def test_all_numeric_with_no_numeric_columns(self): sio = StringIO() PandasTools.WriteSDF(self.df, sio, allNumeric=True) s = sio.getvalue() self.assertFalse(">" in s, s) self.assertNotIn("7\n\n", s) # double-check that the numeric tests don't pass by accident self.assertNotIn("8\n\n", s)
def SaveXlsxFromFrame(frame, outFile, molCol='ROMol', size=(300, 300)): """ Saves pandas DataFrame as a xlsx file with embedded images. It maps numpy data types to excel cell types: int, float -> number datetime -> datetime object -> string (limited to 32k character - xlsx limitations) Cells with compound images are a bit larger than images due to excel. Column width weirdness explained (from xlsxwriter docs): The width corresponds to the column width value that is specified in Excel. It is approximately equal to the length of a string in the default font of Calibri 11. Unfortunately, there is no way to specify "AutoFit" for a column in the Excel file format. This feature is only available at runtime from within Excel. """ import xlsxwriter # don't want to make this a RDKit dependency cols = list(frame.columns) cols.remove(molCol) dataTypes = dict(frame.dtypes) workbook = xlsxwriter.Workbook(outFile) # New workbook worksheet = workbook.add_worksheet() # New work sheet worksheet.set_column('A:A', size[0] / 6.) # column width # Write first row with column names c2 = 1 for x in cols: worksheet.write_string(0, c2, x) c2 += 1 c = 1 for index, row in frame.iterrows(): image_data = StringIO() img = Draw.MolToImage(row[molCol], size=size) img.save(image_data, format='PNG') worksheet.set_row(c, height=size[1]) # looks like height is not in px? worksheet.insert_image(c, 0, "f", {'image_data': image_data}) c2 = 1 for x in cols: if str(dataTypes[x]) == "object": worksheet.write_string( c, c2, str(row[x])[:32000]) # string length is limited in xlsx elif ('float' in str(dataTypes[x])) or ('int' in str( dataTypes[x])): if (row[x] != np.nan) or (row[x] != np.inf): worksheet.write_number(c, c2, row[x]) elif 'datetime' in str(dataTypes[x]): worksheet.write_datetime(c, c2, row[x]) c2 += 1 c += 1 workbook.close() image_data.close()
def test_specify_numeric_column(self): sio = StringIO() df = self.df df["len2"] = df["ID"].map(len) PandasTools.WriteSDF(df, sio, properties=["len2"]) s = sio.getvalue() self.assertEqual(s.count("<len2>"), 2) self.assertIn("7\n\n", s) self.assertIn("8\n\n", s)
def test_all_numeric_with_numeric_columns(self): sio = StringIO() df = self.df df["len"] = df["ID"].map(len) PandasTools.WriteSDF(df, sio, allNumeric=True) s = sio.getvalue() self.assertEqual(s.count("<len>"), 2) self.assertIn("7\n\n", s) self.assertIn("8\n\n", s)
def SaveXlsxFromFrame(frame, outFile, molCol='ROMol', size=(300,300)): """ Saves pandas DataFrame as a xlsx file with embedded images. It maps numpy data types to excel cell types: int, float -> number datetime -> datetime object -> string (limited to 32k character - xlsx limitations) Cells with compound images are a bit larger than images due to excel. Column width weirdness explained (from xlsxwriter docs): The width corresponds to the column width value that is specified in Excel. It is approximately equal to the length of a string in the default font of Calibri 11. Unfortunately, there is no way to specify "AutoFit" for a column in the Excel file format. This feature is only available at runtime from within Excel. """ import xlsxwriter # don't want to make this a RDKit dependency cols = list(frame.columns) cols.remove(molCol) dataTypes = dict(frame.dtypes) workbook = xlsxwriter.Workbook(outFile) # New workbook worksheet = workbook.add_worksheet() # New work sheet worksheet.set_column('A:A', size[0]/6.) # column width # Write first row with column names c2 = 1 for x in cols: worksheet.write_string(0, c2, x) c2 += 1 c = 1 for index, row in frame.iterrows(): image_data = StringIO() img = Draw.MolToImage(row[molCol], size=size) img.save(image_data, format='PNG') worksheet.set_row(c, height=size[1]) # looks like height is not in px? worksheet.insert_image(c, 0, "f", {'image_data': image_data}) c2 = 1 for x in cols: if str(dataTypes[x]) == "object": worksheet.write_string(c, c2, str(row[x])[:32000]) # string length is limited in xlsx elif ('float' in str(dataTypes[x])) or ('int' in str(dataTypes[x])): if (row[x] != np.nan) or (row[x] != np.inf): worksheet.write_number(c, c2, row[x]) elif 'datetime' in str(dataTypes[x]): worksheet.write_datetime(c, c2, row[x]) c2 += 1 c += 1 workbook.close() image_data.close()
def _get_image(x): """displayhook function for PIL Images, rendered as PNG""" import pandas as pd sio = StringIO() x.save(sio,format='PNG') s = b64encode(sio.getvalue()) pd.set_option('display.max_columns',len(s)+1000) pd.set_option('display.max_rows',len(s)+1000) if len(s)+100 > pd.get_option("display.max_colwidth"): pd.set_option("display.max_colwidth",len(s)+1000) return s
def test2(self): fName = os.path.join(RDConfig.RDDataDir, 'NCI', 'first_200.props.sdf') suppl = Chem.SDMolSupplier(fName) io = StringIO() try: Convert(suppl, io, keyCol='AMW', stopAfter=5) except Exception: import traceback traceback.print_exc() self.fail('conversion failed') txt = io.getvalue() lines = [line for line in txt.split('\n') if line.strip() != ''] self.assertTrue(len(lines) == 6, 'bad num lines: %d' % len(lines)) line0 = lines[0].split(',') self.assertEqual(len(line0), 20) self.assertTrue(line0[0] == 'AMW') self.assertTrue(line0[1] == 'SMILES')
def test1(self): fName = os.path.join(RDConfig.RDDataDir, 'NCI', 'first_200.props.sdf') suppl = Chem.SDMolSupplier(fName) io = StringIO() try: Convert(suppl, io) except Exception: import traceback traceback.print_exc() self.fail('conversion failed') txt = io.getvalue() lines = txt.split('\n') if not lines[-1]: del lines[-1] self.assertTrue(len(lines) == 201, 'bad num lines: %d' % len(lines)) line0 = lines[0].split(',') self.assertEqual(len(line0), 20) self.assertTrue(line0[0] == 'SMILES')
def test2(self): import os from rdkit.six.moves import cStringIO as StringIO fName = os.path.join(RDConfig.RDDataDir,'NCI','first_200.props.sdf') suppl = Chem.SDMolSupplier(fName) io = StringIO() try: Convert(suppl,io,keyCol='AMW',stopAfter=5) except: import traceback traceback.print_exc() self.fail('conversion failed') txt = io.getvalue() lines = txt.split('\n') if not lines[-1]: del lines[-1] self.assertTrue(len(lines)==6,'bad num lines: %d'%len(lines)) line0 = lines[0].split(',') self.assertEqual(len(line0),20) self.assertTrue(line0[0]=='AMW') self.assertTrue(line0[1]=='SMILES')
def test2(self): import os from rdkit.six.moves import cStringIO as StringIO #@UnresolvedImport #pylint: disable=F0401 fName = os.path.join(RDConfig.RDDataDir, 'NCI', 'first_200.props.sdf') suppl = Chem.SDMolSupplier(fName) io = StringIO() try: Convert(suppl, io, keyCol='AMW', stopAfter=5) except Exception: import traceback traceback.print_exc() self.fail('conversion failed') txt = io.getvalue() lines = txt.split('\n') if not lines[-1]: del lines[-1] self.assertTrue(len(lines) == 6, 'bad num lines: %d' % len(lines)) line0 = lines[0].split(',') self.assertEqual(len(line0), 20) self.assertTrue(line0[0] == 'AMW') self.assertTrue(line0[1] == 'SMILES')
def test1(self): import os from rdkit.six.moves import cStringIO as StringIO # @UnresolvedImport #pylint: disable=F0401 fName = os.path.join(RDConfig.RDDataDir, "NCI", "first_200.props.sdf") suppl = Chem.SDMolSupplier(fName) io = StringIO() try: Convert(suppl, io) except Exception: import traceback traceback.print_exc() self.fail("conversion failed") txt = io.getvalue() lines = txt.split("\n") if not lines[-1]: del lines[-1] self.assertTrue(len(lines) == 201, "bad num lines: %d" % len(lines)) line0 = lines[0].split(",") self.assertEqual(len(line0), 20) self.assertTrue(line0[0] == "SMILES")
def BuildFuncGroupHierarchy(fileNm=None,data=None,force=False): global groupDefns,hierarchy,lastData,lastFilename if not force and hierarchy and (not data or data==lastData) and \ (not fileNm or fileNm==lastFilename): return hierarchy[:] lastData=data splitter = re.compile('\t+') from rdkit import Chem if not fileNm and not data: fileNm = os.path.join(RDConfig.RDDataDir,'Functional_Group_Hierarchy.txt') if fileNm: inF = open(fileNm,'r') lastFilename = fileNm elif data: inF = StringIO(data) else: raise ValueError("need data or filename") groupDefns={} res = [] lineNo=0 for line in inF.readlines(): lineNo+=1 line=line.strip() line = line.split('//')[0] if not line: continue splitL = splitter.split(line) if len(splitL)<3: raise FuncGroupFileParseError("Input line %d (%s) is not long enough."%(lineNo,repr(line))) label = splitL[0].strip() if label in groupDefns: raise FuncGroupFileParseError("Duplicate label on line %d."%lineNo) labelHierarchy = label.split('.') if len(labelHierarchy)>1: for i in range(len(labelHierarchy)-1): tmp = '.'.join(labelHierarchy[:i+1]) if not tmp in groupDefns: raise FuncGroupFileParseError("Hierarchy member %s (line %d) not found."%(tmp,lineNo)) parent = groupDefns['.'.join(labelHierarchy[:-1])] else: parent = None smarts = splitL[1] try: patt = Chem.MolFromSmarts(smarts) except: import traceback traceback.print_exc() patt = None if not patt: raise FuncGroupFileParseError('Smarts "%s" (line %d) could not be parsed.'%(smarts,lineNo)) name = splitL[2].strip() rxnSmarts='' if len(splitL)>3: rxnSmarts=splitL[3] node = FGHierarchyNode(name,patt,smarts=smarts,label=label,parent=parent,rxnSmarts=rxnSmarts) if parent: parent.children.append(node) else: res.append(node) groupDefns[label] = node hierarchy=res[:] return res
def BuildFuncGroupHierarchy(fileNm=None,data=None,force=False): global groupDefns,hierarchy,lastData,lastFilename if not force and hierarchy and (not data or data==lastData) and \ (not fileNm or fileNm==lastFilename): return hierarchy[:] lastData=data splitter = re.compile('\t+') from rdkit import Chem if not fileNm and not data: fileNm = os.path.join(RDConfig.RDDataDir,'Functional_Group_Hierarchy.txt') if fileNm: inF = open(fileNm,'r') lastFilename = fileNm elif data: inF = StringIO(data) else: raise ValueError("need data or filename") groupDefns={} res = [] lineNo=0 for line in inF.readlines(): lineNo+=1 line=line.strip() line = line.split('//')[0] if not line: continue splitL = splitter.split(line) if len(splitL)<3: raise FuncGroupFileParseError("Input line %d (%s) is not long enough."%(lineNo,repr(line))) label = splitL[0].strip() if label in groupDefns: raise FuncGroupFileParseError("Duplicate label on line %d."%lineNo) labelHierarchy = label.split('.') if len(labelHierarchy)>1: for i in range(len(labelHierarchy)-1): tmp = '.'.join(labelHierarchy[:i+1]) if not tmp in groupDefns: raise FuncGroupFileParseError("Hierarchy member %s (line %d) not found."%(tmp,lineNo)) parent = groupDefns['.'.join(labelHierarchy[:-1])] else: parent = None smarts = splitL[1] patt = Chem.MolFromSmarts(smarts) if not patt: raise FuncGroupFileParseError('Smarts "%s" (line %d) could not be parsed.'%(smarts,lineNo)) name = splitL[2].strip() rxnSmarts='' if len(splitL)>3: rxnSmarts=splitL[3] node = FGHierarchyNode(name,patt,smarts=smarts,label=label,parent=parent,rxnSmarts=rxnSmarts) if parent: parent.children.append(node) else: res.append(node) groupDefns[label] = node hierarchy=res[:] return res
def test_default_write_does_not_include_tags(self): sio = StringIO() PandasTools.WriteSDF(self.df, sio) s = sio.getvalue() self.assertNotIn(s, "prop2")
def test_identifier_from_a_column(self): sio = StringIO() PandasTools.WriteSDF(self.df, sio, idName="prop2") s = sio.getvalue() first_line = s.split("\n", 1)[0] self.assertEqual(first_line, "qwe")