def outputRedirect(): """ Redirect standard output and error to String IO and return """ try: _stdout, _stderr = sys.stdout, sys.stderr sys.stdout = sStdout = StringIO() sys.stderr = sStderr = StringIO() yield (sStdout, sStderr) finally: sys.stdout, sys.stderr = _stdout, _stderr
def _testStreamRoundtrip(self): inD = open(self.fName).read() supp = Chem.SDMolSupplier(self.fName) outName = tempfile.mktemp('.sdf') writer = Chem.SDWriter(outName) m1 = next(supp) for m in supp: writer.write(m) writer.flush() writer = None outD = open(outName,'r').read() try: os.unlink(outName) except: import time time.sleep(1) try: os.unlink(outName) except: pass assert inD.count('$$$$')==outD.count('$$$$'),'bad nMols in output' io = StringIO(outD) supp = Chem.SDMolSupplier(stream=io) outD2 = supp.Dump() assert outD2.count('$$$$')==len(supp),'bad nMols in output' assert outD2.count('$$$$')==outD.count('$$$$'),'bad nMols in output' assert outD2==outD,'bad outd'
def _initPatterns(self): """ >>> remover = SaltRemover() >>> len(remover.salts)>0 True >>> remover = SaltRemover(defnData="[Cl,Br]") >>> len(remover.salts) 1 """ whitespace = re.compile(r'[\t ]+') if self.defnData: from rdkit.six.moves import cStringIO as StringIO inF = StringIO(self.defnData) else: inF = open(self.defnFilename, 'r') self.salts = [] for line in inF: line = line.strip().split('//')[0] if line: splitL = whitespace.split(line) try: salt = Chem.MolFromSmarts(splitL[0]) except: import traceback traceback.print_exc() raise ValueError(line) self.salts.append(salt)
def test_load_from_sio(self): sio = StringIO(methane + peroxide) df = PandasTools.LoadSDF(sio) self.assertEqual(len(df), 2) self.assertEqual(list(df["ID"]), ["Methane", "Peroxide"]) atom_counts = [mol.GetNumAtoms() for mol in df["ROMol"]] self.assertEqual(atom_counts, [1, 2])
def test_load_specify_column_names(self): sio = StringIO(methane + peroxide) df = PandasTools.LoadSDF(sio, idName="CorpID", molColName="_rdmol") self.assertEqual(len(df), 2) self.assertEqual(list(df["CorpID"]), ["Methane", "Peroxide"]) atom_counts = [mol.GetNumAtoms() for mol in df["_rdmol"]] self.assertEqual(atom_counts, [1, 2])
def test_all_numeric_with_no_numeric_columns(self): sio = StringIO() PandasTools.WriteSDF(self.df, sio, allNumeric=True) s = sio.getvalue() self.assertFalse(">" in s, s) self.assertNotIn("7\n\n", s) # double-check that the numeric tests don't pass by accident self.assertNotIn("8\n\n", s)
def _initPatterns(self): """ >>> remover = SaltRemover() >>> len(remover.salts)>0 True >>> remover = SaltRemover(defnData="[Cl,Br]") >>> len(remover.salts) 1 >>> from rdkit import RDLogger >>> RDLogger.DisableLog('rdApp.error') >>> remover = SaltRemover(defnData="[Cl,fail]") Traceback (most recent call last): ... ValueError: [Cl,fail] >>> RDLogger.EnableLog('rdApp.error') """ whitespace = re.compile(r'[\t ]+') if self.defnData: from rdkit.six.moves import cStringIO as StringIO inF = StringIO(self.defnData) else: inF = open(self.defnFilename, 'r') self.salts = [] for line in inF: line = line.strip().split('//')[0] if line: splitL = whitespace.split(line) salt = Chem.MolFromSmarts(splitL[0]) if salt is None: raise ValueError(line) self.salts.append(salt)
def SaveXlsxFromFrame(frame, outFile, molCol='ROMol', size=(300, 300)): """ Saves pandas DataFrame as a xlsx file with embedded images. It maps numpy data types to excel cell types: int, float -> number datetime -> datetime object -> string (limited to 32k character - xlsx limitations) Cells with compound images are a bit larger than images due to excel. Column width weirdness explained (from xlsxwriter docs): The width corresponds to the column width value that is specified in Excel. It is approximately equal to the length of a string in the default font of Calibri 11. Unfortunately, there is no way to specify "AutoFit" for a column in the Excel file format. This feature is only available at runtime from within Excel. """ import xlsxwriter # don't want to make this a RDKit dependency cols = list(frame.columns) cols.remove(molCol) dataTypes = dict(frame.dtypes) workbook = xlsxwriter.Workbook(outFile) # New workbook worksheet = workbook.add_worksheet() # New work sheet worksheet.set_column('A:A', size[0] / 6.) # column width # Write first row with column names c2 = 1 for x in cols: worksheet.write_string(0, c2, x) c2 += 1 c = 1 for index, row in frame.iterrows(): image_data = StringIO() img = Draw.MolToImage(row[molCol], size=size) img.save(image_data, format='PNG') worksheet.set_row(c, height=size[1]) # looks like height is not in px? worksheet.insert_image(c, 0, "f", {'image_data': image_data}) c2 = 1 for x in cols: if str(dataTypes[x]) == "object": worksheet.write_string( c, c2, str(row[x])[:32000]) # string length is limited in xlsx elif ('float' in str(dataTypes[x])) or ('int' in str( dataTypes[x])): if (row[x] != np.nan) or (row[x] != np.inf): worksheet.write_number(c, c2, row[x]) elif 'datetime' in str(dataTypes[x]): worksheet.write_datetime(c, c2, row[x]) c2 += 1 c += 1 workbook.close() image_data.close()
def test_specify_numeric_column(self): sio = StringIO() df = self.df df["len2"] = df["ID"].map(len) PandasTools.WriteSDF(df, sio, properties=["len2"]) s = sio.getvalue() self.assertEqual(s.count("<len2>"), 2) self.assertIn("7\n\n", s) self.assertIn("8\n\n", s)
def test_all_numeric_with_numeric_columns(self): sio = StringIO() df = self.df df["len"] = df["ID"].map(len) PandasTools.WriteSDF(df, sio, allNumeric=True) s = sio.getvalue() self.assertEqual(s.count("<len>"), 2) self.assertIn("7\n\n", s) self.assertIn("8\n\n", s)
def _initPatterns(self): """ >>> remover = SaltRemover() >>> len(remover.salts)>0 True Default input format is SMARTS >>> remover = SaltRemover(defnData="[Cl,Br]") >>> len(remover.salts) 1 >>> remover = SaltRemover(defnData="[Na+]\\nCC(=O)O", defnFormat=InputFormat.SMILES) >>> len(remover.salts) 2 >>> from rdkit import RDLogger >>> RDLogger.DisableLog('rdApp.error') >>> remover = SaltRemover(defnData="[Cl,fail]") Traceback (most recent call last): ... ValueError: [Cl,fail] >>> RDLogger.EnableLog('rdApp.error') """ if self.defnData: from rdkit.six.moves import cStringIO as StringIO inF = StringIO(self.defnData) with closing(inF): self.salts = [] for line in inF: if line: if self.defnFormat == InputFormat.SMARTS: salt = _smartsFromSmartsLine(line) elif self.defnFormat == InputFormat.SMILES: salt = Chem.MolFromSmiles(line) else: raise ValueError( 'Unsupported format for supplier.') if salt is None: raise ValueError(line) self.salts.append(salt) else: if self.defnFormat == InputFormat.SMARTS: self.salts = [ mol for mol in _getSmartsSaltsFromFile(self.defnFilename) ] elif self.defnFormat == InputFormat.MOL: self.salts = [mol for mol in SDMolSupplier(self.defnFilename)] elif self.defnFormat == InputFormat.SMILES: self.salts = [ mol for mol in SmilesMolSupplier(self.defnFilename) ] else: raise ValueError('Unsupported format for supplier.')
def test_properties(self): sio = StringIO(peroxide + methane) df = PandasTools.LoadSDF(sio) self.assertEqual(set(df.columns), set("ROMol ID prop1 prop2 prop3".split())) prop1 = list(df["prop1"]) self.assertTrue(numpy.isnan(prop1[0]), prop1[0]) self.assertEqual(prop1[1], "12.34") self.assertEqual(list(df["prop2"]), ["rtz", "qwe"]) prop3 = list(df["prop3"]) self.assertEqual(prop3[0], "yxcv") self.assertTrue(numpy.isnan(prop3[1]), prop3[1])
def test2(self): fName = os.path.join(RDConfig.RDDataDir, 'NCI', 'first_200.props.sdf') suppl = Chem.SDMolSupplier(fName) io = StringIO() try: Convert(suppl, io, keyCol='AMW', stopAfter=5) except Exception: import traceback traceback.print_exc() self.fail('conversion failed') txt = io.getvalue() lines = [line for line in txt.split('\n') if line.strip() != ''] self.assertTrue(len(lines) == 6, 'bad num lines: %d' % len(lines)) line0 = lines[0].split(',') self.assertEqual(len(line0), 20) self.assertTrue(line0[0] == 'AMW') self.assertTrue(line0[1] == 'SMILES')
def test1(self): fName = os.path.join(RDConfig.RDDataDir, 'NCI', 'first_200.props.sdf') suppl = Chem.SDMolSupplier(fName) io = StringIO() try: Convert(suppl, io) except Exception: import traceback traceback.print_exc() self.fail('conversion failed') txt = io.getvalue() lines = txt.split('\n') if not lines[-1]: del lines[-1] self.assertTrue(len(lines) == 201, 'bad num lines: %d' % len(lines)) line0 = lines[0].split(',') self.assertEqual(len(line0), 20) self.assertTrue(line0[0] == 'SMILES')
def test2(self): import os from rdkit.six.moves import cStringIO as StringIO #@UnresolvedImport #pylint: disable=F0401 fName = os.path.join(RDConfig.RDDataDir, 'NCI', 'first_200.props.sdf') suppl = Chem.SDMolSupplier(fName) io = StringIO() try: Convert(suppl, io, keyCol='AMW', stopAfter=5) except Exception: import traceback traceback.print_exc() self.fail('conversion failed') txt = io.getvalue() lines = txt.split('\n') if not lines[-1]: del lines[-1] self.assertTrue(len(lines) == 6, 'bad num lines: %d' % len(lines)) line0 = lines[0].split(',') self.assertEqual(len(line0), 20) self.assertTrue(line0[0] == 'AMW') self.assertTrue(line0[1] == 'SMILES')
def test_passed_in_file_is_not_closed(self): sio = StringIO(methane) df = PandasTools.LoadSDF(sio) self.assertEqual(len(df), 1) self.assertFalse(sio.closed)
def test_empty_file(self): # Should return an empty data frame with no rows or columns sio = StringIO() df = PandasTools.LoadSDF(sio) self.assertEqual(len(df), 0) self.assertEqual(len(df.index), 0)
def test_ignore_mol_column(self): sio = StringIO(peroxide + methane) df = PandasTools.LoadSDF(sio, molColName=None) self.assertEqual(set(df.columns), set("ID prop1 prop2 prop3".split()))
def setUp(self): sio = StringIO(methane + peroxide) self.df = PandasTools.LoadSDF(sio)
def BuildFuncGroupHierarchy(fileNm=None,data=None,force=False): global groupDefns,hierarchy,lastData,lastFilename if not force and hierarchy and (not data or data==lastData) and \ (not fileNm or fileNm==lastFilename): return hierarchy[:] lastData=data splitter = re.compile('\t+') from rdkit import Chem if not fileNm and not data: fileNm = os.path.join(RDConfig.RDDataDir,'Functional_Group_Hierarchy.txt') if fileNm: inF = open(fileNm,'r') lastFilename = fileNm elif data: inF = StringIO(data) else: raise ValueError("need data or filename") groupDefns={} res = [] lineNo=0 for line in inF.readlines(): lineNo+=1 line=line.strip() line = line.split('//')[0] if not line: continue splitL = splitter.split(line) if len(splitL)<3: raise FuncGroupFileParseError("Input line %d (%s) is not long enough."%(lineNo,repr(line))) label = splitL[0].strip() if label in groupDefns: raise FuncGroupFileParseError("Duplicate label on line %d."%lineNo) labelHierarchy = label.split('.') if len(labelHierarchy)>1: for i in range(len(labelHierarchy)-1): tmp = '.'.join(labelHierarchy[:i+1]) if not tmp in groupDefns: raise FuncGroupFileParseError("Hierarchy member %s (line %d) not found."%(tmp,lineNo)) parent = groupDefns['.'.join(labelHierarchy[:-1])] else: parent = None smarts = splitL[1] patt = Chem.MolFromSmarts(smarts) if not patt: raise FuncGroupFileParseError('Smarts "%s" (line %d) could not be parsed.'%(smarts,lineNo)) name = splitL[2].strip() rxnSmarts='' if len(splitL)>3: rxnSmarts=splitL[3] node = FGHierarchyNode(name,patt,smarts=smarts,label=label,parent=parent,rxnSmarts=rxnSmarts) if parent: parent.children.append(node) else: res.append(node) groupDefns[label] = node hierarchy=res[:] return res
def test_default_write_does_not_include_tags(self): sio = StringIO() PandasTools.WriteSDF(self.df, sio) s = sio.getvalue() self.assertNotIn(s, "prop2")
def test_identifier_from_a_column(self): sio = StringIO() PandasTools.WriteSDF(self.df, sio, idName="prop2") s = sio.getvalue() first_line = s.split("\n", 1)[0] self.assertEqual(first_line, "qwe")