Example #1
0
def outputRedirect():
    """ Redirect standard output and error to String IO and return """
    try:
        _stdout, _stderr = sys.stdout, sys.stderr
        sys.stdout = sStdout = StringIO()
        sys.stderr = sStderr = StringIO()
        yield (sStdout, sStderr)
    finally:
        sys.stdout, sys.stderr = _stdout, _stderr
Example #2
0
 def _testStreamRoundtrip(self):
   inD = open(self.fName).read()
   supp = Chem.SDMolSupplier(self.fName)
   outName = tempfile.mktemp('.sdf')
   writer = Chem.SDWriter(outName)
   m1 = next(supp)
   for m in supp:
     writer.write(m)
   writer.flush()
   writer = None
   outD = open(outName,'r').read()
   try:
     os.unlink(outName)
   except:
     import time
     time.sleep(1)
     try:
       os.unlink(outName)
     except:
       pass
   assert inD.count('$$$$')==outD.count('$$$$'),'bad nMols in output'
   io = StringIO(outD)
   supp = Chem.SDMolSupplier(stream=io)
   outD2 = supp.Dump()
   assert outD2.count('$$$$')==len(supp),'bad nMols in output'
   assert outD2.count('$$$$')==outD.count('$$$$'),'bad nMols in output'
   assert outD2==outD,'bad outd'
Example #3
0
    def _initPatterns(self):
        """

    >>> remover = SaltRemover()
    >>> len(remover.salts)>0
    True

    >>> remover = SaltRemover(defnData="[Cl,Br]")
    >>> len(remover.salts)
    1
    
    """
        whitespace = re.compile(r'[\t ]+')
        if self.defnData:
            from rdkit.six.moves import cStringIO as StringIO
            inF = StringIO(self.defnData)
        else:
            inF = open(self.defnFilename, 'r')
        self.salts = []
        for line in inF:
            line = line.strip().split('//')[0]
            if line:
                splitL = whitespace.split(line)
                try:
                    salt = Chem.MolFromSmarts(splitL[0])
                except:
                    import traceback
                    traceback.print_exc()
                    raise ValueError(line)
                self.salts.append(salt)
 def test_load_from_sio(self):
     sio = StringIO(methane + peroxide)
     df = PandasTools.LoadSDF(sio)
     self.assertEqual(len(df), 2)
     self.assertEqual(list(df["ID"]), ["Methane", "Peroxide"])
     atom_counts = [mol.GetNumAtoms() for mol in df["ROMol"]]
     self.assertEqual(atom_counts, [1, 2])
 def test_load_specify_column_names(self):
     sio = StringIO(methane + peroxide)
     df = PandasTools.LoadSDF(sio, idName="CorpID", molColName="_rdmol")
     self.assertEqual(len(df), 2)
     self.assertEqual(list(df["CorpID"]), ["Methane", "Peroxide"])
     atom_counts = [mol.GetNumAtoms() for mol in df["_rdmol"]]
     self.assertEqual(atom_counts, [1, 2])
Example #6
0
 def test_all_numeric_with_no_numeric_columns(self):
     sio = StringIO()
     PandasTools.WriteSDF(self.df, sio, allNumeric=True)
     s = sio.getvalue()
     self.assertFalse(">" in s, s)
     self.assertNotIn("7\n\n", s)  # double-check that the numeric tests don't pass by accident
     self.assertNotIn("8\n\n", s)
Example #7
0
    def _initPatterns(self):
        """

    >>> remover = SaltRemover()
    >>> len(remover.salts)>0
    True

    >>> remover = SaltRemover(defnData="[Cl,Br]")
    >>> len(remover.salts)
    1

    >>> from rdkit import RDLogger
    >>> RDLogger.DisableLog('rdApp.error')
    >>> remover = SaltRemover(defnData="[Cl,fail]")
    Traceback (most recent call last):
      ...
    ValueError: [Cl,fail]

    >>> RDLogger.EnableLog('rdApp.error')
    """
        whitespace = re.compile(r'[\t ]+')
        if self.defnData:
            from rdkit.six.moves import cStringIO as StringIO
            inF = StringIO(self.defnData)
        else:
            inF = open(self.defnFilename, 'r')
        self.salts = []
        for line in inF:
            line = line.strip().split('//')[0]
            if line:
                splitL = whitespace.split(line)
                salt = Chem.MolFromSmarts(splitL[0])
                if salt is None:
                    raise ValueError(line)
                self.salts.append(salt)
Example #8
0
def SaveXlsxFromFrame(frame, outFile, molCol='ROMol', size=(300, 300)):
    """
    Saves pandas DataFrame as a xlsx file with embedded images.
    It maps numpy data types to excel cell types:
    int, float -> number
    datetime -> datetime
    object -> string (limited to 32k character - xlsx limitations)

    Cells with compound images are a bit larger than images due to excel.
    Column width weirdness explained (from xlsxwriter docs):
    The width corresponds to the column width value that is specified in Excel.
    It is approximately equal to the length of a string in the default font of Calibri 11.
    Unfortunately, there is no way to specify "AutoFit" for a column in the Excel file format.
    This feature is only available at runtime from within Excel.
    """

    import xlsxwriter  # don't want to make this a RDKit dependency

    cols = list(frame.columns)
    cols.remove(molCol)
    dataTypes = dict(frame.dtypes)

    workbook = xlsxwriter.Workbook(outFile)  # New workbook
    worksheet = workbook.add_worksheet()  # New work sheet
    worksheet.set_column('A:A', size[0] / 6.)  # column width

    # Write first row with column names
    c2 = 1
    for x in cols:
        worksheet.write_string(0, c2, x)
        c2 += 1

    c = 1
    for index, row in frame.iterrows():
        image_data = StringIO()
        img = Draw.MolToImage(row[molCol], size=size)
        img.save(image_data, format='PNG')

        worksheet.set_row(c, height=size[1])  # looks like height is not in px?
        worksheet.insert_image(c, 0, "f", {'image_data': image_data})

        c2 = 1
        for x in cols:
            if str(dataTypes[x]) == "object":
                worksheet.write_string(
                    c, c2,
                    str(row[x])[:32000])  # string length is limited in xlsx
            elif ('float' in str(dataTypes[x])) or ('int' in str(
                    dataTypes[x])):
                if (row[x] != np.nan) or (row[x] != np.inf):
                    worksheet.write_number(c, c2, row[x])
            elif 'datetime' in str(dataTypes[x]):
                worksheet.write_datetime(c, c2, row[x])
            c2 += 1
        c += 1

    workbook.close()
    image_data.close()
 def test_specify_numeric_column(self):
     sio = StringIO()
     df = self.df
     df["len2"] = df["ID"].map(len)
     PandasTools.WriteSDF(df, sio, properties=["len2"])
     s = sio.getvalue()
     self.assertEqual(s.count("<len2>"), 2)
     self.assertIn("7\n\n", s)
     self.assertIn("8\n\n", s)
 def test_all_numeric_with_numeric_columns(self):
     sio = StringIO()
     df = self.df
     df["len"] = df["ID"].map(len)
     PandasTools.WriteSDF(df, sio, allNumeric=True)
     s = sio.getvalue()
     self.assertEqual(s.count("<len>"), 2)
     self.assertIn("7\n\n", s)
     self.assertIn("8\n\n", s)
Example #11
0
    def _initPatterns(self):
        """

    >>> remover = SaltRemover()
    >>> len(remover.salts)>0
    True

    Default input format is SMARTS
    >>> remover = SaltRemover(defnData="[Cl,Br]")
    >>> len(remover.salts)
    1

    >>> remover = SaltRemover(defnData="[Na+]\\nCC(=O)O", defnFormat=InputFormat.SMILES)
    >>> len(remover.salts)
    2

    >>> from rdkit import RDLogger
    >>> RDLogger.DisableLog('rdApp.error')
    >>> remover = SaltRemover(defnData="[Cl,fail]")
    Traceback (most recent call last):
      ...
    ValueError: [Cl,fail]

    >>> RDLogger.EnableLog('rdApp.error')
    """
        if self.defnData:
            from rdkit.six.moves import cStringIO as StringIO
            inF = StringIO(self.defnData)
            with closing(inF):
                self.salts = []
                for line in inF:
                    if line:
                        if self.defnFormat == InputFormat.SMARTS:
                            salt = _smartsFromSmartsLine(line)
                        elif self.defnFormat == InputFormat.SMILES:
                            salt = Chem.MolFromSmiles(line)
                        else:
                            raise ValueError(
                                'Unsupported format for supplier.')
                        if salt is None:
                            raise ValueError(line)
                        self.salts.append(salt)
        else:
            if self.defnFormat == InputFormat.SMARTS:
                self.salts = [
                    mol for mol in _getSmartsSaltsFromFile(self.defnFilename)
                ]
            elif self.defnFormat == InputFormat.MOL:
                self.salts = [mol for mol in SDMolSupplier(self.defnFilename)]
            elif self.defnFormat == InputFormat.SMILES:
                self.salts = [
                    mol for mol in SmilesMolSupplier(self.defnFilename)
                ]
            else:
                raise ValueError('Unsupported format for supplier.')
Example #12
0
    def test_properties(self):
        sio = StringIO(peroxide + methane)
        df = PandasTools.LoadSDF(sio)
        self.assertEqual(set(df.columns), set("ROMol ID prop1 prop2 prop3".split()))
        prop1 = list(df["prop1"])
        self.assertTrue(numpy.isnan(prop1[0]), prop1[0])
        self.assertEqual(prop1[1], "12.34")

        self.assertEqual(list(df["prop2"]), ["rtz", "qwe"])
        
        prop3 = list(df["prop3"])
        self.assertEqual(prop3[0], "yxcv")
        self.assertTrue(numpy.isnan(prop3[1]), prop3[1])
Example #13
0
 def test2(self):
     fName = os.path.join(RDConfig.RDDataDir, 'NCI', 'first_200.props.sdf')
     suppl = Chem.SDMolSupplier(fName)
     io = StringIO()
     try:
         Convert(suppl, io, keyCol='AMW', stopAfter=5)
     except Exception:
         import traceback
         traceback.print_exc()
         self.fail('conversion failed')
     txt = io.getvalue()
     lines = [line for line in txt.split('\n') if line.strip() != '']
     self.assertTrue(len(lines) == 6, 'bad num lines: %d' % len(lines))
     line0 = lines[0].split(',')
     self.assertEqual(len(line0), 20)
     self.assertTrue(line0[0] == 'AMW')
     self.assertTrue(line0[1] == 'SMILES')
Example #14
0
 def test1(self):
     fName = os.path.join(RDConfig.RDDataDir, 'NCI', 'first_200.props.sdf')
     suppl = Chem.SDMolSupplier(fName)
     io = StringIO()
     try:
         Convert(suppl, io)
     except Exception:
         import traceback
         traceback.print_exc()
         self.fail('conversion failed')
     txt = io.getvalue()
     lines = txt.split('\n')
     if not lines[-1]:
         del lines[-1]
     self.assertTrue(len(lines) == 201, 'bad num lines: %d' % len(lines))
     line0 = lines[0].split(',')
     self.assertEqual(len(line0), 20)
     self.assertTrue(line0[0] == 'SMILES')
Example #15
0
 def test2(self):
   import os
   from rdkit.six.moves import cStringIO as StringIO  #@UnresolvedImport #pylint: disable=F0401
   fName = os.path.join(RDConfig.RDDataDir, 'NCI', 'first_200.props.sdf')
   suppl = Chem.SDMolSupplier(fName)
   io = StringIO()
   try:
     Convert(suppl, io, keyCol='AMW', stopAfter=5)
   except Exception:
     import traceback
     traceback.print_exc()
     self.fail('conversion failed')
   txt = io.getvalue()
   lines = txt.split('\n')
   if not lines[-1]:
     del lines[-1]
   self.assertTrue(len(lines) == 6, 'bad num lines: %d' % len(lines))
   line0 = lines[0].split(',')
   self.assertEqual(len(line0), 20)
   self.assertTrue(line0[0] == 'AMW')
   self.assertTrue(line0[1] == 'SMILES')
 def test_passed_in_file_is_not_closed(self):
     sio = StringIO(methane)
     df = PandasTools.LoadSDF(sio)
     self.assertEqual(len(df), 1)
     self.assertFalse(sio.closed)
 def test_empty_file(self):
     # Should return an empty data frame with no rows or columns
     sio = StringIO()
     df = PandasTools.LoadSDF(sio)
     self.assertEqual(len(df), 0)
     self.assertEqual(len(df.index), 0)
 def test_ignore_mol_column(self):
     sio = StringIO(peroxide + methane)
     df = PandasTools.LoadSDF(sio, molColName=None)
     self.assertEqual(set(df.columns), set("ID prop1 prop2 prop3".split()))
 def setUp(self):
     sio = StringIO(methane + peroxide)
     self.df = PandasTools.LoadSDF(sio)
Example #20
0
def BuildFuncGroupHierarchy(fileNm=None,data=None,force=False):
  global groupDefns,hierarchy,lastData,lastFilename
  if not force and hierarchy and (not data or data==lastData) and \
        (not fileNm or fileNm==lastFilename): 
    return hierarchy[:]
  lastData=data
  splitter = re.compile('\t+')
  from rdkit import Chem

  if not fileNm and not data:
    fileNm = os.path.join(RDConfig.RDDataDir,'Functional_Group_Hierarchy.txt')

  if fileNm:
    inF = open(fileNm,'r')
    lastFilename = fileNm
  elif data:
    inF = StringIO(data)
  else:
    raise ValueError("need data or filename")

  groupDefns={}
  res = []
  lineNo=0
  for line in inF.readlines():
    lineNo+=1
    line=line.strip()
    line = line.split('//')[0]
    if not line:
      continue
    splitL = splitter.split(line)
    if len(splitL)<3:
      raise FuncGroupFileParseError("Input line %d (%s) is not long enough."%(lineNo,repr(line)))
    label = splitL[0].strip()
    if label in groupDefns:
      raise FuncGroupFileParseError("Duplicate label on line %d."%lineNo)
    labelHierarchy = label.split('.')
    if len(labelHierarchy)>1:
      for i in range(len(labelHierarchy)-1):
        tmp = '.'.join(labelHierarchy[:i+1])
        if not tmp in groupDefns:
          raise FuncGroupFileParseError("Hierarchy member %s (line %d) not found."%(tmp,lineNo))
      parent = groupDefns['.'.join(labelHierarchy[:-1])]
    else:
      parent = None
    smarts = splitL[1]
    patt = Chem.MolFromSmarts(smarts)
    if not patt:
      raise FuncGroupFileParseError('Smarts "%s" (line %d) could not be parsed.'%(smarts,lineNo))
      
    name = splitL[2].strip()
    
    rxnSmarts=''
    if len(splitL)>3:
      rxnSmarts=splitL[3]

    node = FGHierarchyNode(name,patt,smarts=smarts,label=label,parent=parent,rxnSmarts=rxnSmarts)
    if parent:
      parent.children.append(node)
    else:
      res.append(node)
    groupDefns[label] = node
  hierarchy=res[:]
  return res
 def test_default_write_does_not_include_tags(self):
     sio = StringIO()
     PandasTools.WriteSDF(self.df, sio)
     s = sio.getvalue()
     self.assertNotIn(s, "prop2")
 def test_identifier_from_a_column(self):
     sio = StringIO()
     PandasTools.WriteSDF(self.df, sio, idName="prop2")
     s = sio.getvalue()
     first_line = s.split("\n", 1)[0]
     self.assertEqual(first_line, "qwe")