Exemple #1
0
 def test_all_numeric_with_no_numeric_columns(self):
     sio = StringIO()
     PandasTools.WriteSDF(self.df, sio, allNumeric=True)
     s = sio.getvalue()
     self.assertFalse(">" in s, s)
     self.assertNotIn("7\n\n", s)  # double-check that the numeric tests don't pass by accident
     self.assertNotIn("8\n\n", s)
 def test_all_numeric_with_no_numeric_columns(self):
     sio = StringIO()
     PandasTools.WriteSDF(self.df, sio, allNumeric=True)
     s = sio.getvalue()
     self.assertFalse(">" in s, s)
     self.assertNotIn("7\n\n", s)  # double-check that the numeric tests don't pass by accident
     self.assertNotIn("8\n\n", s)
Exemple #3
0
def SaveXlsxFromFrame(frame, outFile, molCol='ROMol', size=(300, 300)):
    """
    Saves pandas DataFrame as a xlsx file with embedded images.
    It maps numpy data types to excel cell types:
    int, float -> number
    datetime -> datetime
    object -> string (limited to 32k character - xlsx limitations)

    Cells with compound images are a bit larger than images due to excel.
    Column width weirdness explained (from xlsxwriter docs):
    The width corresponds to the column width value that is specified in Excel.
    It is approximately equal to the length of a string in the default font of Calibri 11.
    Unfortunately, there is no way to specify "AutoFit" for a column in the Excel file format.
    This feature is only available at runtime from within Excel.
    """

    import xlsxwriter  # don't want to make this a RDKit dependency

    cols = list(frame.columns)
    cols.remove(molCol)
    dataTypes = dict(frame.dtypes)

    workbook = xlsxwriter.Workbook(outFile)  # New workbook
    worksheet = workbook.add_worksheet()  # New work sheet
    worksheet.set_column('A:A', size[0] / 6.)  # column width

    # Write first row with column names
    c2 = 1
    for x in cols:
        worksheet.write_string(0, c2, x)
        c2 += 1

    c = 1
    for index, row in frame.iterrows():
        image_data = StringIO()
        img = Draw.MolToImage(row[molCol], size=size)
        img.save(image_data, format='PNG')

        worksheet.set_row(c, height=size[1])  # looks like height is not in px?
        worksheet.insert_image(c, 0, "f", {'image_data': image_data})

        c2 = 1
        for x in cols:
            if str(dataTypes[x]) == "object":
                worksheet.write_string(
                    c, c2,
                    str(row[x])[:32000])  # string length is limited in xlsx
            elif ('float' in str(dataTypes[x])) or ('int' in str(
                    dataTypes[x])):
                if (row[x] != np.nan) or (row[x] != np.inf):
                    worksheet.write_number(c, c2, row[x])
            elif 'datetime' in str(dataTypes[x]):
                worksheet.write_datetime(c, c2, row[x])
            c2 += 1
        c += 1

    workbook.close()
    image_data.close()
 def test_specify_numeric_column(self):
     sio = StringIO()
     df = self.df
     df["len2"] = df["ID"].map(len)
     PandasTools.WriteSDF(df, sio, properties=["len2"])
     s = sio.getvalue()
     self.assertEqual(s.count("<len2>"), 2)
     self.assertIn("7\n\n", s)
     self.assertIn("8\n\n", s)
 def test_all_numeric_with_numeric_columns(self):
     sio = StringIO()
     df = self.df
     df["len"] = df["ID"].map(len)
     PandasTools.WriteSDF(df, sio, allNumeric=True)
     s = sio.getvalue()
     self.assertEqual(s.count("<len>"), 2)
     self.assertIn("7\n\n", s)
     self.assertIn("8\n\n", s)
 def test_specify_numeric_column(self):
     sio = StringIO()
     df = self.df
     df["len2"] = df["ID"].map(len)
     PandasTools.WriteSDF(df, sio, properties=["len2"])
     s = sio.getvalue()
     self.assertEqual(s.count("<len2>"), 2)
     self.assertIn("7\n\n", s)
     self.assertIn("8\n\n", s)
 def test_all_numeric_with_numeric_columns(self):
     sio = StringIO()
     df = self.df
     df["len"] = df["ID"].map(len)
     PandasTools.WriteSDF(df, sio, allNumeric=True)
     s = sio.getvalue()
     self.assertEqual(s.count("<len>"), 2)
     self.assertIn("7\n\n", s)
     self.assertIn("8\n\n", s)
Exemple #8
0
def SaveXlsxFromFrame(frame, outFile, molCol='ROMol', size=(300,300)):
    """
    Saves pandas DataFrame as a xlsx file with embedded images.
    It maps numpy data types to excel cell types:
    int, float -> number
    datetime -> datetime
    object -> string (limited to 32k character - xlsx limitations)
 
    Cells with compound images are a bit larger than images due to excel.
    Column width weirdness explained (from xlsxwriter docs):
    The width corresponds to the column width value that is specified in Excel. 
    It is approximately equal to the length of a string in the default font of Calibri 11. 
    Unfortunately, there is no way to specify "AutoFit" for a column in the Excel file format.
    This feature is only available at runtime from within Excel.
    """
    
    import xlsxwriter # don't want to make this a RDKit dependency
       
    cols = list(frame.columns)
    cols.remove(molCol)
    dataTypes = dict(frame.dtypes)

    workbook = xlsxwriter.Workbook(outFile) # New workbook
    worksheet = workbook.add_worksheet() # New work sheet
    worksheet.set_column('A:A', size[0]/6.) # column width
    
    # Write first row with column names
    c2 = 1
    for x in cols:
        worksheet.write_string(0, c2, x)
        c2 += 1
    
    c = 1
    for index, row in frame.iterrows():
        image_data = StringIO()
        img = Draw.MolToImage(row[molCol], size=size)
        img.save(image_data, format='PNG')
        
        worksheet.set_row(c, height=size[1]) # looks like height is not in px?
        worksheet.insert_image(c, 0, "f", {'image_data': image_data})

        c2 = 1
        for x in cols:
            if str(dataTypes[x]) == "object":
                worksheet.write_string(c, c2, str(row[x])[:32000]) # string length is limited in xlsx
            elif ('float' in str(dataTypes[x])) or ('int' in str(dataTypes[x])):
                if (row[x] != np.nan) or (row[x] != np.inf):
                    worksheet.write_number(c, c2, row[x])
            elif 'datetime' in str(dataTypes[x]):
                worksheet.write_datetime(c, c2, row[x])
            c2 += 1
        c += 1

    workbook.close()
    image_data.close()
Exemple #9
0
def _get_image(x):
  """displayhook function for PIL Images, rendered as PNG"""
  import pandas as pd
  sio = StringIO()    
  x.save(sio,format='PNG')
  s = b64encode(sio.getvalue())
  pd.set_option('display.max_columns',len(s)+1000)
  pd.set_option('display.max_rows',len(s)+1000)
  if len(s)+100 > pd.get_option("display.max_colwidth"):
    pd.set_option("display.max_colwidth",len(s)+1000)
  return s
Exemple #10
0
 def test2(self):
   fName = os.path.join(RDConfig.RDDataDir, 'NCI', 'first_200.props.sdf')
   suppl = Chem.SDMolSupplier(fName)
   io = StringIO()
   try:
     Convert(suppl, io, keyCol='AMW', stopAfter=5)
   except Exception:
     import traceback
     traceback.print_exc()
     self.fail('conversion failed')
   txt = io.getvalue()
   lines = [line for line in txt.split('\n') if line.strip() != '']
   self.assertTrue(len(lines) == 6, 'bad num lines: %d' % len(lines))
   line0 = lines[0].split(',')
   self.assertEqual(len(line0), 20)
   self.assertTrue(line0[0] == 'AMW')
   self.assertTrue(line0[1] == 'SMILES')
Exemple #11
0
 def test2(self):
     fName = os.path.join(RDConfig.RDDataDir, 'NCI', 'first_200.props.sdf')
     suppl = Chem.SDMolSupplier(fName)
     io = StringIO()
     try:
         Convert(suppl, io, keyCol='AMW', stopAfter=5)
     except Exception:
         import traceback
         traceback.print_exc()
         self.fail('conversion failed')
     txt = io.getvalue()
     lines = [line for line in txt.split('\n') if line.strip() != '']
     self.assertTrue(len(lines) == 6, 'bad num lines: %d' % len(lines))
     line0 = lines[0].split(',')
     self.assertEqual(len(line0), 20)
     self.assertTrue(line0[0] == 'AMW')
     self.assertTrue(line0[1] == 'SMILES')
Exemple #12
0
 def test1(self):
     fName = os.path.join(RDConfig.RDDataDir, 'NCI', 'first_200.props.sdf')
     suppl = Chem.SDMolSupplier(fName)
     io = StringIO()
     try:
         Convert(suppl, io)
     except Exception:
         import traceback
         traceback.print_exc()
         self.fail('conversion failed')
     txt = io.getvalue()
     lines = txt.split('\n')
     if not lines[-1]:
         del lines[-1]
     self.assertTrue(len(lines) == 201, 'bad num lines: %d' % len(lines))
     line0 = lines[0].split(',')
     self.assertEqual(len(line0), 20)
     self.assertTrue(line0[0] == 'SMILES')
Exemple #13
0
 def test1(self):
   fName = os.path.join(RDConfig.RDDataDir, 'NCI', 'first_200.props.sdf')
   suppl = Chem.SDMolSupplier(fName)
   io = StringIO()
   try:
     Convert(suppl, io)
   except Exception:
     import traceback
     traceback.print_exc()
     self.fail('conversion failed')
   txt = io.getvalue()
   lines = txt.split('\n')
   if not lines[-1]:
     del lines[-1]
   self.assertTrue(len(lines) == 201, 'bad num lines: %d' % len(lines))
   line0 = lines[0].split(',')
   self.assertEqual(len(line0), 20)
   self.assertTrue(line0[0] == 'SMILES')
Exemple #14
0
 def test2(self):
   import os
   from rdkit.six.moves import cStringIO as StringIO
   fName = os.path.join(RDConfig.RDDataDir,'NCI','first_200.props.sdf')
   suppl = Chem.SDMolSupplier(fName)
   io = StringIO()
   try:
     Convert(suppl,io,keyCol='AMW',stopAfter=5)
   except:
     import traceback
     traceback.print_exc()
     self.fail('conversion failed')
   txt = io.getvalue()
   lines = txt.split('\n')
   if not lines[-1]:
     del lines[-1]
   self.assertTrue(len(lines)==6,'bad num lines: %d'%len(lines))
   line0 = lines[0].split(',')
   self.assertEqual(len(line0),20)
   self.assertTrue(line0[0]=='AMW')
   self.assertTrue(line0[1]=='SMILES')
Exemple #15
0
 def test2(self):
   import os
   from rdkit.six.moves import cStringIO as StringIO  #@UnresolvedImport #pylint: disable=F0401
   fName = os.path.join(RDConfig.RDDataDir, 'NCI', 'first_200.props.sdf')
   suppl = Chem.SDMolSupplier(fName)
   io = StringIO()
   try:
     Convert(suppl, io, keyCol='AMW', stopAfter=5)
   except Exception:
     import traceback
     traceback.print_exc()
     self.fail('conversion failed')
   txt = io.getvalue()
   lines = txt.split('\n')
   if not lines[-1]:
     del lines[-1]
   self.assertTrue(len(lines) == 6, 'bad num lines: %d' % len(lines))
   line0 = lines[0].split(',')
   self.assertEqual(len(line0), 20)
   self.assertTrue(line0[0] == 'AMW')
   self.assertTrue(line0[1] == 'SMILES')
Exemple #16
0
    def test1(self):
        import os
        from rdkit.six.moves import cStringIO as StringIO  # @UnresolvedImport #pylint: disable=F0401

        fName = os.path.join(RDConfig.RDDataDir, "NCI", "first_200.props.sdf")
        suppl = Chem.SDMolSupplier(fName)
        io = StringIO()
        try:
            Convert(suppl, io)
        except Exception:
            import traceback

            traceback.print_exc()
            self.fail("conversion failed")
        txt = io.getvalue()
        lines = txt.split("\n")
        if not lines[-1]:
            del lines[-1]
        self.assertTrue(len(lines) == 201, "bad num lines: %d" % len(lines))
        line0 = lines[0].split(",")
        self.assertEqual(len(line0), 20)
        self.assertTrue(line0[0] == "SMILES")
Exemple #17
0
def BuildFuncGroupHierarchy(fileNm=None,data=None,force=False):
  global groupDefns,hierarchy,lastData,lastFilename
  if not force and hierarchy and (not data or data==lastData) and \
        (not fileNm or fileNm==lastFilename): 
    return hierarchy[:]
  lastData=data
  splitter = re.compile('\t+')
  from rdkit import Chem

  if not fileNm and not data:
    fileNm = os.path.join(RDConfig.RDDataDir,'Functional_Group_Hierarchy.txt')

  if fileNm:
    inF = open(fileNm,'r')
    lastFilename = fileNm
  elif data:
    inF = StringIO(data)
  else:
    raise ValueError("need data or filename")

  groupDefns={}
  res = []
  lineNo=0
  for line in inF.readlines():
    lineNo+=1
    line=line.strip()
    line = line.split('//')[0]
    if not line:
      continue
    splitL = splitter.split(line)
    if len(splitL)<3:
      raise FuncGroupFileParseError("Input line %d (%s) is not long enough."%(lineNo,repr(line)))
    label = splitL[0].strip()
    if label in groupDefns:
      raise FuncGroupFileParseError("Duplicate label on line %d."%lineNo)
    labelHierarchy = label.split('.')
    if len(labelHierarchy)>1:
      for i in range(len(labelHierarchy)-1):
        tmp = '.'.join(labelHierarchy[:i+1])
        if not tmp in groupDefns:
          raise FuncGroupFileParseError("Hierarchy member %s (line %d) not found."%(tmp,lineNo))
      parent = groupDefns['.'.join(labelHierarchy[:-1])]
    else:
      parent = None
    smarts = splitL[1]
    try:
      patt = Chem.MolFromSmarts(smarts)
    except:
      import traceback
      traceback.print_exc()
      patt = None
    if not patt:
      raise FuncGroupFileParseError('Smarts "%s" (line %d) could not be parsed.'%(smarts,lineNo))
      
    name = splitL[2].strip()
    
    rxnSmarts=''
    if len(splitL)>3:
      rxnSmarts=splitL[3]

    node = FGHierarchyNode(name,patt,smarts=smarts,label=label,parent=parent,rxnSmarts=rxnSmarts)
    if parent:
      parent.children.append(node)
    else:
      res.append(node)
    groupDefns[label] = node
  hierarchy=res[:]
  return res
Exemple #18
0
def BuildFuncGroupHierarchy(fileNm=None,data=None,force=False):
  global groupDefns,hierarchy,lastData,lastFilename
  if not force and hierarchy and (not data or data==lastData) and \
        (not fileNm or fileNm==lastFilename): 
    return hierarchy[:]
  lastData=data
  splitter = re.compile('\t+')
  from rdkit import Chem

  if not fileNm and not data:
    fileNm = os.path.join(RDConfig.RDDataDir,'Functional_Group_Hierarchy.txt')

  if fileNm:
    inF = open(fileNm,'r')
    lastFilename = fileNm
  elif data:
    inF = StringIO(data)
  else:
    raise ValueError("need data or filename")

  groupDefns={}
  res = []
  lineNo=0
  for line in inF.readlines():
    lineNo+=1
    line=line.strip()
    line = line.split('//')[0]
    if not line:
      continue
    splitL = splitter.split(line)
    if len(splitL)<3:
      raise FuncGroupFileParseError("Input line %d (%s) is not long enough."%(lineNo,repr(line)))
    label = splitL[0].strip()
    if label in groupDefns:
      raise FuncGroupFileParseError("Duplicate label on line %d."%lineNo)
    labelHierarchy = label.split('.')
    if len(labelHierarchy)>1:
      for i in range(len(labelHierarchy)-1):
        tmp = '.'.join(labelHierarchy[:i+1])
        if not tmp in groupDefns:
          raise FuncGroupFileParseError("Hierarchy member %s (line %d) not found."%(tmp,lineNo))
      parent = groupDefns['.'.join(labelHierarchy[:-1])]
    else:
      parent = None
    smarts = splitL[1]
    patt = Chem.MolFromSmarts(smarts)
    if not patt:
      raise FuncGroupFileParseError('Smarts "%s" (line %d) could not be parsed.'%(smarts,lineNo))
      
    name = splitL[2].strip()
    
    rxnSmarts=''
    if len(splitL)>3:
      rxnSmarts=splitL[3]

    node = FGHierarchyNode(name,patt,smarts=smarts,label=label,parent=parent,rxnSmarts=rxnSmarts)
    if parent:
      parent.children.append(node)
    else:
      res.append(node)
    groupDefns[label] = node
  hierarchy=res[:]
  return res
Exemple #19
0
 def test_default_write_does_not_include_tags(self):
     sio = StringIO()
     PandasTools.WriteSDF(self.df, sio)
     s = sio.getvalue()
     self.assertNotIn(s, "prop2")
Exemple #20
0
 def test_identifier_from_a_column(self):
     sio = StringIO()
     PandasTools.WriteSDF(self.df, sio, idName="prop2")
     s = sio.getvalue()
     first_line = s.split("\n", 1)[0]
     self.assertEqual(first_line, "qwe")
 def test_identifier_from_a_column(self):
     sio = StringIO()
     PandasTools.WriteSDF(self.df, sio, idName="prop2")
     s = sio.getvalue()
     first_line = s.split("\n", 1)[0]
     self.assertEqual(first_line, "qwe")
 def test_default_write_does_not_include_tags(self):
     sio = StringIO()
     PandasTools.WriteSDF(self.df, sio)
     s = sio.getvalue()
     self.assertNotIn(s, "prop2")