コード例 #1
0
def data_generator(positive_hits, data):
    #load dataset
    data = PandasTools.LoadSDF(data,
                               smilesName='SMILES',
                               molColName='mol',
                               includeFingerprints=True,
                               embedProps=True)
    positive_hits = PandasTools.LoadSDF(positive_hits,
                                        smilesName='SMILES',
                                        molColName='mol',
                                        includeFingerprints=True,
                                        embedProps=True)

    #generate hit names
    hitname = list(positive_hits.NAME)

    #test if the compound in dataset is unique
    if len(np.unique(hitname)) != len(hitname):
        print([
            item for item, count in collections.Counter(hitname).items()
            if count > 1
        ])
    else:
        #get the label 0-negative,1-positive
        y = pd.to_numeric(data["NAME"].isin(hitname).astype('uint8'))
        data['mol'] = [
            AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in data['mol']
        ]
        data_name = [f'Bit_{i}' for i in range(2048)]
        data_bits = [list(x) for x in data['mol']]
        X = pd.DataFrame(data_bits, columns=data_name)
    return X, y
コード例 #2
0
 def test_load_specify_column_names(self):
     sio = StringIO(methane + peroxide)
     df = PandasTools.LoadSDF(sio, idName="CorpID", molColName="_rdmol")
     self.assertEqual(len(df), 2)
     self.assertEqual(list(df["CorpID"]), ["Methane", "Peroxide"])
     atom_counts = [mol.GetNumAtoms() for mol in df["_rdmol"]]
     self.assertEqual(atom_counts, [1, 2])
コード例 #3
0
def get_dataframe_from_file(filename, mol_col="ROMol", smiles_col="SMILES"):
    """Determine file type from filename extension and produce a Pandas dataframe
    accordingly.

    Supported filename extensions: .sdf, .csv, .tsv, .smi

    If the file is an SDF, get the structures from the MOL_COL
    column. Otherwise, use the SMILES strings in SMILES_COL to obtain the
    structures.

    Parameters
    ----------
    
    filename: filepath string
        The name of the file from which to read the data; can an be absolute or
        relative path.

    mol_col: string containing column name (case-insensitive)
        In an SDF formatted file, the name of the column containing the structures

    smiles_col: string containing column name (case-insensitive)
        In a non-SDF formatted file, the name of the column containing the SMILES strings


    Returns
    -------

    df: Pandas dataframe 
        A Pandas dataframe containing the data from FILENAME. 

    """
    logging.info(f'Reading {filename}')

    file_ext = pathlib.Path(filename).suffix

    if file_ext == ".sdf":
        ## Maintaining the standard parameters: /idName/ = 'ID', /includeFingerprints/ = False,
        ##                                      /isomericSmiles/ = True, /embedProps/ = False,
        ##                                      /removeHs/ = True, /strictParsing/ = True, /smilesName/ = None
        df = PandasTools.LoadSDF(filename, molColName=mol_col)

    elif file_ext in [".csv", ".tsv", ".smi"]:
        sep = ","
        if file_ext == ".tsv":
            sep = "\t"
        elif file_ext == ".smi":
            mol_field = "smiles"
        df = pandas.read_csv(filename, sep=sep)
        # Generate structures from SMILES
        # Yields /None/ if conversion fails
        PandasTools.AddMoleculeColumnToFrame(df, smilesCol=smiles_col)

    elif file_ext == '':
        # TODO: Error: Cannot determine file type
        pass
    else:
        # TODO: Error: file type not supported
        pass

    return df
コード例 #4
0
ファイル: cal_dERMSD.py プロジェクト: zhenglz/deltaVinaXGB
def get_lowest_energy(lowest):
    ''' Get the global minimum energy '''
    df_confs = PandasTools.LoadSDF(lowest)
    df_confs["energy_abs"] = df_confs["energy_abs"].astype(float)
    lowest = df_confs.sort_values(["energy_abs"]).energy_abs.min()

    return lowest
コード例 #5
0
def read_synonyms_file(sdfFile, fix_names=False, path=''):
    """Read comptox synonyms file."""
    print('Reading synonyms file...')
    frame = PandasTools.LoadSDF(os.path.join(path, sdfFile),
                                molColName=None,
                                removeHs=False,
                                strictParsing=False)
    # frame = pd.read_csv('names.csv', low_memory=False)
    frame = frame[['Preferred_Name', 'Synonyms']]

    namecol = frame[['Preferred_Name']].copy() \
        .rename(columns={'Preferred_Name': 'chemname'}).dropna()
    syncol = frame[['Synonyms']].copy() \
        .rename(columns={'Synonyms': 'chemname'}).dropna()
    del frame
    print('Formatting...')
    synsplit = syncol['chemname'].str.split('\n', expand=True)
    if fix_names:
        quarter = int(round(len(synsplit) / 4))
        synsplit1 = synsplit[:quarter].apply(fix_row, axis=1)
        synsplit2 = synsplit[quarter:quarter * 2].apply(fix_row, axis=1)
        synsplit3 = synsplit[quarter * 2:quarter * 3].apply(fix_row, axis=1)
        synsplit4 = synsplit[quarter * 3:].apply(fix_row, axis=1)
        synsplit = pd.concat((synsplit1, synsplit2, synsplit3, synsplit4),
                             axis=0)
        # synsplit = synsplit.apply(fix_row, axis=1)
    comb = pd.concat([synsplit[i].dropna() for i in synsplit.columns]) \
        .dropna().drop_duplicates()
    df = pd.DataFrame(comb, columns=['chemname'])
    df2 = pd.concat([namecol, df]).drop_duplicates()
    print('Done')
    print('Chemicals found in synonyms file: ' + str(len(df2)))
    return df2
コード例 #6
0
def getTestFrame():
    rdBase.DisableLog('rdApp.error')
    sdfFile = os.path.join(RDConfig.RDCodeDir, 'Chem', 'test_data',
                           'pandas_load.sdf.gz')
    df = PandasTools.LoadSDF(sdfFile, smilesName='SMILES')
    rdBase.EnableLog('rdApp.error')
    return df
コード例 #7
0
def RDkitRead(in_file, removeHs=True, add_Hs=False):
    ## Read in SDF file; can choose to add hydrogens or not
    if re.search(r'.sdf', in_file):
        print(' # Reading SDF')
        df = rdpd.LoadSDF(file_handle(in_file),
                          removeHs=removeHs,
                          idName='ID',
                          molColName='mol')
        df['smiles'] = df.mol.apply(
            lambda m: Chem.MolToSmiles(Chem.RemoveHs(m)))
        if add_Hs:
            df['mol'] = df.mol.apply(Chem.AddHs)

    ## Read in SMILES file, check if there is a header "smiles"
    if re.search(r'.smi', in_file):
        print('# Reading SMI')
        with file_handle(in_file) as fi:
            if re.search('smi', str(fi.readline()), re.IGNORECASE):
                print('# Smiles input has Header #\n')
                df = pd.read_csv(in_file, sep='\s+', comment='#').dropna()
                df.columns = ['smiles', 'ID']
            else:
                print('# Smiles input has NO Header #\n')
                df = pd.read_csv(in_file, header=None, sep='\s+',
                                 comment='#').dropna()
                df.columns = ['smiles', 'ID']
        rdpd.AddMoleculeColumnToFrame(df, smilesCol='smiles', molCol='mol')
        df['smiles'] = df.mol.apply(Chem.MolToSmiles)

    print('## Number of MOL read from {}: {}\n'.format(in_file,
                                                       len(df.smiles)))
    return df
コード例 #8
0
ファイル: QueryHandler.py プロジェクト: SohanCSERU/PhytoChem
def validate_sdf(path):
    df = PandasTools.LoadSDF(path)
    if df.empty:
        raise ValidationError(
            'File Validation Failed',
            params={'file': path},
        )
コード例 #9
0
def RDkitRead( in_file, idnm, removeHs=False, add_Hs=False ):

  ## Read in SDF file; can choose to add hydrogens or not
  if re.search(r'.sdf', in_file):
    print(' \033[34m## Reading SDF ##\033[0m')
    df = rdpd.LoadSDF(  file_handle(in_file), removeHs=removeHs,
                        smilesName='smiles', molColName='MOL' )
    if add_Hs:
      df['MOL'] = df.mol.apply(Chem.AddHs)

  ## Read in SMILES file, check if there is a header "smiles"
  if re.search(r'.smi', in_file):
    print('  \033[34m## Reading SMI ##\033[0m')
    with file_handle(in_file) as fi:
      if re.search('smi', str(fi.readline()), re.IGNORECASE):
        print(' \033[36m# Smiles input has Header #\033[0m\n')
        df = pd.read_csv(in_file, sep='\s+').dropna()
        df.columns = ['smiles', idnm]
      else:
        print(' \033[35m# Smiles input has NO Header #\033[0m\n')
        df = pd.read_csv(in_file, header=None, sep='\s+', comment='#').dropna()
        df.columns = ['smiles', idnm]
    df['MOL'] = df.smiles.apply(Chem.MolFromSmiles)

  print('## Number of MOL read from \033[34m{0}: \033[31m{1}\033[0m\n'.format(in_file,len(df)))
  return df
コード例 #10
0
    def process_input(self, data_input: Union[pd.DataFrame,
                                              str]) -> pd.DataFrame:
        """
            Checks if input is an Excel file and converts it into pandas dataframe.
            If it already is a pandas dataframe, nothing changes.

            :param data_input: it can be either a pandas dataframe or an excel file

            :return i_data: input data to be curated
        """

        if isinstance(data_input, pd.DataFrame):
            i_data = data_input
        elif isinstance(data_input, str):
            if data_input.endswith('.xlsx'):
                i_data = pd.read_excel(data_input, engine='openpyxl')
            elif data_input.endswith('.csv'):
                if not self.separator:
                    self.separator = ','
                i_data = pd.read_csv(data_input, sep=self.separator)
            elif data_input.endswith('.tsv'):
                if not self.separator:
                    self.separator = '\t'
                i_data = pd.read_csv(data_input, sep=self.separator)
            elif data_input.endswith('.sdf'):
                i_data = PandasTools.LoadSDF(data_input)
            else:
                sys.stderr.write(
                    'Please provide a file with a valid format (xlsx, csv, tsv, sdf)\n'
                )
                sys.exit()

        return i_data
コード例 #11
0
 def test_load_from_sio(self):
     sio = StringIO(methane + peroxide)
     df = PandasTools.LoadSDF(sio)
     self.assertEqual(len(df), 2)
     self.assertEqual(list(df["ID"]), ["Methane", "Peroxide"])
     atom_counts = [mol.GetNumAtoms() for mol in df["ROMol"]]
     self.assertEqual(atom_counts, [1, 2])
コード例 #12
0
def load_sdf(path):
    '''Loads data from SDF files.
    '''
    # No direct structure (png?) but labels and names
    dataset = PandasTools.LoadSDF(path)

    # Get the name of the dependent column as the remaining one
    dependent_col = list(set(dataset.columns) - set(['ID', 'ROMol']))[0]

    smiles = [Chem.MolToSmiles(mol) for mol in dataset['ROMol']]
    dataset['SMILES'] = smiles

    # Drop ROMol column to standarize all datasets from different sources
    dataset = dataset.drop(['ROMol'], axis='columns')

    column_names = {
        'ID': 'CAS',
        'SMILES': 'SMILES',
        '%s' % dependent_col: 'Dependent',
    }

    # Rename the dataset with the required column names
    dataset.rename(index=str, columns=column_names, inplace=True)

    return dataset
コード例 #13
0
 def test_load_gzip_file(self):
     rdBase.DisableLog('rdApp.error')
     df = PandasTools.LoadSDF(self.gz_filename)
     rdBase.EnableLog('rdApp.error')
     self.assertEqual(len(df), 13)
     # The molecule with index 1 is invalid, so it should be missing form the index
     self.assertEqual(list(df.index),
                      [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
コード例 #14
0
def load_valid_atom_or_bond_features(path: str,
                                     smiles: List[str]) -> List[np.ndarray]:
    """
    Loads features saved in a variety of formats.

    Supported formats:

    * :code:`.npz` descriptors are saved as 2D array for each molecule in the order of that in the data.csv
    * :code:`.pkl` / :code:`.pckl` / :code:`.pickle` containing a pandas dataframe with smiles as index and numpy array of descriptors as columns
    * :code:'.sdf' containing all mol blocks with descriptors as entries

    :param path: Path to file containing atomwise features.
    :return: A list of 2D array.
    """

    extension = os.path.splitext(path)[1]

    if extension == '.npz':
        container = np.load(path)
        features = [container[key] for key in container]

    elif extension in ['.pkl', '.pckl', '.pickle']:
        features_df = pd.read_pickle(path)
        if features_df.iloc[0, 0].ndim == 1:
            features = features_df.apply(
                lambda x: np.stack(x.tolist(), axis=1), axis=1).tolist()
        elif features_df.iloc[0, 0].ndim == 2:
            features = features_df.apply(
                lambda x: np.concatenate(x.tolist(), axis=1), axis=1).tolist()
        else:
            raise ValueError(
                f'Atom/bond descriptors input {path} format not supported')

    elif extension == '.sdf':
        features_df = PandasTools.LoadSDF(path).drop(
            ['ID', 'ROMol'], axis=1).set_index('SMILES')

        features_df = features_df[~features_df.index.duplicated()]

        # locate atomic descriptors columns
        features_df = features_df.iloc[:, features_df.iloc[
            0, :].apply(lambda x: isinstance(x, str) and ',' in x).to_list()]
        features_df = features_df.reindex(smiles)
        if features_df.isnull().any().any():
            raise ValueError(
                'Invalid custom atomic descriptors file, Nan found in data')

        features_df = features_df.applymap(lambda x: np.array(
            x.replace('\r', '').replace('\n', '').split(',')).astype(float))

        features = features_df.apply(lambda x: np.stack(x.tolist(), axis=1),
                                     axis=1).tolist()

    else:
        raise ValueError(f'Extension "{extension}" is not supported.')

    return features
コード例 #15
0
def get_chembl(terms_to_keep):
    sdf_file = '/project/projectdirs/openmsi/projects/compound_data/chembl/chembl_21.sdf.gz'
    df = PandasTools.LoadSDF(sdf_file)
    df['source_database'] = 'chembl'
    k = list(df.keys())
    for t in terms_to_keep:
        if not t in k:
            df[t] = ''
    return df
コード例 #16
0
 def test_AddMoleculeColumnToFrame(self):
     df = PandasTools.LoadSDF(getStreamIO(methane + peroxide),
                              isomericSmiles=True,
                              smilesName='Smiles')
     PandasTools.ChangeMoleculeRendering(frame=df, renderer='String')
     del df['ROMol']
     self.assertNotIn('ROMol', str(df))
     PandasTools.AddMoleculeColumnToFrame(df, includeFingerprints=False)
     self.assertIn('ROMol', str(df))
コード例 #17
0
def get_mols_from_files(filenames, targets, verbose=True):
    """
    Read each file into its own Pandas dataframe. File type is based on the file
    extension. Currently supported filetypes are .sdf, .smi, .csv, and .tsv.

    For each file, extract the mols, stats, and the molecules that require review.

    Bring cleaned mols from all files into one list, /all_mols/, and all mols
    requiring review into one dict, /all_for_review/.
    """

    all_for_review = {}
    all_mols = []

    for filename in filenames:
        logging.info(filename)

        # Determine the type of the filename by the extension
        file_ext = pathlib.Path(filename).suffix
        ## Mol_field should probably be a passable agument, defaulting to "mol"?
        mol_field = "mol"

        # Read file depending on file extension
        if file_ext == ".sdf":
            df = PandasTools.LoadSDF(filename, molColName=mol_field)
        elif file_ext in [".csv", ".tsv", ".smi"]:
            sep = ","
            if file_ext == ".tsv":
                sep = "\t"
            if file_ext == ".smi":
                mol_field = "smiles"
            df = pandas.read_csv(filename, sep=sep)
        else:
            # TODO Throw an error
            pass

        # Stats is never used?
        mols, stats, for_review = get_activities(df,
                                                 original_filename=filename,
                                                 activity_fields=targets,
                                                 mol_field=mol_field)

        # Report the number of mols with activity for each target
        for target in targets:
            # We iterate over all the mols A LOT. Can that be reduced at all?
            # Also why did we make and return the /stats/ dict if we were just going to count
            # the stuff in /mols/ to get the same info???
            t = [x for x in mols if x.has_activity(target)]
            logging.info(f"{filename} {target} hits: {len(t)}")

        # Add the mols from the file to the list of all mols
        all_mols.extend(mols)
        # Add the mols that require review from this file to the dict of all mols requiring review
        extend_dict(all_for_review, for_review)

    # Return the list of /all_mols/ that have at least one valid activity and the mols that need to be reviewed
    return all_mols, all_for_review
コード例 #18
0
ファイル: cal_dERMSD.py プロジェクト: zhenglz/deltaVinaXGB
def num_structure_change(confs, native):
    ''' Get number of conformations satisfying requirements --> for entropy '''
    df_confs = PandasTools.LoadSDF(confs)
    df_confs["energy_abs"] = df_confs["energy_abs"].astype(float)
    lowest = df_confs.sort_values(["energy_abs"]).energy_abs.min()
    num_1 = df_confs[df_confs["energy_abs"] < lowest + 1.0].shape[0]
    num_2 = df_confs[df_confs["energy_abs"] < native].shape[0]
    
    return num_1, num_2
コード例 #19
0
ファイル: cluster.py プロジェクト: lenselinkbart/APCluster
def parse_sd_file(file, tgz=False):
    """
    parse a sd file and return molecules
    """
    if tgz == True:
        file = gzip.open(file)
    data = PandasTools.LoadSDF(file,
                               molColName='Molecule',
                               smilesName='smiles')
    return data
コード例 #20
0
def split_sdf(sdf_file_name, outdir="data/"):
    print("Loading sdf.")
    # Parse the SDF file into a Pandas dataframe.
    rdk_lg = RDLogger.logger()
    rdk_lg.setLevel(RDLogger.CRITICAL)
    df = PandasTools.LoadSDF(sdf_file_name,
                             smilesName='SMILES',
                             molColName='Molecule',
                             includeFingerprints=False)
    print("Raw cols = ", [str(x) for x in df.columns])
    # Select only the needed columns and merge the two PDB cols.
    df_list = [
        'PDB ID(s) for Ligand-Target Complex', 'PDB ID(s) of Target Chain',
        'SMILES', 'IC50 (nM)', 'Molecule'
    ]
    df_selected = df[df_list].copy()
    df_selected["PDB IDs"] = df_selected[
        'PDB ID(s) for Ligand-Target Complex'] + ',' + df_selected[
            'PDB ID(s) of Target Chain']
    print("Selected cols = ", [str(x) for x in df_selected.columns])
    df_selected = df_selected[["PDB IDs"] + df_list[2:]]
    # Drop any rows with missing data.
    df_selected = df_selected.replace('', np.nan)
    df_selected = df_selected.replace(',', np.nan)
    df_selected = df_selected.dropna()
    r_rows = len(df.index)
    s_rows = len(df_selected.index)
    print("Raw rows = ", r_rows)
    print("Sel rows = ", s_rows)
    print("Keep pct = %.2f%s" %
          (((float(s_rows) / float(r_rows)) * 100.0), '%'))
    # Build ligand dictionary and a protein dictionary.
    print("Building protein-ligand dictionary.")
    uligs = {}
    prots_ligs = {}
    for lndx, row in enumerate(df_selected.values):
        pdbs = row[0].split(',')
        for pdb in pdbs:
            if pdb == '':
                continue
            if pdb not in prots_ligs:
                prots_ligs[pdb] = []
            prots_ligs[pdb] += [lndx]
        uligs[lndx] = row
    print("Unique proteins = ", len(prots_ligs))
    print("Writing per-ligand output files.")
    # Write out .lig files and return the data dictionaries.
    for key in uligs:
        ndx = str(key)
        lig = uligs[key]
        write_lig_file(lig[3], outdir + "/lig/lig%s.lig" % ndx)
    return uligs, prots_ligs
コード例 #21
0
    def test_svgRendering(self):
        df = PandasTools.LoadSDF(getStreamIO(methane + peroxide))
        self.assertIn('image/png', str(df))
        self.assertNotIn('svg', str(df))

        PandasTools.molRepresentation = 'svg'
        self.assertIn('svg', str(df))
        self.assertNotIn('image/png', str(df))

        # we can use upper case for the molRepresentation
        PandasTools.molRepresentation = 'PNG'
        self.assertNotIn('svg', str(df))
        self.assertIn('image/png', str(df))
コード例 #22
0
def read_activity_data(sdf_name, name_tag, activity_tag):
    """
    Read activity data from the an SD file
    :param sdf_name: input sd file
    :param name_tag: the SD tag with the molecule name
    :param activity_tag: the SD tag with activity data (will be converted to float)
    :return: dataframe with Name and activity
    """
    sdf_df = PandasTools.LoadSDF(sdf_name)
    name_list = sdf_df[name_tag]
    activity_list = [float(x) for x in sdf_df[activity_tag]]
    return pd.DataFrame(np.transpose([name_list, activity_list]),
                        columns=["Name", activity_tag])
コード例 #23
0
def RescaleRename(in_sdf, out_sdf, mol_id, score):
    if not os.path.exists(in_sdf):
        df = None
    else:
        df = rdpd.LoadSDF(in_sdf, removeHs=False,
                          molColName='ROMol').fillna('')
        s_id = df[mol_id]
        print('## Reading in mol {0}: {1}'.format(in_sdf, len(df)))
        for idx, row in df.iterrows():
            df['ROMol'][idx].SetProp('_Name', s_id[idx])
        df[score] = df[score].apply(gold_scale)

    return df
コード例 #24
0
ファイル: UnitTestPandasTools.py プロジェクト: kozo2/rdkit
    def test_properties(self):
        sio = StringIO(peroxide + methane)
        df = PandasTools.LoadSDF(sio)
        self.assertEqual(set(df.columns), set("ROMol ID prop1 prop2 prop3".split()))
        prop1 = list(df["prop1"])
        self.assertTrue(numpy.isnan(prop1[0]), prop1[0])
        self.assertEqual(prop1[1], "12.34")

        self.assertEqual(list(df["prop2"]), ["rtz", "qwe"])
        
        prop3 = list(df["prop3"])
        self.assertEqual(prop3[0], "yxcv")
        self.assertTrue(numpy.isnan(prop3[1]), prop3[1])
コード例 #25
0
def get_hmdb(terms_to_keep):
    df = PandasTools.LoadSDF('/project/projectdirs/openmsi/projects/compound_data/hmdb/structures.sdf')
    df['source_database'] = 'hmdb'
    df.rename(columns={'GENERIC_NAME': 'common_name'}, inplace=True)
    df.rename(columns={'SYNONYMS': 'synonyms'}, inplace=True)
    df.loc[:,'synonyms'] = [[ s.strip() for s in mystr.split(';')] for mystr in df['synonyms'].astype(str).tolist() ]

#     2-(8S,9S,13S,14S)-3-Hydroxy-2-methoxy-13-methyl-7,8,9,11,12,14,15,16-octahydro-6H-cyclopenta[a]phenanthren-17-one; 2-Hydroxyestrone 2-methyl ether; 2-Methoxy-17-oxoestra-1,3,5(10)-trien-3-ol; 2-Methoxy-3-hydroxyestra-1,3,5(10)-trien-17-one; 3-Hydroxy-2-methoxy-Estra-1,3,5(10)-trien-17-one; 3-Hydroxy-2-methoxyestra-1,3,5(10)-trien-17-one; Methoxy-Estrone	
    df.rename(columns={'HMDB_ID': 'hmdb_id'}, inplace=True)
    k = list(df.keys())
    for t in terms_to_keep:
        if not t in k:
            df[t] = ''
    return df
コード例 #26
0
def merge_sumner_data(path_to_msp: str, path_to_csv: str, path_to_sdf: str):

    meta_data = pd.read_csv(path_to_csv, header=0)
    meta_data = meta_data.astype(str)
    structure_data = PandasTools.LoadSDF(path_to_sdf, molColName='Molecule')

    filename, extension = splitext(path_to_msp)
    path_to_output = filename + '.merged' + extension
    with open(path_to_output, 'w') as output:

        spectrum_lines = []
        compound_id = None
        compound_name = None
        for line in open(path_to_msp):
            spectrum_lines.append(line)
            line = line.strip()
            if len(line) == 0:
                # End of a record is reached. Time to write this record
                if len(spectrum_lines) > 1 and compound_id is not None:
                    meta_rows = get_rows(meta_data,
                                         compound_id,
                                         compound_name,
                                         id_columns=['Compound ID'],
                                         name_columns=[])
                    structure_rows = get_rows(
                        structure_data,
                        compound_id,
                        compound_name,
                        id_columns=['ID', 'CAS', 'HMDB_ID'],
                        name_columns=['GENERIC_NAME'])
                    if len(structure_rows) > 0:
                        for row in meta_rows:
                            lines = get_modified_spectrum_lines(
                                spectrum_lines, row, structure_rows[0]
                                if len(structure_rows) > 0 else None)
                            if len(lines) == 0:
                                print(
                                    'Unknown neutral mass for ID = {:s} and Name = {:s}'
                                    .format(compound_id, compound_name),
                                    file=sys.stderr)
                            output.writelines(lines)
                spectrum_lines = []
                compound_id = None
                compound_name = None

            elif ':' in line:
                key, value = line.split(':', maxsplit=1)
                if key.strip() == 'Name':
                    compound_id, compound_name = get_id_from_name(
                        value.strip())
コード例 #27
0
def get_lipid_maps(terms_to_keep):
    df = PandasTools.LoadSDF('/project/projectdirs/openmsi/projects/compound_data/lipidmaps/LMSDFDownload28Jun15FinalAll.sdf')
    df['source_database'] = 'lipidmaps'
    df.rename(columns={'KEGG_ID': 'kegg_id'}, inplace=True)
    df.rename(columns={'PUBCHEM_CID': 'pubchem_compound_id'}, inplace=True)
    df.rename(columns={'COMMON_NAME': 'common_name'}, inplace=True)
    df.rename(columns={'SYNONYMS': 'synonyms'}, inplace=True)
#     Decanohydroxamic acid; caprinohydroxamic acid; n-Decanohydroxamic acid
    df.loc[:,'synonyms'] = [[ s.strip() for s in mystr.split(';')] for mystr in df['synonyms'].astype(str).tolist() ]
    df.rename(columns={'ID': 'lipidmaps_id'}, inplace=True) 
    k = list(df.keys())
    for t in terms_to_keep:
        if not t in k:
            df[t] = ''
    return df
コード例 #28
0
  def _read_sdf( self, sdf_file ):
    ## Build a library of molecules found in the Top-Selction List
    print("  # Reading SDF file: "+sdf_file)
    df = rdpd.LoadSDF(sdf_file, molColName='ROMol', removeHs=False)
    print('  # SDF mol read in from > {0} <: {1}'.format(sdf_file, len(df)))

    X   = self.vec.fit_transform(df['name'])
    Xds = self.vec.get_feature_names()

    common_name_idx = [i for i,col in enumerate(Xds) if col in self.desc]
    top_df[] = (X.toarray()[:, common_name_idx] == 1).any(1)

    del df
    gc.collect()    # active collection of memory to avoid crash

    return top_df
コード例 #29
0
ファイル: rdkit_support.py プロジェクト: mrcblt/gafp
def load_sdf_as_dataframe(path: str,
                          value_tag: str = None,
                          keep_props: bool = False,
                          unfiltered: bool = False) -> Union[DataFrame, str]:
    """
    Loads an SDF from the given `path` into a DataFrame.

    Parameters
    ----------
    path : str
        Path of the SD file
    value_tag : str
        Optional value tag to keep in resulting DataFrame
    keep_props : bool
        If true, the properties are saved within the mol objects
    unfiltered : bool
        If true, the DataFrame is returned without any filtering. Parameter `value_tag` will be ignored.

    Returns
    -------
    Union[DataFrame, str]
        A DataFrame containing all molecules written from the input SDF path.
        Additionally it contains the value tag for each molecule, if a `value_tag` is given.
        If the path is not valid or the file is corrupted, a string with a error message is returned.
    """

    try:
        df = PandasTools.LoadSDF(path, embedProps=keep_props).reset_index(drop=True)
        if len(df) == 0:
            return 'Empty sdfile'
        if value_tag and not unfiltered:
            # Check value_tag is set for all molecules
            if value_tag not in df:
                return 'No molecule contains the given value tag'
            df = df[['ROMol', value_tag]]
            cleaned_df = df.dropna(subset=[value_tag])
            if len(df) != len(cleaned_df):
                logging.warning(f'{len(df)-len(cleaned_df)} molecules don\'t contain the given value tag')
                df = cleaned_df
        elif not unfiltered:
            df = df[['ROMol']]
        logging.info(f'Working with {len(df)} molecules')
        return df
    except OSError as err:
        return str(err)
コード例 #30
0
def main():
    args = UserInput()
    if args.name:
        name = args.name
    else:
        name = 'ID'
    if args.score:
        score = args.score
    else:
        score = 'Chemgauss4'
    if args.dock:
        dock = args.dock
    else:
        dock = 'fred'
    if args.top:
        top = int(args.top)
    else:
        top = -1  # all

    df = rdpd.LoadSDF(args.infile,
                      removeHs=False,
                      molColName='ROMol',
                      idName='mol_ID')[:top].fillna('')
    print('\033[34m> select mol: \033[32m{0}\033[0m'.format(len(df)))
    df[score] = df[score].apply(float)
    df['Rank'] = df.index

    for idx, row in df.iterrows():
        df['ROMol'][idx].SetProp(
            '_Name',
            '{0}::{1}::{2:.2f}::{3}'.format(row[name], row['Rank'] + 1,
                                            row[score], dock))

    sdf_out = '{0}.{1}_docked.sdf.gz'.format(args.outpref, dock)
    csv_out = '{0}.{1}_docked.txt.bz2'.format(args.outpref, dock)

    rdpd.WriteSDF(df, sdf_out, properties=list(df.columns))
    df.to_csv(csv_out,
              header=False,
              index=False,
              sep='\t',
              columns=[name, score],
              float_format='%.3f')