def data_generator(positive_hits, data): #load dataset data = PandasTools.LoadSDF(data, smilesName='SMILES', molColName='mol', includeFingerprints=True, embedProps=True) positive_hits = PandasTools.LoadSDF(positive_hits, smilesName='SMILES', molColName='mol', includeFingerprints=True, embedProps=True) #generate hit names hitname = list(positive_hits.NAME) #test if the compound in dataset is unique if len(np.unique(hitname)) != len(hitname): print([ item for item, count in collections.Counter(hitname).items() if count > 1 ]) else: #get the label 0-negative,1-positive y = pd.to_numeric(data["NAME"].isin(hitname).astype('uint8')) data['mol'] = [ AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in data['mol'] ] data_name = [f'Bit_{i}' for i in range(2048)] data_bits = [list(x) for x in data['mol']] X = pd.DataFrame(data_bits, columns=data_name) return X, y
def test_load_specify_column_names(self): sio = StringIO(methane + peroxide) df = PandasTools.LoadSDF(sio, idName="CorpID", molColName="_rdmol") self.assertEqual(len(df), 2) self.assertEqual(list(df["CorpID"]), ["Methane", "Peroxide"]) atom_counts = [mol.GetNumAtoms() for mol in df["_rdmol"]] self.assertEqual(atom_counts, [1, 2])
def get_dataframe_from_file(filename, mol_col="ROMol", smiles_col="SMILES"): """Determine file type from filename extension and produce a Pandas dataframe accordingly. Supported filename extensions: .sdf, .csv, .tsv, .smi If the file is an SDF, get the structures from the MOL_COL column. Otherwise, use the SMILES strings in SMILES_COL to obtain the structures. Parameters ---------- filename: filepath string The name of the file from which to read the data; can an be absolute or relative path. mol_col: string containing column name (case-insensitive) In an SDF formatted file, the name of the column containing the structures smiles_col: string containing column name (case-insensitive) In a non-SDF formatted file, the name of the column containing the SMILES strings Returns ------- df: Pandas dataframe A Pandas dataframe containing the data from FILENAME. """ logging.info(f'Reading {filename}') file_ext = pathlib.Path(filename).suffix if file_ext == ".sdf": ## Maintaining the standard parameters: /idName/ = 'ID', /includeFingerprints/ = False, ## /isomericSmiles/ = True, /embedProps/ = False, ## /removeHs/ = True, /strictParsing/ = True, /smilesName/ = None df = PandasTools.LoadSDF(filename, molColName=mol_col) elif file_ext in [".csv", ".tsv", ".smi"]: sep = "," if file_ext == ".tsv": sep = "\t" elif file_ext == ".smi": mol_field = "smiles" df = pandas.read_csv(filename, sep=sep) # Generate structures from SMILES # Yields /None/ if conversion fails PandasTools.AddMoleculeColumnToFrame(df, smilesCol=smiles_col) elif file_ext == '': # TODO: Error: Cannot determine file type pass else: # TODO: Error: file type not supported pass return df
def get_lowest_energy(lowest): ''' Get the global minimum energy ''' df_confs = PandasTools.LoadSDF(lowest) df_confs["energy_abs"] = df_confs["energy_abs"].astype(float) lowest = df_confs.sort_values(["energy_abs"]).energy_abs.min() return lowest
def read_synonyms_file(sdfFile, fix_names=False, path=''): """Read comptox synonyms file.""" print('Reading synonyms file...') frame = PandasTools.LoadSDF(os.path.join(path, sdfFile), molColName=None, removeHs=False, strictParsing=False) # frame = pd.read_csv('names.csv', low_memory=False) frame = frame[['Preferred_Name', 'Synonyms']] namecol = frame[['Preferred_Name']].copy() \ .rename(columns={'Preferred_Name': 'chemname'}).dropna() syncol = frame[['Synonyms']].copy() \ .rename(columns={'Synonyms': 'chemname'}).dropna() del frame print('Formatting...') synsplit = syncol['chemname'].str.split('\n', expand=True) if fix_names: quarter = int(round(len(synsplit) / 4)) synsplit1 = synsplit[:quarter].apply(fix_row, axis=1) synsplit2 = synsplit[quarter:quarter * 2].apply(fix_row, axis=1) synsplit3 = synsplit[quarter * 2:quarter * 3].apply(fix_row, axis=1) synsplit4 = synsplit[quarter * 3:].apply(fix_row, axis=1) synsplit = pd.concat((synsplit1, synsplit2, synsplit3, synsplit4), axis=0) # synsplit = synsplit.apply(fix_row, axis=1) comb = pd.concat([synsplit[i].dropna() for i in synsplit.columns]) \ .dropna().drop_duplicates() df = pd.DataFrame(comb, columns=['chemname']) df2 = pd.concat([namecol, df]).drop_duplicates() print('Done') print('Chemicals found in synonyms file: ' + str(len(df2))) return df2
def getTestFrame(): rdBase.DisableLog('rdApp.error') sdfFile = os.path.join(RDConfig.RDCodeDir, 'Chem', 'test_data', 'pandas_load.sdf.gz') df = PandasTools.LoadSDF(sdfFile, smilesName='SMILES') rdBase.EnableLog('rdApp.error') return df
def RDkitRead(in_file, removeHs=True, add_Hs=False): ## Read in SDF file; can choose to add hydrogens or not if re.search(r'.sdf', in_file): print(' # Reading SDF') df = rdpd.LoadSDF(file_handle(in_file), removeHs=removeHs, idName='ID', molColName='mol') df['smiles'] = df.mol.apply( lambda m: Chem.MolToSmiles(Chem.RemoveHs(m))) if add_Hs: df['mol'] = df.mol.apply(Chem.AddHs) ## Read in SMILES file, check if there is a header "smiles" if re.search(r'.smi', in_file): print('# Reading SMI') with file_handle(in_file) as fi: if re.search('smi', str(fi.readline()), re.IGNORECASE): print('# Smiles input has Header #\n') df = pd.read_csv(in_file, sep='\s+', comment='#').dropna() df.columns = ['smiles', 'ID'] else: print('# Smiles input has NO Header #\n') df = pd.read_csv(in_file, header=None, sep='\s+', comment='#').dropna() df.columns = ['smiles', 'ID'] rdpd.AddMoleculeColumnToFrame(df, smilesCol='smiles', molCol='mol') df['smiles'] = df.mol.apply(Chem.MolToSmiles) print('## Number of MOL read from {}: {}\n'.format(in_file, len(df.smiles))) return df
def validate_sdf(path): df = PandasTools.LoadSDF(path) if df.empty: raise ValidationError( 'File Validation Failed', params={'file': path}, )
def RDkitRead( in_file, idnm, removeHs=False, add_Hs=False ): ## Read in SDF file; can choose to add hydrogens or not if re.search(r'.sdf', in_file): print(' \033[34m## Reading SDF ##\033[0m') df = rdpd.LoadSDF( file_handle(in_file), removeHs=removeHs, smilesName='smiles', molColName='MOL' ) if add_Hs: df['MOL'] = df.mol.apply(Chem.AddHs) ## Read in SMILES file, check if there is a header "smiles" if re.search(r'.smi', in_file): print(' \033[34m## Reading SMI ##\033[0m') with file_handle(in_file) as fi: if re.search('smi', str(fi.readline()), re.IGNORECASE): print(' \033[36m# Smiles input has Header #\033[0m\n') df = pd.read_csv(in_file, sep='\s+').dropna() df.columns = ['smiles', idnm] else: print(' \033[35m# Smiles input has NO Header #\033[0m\n') df = pd.read_csv(in_file, header=None, sep='\s+', comment='#').dropna() df.columns = ['smiles', idnm] df['MOL'] = df.smiles.apply(Chem.MolFromSmiles) print('## Number of MOL read from \033[34m{0}: \033[31m{1}\033[0m\n'.format(in_file,len(df))) return df
def process_input(self, data_input: Union[pd.DataFrame, str]) -> pd.DataFrame: """ Checks if input is an Excel file and converts it into pandas dataframe. If it already is a pandas dataframe, nothing changes. :param data_input: it can be either a pandas dataframe or an excel file :return i_data: input data to be curated """ if isinstance(data_input, pd.DataFrame): i_data = data_input elif isinstance(data_input, str): if data_input.endswith('.xlsx'): i_data = pd.read_excel(data_input, engine='openpyxl') elif data_input.endswith('.csv'): if not self.separator: self.separator = ',' i_data = pd.read_csv(data_input, sep=self.separator) elif data_input.endswith('.tsv'): if not self.separator: self.separator = '\t' i_data = pd.read_csv(data_input, sep=self.separator) elif data_input.endswith('.sdf'): i_data = PandasTools.LoadSDF(data_input) else: sys.stderr.write( 'Please provide a file with a valid format (xlsx, csv, tsv, sdf)\n' ) sys.exit() return i_data
def test_load_from_sio(self): sio = StringIO(methane + peroxide) df = PandasTools.LoadSDF(sio) self.assertEqual(len(df), 2) self.assertEqual(list(df["ID"]), ["Methane", "Peroxide"]) atom_counts = [mol.GetNumAtoms() for mol in df["ROMol"]] self.assertEqual(atom_counts, [1, 2])
def load_sdf(path): '''Loads data from SDF files. ''' # No direct structure (png?) but labels and names dataset = PandasTools.LoadSDF(path) # Get the name of the dependent column as the remaining one dependent_col = list(set(dataset.columns) - set(['ID', 'ROMol']))[0] smiles = [Chem.MolToSmiles(mol) for mol in dataset['ROMol']] dataset['SMILES'] = smiles # Drop ROMol column to standarize all datasets from different sources dataset = dataset.drop(['ROMol'], axis='columns') column_names = { 'ID': 'CAS', 'SMILES': 'SMILES', '%s' % dependent_col: 'Dependent', } # Rename the dataset with the required column names dataset.rename(index=str, columns=column_names, inplace=True) return dataset
def test_load_gzip_file(self): rdBase.DisableLog('rdApp.error') df = PandasTools.LoadSDF(self.gz_filename) rdBase.EnableLog('rdApp.error') self.assertEqual(len(df), 13) # The molecule with index 1 is invalid, so it should be missing form the index self.assertEqual(list(df.index), [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
def load_valid_atom_or_bond_features(path: str, smiles: List[str]) -> List[np.ndarray]: """ Loads features saved in a variety of formats. Supported formats: * :code:`.npz` descriptors are saved as 2D array for each molecule in the order of that in the data.csv * :code:`.pkl` / :code:`.pckl` / :code:`.pickle` containing a pandas dataframe with smiles as index and numpy array of descriptors as columns * :code:'.sdf' containing all mol blocks with descriptors as entries :param path: Path to file containing atomwise features. :return: A list of 2D array. """ extension = os.path.splitext(path)[1] if extension == '.npz': container = np.load(path) features = [container[key] for key in container] elif extension in ['.pkl', '.pckl', '.pickle']: features_df = pd.read_pickle(path) if features_df.iloc[0, 0].ndim == 1: features = features_df.apply( lambda x: np.stack(x.tolist(), axis=1), axis=1).tolist() elif features_df.iloc[0, 0].ndim == 2: features = features_df.apply( lambda x: np.concatenate(x.tolist(), axis=1), axis=1).tolist() else: raise ValueError( f'Atom/bond descriptors input {path} format not supported') elif extension == '.sdf': features_df = PandasTools.LoadSDF(path).drop( ['ID', 'ROMol'], axis=1).set_index('SMILES') features_df = features_df[~features_df.index.duplicated()] # locate atomic descriptors columns features_df = features_df.iloc[:, features_df.iloc[ 0, :].apply(lambda x: isinstance(x, str) and ',' in x).to_list()] features_df = features_df.reindex(smiles) if features_df.isnull().any().any(): raise ValueError( 'Invalid custom atomic descriptors file, Nan found in data') features_df = features_df.applymap(lambda x: np.array( x.replace('\r', '').replace('\n', '').split(',')).astype(float)) features = features_df.apply(lambda x: np.stack(x.tolist(), axis=1), axis=1).tolist() else: raise ValueError(f'Extension "{extension}" is not supported.') return features
def get_chembl(terms_to_keep): sdf_file = '/project/projectdirs/openmsi/projects/compound_data/chembl/chembl_21.sdf.gz' df = PandasTools.LoadSDF(sdf_file) df['source_database'] = 'chembl' k = list(df.keys()) for t in terms_to_keep: if not t in k: df[t] = '' return df
def test_AddMoleculeColumnToFrame(self): df = PandasTools.LoadSDF(getStreamIO(methane + peroxide), isomericSmiles=True, smilesName='Smiles') PandasTools.ChangeMoleculeRendering(frame=df, renderer='String') del df['ROMol'] self.assertNotIn('ROMol', str(df)) PandasTools.AddMoleculeColumnToFrame(df, includeFingerprints=False) self.assertIn('ROMol', str(df))
def get_mols_from_files(filenames, targets, verbose=True): """ Read each file into its own Pandas dataframe. File type is based on the file extension. Currently supported filetypes are .sdf, .smi, .csv, and .tsv. For each file, extract the mols, stats, and the molecules that require review. Bring cleaned mols from all files into one list, /all_mols/, and all mols requiring review into one dict, /all_for_review/. """ all_for_review = {} all_mols = [] for filename in filenames: logging.info(filename) # Determine the type of the filename by the extension file_ext = pathlib.Path(filename).suffix ## Mol_field should probably be a passable agument, defaulting to "mol"? mol_field = "mol" # Read file depending on file extension if file_ext == ".sdf": df = PandasTools.LoadSDF(filename, molColName=mol_field) elif file_ext in [".csv", ".tsv", ".smi"]: sep = "," if file_ext == ".tsv": sep = "\t" if file_ext == ".smi": mol_field = "smiles" df = pandas.read_csv(filename, sep=sep) else: # TODO Throw an error pass # Stats is never used? mols, stats, for_review = get_activities(df, original_filename=filename, activity_fields=targets, mol_field=mol_field) # Report the number of mols with activity for each target for target in targets: # We iterate over all the mols A LOT. Can that be reduced at all? # Also why did we make and return the /stats/ dict if we were just going to count # the stuff in /mols/ to get the same info??? t = [x for x in mols if x.has_activity(target)] logging.info(f"{filename} {target} hits: {len(t)}") # Add the mols from the file to the list of all mols all_mols.extend(mols) # Add the mols that require review from this file to the dict of all mols requiring review extend_dict(all_for_review, for_review) # Return the list of /all_mols/ that have at least one valid activity and the mols that need to be reviewed return all_mols, all_for_review
def num_structure_change(confs, native): ''' Get number of conformations satisfying requirements --> for entropy ''' df_confs = PandasTools.LoadSDF(confs) df_confs["energy_abs"] = df_confs["energy_abs"].astype(float) lowest = df_confs.sort_values(["energy_abs"]).energy_abs.min() num_1 = df_confs[df_confs["energy_abs"] < lowest + 1.0].shape[0] num_2 = df_confs[df_confs["energy_abs"] < native].shape[0] return num_1, num_2
def parse_sd_file(file, tgz=False): """ parse a sd file and return molecules """ if tgz == True: file = gzip.open(file) data = PandasTools.LoadSDF(file, molColName='Molecule', smilesName='smiles') return data
def split_sdf(sdf_file_name, outdir="data/"): print("Loading sdf.") # Parse the SDF file into a Pandas dataframe. rdk_lg = RDLogger.logger() rdk_lg.setLevel(RDLogger.CRITICAL) df = PandasTools.LoadSDF(sdf_file_name, smilesName='SMILES', molColName='Molecule', includeFingerprints=False) print("Raw cols = ", [str(x) for x in df.columns]) # Select only the needed columns and merge the two PDB cols. df_list = [ 'PDB ID(s) for Ligand-Target Complex', 'PDB ID(s) of Target Chain', 'SMILES', 'IC50 (nM)', 'Molecule' ] df_selected = df[df_list].copy() df_selected["PDB IDs"] = df_selected[ 'PDB ID(s) for Ligand-Target Complex'] + ',' + df_selected[ 'PDB ID(s) of Target Chain'] print("Selected cols = ", [str(x) for x in df_selected.columns]) df_selected = df_selected[["PDB IDs"] + df_list[2:]] # Drop any rows with missing data. df_selected = df_selected.replace('', np.nan) df_selected = df_selected.replace(',', np.nan) df_selected = df_selected.dropna() r_rows = len(df.index) s_rows = len(df_selected.index) print("Raw rows = ", r_rows) print("Sel rows = ", s_rows) print("Keep pct = %.2f%s" % (((float(s_rows) / float(r_rows)) * 100.0), '%')) # Build ligand dictionary and a protein dictionary. print("Building protein-ligand dictionary.") uligs = {} prots_ligs = {} for lndx, row in enumerate(df_selected.values): pdbs = row[0].split(',') for pdb in pdbs: if pdb == '': continue if pdb not in prots_ligs: prots_ligs[pdb] = [] prots_ligs[pdb] += [lndx] uligs[lndx] = row print("Unique proteins = ", len(prots_ligs)) print("Writing per-ligand output files.") # Write out .lig files and return the data dictionaries. for key in uligs: ndx = str(key) lig = uligs[key] write_lig_file(lig[3], outdir + "/lig/lig%s.lig" % ndx) return uligs, prots_ligs
def test_svgRendering(self): df = PandasTools.LoadSDF(getStreamIO(methane + peroxide)) self.assertIn('image/png', str(df)) self.assertNotIn('svg', str(df)) PandasTools.molRepresentation = 'svg' self.assertIn('svg', str(df)) self.assertNotIn('image/png', str(df)) # we can use upper case for the molRepresentation PandasTools.molRepresentation = 'PNG' self.assertNotIn('svg', str(df)) self.assertIn('image/png', str(df))
def read_activity_data(sdf_name, name_tag, activity_tag): """ Read activity data from the an SD file :param sdf_name: input sd file :param name_tag: the SD tag with the molecule name :param activity_tag: the SD tag with activity data (will be converted to float) :return: dataframe with Name and activity """ sdf_df = PandasTools.LoadSDF(sdf_name) name_list = sdf_df[name_tag] activity_list = [float(x) for x in sdf_df[activity_tag]] return pd.DataFrame(np.transpose([name_list, activity_list]), columns=["Name", activity_tag])
def RescaleRename(in_sdf, out_sdf, mol_id, score): if not os.path.exists(in_sdf): df = None else: df = rdpd.LoadSDF(in_sdf, removeHs=False, molColName='ROMol').fillna('') s_id = df[mol_id] print('## Reading in mol {0}: {1}'.format(in_sdf, len(df))) for idx, row in df.iterrows(): df['ROMol'][idx].SetProp('_Name', s_id[idx]) df[score] = df[score].apply(gold_scale) return df
def test_properties(self): sio = StringIO(peroxide + methane) df = PandasTools.LoadSDF(sio) self.assertEqual(set(df.columns), set("ROMol ID prop1 prop2 prop3".split())) prop1 = list(df["prop1"]) self.assertTrue(numpy.isnan(prop1[0]), prop1[0]) self.assertEqual(prop1[1], "12.34") self.assertEqual(list(df["prop2"]), ["rtz", "qwe"]) prop3 = list(df["prop3"]) self.assertEqual(prop3[0], "yxcv") self.assertTrue(numpy.isnan(prop3[1]), prop3[1])
def get_hmdb(terms_to_keep): df = PandasTools.LoadSDF('/project/projectdirs/openmsi/projects/compound_data/hmdb/structures.sdf') df['source_database'] = 'hmdb' df.rename(columns={'GENERIC_NAME': 'common_name'}, inplace=True) df.rename(columns={'SYNONYMS': 'synonyms'}, inplace=True) df.loc[:,'synonyms'] = [[ s.strip() for s in mystr.split(';')] for mystr in df['synonyms'].astype(str).tolist() ] # 2-(8S,9S,13S,14S)-3-Hydroxy-2-methoxy-13-methyl-7,8,9,11,12,14,15,16-octahydro-6H-cyclopenta[a]phenanthren-17-one; 2-Hydroxyestrone 2-methyl ether; 2-Methoxy-17-oxoestra-1,3,5(10)-trien-3-ol; 2-Methoxy-3-hydroxyestra-1,3,5(10)-trien-17-one; 3-Hydroxy-2-methoxy-Estra-1,3,5(10)-trien-17-one; 3-Hydroxy-2-methoxyestra-1,3,5(10)-trien-17-one; Methoxy-Estrone df.rename(columns={'HMDB_ID': 'hmdb_id'}, inplace=True) k = list(df.keys()) for t in terms_to_keep: if not t in k: df[t] = '' return df
def merge_sumner_data(path_to_msp: str, path_to_csv: str, path_to_sdf: str): meta_data = pd.read_csv(path_to_csv, header=0) meta_data = meta_data.astype(str) structure_data = PandasTools.LoadSDF(path_to_sdf, molColName='Molecule') filename, extension = splitext(path_to_msp) path_to_output = filename + '.merged' + extension with open(path_to_output, 'w') as output: spectrum_lines = [] compound_id = None compound_name = None for line in open(path_to_msp): spectrum_lines.append(line) line = line.strip() if len(line) == 0: # End of a record is reached. Time to write this record if len(spectrum_lines) > 1 and compound_id is not None: meta_rows = get_rows(meta_data, compound_id, compound_name, id_columns=['Compound ID'], name_columns=[]) structure_rows = get_rows( structure_data, compound_id, compound_name, id_columns=['ID', 'CAS', 'HMDB_ID'], name_columns=['GENERIC_NAME']) if len(structure_rows) > 0: for row in meta_rows: lines = get_modified_spectrum_lines( spectrum_lines, row, structure_rows[0] if len(structure_rows) > 0 else None) if len(lines) == 0: print( 'Unknown neutral mass for ID = {:s} and Name = {:s}' .format(compound_id, compound_name), file=sys.stderr) output.writelines(lines) spectrum_lines = [] compound_id = None compound_name = None elif ':' in line: key, value = line.split(':', maxsplit=1) if key.strip() == 'Name': compound_id, compound_name = get_id_from_name( value.strip())
def get_lipid_maps(terms_to_keep): df = PandasTools.LoadSDF('/project/projectdirs/openmsi/projects/compound_data/lipidmaps/LMSDFDownload28Jun15FinalAll.sdf') df['source_database'] = 'lipidmaps' df.rename(columns={'KEGG_ID': 'kegg_id'}, inplace=True) df.rename(columns={'PUBCHEM_CID': 'pubchem_compound_id'}, inplace=True) df.rename(columns={'COMMON_NAME': 'common_name'}, inplace=True) df.rename(columns={'SYNONYMS': 'synonyms'}, inplace=True) # Decanohydroxamic acid; caprinohydroxamic acid; n-Decanohydroxamic acid df.loc[:,'synonyms'] = [[ s.strip() for s in mystr.split(';')] for mystr in df['synonyms'].astype(str).tolist() ] df.rename(columns={'ID': 'lipidmaps_id'}, inplace=True) k = list(df.keys()) for t in terms_to_keep: if not t in k: df[t] = '' return df
def _read_sdf( self, sdf_file ): ## Build a library of molecules found in the Top-Selction List print(" # Reading SDF file: "+sdf_file) df = rdpd.LoadSDF(sdf_file, molColName='ROMol', removeHs=False) print(' # SDF mol read in from > {0} <: {1}'.format(sdf_file, len(df))) X = self.vec.fit_transform(df['name']) Xds = self.vec.get_feature_names() common_name_idx = [i for i,col in enumerate(Xds) if col in self.desc] top_df[] = (X.toarray()[:, common_name_idx] == 1).any(1) del df gc.collect() # active collection of memory to avoid crash return top_df
def load_sdf_as_dataframe(path: str, value_tag: str = None, keep_props: bool = False, unfiltered: bool = False) -> Union[DataFrame, str]: """ Loads an SDF from the given `path` into a DataFrame. Parameters ---------- path : str Path of the SD file value_tag : str Optional value tag to keep in resulting DataFrame keep_props : bool If true, the properties are saved within the mol objects unfiltered : bool If true, the DataFrame is returned without any filtering. Parameter `value_tag` will be ignored. Returns ------- Union[DataFrame, str] A DataFrame containing all molecules written from the input SDF path. Additionally it contains the value tag for each molecule, if a `value_tag` is given. If the path is not valid or the file is corrupted, a string with a error message is returned. """ try: df = PandasTools.LoadSDF(path, embedProps=keep_props).reset_index(drop=True) if len(df) == 0: return 'Empty sdfile' if value_tag and not unfiltered: # Check value_tag is set for all molecules if value_tag not in df: return 'No molecule contains the given value tag' df = df[['ROMol', value_tag]] cleaned_df = df.dropna(subset=[value_tag]) if len(df) != len(cleaned_df): logging.warning(f'{len(df)-len(cleaned_df)} molecules don\'t contain the given value tag') df = cleaned_df elif not unfiltered: df = df[['ROMol']] logging.info(f'Working with {len(df)} molecules') return df except OSError as err: return str(err)
def main(): args = UserInput() if args.name: name = args.name else: name = 'ID' if args.score: score = args.score else: score = 'Chemgauss4' if args.dock: dock = args.dock else: dock = 'fred' if args.top: top = int(args.top) else: top = -1 # all df = rdpd.LoadSDF(args.infile, removeHs=False, molColName='ROMol', idName='mol_ID')[:top].fillna('') print('\033[34m> select mol: \033[32m{0}\033[0m'.format(len(df))) df[score] = df[score].apply(float) df['Rank'] = df.index for idx, row in df.iterrows(): df['ROMol'][idx].SetProp( '_Name', '{0}::{1}::{2:.2f}::{3}'.format(row[name], row['Rank'] + 1, row[score], dock)) sdf_out = '{0}.{1}_docked.sdf.gz'.format(args.outpref, dock) csv_out = '{0}.{1}_docked.txt.bz2'.format(args.outpref, dock) rdpd.WriteSDF(df, sdf_out, properties=list(df.columns)) df.to_csv(csv_out, header=False, index=False, sep='\t', columns=[name, score], float_format='%.3f')