def test_all_numeric_with_numeric_columns(self): sio = StringIO() df = self.df df["len"] = df["ID"].map(len) PandasTools.WriteSDF(df, sio, allNumeric=True) s = sio.getvalue() self.assertEqual(s.count("<len>"), 2) self.assertIn("7\n\n", s) self.assertIn("8\n\n", s)
def make_input(): active_df = pd.read_csv("actives_final.ism", header=None, sep=" ") active_rows, active_cols = active_df.shape active_df.columns = ["SMILES", "ID", "ChEMBL_ID"] active_df["label"] = ["Active"] * active_rows PandasTools.AddMoleculeColumnToFrame(active_df, "SMILES", "MOL") decoy_df = pd.read_csv("decoys_final.ism", header=None, sep=" ") decoy_rows, decoy_cols = decoy_df.shape decoy_df.columns = ["SMILES", "ID"] decoy_df["label"] = ["Decoy"] * decoy_rows PandasTools.AddMoleculeColumnToFrame(decoy_df, "SMILES", "MOL") active_df["is_active"] = [1] * active_df.shape[0] decoy_df["is_active"] = [0] * decoy_df.shape[0] combined_df = active_df.append(decoy_df)[["SMILES", "ID", "is_active"]] combined_df.to_csv("dude_ace.csv", index=False)
def get_mols_from_files(filenames, targets, verbose=True): """ Read each file into its own Pandas dataframe. File type is based on the file extension. Currently supported filetypes are .sdf, .smi, .csv, and .tsv. For each file, extract the mols, stats, and the molecules that require review. Bring cleaned mols from all files into one list, /all_mols/, and all mols requiring review into one dict, /all_for_review/. """ all_for_review = {} all_mols = [] for filename in filenames: logging.info(filename) # Determine the type of the filename by the extension file_ext = pathlib.Path(filename).suffix ## Mol_field should probably be a passable agument, defaulting to "mol"? mol_field = "mol" # Read file depending on file extension if file_ext == ".sdf": df = PandasTools.LoadSDF(filename, molColName=mol_field) elif file_ext in [".csv", ".tsv", ".smi"]: sep = "," if file_ext == ".tsv": sep = "\t" if file_ext == ".smi": mol_field = "smiles" df = pandas.read_csv(filename, sep=sep) else: # TODO Throw an error pass # Stats is never used? mols, stats, for_review = get_activities(df, original_filename=filename, activity_fields=targets, mol_field=mol_field) # Report the number of mols with activity for each target for target in targets: # We iterate over all the mols A LOT. Can that be reduced at all? # Also why did we make and return the /stats/ dict if we were just going to count # the stuff in /mols/ to get the same info??? t = [x for x in mols if x.has_activity(target)] logging.info(f"{filename} {target} hits: {len(t)}") # Add the mols from the file to the list of all mols all_mols.extend(mols) # Add the mols that require review from this file to the dict of all mols requiring review extend_dict(all_for_review, for_review) # Return the list of /all_mols/ that have at least one valid activity and the mols that need to be reviewed return all_mols, all_for_review
def test_all_numeric_with_no_numeric_columns(self): sio = StringIO() PandasTools.WriteSDF(self.df, sio, allNumeric=True) s = sio.getvalue() self.assertFalse(">" in s, s) self.assertNotIn( "7\n\n", s) # double-check that the numeric tests don't pass by accident self.assertNotIn("8\n\n", s)
def test_specify_numeric_column(self): sio = StringIO() df = self.df df["len2"] = df["ID"].map(len) PandasTools.WriteSDF(df, sio, properties=["len2"]) s = sio.getvalue() self.assertEqual(s.count("<len2>"), 2) self.assertIn("7\n\n", s) self.assertIn("8\n\n", s)
def num_structure_change(confs, native): ''' Get number of conformations satisfying requirements --> for entropy ''' df_confs = PandasTools.LoadSDF(confs) df_confs["energy_abs"] = df_confs["energy_abs"].astype(float) lowest = df_confs.sort_values(["energy_abs"]).energy_abs.min() num_1 = df_confs[df_confs["energy_abs"] < lowest + 1.0].shape[0] num_2 = df_confs[df_confs["energy_abs"] < native].shape[0] return num_1, num_2
def load_valid_atom_or_bond_features(path: str, smiles: List[str]) -> List[np.ndarray]: """ Loads features saved in a variety of formats. Supported formats: * :code:`.npz` descriptors are saved as 2D array for each molecule in the order of that in the data.csv * :code:`.pkl` / :code:`.pckl` / :code:`.pickle` containing a pandas dataframe with smiles as index and numpy array of descriptors as columns * :code:'.sdf' containing all mol blocks with descriptors as entries :param path: Path to file containing atomwise features. :return: A list of 2D array. """ extension = os.path.splitext(path)[1] if extension == '.npz': container = np.load(path) features = [container[key] for key in container] elif extension in ['.pkl', '.pckl', '.pickle']: features_df = pd.read_pickle(path) if features_df.iloc[0, 0].ndim == 1: features = features_df.apply( lambda x: np.stack(x.tolist(), axis=1), axis=1).tolist() elif features_df.iloc[0, 0].ndim == 2: features = features_df.apply( lambda x: np.concatenate(x.tolist(), axis=1), axis=1).tolist() else: raise ValueError( f'Atom/bond descriptors input {path} format not supported') elif extension == '.sdf': features_df = PandasTools.LoadSDF(path).drop( ['ID', 'ROMol'], axis=1).set_index('SMILES') features_df = features_df[~features_df.index.duplicated()] # locate atomic descriptors columns features_df = features_df.iloc[:, features_df.iloc[ 0, :].apply(lambda x: isinstance(x, str) and ',' in x).to_list()] features_df = features_df.reindex(smiles) if features_df.isnull().any().any(): raise ValueError( 'Invalid custom atomic descriptors file, Nan found in data') features_df = features_df.applymap(lambda x: np.array( x.replace('\r', '').replace('\n', '').split(',')).astype(float)) features = features_df.apply(lambda x: np.stack(x.tolist(), axis=1), axis=1).tolist() else: raise ValueError(f'Extension "{extension}" is not supported.') return features
def get_chembl(terms_to_keep): sdf_file = '/project/projectdirs/openmsi/projects/compound_data/chembl/chembl_21.sdf.gz' df = PandasTools.LoadSDF(sdf_file) df['source_database'] = 'chembl' k = list(df.keys()) for t in terms_to_keep: if not t in k: df[t] = '' return df
def main(): args = getArgs() print(args.infile, args.outfile) smiles_df = pd.read_csv(args.infile) pp = smiles_df[['rdkit_smiles', 'compound_id']] PandasTools.AddMoleculeColumnToFrame(pp, 'rdkit_smiles', 'Molecule') for index, row in pp.iterrows(): row['Molecule'] = Chem.AddHs(row['Molecule']) AllChem.EmbedMolecule(row['Molecule']) pp.at[index, 'i_user_TOTAL_CHARGE'] = Chem.rdmolops.GetFormalCharge( row['Molecule']) PandasTools.WriteSDF(pp, args.outfile, molColName='Molecule', idName='compound_id', properties=list(pp.columns))
def add_mol_column(df, smiles_col, molecule_col='mol'): """ Add a column 'molecule_col' to data frame 'df' containing RDKit Mol objects corresponding to the SMILES strings in column 'smiles_col'. """ PandasTools.AddMoleculeColumnToFrame(df, smiles_col, molecule_col, includeFingerprints=True) return df
def test_force_Kekulize(): df = pd.read_csv('All_Moles_Tested_Data.csv') i= 0 mol_list = [] for smile in df['smiles']: mol = MolFromSmiles(smile) x = find_custom_Kekulize_set(smile, max_atoms= 60, max_degree= 5,printMe = False) for index in x: mol.GetAtomWithIdx(index).SetAtomicNum(32) mol_list.append(mol) df['mol'] = pd.DataFrame({'mol':mol_list}) unit = 5 for i in range(0,len(df)//unit): display(PandasTools.FrameToGridImage(df.iloc[i*unit:i*unit+unit],column='mol', legendsCol='',molsPerRow=unit)) if((len(df)%unit>0)*1): display(PandasTools.FrameToGridImage(df.iloc[len(df)//unit*unit:len(df)],column='mol', legendsCol='',molsPerRow=unit))
def stop_df_from_stream(stream, summary=None, comp_id="stop_df_from_stream"): """Generates a Pandas DataFrame out of the data stream. The molecules need to be present in the stream, e.g. generated by `pipe_mol_from_smiles`.""" if not PANDAS: raise ImportError("pandas is not available.") PT.RenderImagesInAllDataFrames(images=True) df = pandas.DataFrame.from_dict(stop_dict_from_stream(stream, summary=summary, comp_id=comp_id)) return df
def main(): args = UserInput() if args.name: name = args.name else: name = 'ID' if args.score: score = args.score else: score = 'Chemgauss4' if args.dock: dock = args.dock else: dock = 'fred' if args.top: top = int(args.top) else: top = -1 # all df = rdpd.LoadSDF(args.infile, removeHs=False, molColName='ROMol', idName='mol_ID')[:top].fillna('') print('\033[34m> select mol: \033[32m{0}\033[0m'.format(len(df))) df[score] = df[score].apply(float) df['Rank'] = df.index for idx, row in df.iterrows(): df['ROMol'][idx].SetProp( '_Name', '{0}::{1}::{2:.2f}::{3}'.format(row[name], row['Rank'] + 1, row[score], dock)) sdf_out = '{0}.{1}_docked.sdf.gz'.format(args.outpref, dock) csv_out = '{0}.{1}_docked.txt.bz2'.format(args.outpref, dock) rdpd.WriteSDF(df, sdf_out, properties=list(df.columns)) df.to_csv(csv_out, header=False, index=False, sep='\t', columns=[name, score], float_format='%.3f')
def update_sdf(): compounds_df = pd.DataFrame(list(Compound.objects.all().values())).drop( ['id', 'created_at', 'updated_at'], axis=1) PandasTools.AddMoleculeColumnToFrame(compounds_df, 'Smiles', 'ROMol', includeFingerprints=True) if not os.path.exists('media'): os.makedirs('media') df_to_sdf(compounds_df, 'media/all_data.sdf')
def add_mol_to_frame(self): """ Adds a image file of the ligand to the :py:class:`pandas.Dataframe` :return: None """ PandasTools.AddMoleculeColumnToFrame( self._data, smilesCol="smiles", molCol="ROMol", includeFingerprints=False ) self._data["ROMol"].apply(lambda x: x[0])
def test_write_to_sdf_gz(self): dirname = tempfile.mkdtemp() try: filename = os.path.join(dirname, "test.sdf.gz") PandasTools.WriteSDF(self.df, filename) s = gzip.open(filename).read() self.assertEqual(s.count("\n$$$$\n"), 2) self.assertEqual(s.split("\n", 1)[0], "Methane") finally: shutil.rmtree(dirname)
def test_FrameToGridImage(self): # This test only makes sure that we get no exception. To see the created images, set # interactive to True interactive = False self.assertTrue(True) df = self.df result = PandasTools.FrameToGridImage(df) if interactive: result.show() result = PandasTools.FrameToGridImage( df, legendsCol='PUBCHEM_IUPAC_INCHIKEY') if interactive: result.show() result = PandasTools.FrameToGridImage(df, legendsCol=df.index.name) if interactive: result.show()
def main(): args = UserInput() df = RDkitRead(args.infile, args.id, removeHs=False, add_Hs=False) nsee_df = df[df['NeverSee_Groups'] == 'Y'] len(nsee_df) pass_df = df[df['NeverSee_Groups'] == 'N'] len(pass_df) print('\033[34m Passed NeverSee Filter: \033[32m{0}\033[0m'.format( len(pass_df))) print('\033[34m Failed NeverSee Filter: \033[31m{0}\033[0m'.format( len(nsee_df))) if re.search(r'.smi', args.nsee_file, re.IGNORECASE): nsee_df.smiles = nsee_df.MOL.apply( lambda m: Chem.MolToSmiles(Chem.RemoveHs(m))) nsee_df.to_csv(args.nsee_file, columns=['smiles', 'ID'], sep=' ', header=False, index=False) else: rdpd.WriteSDF(nsee_df, args.nsee_file, molColName='MOL', properties=list(nsee_df.columns)) if re.search(r'.smi', args.pass_file, re.IGNORECASE): pass_df.smiles = pass_df.MOL.apply( lambda m: Chem.MolToSmiles(Chem.RemoveHs(m))) pass_df.to_csv(args.pass_file, columns=['smiles', 'ID'], sep=' ', header=False, index=False) else: rdpd.WriteSDF(pass_df, args.pass_file, molColName='MOL', properties=list(pass_df.columns)) print('')
def parse_sd_file(file, tgz=False): """ parse a sd file and return molecules """ if tgz == True: file = gzip.open(file) data = PandasTools.LoadSDF(file, molColName='Molecule', smilesName='smiles') return data
def compute_unique_smiles(self, interp_df, embeddings, embedding_funct, scaled_radius=0.5): """ Identify duplicate SMILES and distorts the embedding. The input df must have columns 'SMILES' and 'Generated' at 0th and 1st position. 'Generated' colunm must contain boolean to classify SMILES into input SMILES(False) and generated SMILES(True). This function does not make any assumptions about order of embeddings. Instead it simply orders the df by SMILES to identify the duplicates. """ distance = self._compute_radius(scaled_radius) for i in range(5): smiles = interp_df['SMILES'].sort_values() duplicates = set() for idx in range(0, smiles.shape[0] - 1): if smiles.iat[idx] == smiles.iat[idx + 1]: duplicates.add(smiles.index[idx]) duplicates.add(smiles.index[idx + 1]) if len(duplicates) > 0: for dup_idx in duplicates: if interp_df.iat[dup_idx, 1]: # add jitter to generated molecules only embeddings[dup_idx] = self.addjitter( embeddings[dup_idx], distance, 1) smiles = embedding_funct(embeddings) else: break # Ensure all generated molecules are valid. for i in range(5): PandasTools.AddMoleculeColumnToFrame(interp_df,'SMILES') invalid_mol_df = interp_df[interp_df['ROMol'].isnull()] if not invalid_mol_df.empty: invalid_index = invalid_mol_df.index.to_list() for idx in invalid_index: embeddings[idx] = self.addjitter(embeddings[idx], distance, cnt=1) smiles = embedding_funct(embeddings) else: break # Cleanup if 'ROMol' in interp_df.columns: interp_df = interp_df.drop('ROMol', axis=1) return interp_df
def get_all(self): t2.set('') t_sol.set('') t_lip.set('') t_sasc.set('') print('molecule') print(molecule[0]) #print('canocical_smile', molecule[0].canonical_smiles) print('isomeric_smile', molecule[0].isomeric_smiles) mol_canonical_smiles = molecule[0].canonical_smiles mol_isomeric_smiles = molecule[0].isomeric_smiles t2.set(mol_isomeric_smiles) mol_ = Chem.MolFromSmiles(mol_isomeric_smiles) Draw.MolToFile(mol_, 'tmp.png') global image_ image_open = Image.open('tmp.png') image_ = ImageTk.PhotoImage(image_open, master=frame1) canvas.create_image(150,75, image=image_) smiles = t2.get() df = pd.DataFrame({'name': [t1.get()], 'smiles' : [t2.get()], 'solubility': [0.00]}) #df = pd.DataFrame([]) df.to_csv('tmp.csv') graph_featurizer = dc.feat.graph_features.ConvMolFeaturizer() loader_p = dc.data.data_loader.CSVLoader( tasks = ['solubility'], smiles_field = "smiles", id_field = "name", featurizer = graph_featurizer ) predictset = loader_p.featurize( 'tmp.csv' ) prediction_sol = model_sol.predict(predictset) t_sol.set(round(10**prediction_sol[0][0],3)) prediction_lip = model_lip.predict(predictset) t_lip.set(round(10**prediction_lip[0][0],3)) PandasTools.AddMoleculeColumnToFrame(frame=df, smilesCol='smiles') sa_score = df.ROMol.map(sascorer.calculateScore) t_sasc.set(round(sa_score[0],2)) #print(df['calc_SA_score']) print(sa_score[0])
def get_most_common_fragments(fragments, top_x=50): """ Get most common fragments. Parameters ---------- fragments : pandas.DataFrame Fragment details, i.e. SMILES, kinase groups, and fragment RDKit molecules, for input subpocket. top_x : int Top x most common fragments. Returns ------- pandas.DataFrame Most common fragments (sorted in descending order), including fragments' SMILES, ROMol, and count. """ # Get number of occurrences (count) per fragment (based on SMILES) in decending order fragment_counts = fragments.smiles.value_counts() fragment_counts.name = "fragment_count" # Cast Series to DataFrame and add ROMol column fragment_counts = fragment_counts.reset_index().rename( columns={"index": "smiles"}) PandasTools.AddMoleculeColumnToFrame(fragment_counts, "smiles") # Sort fragments by their count (descending) fragment_counts.sort_values("fragment_count", ascending=False, inplace=True) fragment_counts.reset_index(inplace=True, drop=True) # Set molecule ID as index name fragment_counts.index.name = "molecule_id" # Get the top X most common fragments if fragment_counts.shape[0] < top_x: # Select all fragments if there are less than top X fragments in subpocket most_common_fragments = fragment_counts else: # If multiple fragments have the same count but some make it into the top X and some not, # include the latter also # Get lowest fragment count that is included in top X fragments lowest_fragment_count = fragment_counts.iloc[top_x - 1].fragment_count # Get all fragments with more or equal to the lowest fragment count most_common_fragments = fragment_counts[ fragment_counts.fragment_count >= lowest_fragment_count] return most_common_fragments
def __init__(self, data, output_name): output = StringIO() compounds_df = pd.DataFrame(list(data.values())).drop('id', axis=1) PandasTools.AddMoleculeColumnToFrame(compounds_df, 'Smiles', 'ROMol', includeFingerprints=True) PandasTools.WriteSDF(compounds_df, output, molColName='ROMol', idName='PID', properties=list(compounds_df.columns)) mimetype = 'text/plain' file_ext = 'sdf' output.seek(0) super(SDFResponse, self).__init__(content=output.getvalue(), content_type=mimetype) self['Content-Disposition'] = 'attachment;filename="%s.%s"' % \ (output_name.replace('"', '\"'), file_ext)
def molgrid_image(smiles, file_name, labels=None, molPerRow=5): df = pd.DataFrame({'smiles': smiles}) PandasTools.AddMoleculeColumnToFrame(df, 'smiles', 'mol') if labels is None: labels = ['{:d}'.format(i) for i in df.index] svg = Draw.MolsToGridImage(df['mol'], molsPerRow=5, legends=labels, useSVG=True) save_svg(svg, file_name + '.svg', dpi=150) return
def readProjectData(filename, FP, smilesCol): # reads in the project data and calculates fingerprints df_proj=pd.read_csv(filename)#,names=['ID','Structure','mol name','scaffold','series assignment','assay'], skiprows=[0]) #df_proj = df_proj.head(100) PandasTools.AddMoleculeColumnToFrame(df_proj,smilesCol=smilesCol,molCol='Molecule') df_proj=df_proj.loc[df_proj['Molecule'].map(lambda x: x is not None)] if FP=='Morgan2': df_proj['FP']=df_proj.Molecule.map(lambda x : AllChem.GetMorganFingerprint(x,2)) else: print(FP, ' fingerprint not implemented.') return return df_proj
def return_files_sdf(): df = pd.read_pickle('temp.pickle') PandasTools.WriteSDF(df, 'temp.sdf', molColName='structures', properties=list(df.columns), allNumeric=False) try: result = send_file('temp.sdf', as_attachment=True) return result except Exception as e: return str(e)
def binding_affinity(self, prot_in, lig_in, outpath="results/results_affinity_binding.csv"): DF = self.preprocessing(prot_in, lig_in) X = DF.iloc[:, 2:] print(DF.columns) logger.info(X.shape) jl_filename = "models/gbdt_regression.joblib" cl_filename = "models/gbdt_model.joblib" if os.path.isfile(jl_filename) is True: with open(jl_filename, 'rb') as file: models = joblib.load(file) y = pd.Series(models.predict(X)) ya = y.rename("predicted_affinity") else: logger.info("no model available") if os.path.isfile(cl_filename) is True: with open(cl_filename, 'rb') as file: models = joblib.load(file) yb = pd.Series(models.predict_proba(X)[:, 1]) else: logger.info("no model available") smiles = DF["smiles"] prot = DF["UniProtID"] final = pd.concat([smiles, prot, ya, yb], axis=1) final.columns = ["smiles", "Uniprot ID", "affinity", "probability"] final["predicted_label"] = np.where( final.probability > 0.7, "high", np.where(final.probability < 0.4, "low", "medium")) logger.info(final.columns) logger.info(final.columns) logger.info(final[0:10]) final.to_csv(outpath) pp_out = "results/affinity_out.sdf" PandasTools.AddMoleculeColumnToFrame(final, 'smiles', 'Molecule') PandasTools.WriteSDF(final, pp_out, molColName='Molecule', properties=list(final.columns))
def split_sdf(sdf_file_name, outdir="data/"): print("Loading sdf.") # Parse the SDF file into a Pandas dataframe. rdk_lg = RDLogger.logger() rdk_lg.setLevel(RDLogger.CRITICAL) df = PandasTools.LoadSDF(sdf_file_name, smilesName='SMILES', molColName='Molecule', includeFingerprints=False) print("Raw cols = ", [str(x) for x in df.columns]) # Select only the needed columns and merge the two PDB cols. df_list = [ 'PDB ID(s) for Ligand-Target Complex', 'PDB ID(s) of Target Chain', 'SMILES', 'IC50 (nM)', 'Molecule' ] df_selected = df[df_list].copy() df_selected["PDB IDs"] = df_selected[ 'PDB ID(s) for Ligand-Target Complex'] + ',' + df_selected[ 'PDB ID(s) of Target Chain'] print("Selected cols = ", [str(x) for x in df_selected.columns]) df_selected = df_selected[["PDB IDs"] + df_list[2:]] # Drop any rows with missing data. df_selected = df_selected.replace('', np.nan) df_selected = df_selected.replace(',', np.nan) df_selected = df_selected.dropna() r_rows = len(df.index) s_rows = len(df_selected.index) print("Raw rows = ", r_rows) print("Sel rows = ", s_rows) print("Keep pct = %.2f%s" % (((float(s_rows) / float(r_rows)) * 100.0), '%')) # Build ligand dictionary and a protein dictionary. print("Building protein-ligand dictionary.") uligs = {} prots_ligs = {} for lndx, row in enumerate(df_selected.values): pdbs = row[0].split(',') for pdb in pdbs: if pdb == '': continue if pdb not in prots_ligs: prots_ligs[pdb] = [] prots_ligs[pdb] += [lndx] uligs[lndx] = row print("Unique proteins = ", len(prots_ligs)) print("Writing per-ligand output files.") # Write out .lig files and return the data dictionaries. for key in uligs: ndx = str(key) lig = uligs[key] write_lig_file(lig[3], outdir + "/lig/lig%s.lig" % ndx) return uligs, prots_ligs
def compile_filters(self): RS_inchi = self.limit_RS(self.df, self.command['RS_min'], self.command['RS_max']) MW_inchi = self.limit_MW(self.df, self.command['MW_min'], self.command['MW_max']) nRing_inchi = self.limit_nRing(self.df, self.command['nRing_min'], self.command['nRing_max']) Lipinski_inchi = self.limit_Lipinski(self.df, self.command['Lipinski']) nG12Ring_inchi = self.limit_nG12Ring(self.df, self.command['nG12Ring_min'], self.command['nG12Ring_max']) SlogP_inchi = self.limit_SlogP(self.df, self.command['SlogP_min'], self.command['SlogP_max']) Sugars_inchi = self.limit_nSugars(self.df, self.command['nSugars_min'], self.command['nSugars_min']) nFRing_inchi = self.limit_nFusedRing(self.df, self.command['nFRing_min'], self.command['nFRing_max']) core_ester_inchi = self.limit_core_ester( self.df, self.command['core_ester_min'], self.command['core_ester_max']) naRing_inchi = self.limit_naRing(self.df, self.command['naRing_min'], self.command['naRing_max']) activity_reported_inchi = self.limit_activity_reported( self.df, self.command['activity_reported']) sets = [ RS_inchi, MW_inchi, nRing_inchi, Lipinski_inchi, nG12Ring_inchi, SlogP_inchi, Sugars_inchi, nFRing_inchi, core_ester_inchi, naRing_inchi, activity_reported_inchi ] self.filtered_inchi = list(set.intersection(*sets)) self.filtered_df = self.df.loc[self.df['InChI Keys'].isin( self.filtered_inchi)] # print(filtered_df.shape[0], ' compouds have been compiled based on your filters.') # smiles = filtered_df['smiles'].tolist() PandasTools.AddMoleculeColumnToFrame(self.filtered_df, 'smiles', 'Molecule picture') # export csv file # self.filtered_df.to_csv('temp.csv', index=False) ## export sdf file # PandasTools.WriteSDF(self.filtered_df, 'temp.sdf', molColName='structures', properties=list(self.filtered_df.columns), allNumeric=False) # export smiles # self.smiles_writer() # self.filtered_df.to_sql(name='temp', con=db.engine, index=False) smiles_frame = self.frame_manage() return smiles_frame
def mol_diversity(smiles): df = pd.DataFrame({'smiles': smiles}) PandasTools.AddMoleculeColumnToFrame(df, 'smiles', 'mol') fps = [ Chem.GetMorganFingerprintAsBitVect(m, 4, nBits=2048) for m in df['mol'] ] dist_1d = tanimoto_1d(fps) mean_dist = np.mean(dist_1d) return mean_dist mean_rand = 0.91549 # mean random distance mean_diverse = 0.94170 # mean diverse distance norm_dist = (mean_dist - mean_rand) / (mean_diverse - mean_rand) return norm_dist