def loadDrugDatabase(filename): ''' 输入文件支持sdf和csv格式 如果是csv格式必须存存在smiles列,例如 smiles, id, name, .... CCC(CO)NC, 1, HD0001, ... ''' if not os.path.exists(filename): raise Exception("数据库文件不存在!") if not ((filename.split('.')[-1] == 'csv') or (filename.split('.')[-1] == 'sdf')): raise Exception("数据库文件仅支持sdf和csv格式!") print('') print('Step 3') print('###############################') print('正在加在数据库!!!') if filename.split('.')[-1] == 'sdf': drugs = Chem.SDMolSupplier(filename) # drugs = [x for x in drugs] descr = [] smiles = [] for x in drugs: descr.append(x.GetPropsAsDict()) smiles.append(Chem.MolToSmiles(x)) col = [i for i in x.GetPropsAsDict().keys()] df = pd.DataFrame(columns=[col]) df = df.from_dict(descr) df['smiles_tmp'] = smiles print('正在去除化合物中的盐离子!!!') df['smiles'] = [removeSalt(smi) for smi in df.smiles_tmp] print('正在产生结构文件!!!') PandasTools.AddMoleculeColumnToFrame(df, 'smiles', 'mol') elif filename.split('.')[-1] == 'csv': df = pd.read_csv(filename) df['smiles_tmp'] = df['smiles'] print('正在去除化合物中的盐离子!!!') df['smiles'] = [removeSalt(smi) for smi in df.smiles_tmp] print('正在产生结构文件!!!') PandasTools.AddMoleculeColumnToFrame(df, 'smiles', 'mol') # drugs = [x for x in df.mol] else: raise Exception('化合物数据库必须是SDF或CSV格式!') print('共加载%s个化合物!!!' %df.shape[0]) print('###############################') return df
def add_mol_column_to_df(self, data: pd.DataFrame, smiles_column: str) -> pd.DataFrame: """ Applies PandasTools functionalities to process the structure into a valid format for the sdf transformation. :param data: dataframe to be modified :param smiles_column: SMILES column in the dataframe to be processed :return data: modified data :return no_mol: data that hasn't been modified """ PandasTools.AddMoleculeColumnToFrame(data, smiles_column) no_mol = data[data['ROMol'].isna()] data.drop(no_mol.index, axis=0, inplace=True) data.loc[:, 'ROMol'] = [ Chem.AddHs(x) for x in data['ROMol'].values.tolist() ] if no_mol.empty is False: self.get_output_file(outfile_type='xlsx', data=no_mol, outfile_name='Non_processed_molecules') return data
def RDkitRead(in_file, removeHs=True, add_Hs=False): ## Read in SDF file; can choose to add hydrogens or not if re.search(r'.sdf', in_file): print(' # Reading SDF') df = rdpd.LoadSDF(file_handle(in_file), removeHs=removeHs, idName='ID', molColName='mol') df['smiles'] = df.mol.apply( lambda m: Chem.MolToSmiles(Chem.RemoveHs(m))) if add_Hs: df['mol'] = df.mol.apply(Chem.AddHs) ## Read in SMILES file, check if there is a header "smiles" if re.search(r'.smi', in_file): print('# Reading SMI') with file_handle(in_file) as fi: if re.search('smi', str(fi.readline()), re.IGNORECASE): print('# Smiles input has Header #\n') df = pd.read_csv(in_file, sep='\s+', comment='#').dropna() df.columns = ['smiles', 'ID'] else: print('# Smiles input has NO Header #\n') df = pd.read_csv(in_file, header=None, sep='\s+', comment='#').dropna() df.columns = ['smiles', 'ID'] rdpd.AddMoleculeColumnToFrame(df, smilesCol='smiles', molCol='mol') df['smiles'] = df.mol.apply(Chem.MolToSmiles) print('## Number of MOL read from {}: {}\n'.format(in_file, len(df.smiles))) return df
def read_csv( urlpath: Union[str, os.PathLike, TextIO], smiles_column: str = None, mol_column: str = "mol", **kwargs, ) -> pd.DataFrame: """Read a CSV file. Args: urlpath: Path to a file or a file-like object. Path can be remote or local. smiles_column: Use this column to build a mol column. mol_column: Name to give to the mol column. If not None a mol column will be build. Avoid when loading a very large file. kwargs: Arguments to pass to `pd.read_csv()`. Returns: df: a `pandas.DataFrame` """ df: pd.DataFrame = pd.read_csv(urlpath, **kwargs) # type: ignore if smiles_column is not None: PandasTools.AddMoleculeColumnToFrame(df, smiles_column, mol_column) return df
def __call__(self): df = pd.DataFrame(columns=["smiles"]) df.loc[1] = self.smiles PandasTools.AddMoleculeColumnToFrame(df, smilesCol="smiles") romol = df.loc[1, "ROMol"] return pd.concat({romol: self.dataframe}, names=["Structure"])
def regression_gc(): smiles = t_smiles.get() df = pd.DataFrame({ 'name': [t_name.get()], 'smiles': [t_smiles.get()], 'solubility': [0.00] }) df.to_csv('tmp.csv') graph_featurizer = dc.feat.graph_features.ConvMolFeaturizer() loader_p = dc.data.data_loader.CSVLoader(tasks=['solubility'], smiles_field="smiles", id_field="name", featurizer=graph_featurizer) predictset = loader_p.featurize('tmp.csv') prediction_sol = model_sol.predict(predictset) t_sol.set(round(10**prediction_sol[0][0], 3)) prediction_lip = model_lip.predict(predictset) t_lip.set(round(10**prediction_lip[0][0], 3)) prediction_GWP100 = model_GWP100.predict(predictset) t_GWP100.set(round(10**prediction_GWP100[0][0], 3)) PandasTools.AddMoleculeColumnToFrame(frame=df, smilesCol='smiles') sa_score = df.ROMol.map(sascorer.calculateScore) t_sasc.set(round(sa_score[0], 2)) print(sa_score[0])
def pandas_structure(active_df): # Converts INCHI input file to smiles, then adds rd_object to dataframe''' active_df['Smiles'] = active_df.apply(lambda x: inchi_smiles(x['inchi']), axis=1) PandasTools.AddMoleculeColumnToFrame(active_df, 'Smiles', 'Molecule') return active_df
def parse_text_file(file): """ parse a csv file and return molecules """ data = pd.read_csv(file) PandasTools.AddMoleculeColumnToFrame(data, "SMILES", "Molecule") return data
def painthis(smidf, prop): pt.AddMoleculeColumnToFrame(smidf, "smiles") smidf['pr'] = smidf['ROMol'].map('Descriptors.' + prop) del smidf["ROMol"] ax = smidf['pr'].hist(bins=50) ax.set_xlabel(prop)
def get_dataframe_from_file(filename, mol_col="ROMol", smiles_col="SMILES"): """Determine file type from filename extension and produce a Pandas dataframe accordingly. Supported filename extensions: .sdf, .csv, .tsv, .smi If the file is an SDF, get the structures from the MOL_COL column. Otherwise, use the SMILES strings in SMILES_COL to obtain the structures. Parameters ---------- filename: filepath string The name of the file from which to read the data; can an be absolute or relative path. mol_col: string containing column name (case-insensitive) In an SDF formatted file, the name of the column containing the structures smiles_col: string containing column name (case-insensitive) In a non-SDF formatted file, the name of the column containing the SMILES strings Returns ------- df: Pandas dataframe A Pandas dataframe containing the data from FILENAME. """ logging.info(f'Reading {filename}') file_ext = pathlib.Path(filename).suffix if file_ext == ".sdf": ## Maintaining the standard parameters: /idName/ = 'ID', /includeFingerprints/ = False, ## /isomericSmiles/ = True, /embedProps/ = False, ## /removeHs/ = True, /strictParsing/ = True, /smilesName/ = None df = PandasTools.LoadSDF(filename, molColName=mol_col) elif file_ext in [".csv", ".tsv", ".smi"]: sep = "," if file_ext == ".tsv": sep = "\t" elif file_ext == ".smi": mol_field = "smiles" df = pandas.read_csv(filename, sep=sep) # Generate structures from SMILES # Yields /None/ if conversion fails PandasTools.AddMoleculeColumnToFrame(df, smilesCol=smiles_col) elif file_ext == '': # TODO: Error: Cannot determine file type pass else: # TODO: Error: file type not supported pass return df
def process_ligands(self, ligands): XD = [] if self.drug_format == "labeled_smiles": if type(ligands) == OrderedDict: iterator = ligands.keys() else: iterator = range(ligands.shape[0]) for d in iterator: XD.append( label_smiles(ligands[d], self.SMILEN, self.charsmiset)) elif self.drug_format == "mol2vec": from gensim.models import word2vec from mol2vec.features import (MolSentence, mol2alt_sentence, sentences2vec) from rdkit.Chem import PandasTools word2vec_model = word2vec.Word2Vec.load(self.mol2vec_model_path) df_ligands = pd.DataFrame({"smiles": ligands}) PandasTools.AddMoleculeColumnToFrame(df_ligands, "smiles", "ROMol") dtc_train = df_ligands[df_ligands["ROMol"].notnull()] dtc_train.loc[:, "mol-sentence"] = dtc_train.apply( lambda x: MolSentence( mol2alt_sentence(x["ROMol"], self.mol2vec_radius)), axis=1, ) XD = sentences2vec(dtc_train["mol-sentence"], word2vec_model, unseen="UNK") return XD
def get_molecules(): df = pd.read_csv('./data/smiles.csv') PandasTools.AddMoleculeColumnToFrame(df, 'smiles', 'molecule', includeFingerprints=False) return df
def read_excel( urlpath: Union[str, os.PathLike, TextIO], sheet_name: Optional[Union[str, int, list]] = 0, smiles_column: str = None, mol_column: str = "mol", **kwargs, ) -> pd.DataFrame: """Read an excel file. Args: urlpath: Path to a file or a file-like object. Path can be remote or local. sheet_name: see `pandas.read_excel()` doc. mol_column: Name to give to the mol column. If not None a mol column will be build. Avoid when loading a very large file. mol_column: name to give to the mol column. kwargs: Arguments to pass to `pd.read_excel()`. Returns: df: a `pandas.DataFrame` """ df = pd.read_excel(urlpath, sheet_name=sheet_name, **kwargs) # type: ignore if smiles_column is not None: PandasTools.AddMoleculeColumnToFrame(df, smiles_column, mol_column) return df
def grid_image( df, filename, molobj=True, smi='smiles'): # list of molecules to print and substructre to align """ Creates and saves grid image of 2D drawings of molecules. Accepts dataframe containing a column titled "Molecule" that contains RDKit molecule objects. Accepts filename as string (without .png) for image file. Returns nothing, saves file in current directory. _____________________________ Keyword Arguments: molobj=True, if RDKit MolObj column exists in df. (Must be headed "Molecule") smi='smiles', if molojb=False then use column titled smi to create MolObj column. """ if not molobj: # no molobj exists PandasTools.AddMoleculeColumnToFrame(df, smi, 'Molecule', includeFingerprints=True) # this code makes multiple images of n molecules. May be prefered for large sets of molecules. # create images of molecules in dataframe mol_image = PandasTools.FrameToGridImage( df, column='Molecule', molsPerRow=3, subImgSize=(800, 400), legends=[str(i + 1) for i in range(len(df['Molecule']))]) mol_image.save( filename + '.png') # shold use a better naming scheme to avoid overwrites.
def fetch_learning_data(datasets, datasets_cols=(), bioacitivities_cols=('value', ), compute_descriptors=False, create_rdkit_mols=False, col_names_map=(), duplicates_handler=None): DB_CONNECTION, TB_COMPOUNDS, TB_DATASETS, TB_BIOACTIVITIES = db.fetch_all() session = sessionmaker(bind=DB_CONNECTION)() cols = _gather_columns(TB_BIOACTIVITIES, bioacitivities_cols) cols.extend(_gather_columns(TB_DATASETS, datasets_cols)) cols.append(TB_COMPOUNDS.c.smiles) query = session.query( *cols ).join(TB_COMPOUNDS).join(TB_DATASETS)\ .filter( TB_DATASETS.c.unique_id.in_(datasets) ) # make the DB query and export the data to pandas DataFrame object data = pandas.read_sql_query(query.selectable, DB_CONNECTION) smiles_col_name = settings.COMPOUNDS_TABLE + '_smiles' ic50_col_name = settings.BIOACTIVITIES_TABLE + '_value' # remove duplicate values if duplicates_handler: duplicates = set( data[smiles_col_name][data[smiles_col_name].duplicated()]) for smiles in duplicates: duplicate_ic50s = data[data[smiles_col_name] == smiles][ic50_col_name] ret = duplicates_handler(smiles, duplicate_ic50s) data = data[data[smiles_col_name] != smiles] if type(ret) != bool and ret != False: data.update( pandas.DataFrame([[smiles, ret]], columns=[smiles_col_name, ic50_col_name])) if compute_descriptors: desc_list = Descriptors.descList try: desc_list = [x for x in desc_list if x[0] in compute_descriptors] except TypeError: for desc_name, function in desc_list: values = [] for smiles in data[smiles_col_name]: mol = MolFromSmiles(smiles) values.append(function(mol)) data[desc_name] = values if create_rdkit_mols: PandasTools.AddMoleculeColumnToFrame(data, smiles_col_name, 'rdmol') if col_names_map: data.rename(columns=col_names_map, inplace=True) return data
def query_to_df(queryset): # Dataframe to write calculations of each compounds compounds_df = pd.DataFrame(list(queryset.values())).drop('id', axis=1) PandasTools.AddMoleculeColumnToFrame(compounds_df, 'Smiles', 'ROMol', includeFingerprints=True) return compounds_df
def initial_population(self) -> None: dataframe = pd.read_csv(self.data_file) pdtl.AddMoleculeColumnToFrame(dataframe, 'smiles', 'molecule') molecules = dataframe['molecule'].sample(n=self.initial_size).tolist() molecules = self.arbiter(self.unique_molecules(molecules)) molecules, descriptors, fitnesses = self.process_molecules(molecules) self.archive.add_to_archive(molecules, descriptors, fitnesses) return None
def sn_scaff_smiles(self, murcko_smiles): """Function to exctract the preferred scaffold based on Scaffold Tree rules from the scaffold network created from a Murcko scaffold Args: murcko_smiles(str): valdi smiles string of a Murcko scaffold Returns: str: smiles string of the preferred scaffold """ if murcko_smiles is None: return None mol = Chem.MolFromSmiles(murcko_smiles) if mol is not None: # if the murcko scaffold has less or equal than the targeted number of rings, then the Murcko scaffold is already the sn_scaffold, # so no further decomposition is needed if Chem.rdMolDescriptors.CalcNumRings(mol) <= self.nrings_target: return murcko_smiles # otherwise start decomposition try: sn = rdScaffoldNetwork.CreateScaffoldNetwork([mol], self.snparams) except: raise ValueError( "failed to calculate scaffold network for {}".format( murcko_smiles)) # create data fram with n ode smiles node_df = pd.DataFrame({"node_smiles": [str(n) for n in sn.nodes]}) PandasTools.AddMoleculeColumnToFrame(node_df, "node_smiles", "mol", includeFingerprints=False) node_df["num_rings"] = node_df["mol"].apply( Chem.rdMolDescriptors.CalcNumRings) node_df["num_rings_delta"] = (node_df["num_rings"] - self.nrings_target).abs() node_df["num_rbonds"] = node_df["mol"].apply( Chem.rdMolDescriptors.CalcNumRotatableBonds, strict=False) node_df["num_hrings"] = node_df["mol"].apply( Chem.rdMolDescriptors.CalcNumHeterocycles) node_df["num_arings"] = node_df["mol"].apply( Chem.rdMolDescriptors.CalcNumAromaticRings) node_df["num_bridge"] = node_df["mol"].apply( Chem.rdMolDescriptors.CalcNumBridgeheadAtoms) node_df["num_spiro"] = node_df["mol"].apply( Chem.rdMolDescriptors.CalcNumSpiroAtoms) node_df["has_macrocyle"] = node_df["mol"].apply( self.has_macrocycle) node_df["has_unusual_ring_size"] = node_df["mol"].apply( self.has_unusual_ringsize) node_df.sort_values(self.priority_cols, ascending=self.priority_asc, inplace=True) return node_df.iloc[0]["node_smiles"] else: raise ValueError("murcko_smiles {} cannot be read by rdkit".format( murcko_smiles))
def make_input(): active_df = pd.read_csv("actives_final.ism", header=None, sep=" ") active_rows, active_cols = active_df.shape active_df.columns = ["SMILES", "ID", "ChEMBL_ID"] active_df["label"] = ["Active"] * active_rows PandasTools.AddMoleculeColumnToFrame(active_df, "SMILES", "MOL") decoy_df = pd.read_csv("decoys_final.ism", header=None, sep=" ") decoy_rows, decoy_cols = decoy_df.shape decoy_df.columns = ["SMILES", "ID"] decoy_df["label"] = ["Decoy"] * decoy_rows PandasTools.AddMoleculeColumnToFrame(decoy_df, "SMILES", "MOL") active_df["is_active"] = [1] * active_df.shape[0] decoy_df["is_active"] = [0] * decoy_df.shape[0] combined_df = active_df.append(decoy_df)[["SMILES", "ID", "is_active"]] combined_df.to_csv("dude_ace.csv", index=False)
def test_AddMoleculeColumnToFrame(self): df = PandasTools.LoadSDF(getStreamIO(methane + peroxide), isomericSmiles=True, smilesName='Smiles') PandasTools.ChangeMoleculeRendering(frame=df, renderer='String') del df['ROMol'] self.assertNotIn('ROMol', str(df)) PandasTools.AddMoleculeColumnToFrame(df, includeFingerprints=False) self.assertIn('ROMol', str(df))
def add_mol_column(df, smiles_col, molecule_col='mol'): """ Add a column 'molecule_col' to data frame 'df' containing RDKit Mol objects corresponding to the SMILES strings in column 'smiles_col'. """ PandasTools.AddMoleculeColumnToFrame(df, smiles_col, molecule_col, includeFingerprints=True) return df
def update_sdf(): compounds_df = pd.DataFrame(list(Compound.objects.all().values())).drop( ['id', 'created_at', 'updated_at'], axis=1) PandasTools.AddMoleculeColumnToFrame(compounds_df, 'Smiles', 'ROMol', includeFingerprints=True) if not os.path.exists('media'): os.makedirs('media') df_to_sdf(compounds_df, 'media/all_data.sdf')
def update_sdf(): compounds_df = pd.DataFrame(list(Compound.objects.all().values())) if not compounds_df.isnull: compounds_df = compounds_df.drop(['id', 'created_at', 'updated_at'], axis=1) PandasTools.AddMoleculeColumnToFrame(compounds_df, 'Smiles', 'ROMol', includeFingerprints=True) if not os.path.exists('media'): os.makedirs('media') with open('media/all_data.sdf', 'w') as fi: PandasTools.WriteSDF(compounds_df, fi, molColName='ROMol', idName='PID', properties=list(compounds_df.columns))
def add_mol_to_frame(self): """ Adds a image file of the ligand to the :py:class:`pandas.Dataframe` :return: None """ PandasTools.AddMoleculeColumnToFrame( self._data, smilesCol="smiles", molCol="ROMol", includeFingerprints=False ) self._data["ROMol"].apply(lambda x: x[0])
def molgrid_image(smiles, file_name, labels=None, molPerRow=5): df = pd.DataFrame({'smiles': smiles}) PandasTools.AddMoleculeColumnToFrame(df, 'smiles', 'mol') if labels is None: labels = ['{:d}'.format(i) for i in df.index] svg = Draw.MolsToGridImage(df['mol'], molsPerRow=5, legends=labels, useSVG=True) save_svg(svg, file_name + '.svg', dpi=150) return
def compute_unique_smiles(self, interp_df, embeddings, embedding_funct, scaled_radius=0.5): """ Identify duplicate SMILES and distorts the embedding. The input df must have columns 'SMILES' and 'Generated' at 0th and 1st position. 'Generated' colunm must contain boolean to classify SMILES into input SMILES(False) and generated SMILES(True). This function does not make any assumptions about order of embeddings. Instead it simply orders the df by SMILES to identify the duplicates. """ distance = self._compute_radius(scaled_radius) for i in range(5): smiles = interp_df['SMILES'].sort_values() duplicates = set() for idx in range(0, smiles.shape[0] - 1): if smiles.iat[idx] == smiles.iat[idx + 1]: duplicates.add(smiles.index[idx]) duplicates.add(smiles.index[idx + 1]) if len(duplicates) > 0: for dup_idx in duplicates: if interp_df.iat[dup_idx, 1]: # add jitter to generated molecules only embeddings[dup_idx] = self.addjitter( embeddings[dup_idx], distance, 1) smiles = embedding_funct(embeddings) else: break # Ensure all generated molecules are valid. for i in range(5): PandasTools.AddMoleculeColumnToFrame(interp_df,'SMILES') invalid_mol_df = interp_df[interp_df['ROMol'].isnull()] if not invalid_mol_df.empty: invalid_index = invalid_mol_df.index.to_list() for idx in invalid_index: embeddings[idx] = self.addjitter(embeddings[idx], distance, cnt=1) smiles = embedding_funct(embeddings) else: break # Cleanup if 'ROMol' in interp_df.columns: interp_df = interp_df.drop('ROMol', axis=1) return interp_df
def get_all(self): t2.set('') t_sol.set('') t_lip.set('') t_sasc.set('') print('molecule') print(molecule[0]) #print('canocical_smile', molecule[0].canonical_smiles) print('isomeric_smile', molecule[0].isomeric_smiles) mol_canonical_smiles = molecule[0].canonical_smiles mol_isomeric_smiles = molecule[0].isomeric_smiles t2.set(mol_isomeric_smiles) mol_ = Chem.MolFromSmiles(mol_isomeric_smiles) Draw.MolToFile(mol_, 'tmp.png') global image_ image_open = Image.open('tmp.png') image_ = ImageTk.PhotoImage(image_open, master=frame1) canvas.create_image(150,75, image=image_) smiles = t2.get() df = pd.DataFrame({'name': [t1.get()], 'smiles' : [t2.get()], 'solubility': [0.00]}) #df = pd.DataFrame([]) df.to_csv('tmp.csv') graph_featurizer = dc.feat.graph_features.ConvMolFeaturizer() loader_p = dc.data.data_loader.CSVLoader( tasks = ['solubility'], smiles_field = "smiles", id_field = "name", featurizer = graph_featurizer ) predictset = loader_p.featurize( 'tmp.csv' ) prediction_sol = model_sol.predict(predictset) t_sol.set(round(10**prediction_sol[0][0],3)) prediction_lip = model_lip.predict(predictset) t_lip.set(round(10**prediction_lip[0][0],3)) PandasTools.AddMoleculeColumnToFrame(frame=df, smilesCol='smiles') sa_score = df.ROMol.map(sascorer.calculateScore) t_sasc.set(round(sa_score[0],2)) #print(df['calc_SA_score']) print(sa_score[0])
def get_most_common_fragments(fragments, top_x=50): """ Get most common fragments. Parameters ---------- fragments : pandas.DataFrame Fragment details, i.e. SMILES, kinase groups, and fragment RDKit molecules, for input subpocket. top_x : int Top x most common fragments. Returns ------- pandas.DataFrame Most common fragments (sorted in descending order), including fragments' SMILES, ROMol, and count. """ # Get number of occurrences (count) per fragment (based on SMILES) in decending order fragment_counts = fragments.smiles.value_counts() fragment_counts.name = "fragment_count" # Cast Series to DataFrame and add ROMol column fragment_counts = fragment_counts.reset_index().rename( columns={"index": "smiles"}) PandasTools.AddMoleculeColumnToFrame(fragment_counts, "smiles") # Sort fragments by their count (descending) fragment_counts.sort_values("fragment_count", ascending=False, inplace=True) fragment_counts.reset_index(inplace=True, drop=True) # Set molecule ID as index name fragment_counts.index.name = "molecule_id" # Get the top X most common fragments if fragment_counts.shape[0] < top_x: # Select all fragments if there are less than top X fragments in subpocket most_common_fragments = fragment_counts else: # If multiple fragments have the same count but some make it into the top X and some not, # include the latter also # Get lowest fragment count that is included in top X fragments lowest_fragment_count = fragment_counts.iloc[top_x - 1].fragment_count # Get all fragments with more or equal to the lowest fragment count most_common_fragments = fragment_counts[ fragment_counts.fragment_count >= lowest_fragment_count] return most_common_fragments
def readProjectData(filename, FP, smilesCol): # reads in the project data and calculates fingerprints df_proj=pd.read_csv(filename)#,names=['ID','Structure','mol name','scaffold','series assignment','assay'], skiprows=[0]) #df_proj = df_proj.head(100) PandasTools.AddMoleculeColumnToFrame(df_proj,smilesCol=smilesCol,molCol='Molecule') df_proj=df_proj.loc[df_proj['Molecule'].map(lambda x: x is not None)] if FP=='Morgan2': df_proj['FP']=df_proj.Molecule.map(lambda x : AllChem.GetMorganFingerprint(x,2)) else: print(FP, ' fingerprint not implemented.') return return df_proj
def compile_filters(self): RS_inchi = self.limit_RS(self.df, self.command['RS_min'], self.command['RS_max']) MW_inchi = self.limit_MW(self.df, self.command['MW_min'], self.command['MW_max']) nRing_inchi = self.limit_nRing(self.df, self.command['nRing_min'], self.command['nRing_max']) Lipinski_inchi = self.limit_Lipinski(self.df, self.command['Lipinski']) nG12Ring_inchi = self.limit_nG12Ring(self.df, self.command['nG12Ring_min'], self.command['nG12Ring_max']) SlogP_inchi = self.limit_SlogP(self.df, self.command['SlogP_min'], self.command['SlogP_max']) Sugars_inchi = self.limit_nSugars(self.df, self.command['nSugars_min'], self.command['nSugars_min']) nFRing_inchi = self.limit_nFusedRing(self.df, self.command['nFRing_min'], self.command['nFRing_max']) core_ester_inchi = self.limit_core_ester( self.df, self.command['core_ester_min'], self.command['core_ester_max']) naRing_inchi = self.limit_naRing(self.df, self.command['naRing_min'], self.command['naRing_max']) activity_reported_inchi = self.limit_activity_reported( self.df, self.command['activity_reported']) sets = [ RS_inchi, MW_inchi, nRing_inchi, Lipinski_inchi, nG12Ring_inchi, SlogP_inchi, Sugars_inchi, nFRing_inchi, core_ester_inchi, naRing_inchi, activity_reported_inchi ] self.filtered_inchi = list(set.intersection(*sets)) self.filtered_df = self.df.loc[self.df['InChI Keys'].isin( self.filtered_inchi)] # print(filtered_df.shape[0], ' compouds have been compiled based on your filters.') # smiles = filtered_df['smiles'].tolist() PandasTools.AddMoleculeColumnToFrame(self.filtered_df, 'smiles', 'Molecule picture') # export csv file # self.filtered_df.to_csv('temp.csv', index=False) ## export sdf file # PandasTools.WriteSDF(self.filtered_df, 'temp.sdf', molColName='structures', properties=list(self.filtered_df.columns), allNumeric=False) # export smiles # self.smiles_writer() # self.filtered_df.to_sql(name='temp', con=db.engine, index=False) smiles_frame = self.frame_manage() return smiles_frame