def test_parallelized(self): def fn(x): return x**2 results = dm.parallelized( fn, [{ "x": i } for i in range(10)], scheduler="processes", n_jobs=None, arg_type="kwargs", progress=True, ) assert results == [0, 1, 4, 9, 16, 25, 36, 49, 64, 81] results = dm.parallelized( fn, [[i] for i in range(10)], scheduler="processes", n_jobs=None, arg_type="args", progress=True, ) assert results == [0, 1, 4, 9, 16, 25, 36, 49, 64, 81] results = dm.parallelized( fn, [i for i in range(10)], scheduler="processes", n_jobs=None, progress=False, ) assert results == [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
def pdist(mols: List[Chem.rdchem.Mol], n_jobs: Optional[int] = 1, **fp_args) -> Tuple[np.ndarray, np.ndarray]: """Compute the pairwise tanimoto distance between the fingerprints of all the molecules in the input set. Args: mols: list of molecules n_jobs: Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores. **fp_args: list of args to pass to `to_fp()`. Returns: distmat, valid_idx: Distance matrix, and valid index that have passed the conversion to fingerprint. """ fps = dm.parallelized( functools.partial(dm.to_fp, as_array=False, **fp_args), mols, n_jobs=n_jobs, ) valid_idx, fps = zip(*[(i, fp) for i, fp in enumerate(fps) if fp is not None]) fps = list(fps) dist = GetTanimotoDistMat(fps) dist_mat = np.zeros((len(fps), len(fps))) dist_mat[np.triu_indices_from(dist_mat, 1)] = dist dist_mat += dist_mat.T return dist_mat, np.array(valid_idx)
def ingest_chembl_smi(smi_path, smiles_column, canonical_id_column, activity_column): """Convert an smi file with a smiles column to a molchunk. It is assumed that the SMI has been cleaned (no header, and other columns have been removed). Args: smi_path: path to the smi file. smiles_column: column where the SMILES are located: f0 = col 1 f1 = col 2 .. etc canonical_id_column: name/id for molecule: f0 = col 1 f1 = col 2 .. etc activity column: column where bioactivity is listed (ki, ec50, etc): f0 = col 1 f1 = col 2 .. etc """ # Next we will the multithreaded read options that pyarrow allows for. opts = pa.csv.ReadOptions(use_threads=True, autogenerate_column_names=True) # Then we tell pyarrow that the columns in our csv file are seperated by ';' # If they were tab seperated we would use '\t' and if it was comma we would use # ',' parse_options= pa.csv.ParseOptions(delimiter=' ') # Now we read the CSV into a pyarrow table. This is a columular dataset. More # on this later. Note how we specified the options above. table = pa.csv.read_csv(smi_path, opts, parse_options) # Now we will use a function that converts the pyarrow table into a pandas # dataframe. We could have done this without arrow, but again -- there are # very powerful tools that arrow will grant us. df_new = table.to_pandas() smiles_column = 'f0' # run initial mapper on smiles column to generate basic information and fingerprint on bits df_clean_mapped = dm.parallelized(_preprocess, list(df_new.iterrows()), arg_type='args', progress=True) df_clean_mapped = pd.DataFrame(df_clean_mapped) #rename columns df_clean_mapped['smiles'] = df_clean_mapped[smiles_column] df_clean_mapped['canonical_id'] = df_clean_mapped[canonical_id_column] df_clean_mapped['ki'] = df_clean_mapped[activity_column] #delete old columns del df_clean_mapped['f2'] del df_clean_mapped['f1'] del df_clean_mapped['f0'] #remove duplicated standard SMILES and reindex duplicateRowsDF2 = df_clean_mapped[df_clean_mapped.duplicated(['standard_smiles'])] print("Duplicate Rows based on a single column are:", duplicateRowsDF2, sep='\n') df_clean_mapped = df_clean_mapped.drop_duplicates(subset='standard_smiles', keep="first", inplace=False) df = df_clean_mapped.reset_index(drop=True) return df
def pick_diverse( mols: List[Chem.rdchem.Mol], npick: int, initial_picks: List[int] = None, feature_fn: Callable = None, dist_fn: Callable = None, seed: int = 42, n_jobs: Optional[int] = 1, ): r"""Pick a set of diverse molecules based on they fingerprint. Args: mols: a list of molecules. npick: Number of element to pick from mols, including the preselection. initial_picks: Starting list of index for molecules that should be in the set of picked molecules. Default to None. feature_fn: A feature function that takes a Chem.rdchem.Mol object and return molecular features. By default, the `dm.to_fp()` is used. Default to None. dist_fn: A function that takes two indexes (i,j) and return the distance between them. You might use partial to set the fingerprints as input. By default, the Tanimoto similarity will be used. Default to None. seed: seed for reproducibility n_jobs: Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores. Returns: picked_inds: index of the molecule that have been picked mols: molecules that have been picked """ if feature_fn is None: feature_fn = functools.partial(dm.to_fp, as_array=False) features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs) def distij(i, j, features=features): return 1.0 - DataStructs.TanimotoSimilarity(features[i], features[j]) if dist_fn is None: dist_fn = distij picker = MaxMinPicker() initial_picks = [] if initial_picks is None else initial_picks picked_inds = picker.LazyPick(dist_fn, len(mols), npick, firstPicks=initial_picks, seed=seed) picked_inds = np.array(picked_inds) picked_mols = [mols[x] for x in picked_inds] return picked_inds, picked_mols
def assign_to_centroids( mols: List[Chem.rdchem.Mol], centroids: List[Chem.rdchem.Mol], feature_fn: Callable = None, dist_fn: Callable = None, n_jobs: Optional[int] = 1, ): r"""Assign molecules to centroids. Each molecule will be assigned to the closest centroid. Args: mols: a list of molecules to assign to centroids centroids: list of molecules to use as centroid feature_fn: A feature function that takes a Chem.rdchem.Mol object and return molecular features. By default, the `dm.to_fp()` is used. Default to None. dist_fn: A function that takes two indexes (i,j) and return the distance between them. You might use partial to set the fingerprints as input. By default, the Tanimoto similarity will be used. Default to None. n_jobs: Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores. Returns: clusters_map: dict of index mapping each centroid index to the molecule index in the cluster clusters_list: list of all molecules in each cluster. The cluster index follows the index of the centroid. Note that the centroid molecule is not added to the cluster. """ if feature_fn is None: feature_fn = functools.partial(dm.to_fp, as_array=False) all_mols = [x for x in mols] + [c for c in centroids] features = dm.parallelized(feature_fn, all_mols, n_jobs=n_jobs) def distij(i, j, features=features): return 1.0 - DataStructs.TanimotoSimilarity(features[int(i)], features[int(j)]) if dist_fn is None: dist_fn = distij clusters_map = ddict(list) clusters_list = [[] for _ in centroids] query_inds = np.expand_dims(np.arange(len(mols), dtype=int), axis=1) centroid_inds = np.expand_dims(np.arange(len(centroids), dtype=int), axis=1) + len(mols) dist_mat = distance.cdist(query_inds, centroid_inds, metric=distij) closest = np.argmin(dist_mat, axis=1) for ind, cluster_ind in enumerate(closest): # type: ignore clusters_map[cluster_ind].append(ind) clusters_list[cluster_ind].append(mols[ind]) return clusters_map, clusters_list
def cdist( mols1: List[Chem.rdchem.Mol], mols2: List[Chem.rdchem.Mol], n_jobs: Optional[int] = 1, **fp_args, ) -> np.ndarray: """Compute the pairwise tanimoto distance between the fingerprints of each pair of molecules of the two collections of inputs. Args: mols1: list of molecules. mols2: list of molecules. n_jobs: Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores. **fp_args: list of args to pass to `to_fp()`. Returns: distmat """ fps1 = dm.parallelized( functools.partial(dm.to_fp, as_array=True, **fp_args), mols1, n_jobs=n_jobs, ) fps2 = dm.parallelized( functools.partial(dm.to_fp, as_array=True, **fp_args), mols2, n_jobs=n_jobs, ) fps1 = np.array(fps1) fps2 = np.array(fps2) dist_mat = distance.cdist(fps1, fps2, metric="jaccard") return dist_mat
def cluster_mols( mols: List[Chem.rdchem.Mol], cutoff: float = 0.2, feature_fn: Callable = None, n_jobs: Optional[int] = 1, ): """Cluster a set of molecules using the butina clustering algorithm and a given threshold. Args: mols: a list of molecules. cutoff: Cuttoff for the clustering. Default to 0.2. feature_fn: A feature function that takes a Chem.rdchem.Mol object and return molecular features. By default, the `dm.to_fp()` is used. Default to None. n_jobs: Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores. """ if feature_fn is None: feature_fn = functools.partial(dm.to_fp, as_array=False) features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs) dists = [] n_mols = len(mols) for i in range(1, n_mols): dist = DataStructs.BulkTanimotoSimilarity(features[i], features[:i], returnDistance=True) dists.extend([x for x in dist]) # now cluster the data cluster_indices = Butina.ClusterData(dists, n_mols, cutoff, isDistData=True) cluster_mols = [ operator.itemgetter(*cluster)(mols) for cluster in cluster_indices ] # Make single mol cluster a list cluster_mols = [[c] if isinstance(c, Chem.rdchem.Mol) else c for c in cluster_mols] return cluster_indices, cluster_mols
def prep_parquet_db(df, n_jobs, smiles_col, catalog_id_col, canonical_id_col): '''Take a cleaned df that contains protonated/tautomerized smiles, the vendor database ID and a canonical ID -- number indicates protomer/taut and 1) enumerate stereoisomers 2) generate chiral/achiral fingerprints 3) smarts and a new canonical ID that references stereoisomer. Returns: elaborated dataframe - pandas dataframe args: df == dataframe to be passed in - pandas dataframe n_jobs == number of jobs utilized by joblib - integer smiles_col == the name of the smiles column - string catalog_id == name of column referencing the catalog ID - string canonical_id == name of col referencing the canonical ID usually Z123456789_1 where _1 is protomer/taut num -string ''' smiles_column = smiles_col #Add clean the mols, standardize and generate lists for enumerated smiles, fingerprints both chiral/achiral at 8kbits df_clean_mapped = dm.parallelized(_preprocess, list(df.iterrows()), arg_type='args', progress=True, n_jobs=n_jobs) df_clean_mapped = pd.DataFrame(df_clean_mapped) #keep only the following columns columns_to_keep = ['enumerated_smiles', catalog_id_col, canonical_id_col, 'achiral_fp', 'chiral_fp', 'smarts', 'selfies'] df2 = df_clean_mapped[columns_to_keep] #remove dropped smiles, these fail due to invalid mols from rdkit df_dropped = df2[df2.smarts == 'dropped'] df3 = df2[df2.smarts != 'dropped'] #explode all the lists and generate new rows, then drop duplicated smiles. df4 = df3.set_index(['CatalogID', 'ID_Index', 'smarts', 'selfies']).apply(pd.Series.explode).reset_index() df5 = df4.drop_duplicates(subset='enumerated_smiles', keep="first", inplace=False) df5 = df5.reset_index(drop=True) #generate a new indexing system that creates unique names for canonical_id Z123456_1_1 where # Z123456_1 is taut/prot id and the additional _1 is the stereoisomer id df6 = df5.set_index('ID_Index') df6.index = df6.index + '_' + df6.groupby(level=0).cumcount().add(1).astype(str).replace('0','') df7 = df6.reset_index() #cleanup columns and return df7.columns = ['canonical_ID', 'CatalogID', 'smarts', 'selfies', 'enumerated_smiles', 'achiral_fp', 'chiral_fp'] return df7
def pick_centroids( mols: List[Chem.rdchem.Mol], npick: int = 0, initial_picks: List[int] = None, threshold: float = 0.5, feature_fn: Callable = None, dist_fn: Callable = None, seed: int = 42, method: str = "sphere", n_jobs: Optional[int] = 1, ): r"""Pick a set of `npick` centroids from a list of molecules. Args: mols: a list of molecules. npick: Number of element to pick from mols, including the preselection. threshold: Minimum distance between centroids for `maxmin` and sphere exclusion (`sphere`) methods. initial_picks: Starting list of index for molecules that should be in the set of picked molecules. Default to None. feature_fn (callable, optional): A feature function that takes a Chem.rdchem.Mol object and return molecular features. By default, the `dm.to_fp()` is used. Default to None. dist_fn: A function that takes two indexes (i,j) and return the distance between them. You might use partial to set the fingerprints as input. By default, the Tanimoto similarity will be used. Default to None. seed: seed for reproducibility method: Picking method to use. One of `sphere`, `maxmin` or any supported rdkit hierarchical clustering method such as `centroid`, `clink`, `upgma` n_jobs: Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores. Returns: picked_inds: index of the molecule that have been selected as centroids mols: molecules that have been picked """ n_mols = len(mols) if feature_fn is None: feature_fn = functools.partial(dm.to_fp, as_array=False) features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs) def distij(i, j, features=features): return 1.0 - DataStructs.TanimotoSimilarity(features[i], features[j]) if dist_fn is None: dist_fn = distij initial_picks = [] if initial_picks is None else initial_picks if method == "maxmin": picker = MaxMinPicker() picked_inds, _ = picker.LazyPickWithThreshold( dist_fn, n_mols, pickSize=npick, threshold=threshold, firstPicks=initial_picks, seed=seed, ) elif method == "sphere": picker = LeaderPicker() picked_inds = picker.LazyPick(dist_fn, n_mols, threshold=threshold, pickSize=npick, firstPicks=initial_picks) elif method.upper() in ClusterMethod.names.keys() and npick: if initial_picks: logger.warning( "Initial picks is not supported by hierarchical clustering. You pick has been discarded." ) dist_mat = dm.parallelized(distij, list( zip(*np.tril_indices(len(mols), k=-1))), arg_type="args") dist_mat = np.asarray(dist_mat) picker = HierarchicalClusterPicker(ClusterMethod.names[method.upper()]) picked_inds = picker.Pick(dist_mat, n_mols, npick) else: raise ValueError( f"Picking method {method} with {npick} elements to pick is not supported." ) picked_inds = np.array(picked_inds) picked_mols = [mols[x] for x in picked_inds] return picked_inds, picked_mols
molid = names[count] m.SetProp('_Name', molid) probe = Chem.Mol(m.ToBinary()) v.ShowMol(probe, name=molid, showOnly=False) df = pa.feather.read_feather('/data/mol_chunk_tests_cluster/test_2.molchunk') # df['combined_smiles'] = df[['standard_smiles', 'enumerated_smiles']].values.tolist() columns_to_keep = ['enumerated_smiles', 'CatalogID', 'ID_Index'] df2 = df[columns_to_keep] df3 = df2.explode('enumerated_smiles') df5 = df3.reset_index(drop=True) smiles_column = 'enumerated_smiles' # run initial mapper on smiles column to generate basic information and fingerprint on bits df_clean_mapped = dm.parallelized(_preprocess, list(df5.iterrows()), arg_type='args', progress=True) df_clean_mapped = pd.DataFrame(df_clean_mapped) # del df_clean_mapped['combined_smiles'] #remove duplicated standard SMILES and reindex duplicateRowsDF2 = df_clean_mapped[df_clean_mapped.duplicated(['standard_smiles'])] # print("Duplicate Rows based on a single column are:", duplicateRowsDF2, sep='\n') df_clean_mapped = df_clean_mapped.drop_duplicates(subset='standard_smiles', keep="first", inplace=False) df6 = df_clean_mapped.reset_index(drop=True) limit = 13000 results_list = []
columns = ['names', 'input_pdbqt_path', 'output_docking_pose_paths', 'output_docking_scores']) try: out_df = pd.merge(df, docked_df, on="names") return out_df except: print("merging df, failed") return None col_to_dock = 'pdbqt_ambcc' working_dir = '/data/dockop_glide_d3/dock_test' smiles_column = 'standard_smiles' df2 = dm.parallelized(_generate_pdbqt_outfiles_for_docking, list(df2.iterrows()), arg_type='args', progress=True) df2 = pd.DataFrame(df2) autodock_gpu = '/home/schrogpu/ADFRsuite-1.0/AutoDock-GPU/bin/autodock_gpu_128wi' receptor_path = '/home/schrogpu/ADFRsuite-1.0/d3_docking/pocket2fixer/rigidReceptor.maps.fld' lsmet = 'sw' num_runs = 50 dev_num = 0 names_to_dock = list(df2['canonical_id']) filenames = list(df2['pdbqt_out_path']) batch_list = f'{working_dir}/{col_to_dock}_batch.txt' with open(batch_list, 'w') as f: f.write(f'{receptor_path}\n') for i, filepath in enumerate(filenames):
row["inchi"] = 'dropped' row["inchikey"] = 'dropped' row["enumerated_smiles"] = list('dropped') return row # Load the dataset from parquet one by one dataset = ds.dataset(dataset_dir, format="parquet") # Create a list of fragments that are not memory loaded fragments = [file for file in dataset.get_fragments()] for count, element in enumerate(fragments): #cast the fragment as a pandas df df_docked = element.to_table().to_pandas() #reset the index df_docked = df_docked.reset_index(drop=True) #now write the nearest neighbor name and smiles to the df smiles_column = 'Smile' df_add_nn = dm.parallelized(_preprocess, list(df_docked.iterrows()), arg_type='args', progress=True, n_jobs=54) df_add_nn = pd.DataFrame(df_add_nn) #write the mochunk to disk feather.write_feather(df_add_nn, f'{output_dir}/er_enumisomers_{count}.molchunk')
df5 = df5.set_index('ID_Index') df5.index = df5.index + '_' + df5.groupby( level=0).cumcount().add(1).astype(str).replace('0', '') df5 = df5.reset_index() df5.columns = ['canonical_id', 'enumerated_smiles', 'CatalogID'] # columns_to_keep = ['enumerated_smiles', 'CatalogID', 'ID_Index'] # df2 = df[columns_to_keep] # df3 = df2.explode('enumerated_smiles') # df5 = df3.reset_index(drop=True) smiles_column = 'enumerated_smiles' # run initial mapper on smiles column to generate basic information and fingerprint on bits df_clean_mapped = dm.parallelized(_preprocess, list(df5.iterrows()), arg_type='args', progress=True, n_jobs=4) df_clean_mapped = pd.DataFrame(df_clean_mapped) df_dropped = df_clean_mapped[df_clean_mapped.standard_smiles == 'dropped'] print(f' The number of dropped entries is: {len(df_dropped)})') feather.write_feather(df_dropped, f'{output_dir}/er_d3sim_dropped_{count}.molchunk') df_clean_mapped = df_clean_mapped[ df_clean_mapped.standard_smiles != 'dropped'] print(f' The number of successful entries is: {len(df_clean_mapped)})') # del df_clean_mapped['combined_smiles']
row["mol2_block_am1bcc"] = mol2_block_am1bcc row["pdb_am1bcc"] = pdb_am1bcc row["pdbqt_am1bcc"] = pdbqt_am1bcc row["pdbqt_gast"] = pdbqt_gast # print(f'{name} with smiles {smiles} is complete') return row except: smiles = str(row[smiles_column]) name = row[6] row["mol2_block_am1bcc"] = 'dropped' row["pdb_am1bcc"] = 'dropped' row["pdbqt_am1bcc"] = 'dropped' row["pdbqt_gast"] = 'dropped' # print(f'{name} with smiles {smiles} is failed!!') return row smiles_column = 'standard_smiles' df_clean_mapped_3d = dm.parallelized(_preprocess_3d, list(d3_df.iterrows()), arg_type='args', progress=True) df_clean_mapped_3d_1 = pd.DataFrame(df_clean_mapped_3d) df2 = df_clean_mapped_3d_1 df2 = df2.set_index('canonical_id') df2.index = df2.index + df2.groupby(level=0).cumcount().astype(str).replace('0','') df2 = df2.reset_index() df2['canonical_id'] = df2['index'] del df2['index'] df2 feather.write_feather(df_clean_mapped_3d_1, '/data/dockop_glide_d3/chembld3.molchunk')