def get_rdkit_smiles_parent(data): print("") print( "Adding SMILES column 'rdkit_smiles_parent' with salts stripped...(may take a while)", flush=True) """ ___Strip the salts off the rdkit SMILES strings___ First, loops through data and determines the base/parent smiles string for each row. Appends the base smiles string to a new row in a list. Then adds the list as a new column in 'data'" """ i_max = data.shape[0] rdkit_smiles_parent = [] for i in range(i_max): smile = data['rdkit_smiles'].iloc[i] if type(smile) is float: split = '' else: split = base_smiles_from_smiles(smile) rdkit_smiles_parent.append(split) # 2. Add base smiles string (stripped smiles) to dataset data['rdkit_smiles_parent'] = rdkit_smiles_parent return data
def get_rdkit_smiles_parent(data): """Strip the salts off the rdkit SMILES strings First, loops through data and determines the base/parent smiles string for each row. Appends the base smiles string to a new row in a list. Then adds the list as a new column, 'rdkit_smiles_parent', in 'data'. Basically calls base_smiles_from_smiles for each smile in the column 'rdkit_smiles' Args: data (DataFrame): A DataFrame with a column named 'rdkit_smiles'. Returns: DataFrame with column 'rdkit_smiles_parent' with salts stripped """ print("") print( "Adding SMILES column 'rdkit_smiles_parent' with salts stripped...(may take a while)", flush=True) i_max = data.shape[0] rdkit_smiles_parent = [] for i in range(i_max): smile = data['rdkit_smiles'].iloc[i] if type(smile) is float: split = '' else: split = base_smiles_from_smiles(smile) rdkit_smiles_parent.append(split) # 2. Add base smiles string (stripped smiles) to dataset data['rdkit_smiles_parent'] = rdkit_smiles_parent return data
def _prepare_input_data(input_df, id_col, smiles_col, response_col, conc_col, dont_standardize): """ Prepare input data frame for running predictions """ colnames = set(input_df.columns.values) if (id_col is None) or (id_col not in colnames): input_df['compound_id'] = [ 'compound_%.6d' % i for i in range(input_df.shape[0]) ] id_col = 'compound_id' if smiles_col not in colnames: raise ValueError( 'smiles_col parameter not specified or column not in input file.') if dont_standardize: std_smiles_col = smiles_col else: print("Standardizing SMILES strings for %d compounds." % input_df.shape[0]) orig_ncmpds = input_df.shape[0] std_smiles = base_smiles_from_smiles( input_df[smiles_col].values.tolist(), workers=16) input_df['orig_smiles'] = input_df[smiles_col] input_df[smiles_col] = std_smiles input_df = input_df[input_df[smiles_col] != ''] if input_df.shape[0] == 0: raise ValueError("No valid SMILES strings to predict on.") nlost = orig_ncmpds - input_df.shape[0] if nlost > 0: print( "Could not parse %d SMILES strings; will predict on the remainder." % nlost) pred_params = { 'featurizer': 'computed_descriptors', 'result_dir': tempfile.mkdtemp(), 'id_col': id_col, 'smiles_col': smiles_col } if (response_col is not None) and (response_col in input_df.columns.values): pred_params['response_cols'] = response_col if conc_col is not None and conc_col in input_df.columns.values: pred_params['response_cols'] += "," + conc_col elif conc_col is not None and conc_col in input_df.columns.values: pred_params['response_cols'] = "ACTIVITY," + conc_col return input_df, pred_params
def mcs_vs_tanimoto(pred_dset, pred_smiles_col='smiles'): """ Compute within-dataset distance matrices for compounds in pred_dset based on both Tanimoto and MCS distances, and compare the resulting distances. """ if type(pred_dset) == str: pred_df = pd.read_csv(pred_dset, index_col=False) else: pred_df = pred_dset pred_smiles = pred_df[pred_smiles_col].values pred_smiles = [base_smiles_from_smiles(s) for s in pred_smiles] cmpd_ids = pred_df.compound_id.values ncmpd = pred_df.shape[0] cmpd_i_list = [] cmpd_j_list = [] tani_dist = [] mcs_dist = [] tani_dist_mat = cd.calc_dist_smiles('ecfp', 'tanimoto', pred_smiles, calc_type='all') mcs_dist_mat = cd.calc_dist_smiles('ecfp', 'mcs', pred_smiles, calc_type='all') for i in range(ncmpd - 1): cmpd_i = cmpd_ids[i] for j in range(i + 1, ncmpd): cmpd_j = cmpd_ids[j] cmpd_i_list.append(cmpd_i) cmpd_j_list.append(cmpd_j) tani_dist.append(tani_dist_mat[i, j]) mcs_dist.append(mcs_dist_mat[i, j]) dist_df = pd.DataFrame( dict(compound_i=cmpd_i_list, compound_j=cmpd_j_list, tanimoto_distance=tani_dist, mcs_distance=mcs_dist)) fig, ax = plt.subplots(figsize=(15, 15)) sns.scatterplot(x='mcs_distance', y='tanimoto_distance', data=dist_df) return dist_df
def nearest_neighbor_distances(pred_dset, ref_dset, pred_smiles_col='smiles', ref_smiles_col='base_rdkit_smiles'): """ Find the nearest neighbor compound in the reference dataset for each predicted compound and its distance to the predicted compound. Add this information to the table of predicted properties. """ if type(pred_dset) == str: pred_df = pd.read_csv(pred_dset, index_col=False) else: pred_df = pred_dset if type(ref_dset) == str: ref_df = pd.read_csv(ref_dset, index_col=False) else: ref_df = ref_dset pred_smiles = pred_df[pred_smiles_col].values pred_smiles = [base_smiles_from_smiles(s) for s in pred_smiles] ref_smiles = ref_df[ref_smiles_col].values dist_arr = cd.calc_dist_smiles('ecfp', 'tanimoto', pred_smiles, ref_smiles, calc_type='all') ref_cmpd_ids = ref_df.compound_id.values nn_ind = np.argmin(dist_arr, axis=1) nn_dist = np.min(dist_arr, axis=1) pred_df['nearest_cmpd'] = ref_cmpd_ids[nn_ind] pred_df['nearest_dist'] = nn_dist uniq_neighbors, counts = np.unique(pred_df.nearest_cmpd.values, return_counts=True) nnfreq_df = pd.DataFrame( dict(nearest_cmpd=uniq_neighbors, pred_cmpd_count=counts)).sort_values(by='pred_cmpd_count', ascending=False) nn_pred_df = pred_df.merge(nnfreq_df, how='left', on='nearest_cmpd').sort_values( by=['pred_cmpd_count', 'nearest_cmpd'], ascending=False) return nn_pred_df
def compute_dist_matrix(pred_file, ref_dset_file, pred_smiles_col='smiles', ref_smiles_col='base_rdkit_smiles'): """ Compute the Tanimoto distance matrix between the SMILES strings in pred_file and those in ref_dset_file. """ base = os.path.splitext(os.path.basename(pred_file)) pred_df = pd.read_csv(pred_file, index_col=False) ref_df = pd.read_csv(ref_dset_file, index_col=False) pred_smiles = pred_df[pred_smiles_col].values pred_smiles = [base_smiles_from_smiles(s) for s in pred_smiles] ref_smiles = ref_df[ref_smiles_col].values dist_arr = cd.calc_dist_smiles('ecfp', 'tanimoto', pred_smiles, ref_smiles, calc_type='all') return dist_arr
def aggregate_assay_data(assay_df, value_col='VALUE_NUM', output_value_col=None, label_actives=True, active_thresh=None, id_col='CMPD_NUMBER', smiles_col='rdkit_smiles', relation_col='VALUE_FLAG', date_col=None): """ Map RDKit SMILES strings in assay_df to base structures, then compute an MLE estimate of the mean value over replicate measurements for the same SMILES strings, taking censoring into account. Generate an aggregated result table with one value for each unique base SMILES string, to be used in an ML-ready dataset. :param assay_df: The input data frame to be processed. :param value_col: The column in the data frame containing assay values to be averaged. :param output_value_col: Optional; the column name to use in the output data frame for the averaged data. :param label_actives: If True, generate an additional column 'active' indicating whether the mean value is above a threshold specified by active_thresh. :param active_thresh: The threshold to be used for labeling compounds as active or inactive. If active_thresh is None (the default), the threshold used is the minimum reported value across all records with left-censored values (i.e., those with '<' in the relation column. :param id_col: The input data frame column containing compound IDs. :param smiles_col: The input data frame column containing SMILES strings. :param relation_col: The input data frame column containing relational operators (<, >, etc.). :param date_col: The input data frame column containing dates when the assay data was uploaded. If not None, the code will assign the earliest date among replicates to the aggregate data record. :return: A data frame containing averaged assay values, with one value per compound. """ assay_df = assay_df.fillna({relation_col: '', smiles_col: ''}) # Filter out rows where SMILES is missing n_missing_smiles = np.array( [len(smiles) == 0 for smiles in assay_df[smiles_col].values]).sum() print("%d entries in input table are missing SMILES strings" % n_missing_smiles) has_smiles = np.array( [len(smiles) > 0 for smiles in assay_df[smiles_col].values]) assay_df = assay_df[has_smiles].copy() # Estimate the measurement error across replicates for this assay std_est = replicate_rmsd(assay_df, smiles_col=smiles_col, value_col=value_col, relation_col=relation_col) # Map SMILES strings to base structure SMILES strings, then map these to indices into the list of # unique base structures orig_smiles_strs = assay_df[smiles_col].values norig = len(set(orig_smiles_strs)) smiles_strs = [ base_smiles_from_smiles(smiles, True) for smiles in orig_smiles_strs ] assay_df['base_rdkit_smiles'] = smiles_strs uniq_smiles_strs = list(set(smiles_strs)) nuniq = len(uniq_smiles_strs) print( "%d unique SMILES strings are reduced to %d unique base SMILES strings" % (norig, nuniq)) smiles_map = dict([(smiles, i) for i, smiles in enumerate(uniq_smiles_strs)]) smiles_indices = np.array( [smiles_map.get(smiles, nuniq) for smiles in smiles_strs]) assay_vals = assay_df[value_col].values value_flags = assay_df[relation_col].values # Compute a maximum likelihood estimate of the mean assay value for each compound, averaging over replicates # and factoring in censoring. Report the censoring/relation/value_flag only if the flags are consistent across # all replicates. # Exclude compounds that couldn't be mapped to SMILES strings. cmpd_ids = assay_df[id_col].values reported_cmpd_ids = [''] * nuniq reported_value_flags = [''] * nuniq if date_col is not None: reported_dates = [''] * nuniq reported_assay_val = np.zeros(nuniq, dtype=float) for i in range(nuniq): cmpd_ind = np.where(smiles_indices == i)[0] cmpd_df = assay_df.iloc[cmpd_ind] reported_assay_val[i], reported_value_flags[i] = mle_censored_mean( cmpd_df, std_est, value_col=value_col, relation_col=relation_col) # When multiple compound IDs map to the same base SMILES string, use the lexicographically smallest one. reported_cmpd_ids[i] = sorted(set(cmpd_ids[cmpd_ind]))[0] # If a date column is specified, use the earliest one among replicates if date_col is not None: # np.datetime64 doesn't seem to understand the date format in GSK's crit res tables #earliest_date = sorted([np.datetime64(d) for d in cmpd_df[date_col].values])[0] earliest_date = sorted( pd.to_datetime(cmpd_df[date_col], infer_datetime_format=True).values)[0] reported_dates[i] = np.datetime_as_string(earliest_date) if output_value_col is None: output_value_col = value_col agg_df = pd.DataFrame({ 'compound_id': reported_cmpd_ids, 'base_rdkit_smiles': uniq_smiles_strs, 'relation': reported_value_flags, output_value_col: reported_assay_val }) if date_col is not None: agg_df[date_col] = reported_dates # Label each compound as active or not, based on the reported relation and values relative to a common threshold if label_actives: inactive_df = agg_df[agg_df.relation == '<'] if inactive_df.shape[0] > 0 and active_thresh is None: active_thresh = np.min(inactive_df[output_value_col].values) if active_thresh is not None: is_active = ((agg_df.relation != '<') & (agg_df[output_value_col].values > active_thresh)) agg_df['active'] = [int(a) for a in is_active] else: agg_df['active'] = 1 return agg_df
def predict_activity(args): input_df = pd.read_csv(args.input_file, index_col=False) colnames = set(input_df.columns.values) if args.id_col not in colnames: input_df['compound_id'] = [ 'compound_%.6d' % i for i in range(input_df.shape[0]) ] args.id_col = 'compound_id' if args.smiles_col not in colnames: raise ValueError( 'smiles_col parameter not specified or column not in input file.') if args.dont_standardize: std_smiles_col = args.smiles_col else: print("Standardizing SMILES strings for %d compounds." % input_df.shape[0]) orig_ncmpds = input_df.shape[0] std_smiles = [ base_smiles_from_smiles(s) for s in input_df[args.smiles_col].values ] input_df['standardized_smiles'] = std_smiles input_df = input_df[input_df.standardized_smiles != ''] if input_df.shape[0] == 0: print("No valid SMILES strings to predict on.") return nlost = orig_ncmpds - input_df.shape[0] input_df = input_df.sort_values(by=args.id_col) orig_smiles = input_df[args.smiles_col].values if nlost > 0: print( "Could not parse %d SMILES strings; will predict on the remainder." % nlost) std_smiles_col = 'standardized_smiles' pred_params = {'id_col': args.id_col, 'smiles_col': std_smiles_col} has_activity = (args.activity_col is not None) if has_activity: pred_params['response_cols'] = args.activity_col pred_params = parse.wrapper(pred_params) model_files = dict(random='bsep_classif_random_split.tar.gz', scaffold='bsep_classif_scaffold_split.tar.gz') if args.model_type not in model_files: raise ValueError("model_type %s is not a recognizied value." % args.model_type) # Test loading model from tarball and running predictions models_dir = os.path.join(os.path.dirname(os.path.dirname(mp.__file__)), 'examples', 'BSEP', 'models') model_tarfile = os.path.join(models_dir, model_files[args.model_type]) pipe = mp.create_prediction_pipeline_from_file(pred_params, reload_dir=None, model_path=model_tarfile) pred_df = pipe.predict_full_dataset(input_df, contains_responses=has_activity, dset_params=pred_params) pred_df = pred_df.sort_values(by=args.id_col) if not args.dont_standardize: pred_df[args.smiles_col] = orig_smiles # Write predictions to output file pred_df.to_csv(args.output_file, index=False) print("Wrote predictions to file %s" % args.output_file) # If measured activity values are provided, print some performance metrics if has_activity: actual_vals = pred_df['%s_actual' % args.activity_col].values pred_classes = pred_df['%s_pred' % args.activity_col].values pred_probs = pred_df['%s_prob' % args.activity_col].values conf_matrix = metrics.confusion_matrix(actual_vals, pred_classes) roc_auc = metrics.roc_auc_score(actual_vals, pred_probs) prc_auc = metrics.average_precision_score(actual_vals, pred_probs) accuracy = metrics.accuracy_score(actual_vals, pred_classes) precision = metrics.precision_score(actual_vals, pred_classes) npv = negative_predictive_value(actual_vals, pred_classes) recall = metrics.recall_score(actual_vals, pred_classes) mcc = metrics.matthews_corrcoef(actual_vals, pred_classes) ncorrect = sum(actual_vals == pred_classes) print("Performance metrics:\n") print("%d out of %d predictions correct." % (ncorrect, pred_df.shape[0])) print("Accuracy: %.3f" % accuracy) print("Precision: %.3f" % precision) print("Recall: %.3f" % recall) print("NPV: %.3f" % npv) print("ROC AUC: %.3f" % roc_auc) print("PRC AUC: %.3f" % prc_auc) print("Matthews correlation coefficient: %.3f" % mcc) print("Confusion matrix:") print("\t\tpredicted activity") print("actual\nactivity\t0\t1\n") print(" 0\t\t%d\t%d" % (conf_matrix[0][0], conf_matrix[0][1])) print(" 1\t\t%d\t%d" % (conf_matrix[1][0], conf_matrix[1][1]))