def mcs_vs_tanimoto(pred_dset, pred_smiles_col='smiles'):
    """
    Compute within-dataset distance matrices for compounds in pred_dset based on both Tanimoto and MCS
    distances, and compare the resulting distances.

    """
    if type(pred_dset) == str:
        pred_df = pd.read_csv(pred_dset, index_col=False)
    else:
        pred_df = pred_dset
    pred_smiles = pred_df[pred_smiles_col].values
    pred_smiles = [base_smiles_from_smiles(s) for s in pred_smiles]
    cmpd_ids = pred_df.compound_id.values
    ncmpd = pred_df.shape[0]

    cmpd_i_list = []
    cmpd_j_list = []
    tani_dist = []
    mcs_dist = []

    tani_dist_mat = cd.calc_dist_smiles('ecfp',
                                        'tanimoto',
                                        pred_smiles,
                                        calc_type='all')
    mcs_dist_mat = cd.calc_dist_smiles('ecfp',
                                       'mcs',
                                       pred_smiles,
                                       calc_type='all')
    for i in range(ncmpd - 1):
        cmpd_i = cmpd_ids[i]
        for j in range(i + 1, ncmpd):
            cmpd_j = cmpd_ids[j]
            cmpd_i_list.append(cmpd_i)
            cmpd_j_list.append(cmpd_j)
            tani_dist.append(tani_dist_mat[i, j])
            mcs_dist.append(mcs_dist_mat[i, j])

    dist_df = pd.DataFrame(
        dict(compound_i=cmpd_i_list,
             compound_j=cmpd_j_list,
             tanimoto_distance=tani_dist,
             mcs_distance=mcs_dist))
    fig, ax = plt.subplots(figsize=(15, 15))
    sns.scatterplot(x='mcs_distance', y='tanimoto_distance', data=dist_df)

    return dist_df
Ejemplo n.º 2
0
    def _get_dists(self):
        '''
        Calculate pairwise compound distances between training and test subsets.

        Args:
            None

        Returns:
            Array of floats. Pairwise Tanimoto distances between training and test subsets.
        '''
        return cd.calc_dist_smiles('ECFP', 'tanimoto',
                                   self.train_df[self.smiles_col].values,
                                   self.test_df[self.smiles_col].values)
def nearest_neighbor_distances(pred_dset,
                               ref_dset,
                               pred_smiles_col='smiles',
                               ref_smiles_col='base_rdkit_smiles'):
    """
    Find the nearest neighbor compound in the reference dataset for each predicted compound and its distance to
    the predicted compound. Add this information to the table of predicted properties.
    """

    if type(pred_dset) == str:
        pred_df = pd.read_csv(pred_dset, index_col=False)
    else:
        pred_df = pred_dset
    if type(ref_dset) == str:
        ref_df = pd.read_csv(ref_dset, index_col=False)
    else:
        ref_df = ref_dset
    pred_smiles = pred_df[pred_smiles_col].values
    pred_smiles = [base_smiles_from_smiles(s) for s in pred_smiles]
    ref_smiles = ref_df[ref_smiles_col].values

    dist_arr = cd.calc_dist_smiles('ecfp',
                                   'tanimoto',
                                   pred_smiles,
                                   ref_smiles,
                                   calc_type='all')
    ref_cmpd_ids = ref_df.compound_id.values

    nn_ind = np.argmin(dist_arr, axis=1)
    nn_dist = np.min(dist_arr, axis=1)
    pred_df['nearest_cmpd'] = ref_cmpd_ids[nn_ind]
    pred_df['nearest_dist'] = nn_dist
    uniq_neighbors, counts = np.unique(pred_df.nearest_cmpd.values,
                                       return_counts=True)
    nnfreq_df = pd.DataFrame(
        dict(nearest_cmpd=uniq_neighbors,
             pred_cmpd_count=counts)).sort_values(by='pred_cmpd_count',
                                                  ascending=False)
    nn_pred_df = pred_df.merge(nnfreq_df, how='left',
                               on='nearest_cmpd').sort_values(
                                   by=['pred_cmpd_count', 'nearest_cmpd'],
                                   ascending=False)
    return nn_pred_df
def compute_dist_matrix(pred_file,
                        ref_dset_file,
                        pred_smiles_col='smiles',
                        ref_smiles_col='base_rdkit_smiles'):
    """
    Compute the Tanimoto distance matrix between the SMILES strings in pred_file and those in ref_dset_file.
    """
    base = os.path.splitext(os.path.basename(pred_file))

    pred_df = pd.read_csv(pred_file, index_col=False)
    ref_df = pd.read_csv(ref_dset_file, index_col=False)
    pred_smiles = pred_df[pred_smiles_col].values
    pred_smiles = [base_smiles_from_smiles(s) for s in pred_smiles]
    ref_smiles = ref_df[ref_smiles_col].values

    dist_arr = cd.calc_dist_smiles('ecfp',
                                   'tanimoto',
                                   pred_smiles,
                                   ref_smiles,
                                   calc_type='all')
    return dist_arr
Ejemplo n.º 5
0
                            dp.diversity_plots(
                                dset_key=cur_ofile,
                                datastore=False,
                                id_col=act_data.id_col,
                                smiles_col=act_data.base_smiles_col,
                                is_base_smiles=True,
                                response_col=act_data.value_col,
                                max_for_mcs=100,
                                out_dir=output_img_dir)

                        feat_type = 'ECFP'
                        dist_metric = 'tanimoto'
                        smiles_lst1 = sub_df[act_data.base_smiles_col].tolist()
                        calc_type = 'nearest'
                        dist_sample = cd.calc_dist_smiles(
                            feat_type, dist_metric, smiles_lst1, None,
                            calc_type)
                        sns.distplot(dist_sample,
                                     kde=False,
                                     axlabel=feat_type + '_' + dist_metric +
                                     '_' + calc_type)
                        label = "Nearest distance between compound pairs"
                        plt.title(label)
                        pdf.savefig(fig)

                ## save curated form of each dataset
                if not target_name in comb:
                    comb[target_name] = []
                ndata = CustomActivityDump(dataset=act_data, df=sub_df)
                comb[target_name].append(ndata)
Ejemplo n.º 6
0
def generateTestset(dtc_df, ex_df, ch_df, test_fraction=0.15):

    n_dtc = dtc_df.shape[0]
    n_ex = ex_df.shape[0]
    n_ch = ch_df.shape[0]

    sumOfthree = dtc_df.shape[0] + ex_df.shape[0] + ch_df.shape[0]
    print("Number of total samples: ", sumOfthree)

    dtc_s = dtc_df['base_rdkit_smiles'].tolist()
    ex_s = ex_df['base_rdkit_smiles'].tolist()
    ch_s = ch_df['base_rdkit_smiles'].tolist()

    #============================
    # Union data sets in order
    #============================
    union_df = pd.concat(
        [
            dtc_df['base_rdkit_smiles'], ex_df['base_rdkit_smiles'],
            ch_df['base_rdkit_smiles']
        ],
        ignore_index=True).drop_duplicates().reset_index(drop=True)
    unionOrder_s = union_df.tolist()
    print("Number of unique rdkit smiles in all three data sets: ",
          union_df.shape[0])
    print("Number of rdkit smiles in the union set: ", len(unionOrder_s))
    n_union = union_df.shape[0]

    union_df = union_df.to_frame()

    # Compute nearest distances to each of the data sources
    feat_type = 'ECFP'
    dist_metric = 'tanimoto'
    calc_type = 'nearest'
    #calc_type='all'
    unionOrder_dtc_nndist = cd.calc_dist_smiles(feat_type, dist_metric,
                                                unionOrder_s, dtc_s, calc_type)
    unionOrder_ex_nndist = cd.calc_dist_smiles(feat_type, dist_metric,
                                               unionOrder_s, ex_s, calc_type)
    unionOrder_ch_nndist = cd.calc_dist_smiles(feat_type, dist_metric,
                                               unionOrder_s, ch_s, calc_type)
    # Sum of the nearest distances to the other two data sources
    unionOrder_all_nndist = unionOrder_dtc_nndist + unionOrder_ex_nndist + unionOrder_ch_nndist

    # Add sum of distances as a column to the dataframe
    union_df['sumDist'] = unionOrder_all_nndist
    print(unionOrder_all_nndist.shape)
    print(union_df.shape)
    print(len(unionOrder_s))

    #----------------------------
    # Randomly select a subset
    #----------------------------
    fraction = 0.4
    subset_tmp_df = union_df.sample(frac=fraction)
    print("Number of randomely selected samples: ", subset_tmp_df.shape[0])

    subset_tmp_s = subset_tmp_df['base_rdkit_smiles'].tolist()
    # Find intersect with dtc, excape and chembl
    subset_dtc = intersection(subset_tmp_s, dtc_s)
    subset_ex = intersection(subset_tmp_s, ex_s)
    subset_ch = intersection(subset_tmp_s, ch_s)

    #=====================================================
    # Compute distances to training

    union_df['temp_split'] = 'train'
    union_df.loc[union_df['base_rdkit_smiles'].isin(subset_tmp_s),
                 'temp_split'] = 'test'

    union_train_s = union_df.loc[union_df['temp_split'] == 'train',
                                 'base_rdkit_smiles']
    union_test_s = union_df.loc[union_df['temp_split'] == 'test',
                                'base_rdkit_smiles']

    feat_type = 'ECFP'
    dist_metric = 'tanimoto'
    calc_type = 'nearest'
    testDist2train = cd.calc_dist_smiles(feat_type, dist_metric, union_test_s,
                                         union_train_s, calc_type)
    print('distance from test set to training set, length: ',
          testDist2train.shape)

    subset_tmp_df['dist2train'] = testDist2train
    subset_tmp_df = subset_tmp_df.sort_values(by=['dist2train'],
                                              ascending=False)

    #============================================================
    # Sample equal proportions starting from the top distances
    # Force assign groups
    subset_tmp_df['source'] = 'dtc'
    subset_tmp_df.loc[subset_tmp_df['base_rdkit_smiles'].isin(ex_s),
                      'source'] = 'excape'
    subset_tmp_df.loc[subset_tmp_df['base_rdkit_smiles'].isin(ch_s),
                      'source'] = 'chembl'
    subset_tmp_df.loc[subset_tmp_df['base_rdkit_smiles'].isin(dtc_s),
                      'source'] = 'dtc'

    subset_dtc_df = subset_tmp_df.loc[subset_tmp_df['source'] == 'dtc']
    subset_ex_df = subset_tmp_df.loc[subset_tmp_df['source'] == 'excape']
    subset_ch_df = subset_tmp_df.loc[subset_tmp_df['source'] == 'chembl']

    subset_dtc_df.sort_values(['dist2train', 'sumDist'],
                              inplace=True,
                              ascending=False)
    subset_ex_df.sort_values(['dist2train', 'sumDist'],
                             inplace=True,
                             ascending=False)
    subset_ch_df.sort_values(['dist2train', 'sumDist'],
                             inplace=True,
                             ascending=False)

    #================
    # Fraction = 0.15
    # test_fraction = 0.15
    nSample_dtc = round(n_union * test_fraction * n_dtc / sumOfthree)
    nSample_ex = round(n_union * test_fraction * n_ex / sumOfthree)
    nSample_ch = round(n_union * test_fraction * n_ch / sumOfthree)

    subset_sel_dtc_df = subset_dtc_df.head(nSample_dtc)
    subset_sel_ex_df = subset_ex_df.head(nSample_ex)
    subset_sel_ch_df = subset_ch_df.head(nSample_ch)

    subset_sel_dtc = subset_sel_dtc_df['base_rdkit_smiles']
    subset_sel_ex = subset_sel_ex_df['base_rdkit_smiles']
    subset_sel_ch = subset_sel_ch_df['base_rdkit_smiles']

    dtc_set = set(subset_sel_dtc)
    ex_set = set(subset_sel_ex)
    ch_set = set(subset_sel_ch)

    # Find the true source groups
    # Union three selected subsets
    subset_sel_s = Union(dtc_set, ex_set, ch_set)
    subset_sel_df = subset_tmp_df.loc[subset_tmp_df['base_rdkit_smiles'].isin(
        subset_sel_s)]

    subset_sel_dtc_df = subset_sel_df.loc[
        subset_sel_df['base_rdkit_smiles'].isin(subset_dtc)]
    subset_sel_ex_df = subset_sel_df.loc[
        subset_sel_df['base_rdkit_smiles'].isin(subset_ex)]
    subset_sel_ch_df = subset_sel_df.loc[
        subset_sel_df['base_rdkit_smiles'].isin(subset_ch)]

    # Gather original dataframe features
    testset_dtc_df = dtc_df.loc[dtc_df['base_rdkit_smiles'].isin(subset_sel_s)]
    trainset_dtc_df = dtc_df.loc[~dtc_df['base_rdkit_smiles'].isin(subset_sel_s
                                                                   )]
    testset_ex_df = ex_df.loc[ex_df['base_rdkit_smiles'].isin(subset_sel_s)]
    trainset_ex_df = ex_df.loc[~ex_df['base_rdkit_smiles'].isin(subset_sel_s)]
    testset_ch_df = ch_df.loc[ch_df['base_rdkit_smiles'].isin(subset_sel_s)]
    trainset_ch_df = ch_df.loc[~ch_df['base_rdkit_smiles'].isin(subset_sel_s)]
    #testset_union_df = union_df.loc[union_df['base_rdkit_smiles'].isin(subset_sel_s)]
    #trainset_union_df = union_df.loc[~union_df['base_rdkit_smiles'].isin(subset_sel_s)]

    return testset_dtc_df, testset_ex_df, testset_ch_df, trainset_dtc_df, trainset_ex_df, trainset_ch_df