Python base_mol_from_smilesの例

プログラミング言語: Python

名前空間/パッケージ名: atomsci.ddm.utils.struct_utils

メソッド/関数: base_mol_from_smiles

hotexamples.comのコード掲載数: 5

Python base_mol_from_smiles - 5件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのatomsci.ddm.utils.struct_utils.base_mol_from_smilesの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

def diversity_plots(dset_key, datastore=True, bucket='gsk_ml', title_prefix=None, ecfp_radius=4, out_dir=None, 
                    id_col='compound_id', smiles_col='rdkit_smiles', max_for_mcs=300):
    """
    Plot visualizations of diversity for an arbitrary table of compounds. At minimum, the file should contain
    columns for a compound ID and a SMILES string.
    """
    # Load table of compound names, IDs and SMILES strings
    if datastore:
        cmpd_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket)
    else:
        cmpd_df = pd.read_csv(dset_key, index_col=False)
    file_prefix = os.path.splitext(os.path.basename(dset_key))[0]
    if title_prefix is None:
        title_prefix = file_prefix.replace('_', ' ')
    compound_ids = cmpd_df[id_col].values
    smiles_strs = cmpd_df[smiles_col].values
    ncmpds = len(smiles_strs)
    print(ncmpds)
    # Strip salts, canonicalize SMILES strings and create RDKit Mol objects
    print("Canonicalizing molecules...")
    base_mols = [struct_utils.base_mol_from_smiles(smiles) for smiles in smiles_strs]
    for i, mol in enumerate(base_mols):
        if mol is None:
            print('Unable to get base molecule for compound %d = %s' % (i, compound_ids[i]))
    base_smiles = [Chem.MolToSmiles(mol) for mol in base_mols]
    print("Done")

    # Generate ECFP fingerprints
    print("Computing fingerprints...")
    fps = [AllChem.GetMorganFingerprintAsBitVect(mol, ecfp_radius, 1024) for mol in base_mols if mol is not None]
    print("Done")

    if ncmpds <= max_for_mcs:
        # Get MCS distance matrix and draw a heatmap
        print("Computing MCS distance matrix...")
        mcs_dist = dm.mcs(base_mols)
        print("Done")
        cmpd1 = []
        cmpd2 = []
        dist = []
        ind1 = []
        ind2 = []
        for i in range(ncmpds-1):
            for j in range(i+1, ncmpds):
                cmpd1.append(compound_ids[i])
                cmpd2.append(compound_ids[j])
                dist.append(mcs_dist[i,j])
                ind1.append(i)
                ind2.append(j)
        dist_df = pd.DataFrame({'compound_1' : cmpd1, 'compound_2' : cmpd2, 'dist' : dist,
                                'i' : ind1, 'j' : ind2})
        dist_df = dist_df.sort_values(by='dist')
        print(dist_df.head(10))
        if out_dir is not None:
            dist_df.to_csv('%s/%s_mcs_dist_table.csv' % (out_dir, file_prefix), index=False)
            for k in range(10):
                mol_i = base_mols[dist_df.i.values[k]]
                mol_j = base_mols[dist_df.j.values[k]]
                img_file_i = '%s/%d_%s.png' % (out_dir, k, compound_ids[dist_df.i.values[k]])
                img_file_j = '%s/%d_%s.png' % (out_dir, k, compound_ids[dist_df.j.values[k]])
                Draw.MolToFile(mol_i, img_file_i, size=(500,500), fitImage=False)
                Draw.MolToFile(mol_j, img_file_j, size=(500,500), fitImage=False)
    
        mcs_linkage = linkage(mcs_dist, method='complete')
        mcs_df = pd.DataFrame(mcs_dist, columns=compound_ids, index=compound_ids)
        if out_dir is not None:
            pdf_path = '%s/%s_mcs_clustermap.pdf' % (out_dir, file_prefix)
            pdf = PdfPages(pdf_path)
        g = sns.clustermap(mcs_df, row_linkage=mcs_linkage, col_linkage=mcs_linkage, figsize=(12,12), cmap='plasma')
        if out_dir is not None:
            pdf.savefig(g.fig)
            pdf.close()
    
        # Draw a UMAP projection based on MCS distance
        mapper = umap.UMAP(n_neighbors=10, n_components=2, metric='precomputed', random_state=17)
        reps = mapper.fit_transform(mcs_dist)
        rep_df = pd.DataFrame.from_records(reps, columns=['x', 'y'])
        rep_df['compound_id'] = compound_ids
        if out_dir is not None:
            pdf_path = '%s/%s_mcs_umap_proj.pdf' % (out_dir, file_prefix)
            pdf = PdfPages(pdf_path)
        fig, ax = plt.subplots(figsize=(12,12))
        sns.scatterplot(x='x', y='y', data=rep_df, ax=ax)
        ax.set_title("%s, 2D projection based on MCS distance" % title_prefix)
        if out_dir is not None:
            pdf.savefig(fig)
            pdf.close()
            rep_df.to_csv('%s/%s_mcs_umap_proj.csv' % (out_dir, file_prefix), index=False)

    # Get Tanimoto distance matrix
    print("Computing Tanimoto distance matrix...")
    tani_dist = dm.tanimoto(fps)
    print("Done")
    # Draw a UMAP projection based on Tanimoto distance
    mapper = umap.UMAP(n_neighbors=10, n_components=2, metric='precomputed', random_state=17)
    reps = mapper.fit_transform(tani_dist)
    rep_df = pd.DataFrame.from_records(reps, columns=['x', 'y'])
    rep_df['compound_id'] = compound_ids
    if out_dir is not None:
        pdf_path = '%s/%s_tani_umap_proj.pdf' % (out_dir, file_prefix)
        pdf = PdfPages(pdf_path)
    fig, ax = plt.subplots(figsize=(12,12))
    sns.scatterplot(x='x', y='y', data=rep_df, ax=ax)
    ax.set_title("%s, 2D projection based on Tanimoto distance" % title_prefix)
    if out_dir is not None:
        pdf.savefig(fig)
        pdf.close()

    # Draw a cluster heatmap based on Tanimoto distance
    tani_linkage = linkage(tani_dist, method='complete')
    tani_df = pd.DataFrame(tani_dist, columns=compound_ids, index=compound_ids)
    if out_dir is not None:
        pdf_path = '%s/%s_tanimoto_clustermap.pdf' % (out_dir, file_prefix)
        pdf = PdfPages(pdf_path)
    g = sns.clustermap(tani_df, row_linkage=tani_linkage, col_linkage=tani_linkage, figsize=(12,12), cmap='plasma')
    if out_dir is not None:
        pdf.savefig(g.fig)
        pdf.close()

コード例 #2

ファイルを表示

def compare_obach_gsk_aq_sol(ecfp_radius=6):
    """
    Plot projections of Obach and GSK solubility datasets using the same UMAP projectors.
    """
    obach_cmpd_file = '/usr/local/data/diversity_plots/obach/LombardoSupplemental_Data_rdsmiles.csv'
    out_dir = '/usr/local/data/diversity_plots/obach'
    obach_id_col = 'Name' 
    smiles_col='rdkit_smiles'

    # Load table of compound names, IDs and SMILES strings
    obach_cmpd_df = pd.read_csv(obach_cmpd_file, index_col=False)
    # Sample the same number of compounds as in the GSK set
    obach_cmpd_df = obach_cmpd_df.sample(n=732, axis=0)
    obach_compound_ids = obach_cmpd_df[obach_id_col].values
    obach_smiles_strs = obach_cmpd_df[smiles_col].values

    # Strip salts, canonicalize SMILES strings and create RDKit Mol objects
    base_mols = [struct_utils.base_mol_from_smiles(smiles) for smiles in obach_smiles_strs]
    for i, mol in enumerate(base_mols):
        if mol is None:
            print('Unable to get base molecule for compound %d = %s' % (i, obach_compound_ids[i]))
    base_smiles = [Chem.MolToSmiles(mol) for mol in base_mols]
    obach_fps = [AllChem.GetMorganFingerprintAsBitVect(mol, ecfp_radius, 1024) for mol in base_mols if mol is not None]

    # Load the GSK dataset
    gsk_data_dir = '/ds/data/gsk_data/GSK_datasets/solubility'
    gsk_cmpd_file = '%s/ATOM_GSK_Solubility_Aqueous.csv' % gsk_data_dir
    gsk_cmpd_df = pd.read_csv(gsk_cmpd_file, index_col=False)
    gsk_smiles_strs = gsk_cmpd_df[smiles_col].values
    gsk_id_col = 'compound_id'
    # Check for common structures between datasets
    dup_smiles = list(set(gsk_smiles_strs) & set(obach_smiles_strs))
    print("GSK and Obach compound sets have %d SMILES strings in common" % len(dup_smiles))
    if len(dup_smiles) > 0:
        gsk_cmpd_df = gsk_cmpd_df[~gsk_cmpd_df.rdkit_smiles.isin(dup_smiles)]
    gsk_smiles_strs = gsk_cmpd_df[smiles_col].values
    gsk_compound_ids = gsk_cmpd_df[gsk_id_col].values
    base_mols = [struct_utils.base_mol_from_smiles(smiles) for smiles in gsk_smiles_strs]
    for i, mol in enumerate(base_mols):
        if mol is None:
            print('Unable to get base molecule for compound %d = %s' % (i, obach_compound_ids[i]))
    base_smiles = [Chem.MolToSmiles(mol) for mol in base_mols]
    gsk_fps = [AllChem.GetMorganFingerprintAsBitVect(mol, ecfp_radius, 1024) for mol in base_mols if mol is not None]

    # Train a UMAP projector with Obach set, then use it to project both data sets
    obach_mapper = umap.UMAP(n_neighbors=10, n_components=2, metric='jaccard', random_state=17)
    obach_reps = obach_mapper.fit_transform(obach_fps)
    gsk_reps = obach_mapper.transform(gsk_fps)
    obach_rep_df = pd.DataFrame.from_records(obach_reps, columns=['x', 'y'])
    obach_rep_df['compound_id'] = obach_compound_ids
    obach_rep_df['dataset'] = 'Obach'
    gsk_rep_df = pd.DataFrame.from_records(gsk_reps, columns=['x', 'y'])
    gsk_rep_df['compound_id'] = gsk_compound_ids
    gsk_rep_df['dataset'] = 'GSK Aq Sol'
    rep_df = pd.concat((obach_rep_df, gsk_rep_df), ignore_index=True)
    #main_rep_df = rep_df[(rep_df.x > -20) & (rep_df.y > -20)]
    dataset_pal = {'Obach' : 'blue', 'GSK Aq Sol' : 'orange'}
    pdf_path = '%s/obach_gsk_aq_sol_umap_proj.pdf' % out_dir
    pdf = PdfPages(pdf_path)
    fig, ax = plt.subplots(figsize=(12,12))
    g = sns.scatterplot(x='x', y='y', ax=ax, hue='dataset', style='dataset', palette=dataset_pal, data=rep_df)
    ax.set_title("Obach and GSK solubility dataset fingerprints, UMAP projection trained on Obach data", fontdict={'fontsize' : 12})
    pdf.savefig(fig)
    pdf.close()

コード例 #3

ファイルを表示

def obach_diversity_plots(ecfp_radius=6):
    """
    Plot visualizations of diversity for the compounds in the Obach, Lombardo et al PK dataset
    """
    # TODO: Put this dataset in the datastore where everybody else can see it
    cmpd_file = '/usr/local/data/diversity_plots/obach/LombardoSupplemental_Data_rdsmiles.csv'
    out_dir = '/usr/local/data/diversity_plots/obach'
    os.makedirs(out_dir, exist_ok=True)
    file_prefix = 'obach'
    title_prefix = 'Obach PK compound set'
    id_col = 'Name' 
    smiles_col='rdkit_smiles'


    # Load table of compound names, IDs and SMILES strings
    cmpd_df = pd.read_csv(cmpd_file, index_col=False)
    compound_ids = cmpd_df[id_col].values
    smiles_strs = cmpd_df[smiles_col].values
    ncmpds = len(smiles_strs)

    # Strip salts, canonicalize SMILES strings and create RDKit Mol objects
    print("Canonicalizing molecules...")
    base_mols = [struct_utils.base_mol_from_smiles(smiles) for smiles in smiles_strs]
    for i, mol in enumerate(base_mols):
        if mol is None:
            print('Unable to get base molecule for compound %d = %s' % (i, compound_ids[i]))
    base_smiles = [Chem.MolToSmiles(mol) for mol in base_mols]
    print("Done")

    # Generate ECFP fingerprints
    print("Computing fingerprints...")
    fps = [AllChem.GetMorganFingerprintAsBitVect(mol, ecfp_radius, 1024) for mol in base_mols if mol is not None]
    print("Done")

    # Get Tanimoto distance matrix
    print("Computing Tanimoto distance matrix...")
    tani_dist = dm.tanimoto(fps)
    print("Done")
    # Draw a UMAP projection based on Tanimoto distance
    mapper = umap.UMAP(n_neighbors=10, n_components=2, metric='precomputed', random_state=17)
    reps = mapper.fit_transform(tani_dist)
    rep_df = pd.DataFrame.from_records(reps, columns=['x', 'y'])
    rep_df['compound_id'] = compound_ids
    if out_dir is not None:
        pdf_path = '%s/%s_tani_umap_proj.pdf' % (out_dir, file_prefix)
        pdf = PdfPages(pdf_path)
    fig, ax = plt.subplots(figsize=(12,12))
    sns.scatterplot(x='x', y='y', data=rep_df, ax=ax)
    ax.set_title("%s, 2D projection based on Tanimoto distance" % title_prefix)

    main_rep_df = rep_df[(rep_df.x > -20) & (rep_df.y > -20)]
    fig, ax = plt.subplots(figsize=(12,12))
    sns.scatterplot(x='x', y='y', data=main_rep_df, ax=ax)
    ax.set_title("%s, main portion, 2D projection based on Tanimoto distance" % title_prefix)
    if out_dir is not None:
        pdf.savefig(fig)

    pdf.close()

    # Draw a cluster heatmap based on Tanimoto distance
    tani_linkage = linkage(tani_dist, method='complete')
    tani_df = pd.DataFrame(tani_dist, columns=compound_ids, index=compound_ids)
    if out_dir is not None:
        pdf_path = '%s/%s_tanimoto_clustermap.pdf' % (out_dir, file_prefix)
        pdf = PdfPages(pdf_path)
    g = sns.clustermap(tani_df, row_linkage=tani_linkage, col_linkage=tani_linkage, figsize=(12,12), cmap='plasma')
    if out_dir is not None:
        pdf.savefig(g.fig)
        pdf.close()

コード例 #4

ファイルを表示

def compare_solubility_datasets(ecfp_radius=6):
    """
    Plot projections of Delaney and GSK solubility datasets using the same UMAP projectors.
    """
    data_dir = '/ds/data/gsk_data/GSK_datasets/solubility'
    del_cmpd_file = '%s/delaney-processed.csv' % data_dir
    out_dir = '/usr/local/data/diversity_plots/solubility'
    smiles_col='rdkit_smiles'
    del_id_col = 'Compound ID'

    # Load table of compound names, IDs and SMILES strings
    del_cmpd_df = pd.read_csv(del_cmpd_file, index_col=False)
    del_compound_ids = del_cmpd_df[del_id_col].values
    del_smiles_strs = del_cmpd_df[smiles_col].values

    # Strip salts, canonicalize SMILES strings and create RDKit Mol objects
    base_mols = [struct_utils.base_mol_from_smiles(smiles) for smiles in del_smiles_strs]
    for i, mol in enumerate(base_mols):
        if mol is None:
            print('Unable to get base molecule for compound %d = %s' % (i, del_compound_ids[i]))
    base_smiles = [Chem.MolToSmiles(mol) for mol in base_mols]
    del_fps = [AllChem.GetMorganFingerprintAsBitVect(mol, ecfp_radius, 1024) for mol in base_mols if mol is not None]


    gsk_cmpd_file = '%s/ATOM_GSK_Solubility_Aqueous.csv' % data_dir
    gsk_cmpd_df = pd.read_csv(gsk_cmpd_file, index_col=False)
    gsk_smiles_strs = gsk_cmpd_df[smiles_col].values
    gsk_id_col = 'compound_id'
    # Check for common structures between datasets
    dup_smiles = list(set(gsk_smiles_strs) & set(del_smiles_strs))
    print("GSK and Delaney compound sets have %d SMILES strings in common" % len(dup_smiles))
    if len(dup_smiles) > 0:
        gsk_cmpd_df = gsk_cmpd_df[~gsk_cmpd_df.rdkit_smiles.isin(dup_smiles)]
    gsk_smiles_strs = gsk_cmpd_df[smiles_col].values
    gsk_compound_ids = gsk_cmpd_df[gsk_id_col].values
    base_mols = [struct_utils.base_mol_from_smiles(smiles) for smiles in gsk_smiles_strs]
    for i, mol in enumerate(base_mols):
        if mol is None:
            print('Unable to get base molecule for compound %d = %s' % (i, del_compound_ids[i]))
    base_smiles = [Chem.MolToSmiles(mol) for mol in base_mols]
    gsk_fps = [AllChem.GetMorganFingerprintAsBitVect(mol, ecfp_radius, 1024) for mol in base_mols if mol is not None]

    # Train a UMAP projector with Delaney set, then use it to project both data sets
    del_mapper = umap.UMAP(n_neighbors=10, n_components=2, metric='jaccard', random_state=17)
    del_reps = del_mapper.fit_transform(del_fps)
    gsk_reps = del_mapper.transform(gsk_fps)
    del_rep_df = pd.DataFrame.from_records(del_reps, columns=['x', 'y'])
    del_rep_df['compound_id'] = del_compound_ids
    del_rep_df['dataset'] = 'Delaney'
    gsk_rep_df = pd.DataFrame.from_records(gsk_reps, columns=['x', 'y'])
    gsk_rep_df['compound_id'] = gsk_compound_ids
    gsk_rep_df['dataset'] = 'GSK Aq Sol'
    rep_df = pd.concat((del_rep_df, gsk_rep_df), ignore_index=True)
    dataset_pal = {'Delaney' : 'forestgreen', 'GSK Aq Sol' : 'orange'}
    pdf_path = '%s/delaney_gsk_aq_sol_umap_proj.pdf' % out_dir
    pdf = PdfPages(pdf_path)
    fig, ax = plt.subplots(figsize=(12,12))
    g = sns.scatterplot(x='x', y='y', ax=ax, hue='dataset', style='dataset', palette=dataset_pal, data=rep_df)
    ax.set_title("Solubility dataset fingerprints, UMAP projection trained on Delaney data", fontdict={'fontsize' : 12})
    pdf.savefig(fig)
    pdf.close()

    # Train a UMAP projector with GSK set, then use it to project both data sets
    gsk_mapper = umap.UMAP(n_neighbors=10, n_components=2, metric='jaccard', random_state=17)
    gsk_reps = gsk_mapper.fit_transform(gsk_fps)
    del_reps = gsk_mapper.transform(del_fps)
    del_rep_df = pd.DataFrame.from_records(del_reps, columns=['x', 'y'])
    del_rep_df['compound_id'] = del_compound_ids
    del_rep_df['dataset'] = 'Delaney'
    gsk_rep_df = pd.DataFrame.from_records(gsk_reps, columns=['x', 'y'])
    gsk_rep_df['compound_id'] = gsk_compound_ids
    gsk_rep_df['dataset'] = 'GSK Aq Sol'
    rep_df = pd.concat((gsk_rep_df, del_rep_df), ignore_index=True)
    dataset_pal = {'Delaney' : 'forestgreen', 'GSK Aq Sol' : 'orange'}
    pdf_path = '%s/gsk_aq_sol_delaney_umap_proj.pdf' % out_dir
    pdf = PdfPages(pdf_path)
    fig, ax = plt.subplots(figsize=(12,12))
    g = sns.scatterplot(x='x', y='y', ax=ax, hue='dataset', style='dataset', palette=dataset_pal, data=rep_df)
    ax.set_title("Solubility dataset fingerprints, UMAP projection trained on GSK aqueous solubility data", fontdict={'fontsize' : 12})
    pdf.savefig(fig)
    pdf.close()

コード例 #5

ファイルを表示

ファイル: diversity_plots.py プロジェクト: stewarthe6/AMPL-1

def diversity_plots(dset_key,
                    datastore=True,
                    bucket='public',
                    title_prefix=None,
                    ecfp_radius=4,
                    umap_file=None,
                    out_dir=None,
                    id_col='compound_id',
                    smiles_col='rdkit_smiles',
                    is_base_smiles=False,
                    response_col=None,
                    max_for_mcs=300):
    """
    Plot visualizations of diversity for an arbitrary table of compounds. At minimum, the file should contain
    columns for a compound ID and a SMILES string. Produces a clustered heatmap display of Tanimoto distances between
    compounds along with a 2D UMAP projection plot based on ECFP fingerprints, with points colored according to the response
    variable.

    Args:
        dset_key (str): Datastore key or filepath for dataset.

        datastore (bool): Whether to load dataset from datastore or from filesystem.

        bucket (str): Name of datastore bucket containing dataset.

        title_prefix (str): Prefix for plot titles.

        ecfp_radius (int): Radius for ECFP fingerprint calculation.

        umap_file (str, optional): Path to file to write UMAP coordinates to.

        out_dir (str, optional):  Output directory for plots and tables. If provided, plots will be output as PDF files rather
            than in the current notebook, and some additional CSV files will be generated.

        id_col (str): Column in dataset containing compound IDs.

        smiles_col (str): Column in dataset containing SMILES strings.

        is_base_smiles (bool): True if SMILES strings do not need to be salt-stripped and standardized.

        response_col (str): Column in dataset containing response values.

        max_for_mcs (int): Maximum dataset size for plots based on MCS distance. If the number of compounds is less than this
            value, an additional cluster heatmap and UMAP projection plot will be produced based on maximum common substructure
            distance.

    """
    # Load table of compound names, IDs and SMILES strings
    if datastore:
        cmpd_df = dsf.retrieve_dataset_by_datasetkey(dset_key, bucket)
    else:
        cmpd_df = pd.read_csv(dset_key, index_col=False)
    cmpd_df = cmpd_df.drop_duplicates(subset=smiles_col)
    file_prefix = os.path.splitext(os.path.basename(dset_key))[0]
    if title_prefix is None:
        title_prefix = file_prefix.replace('_', ' ')
    compound_ids = cmpd_df[id_col].values
    smiles_strs = cmpd_df[smiles_col].values
    ncmpds = len(smiles_strs)
    # Strip salts, canonicalize SMILES strings and create RDKit Mol objects
    if is_base_smiles:
        base_mols = np.array([Chem.MolFromSmiles(s) for s in smiles_strs])
    else:
        print("Canonicalizing %d molecules..." % ncmpds)
        base_mols = np.array([
            struct_utils.base_mol_from_smiles(smiles) for smiles in smiles_strs
        ])
        for i, mol in enumerate(base_mols):
            if mol is None:
                print('Unable to get base molecule for compound %d = %s' %
                      (i, compound_ids[i]))
        print("Done")

    has_good_smiles = np.array([mol is not None for mol in base_mols])
    base_mols = base_mols[has_good_smiles]

    cmpd_df = cmpd_df[has_good_smiles]
    ncmpds = cmpd_df.shape[0]
    compound_ids = cmpd_df[id_col].values
    responses = None
    if response_col is not None:
        responses = cmpd_df[response_col].values
        uniq_responses = set(responses)
        if uniq_responses == set([0, 1]):
            response_type = 'binary'
            colorpal = {0: 'forestgreen', 1: 'red'}
        elif len(uniq_responses) <= 10:
            response_type = 'categorical'
            colorpal = sns.color_palette('husl', n_colors=len(uniq_responses))
        else:
            response_type = 'continuous'
            colorpal = sns.blend_palette(['red', 'green', 'blue'],
                                         12,
                                         as_cmap=True)

    # Generate ECFP fingerprints
    print("Computing fingerprints...")
    fps = [
        AllChem.GetMorganFingerprintAsBitVect(mol, ecfp_radius, 1024)
        for mol in base_mols if mol is not None
    ]
    print("Done")

    if ncmpds <= max_for_mcs:
        # Get MCS distance matrix and draw a heatmap
        print("Computing MCS distance matrix...")
        mcs_dist = dm.mcs(base_mols)
        print("Done")
        cmpd1 = []
        cmpd2 = []
        dist = []
        ind1 = []
        ind2 = []
        for i in range(ncmpds - 1):
            for j in range(i + 1, ncmpds):
                cmpd1.append(compound_ids[i])
                cmpd2.append(compound_ids[j])
                dist.append(mcs_dist[i, j])
                ind1.append(i)
                ind2.append(j)
        dist_df = pd.DataFrame({
            'compound_1': cmpd1,
            'compound_2': cmpd2,
            'dist': dist,
            'i': ind1,
            'j': ind2
        })
        dist_df = dist_df.sort_values(by='dist')
        print(dist_df.head(10))
        if out_dir is not None:
            dist_df.to_csv('%s/%s_mcs_dist_table.csv' % (out_dir, file_prefix),
                           index=False)
            for k in range(10):
                mol_i = base_mols[dist_df.i.values[k]]
                mol_j = base_mols[dist_df.j.values[k]]
                img_file_i = '%s/%d_%s.png' % (
                    out_dir, k, compound_ids[dist_df.i.values[k]])
                img_file_j = '%s/%d_%s.png' % (
                    out_dir, k, compound_ids[dist_df.j.values[k]])
                Draw.MolToFile(mol_i,
                               img_file_i,
                               size=(500, 500),
                               fitImage=False)
                Draw.MolToFile(mol_j,
                               img_file_j,
                               size=(500, 500),
                               fitImage=False)

        mcs_linkage = linkage(mcs_dist, method='complete')
        mcs_df = pd.DataFrame(mcs_dist,
                              columns=compound_ids,
                              index=compound_ids)
        if out_dir is not None:
            pdf_path = '%s/%s_mcs_clustermap.pdf' % (out_dir, file_prefix)
            pdf = PdfPages(pdf_path)
        g = sns.clustermap(mcs_df,
                           row_linkage=mcs_linkage,
                           col_linkage=mcs_linkage,
                           figsize=(12, 12),
                           cmap='plasma')
        if out_dir is not None:
            pdf.savefig(g.fig)
            pdf.close()

        # Draw a UMAP projection based on MCS distance
        mapper = umap.UMAP(n_neighbors=20,
                           min_dist=0.1,
                           n_components=2,
                           metric='precomputed',
                           random_state=17)
        reps = mapper.fit_transform(mcs_dist)
        rep_df = pd.DataFrame.from_records(reps, columns=['x', 'y'])
        rep_df['compound_id'] = compound_ids
        if out_dir is not None:
            pdf_path = '%s/%s_mcs_umap_proj.pdf' % (out_dir, file_prefix)
            pdf = PdfPages(pdf_path)
        fig, ax = plt.subplots(figsize=(12, 12))
        if responses is None:
            sns.scatterplot(x='x', y='y', data=rep_df, ax=ax)
        else:
            rep_df['response'] = responses
            sns.scatterplot(x='x',
                            y='y',
                            hue='response',
                            palette=colorpal,
                            data=rep_df,
                            ax=ax)
        ax.set_title("%s, 2D projection based on MCS distance" % title_prefix)
        if out_dir is not None:
            pdf.savefig(fig)
            pdf.close()
            rep_df.to_csv('%s/%s_mcs_umap_proj.csv' % (out_dir, file_prefix),
                          index=False)

    # Get Tanimoto distance matrix
    print("Computing Tanimoto distance matrix...")
    tani_dist = dm.tanimoto(fps)
    print("Done")
    # Draw a UMAP projection based on Tanimoto distance
    mapper = umap.UMAP(n_neighbors=20,
                       min_dist=0.1,
                       n_components=2,
                       metric='precomputed',
                       random_state=17)
    reps = mapper.fit_transform(tani_dist)
    rep_df = pd.DataFrame.from_records(reps, columns=['x', 'y'])
    rep_df['compound_id'] = compound_ids
    if responses is not None:
        rep_df['response'] = responses
    if umap_file is not None:
        rep_df.to_csv(umap_file, index=False)
        print("Wrote UMAP mapping to %s" % umap_file)
    if out_dir is not None:
        pdf_path = '%s/%s_tani_umap_proj.pdf' % (out_dir, file_prefix)
        pdf = PdfPages(pdf_path)
    fig, ax = plt.subplots(figsize=(12, 12))
    if responses is None:
        sns.scatterplot(x='x', y='y', data=rep_df, ax=ax)
    else:
        sns.scatterplot(x='x',
                        y='y',
                        hue='response',
                        palette=colorpal,
                        data=rep_df,
                        ax=ax)
    ax.set_title("%s, 2D projection based on Tanimoto distance" % title_prefix)
    if out_dir is not None:
        pdf.savefig(fig)
        pdf.close()

    # Draw a cluster heatmap based on Tanimoto distance
    tani_linkage = linkage(tani_dist, method='complete')
    tani_df = pd.DataFrame(tani_dist, columns=compound_ids, index=compound_ids)
    if out_dir is not None:
        pdf_path = '%s/%s_tanimoto_clustermap.pdf' % (out_dir, file_prefix)
        pdf = PdfPages(pdf_path)
    g = sns.clustermap(tani_df,
                       row_linkage=tani_linkage,
                       col_linkage=tani_linkage,
                       figsize=(12, 12),
                       cmap='plasma')
    if out_dir is not None:
        pdf.savefig(g.fig)
        pdf.close()