Ejemplo n.º 1
0
def plot_tsne(func):
    n_mols = 250
    mols = get_chembl(max_size=n_mols, as_mols=True)
    smile_strings = [m.smiles for m in mols]

    title = f"{func} ot-dist"
    distance_computer = OTChemDistanceComputer(
        mass_assignment_method='molecular_mass',
        normalisation_method='total_mass',
        struct_pen_method='bond_frac')
    distances_mat = distance_computer(smile_strings, smile_strings)[0]

    # title = f"{func} similarity kernel"
    # kernel = mol_kern_factory('similarity_kernel')
    # kern_mat = kernel(mols, mols)
    # distances_mat = 1/kern_mat

    # title = f"{func} fingerprint dist"
    # distances_mat = np.zeros((len(smile_strings), len(smile_strings)))
    # for i in tqdm(range(len(smile_strings))):
    #     for j in range(len(smile_strings)):
    #         distances_mat[i, j] = np.sum((mols[i].to_fingerprint(ftype='numeric') -
    #             mols[j].to_fingerprint(ftype='numeric')) ** 2 )

    tsne = TSNE(metric='precomputed')
    points_to_plot = tsne.fit_transform(distances_mat)

    mols = get_chembl(max_size=n_mols)
    smile_strings = [mol.to_smiles() for mol in mols]
    func_ = get_objective_by_name(func)
    prop_list = [func_(mol) for mol in mols]

    plt.title(title, fontsize=22)
    plt.scatter(points_to_plot[:, 0],
                points_to_plot[:, 1],
                c=prop_list,
                cmap=plt.cm.Spectral,
                s=15,
                alpha=0.8)
    plt.xticks([])
    plt.yticks([])
    # extent = ax.get_window_extent().transformed(fig.dpi_scale_trans.inverted())
    plt.savefig(os.path.join(VIS_DIR,
                             title.replace(" ", "_") + '.eps'),
                format='eps',
                dpi=1000)  # bbox_inches=extent, pad_inches=0
def compute_sa_score_datasets():
    sas = get_objective_by_name("sascore")
    chembl = get_chembl(max_size=50)
    res = [sas(m) for m in chembl]
    print("ChEMBL: {:.3f} +- std {:.3f}".format(np.mean(res), np.std(res)))
    zinc = get_zinc250(max_size=50)
    res = [sas(m) for m in zinc]
    print("ZINC: {:.3f} +- std {:.3f}".format(np.mean(res), np.std(res)))
def compute_novel_percentage(mol_list):
    chembl = get_chembl(max_size=-1)  # smiles list
    chembl = [m.smiles for m in chembl]
    zinc = get_zinc250(max_size=-1)  # smiles list
    zinc = [m.smiles for m in zinc]
    # n_total = len(chembl) + len(zinc)
    n_mols = len(mol_list)
    n_in_data = 0.
    for mol in tqdm(mol_list):
        if (mol in chembl) or (mol in zinc):
            n_in_data += 1
    return 1 - n_in_data / n_mols
def make_pairwise(func, n_mols, to_randomize=True):


    if func == 'prop':
        smile_strings, smiles_to_prop = get_chembl_prop(n_mols=n_mols)
        prop_list = [smiles_to_prop[sm] for sm in smile_strings]
    else:
        n_mols_to_get = 5 * n_mols if to_randomize else n_mols
        mols = get_chembl(n_mols=n_mols_to_get)
        np.random.shuffle(mols)
        mols = mols[:n_mols]
        smile_strings = [mol.to_smiles() for mol in mols]
        func_ = get_objective_by_name(func)
        prop_list = [func_(mol) for mol in mols]

    dist_computer = OTChemDistanceComputer()  # <-- default computer
    dists = dist_computer(smile_strings, smile_strings)

    num_rows = max(2, int(np.ceil(dist_computer.get_num_distances() / 4.0)))
    print(num_rows)
    f, ll_ax = plt.subplots(num_rows, 4, figsize=(15, 15))
    axes = itertools.chain.from_iterable(ll_ax)
    for ind, (ax, distmat) in enumerate(zip(axes, dists)):

        xs, ys = [], []
        pairs = []
        for i in range(n_mols):
            for j in range(i, n_mols):
                dist_in_dist = distmat[i, j]
                dist_in_val = np.abs(prop_list[i] - prop_list[j])
                xs.append(dist_in_dist)
                ys.append(dist_in_val)
                pairs.append((i,j))
#                 pairs.append('(%d,%d)'%(i,j))

        ax.set_title(f'Distance {ind}')  # TODO: parameters of distance
        if n_mols > 12:
          ax.scatter(xs, ys, s=1, alpha=0.6)
        else:
          for xval, yval, pval in zip(xs, ys, pairs):
            print(xval, yval, pval)
            if pval[0] == pval[1]:
#               ax.scatter([xval], [yval], s=1, alpha=0.8)
              ax.text(xval, yval, '*', fontsize=14)
            else:
              ax.text(xval, yval, '(%d, %d)'%(pval[0], pval[1]))
          ax.set_xlim((0.0, max(xs) * 1.25))
#         ax.set_xticks([])
#         ax.set_yticks([])

    plt.savefig(os.path.join(VIS_DIR, "dist_vs_value_%d_%s_%s"%(n_mols, func,
                             datetime.now().strftime('%m%d%H%M%S'))))
    print(smile_strings, len(smile_strings))
Ejemplo n.º 5
0
def test(N=100):
    dist_computer = OTChemDistanceComputer()
    mols = get_chembl(max_size=N, as_mols=True)
    natoms = [mol.to_rdkit().GetNumAtoms() for mol in mols]
    times = defaultdict(list)
    for i in range(N):
        for j in range(i):
            t0 = time()
            dist_computer([mols[i].to_smiles()], [mols[j].to_smiles()])
            time_elapsed = time() - t0
            times[natoms[i] + natoms[j]].append(time_elapsed)
    for k, res_lst in times.items():
        times[k] = np.mean(res_lst)
    return times
Ejemplo n.º 6
0
def make_pairwise_kernel(kernel_name, func, **kwargs):
    n_mols = 100

    mols = get_chembl(max_size=n_mols)
    # smile_strings = [mol.to_smiles() for mol in mols]
    func_ = get_objective_by_name(func)
    kernel = mol_kern_factory(kernel_name, **kwargs)
    kern_mat = kernel(mols, mols)
    prop_list = [func_(mol) for mol in mols]

    xs, ys = [], []
    for i in range(n_mols):
        for j in range(n_mols):
            if mode == "inverse_sim":
                dist_in_dist = 1 / kern_mat[i, j]
            elif mode == "scaled_kernel":
                dist_in_dist = 1 / kern_mat[i, j]
                dist_in_dist /= np.sqrt(kern_mat[i, i] * kern_mat[j, j])
            elif mode == "fps_distance":
                dist_in_dist = np.sum(
                    (mols[i].to_fingerprint(ftype='numeric') -
                     mols[j].to_fingerprint(ftype='numeric'))**2)
            else:
                raise ValueError

            dist_in_val = np.abs(prop_list[i] - prop_list[j])
            xs.append(dist_in_dist)
            ys.append(dist_in_val)

    fig = plt.figure()  # figsize=fsize
    ax = fig.add_subplot(1, 1, 1)
    plt.scatter(xs, ys, s=2, alpha=0.6)
    # plt.yscale('log')
    plt.xscale('log')
    plt.xlim([11, 80])
    plt.xticks([])
    plt.yticks([])
    # extent = ax.get_window_extent().transformed(fig.dpi_scale_trans.inverted())
    plt.savefig(os.path.join(VIS_DIR, f"{kernel_name}_{func}.eps"),
                format='eps',
                dpi=1000)  # bbox_inches=extent, pad_inches=0
    plt.clf()
def gen_gp_test_data():
    """ Xs are molecules, Ys are some numeric value """
    n_all = 100

    mols = get_chembl(n_all * 3)
    ys = np.array([func(m) for m in mols])

    mols1, mols2, mols3 = mols[:n_all], mols[n_all:2 *
                                             n_all], mols[2 * n_all:3 * n_all]
    ys1, ys2, ys3 = ys[:n_all], ys[n_all:2 * n_all], ys[2 * n_all:3 * n_all]

    n_train = int(n_all * 0.8)
    ys = np.array([SAScore(m) for m in mols])

    X1_tr, X1_te = mols1[:n_train], mols1[n_train:]
    Y1_tr, Y1_te = ys1[:n_train], ys1[n_train:]
    X2_tr, X2_te = mols2[:n_train], mols2[n_train:]
    Y2_tr, Y2_te = ys2[:n_train], ys2[n_train:]
    X3_tr, X3_te = mols3[:n_train], mols3[n_train:]
    Y3_tr, Y3_te = ys3[:n_train], ys3[n_train:]

    return [(X1_tr, Y1_tr, X1_te, Y1_te), (X2_tr, Y2_tr, X2_te, Y2_te),
            (X3_tr, Y3_tr, X3_te, Y3_te)]
Ejemplo n.º 8
0
def make_tsne(func, as_subplots=False):
    """
    Plot TSNE embeddings colored with property
    for several distance computers.
    """
    n_mols = 200

    dist_computers = [
        OTChemDistanceComputer(mass_assignment_method='equal',
                               normalisation_method='none',
                               struct_pen_method='bond_frac'),
        OTChemDistanceComputer(mass_assignment_method='equal',
                               normalisation_method='total_mass',
                               struct_pen_method='bond_frac'),
        OTChemDistanceComputer(mass_assignment_method='molecular_mass',
                               normalisation_method='none',
                               struct_pen_method='bond_frac'),
        OTChemDistanceComputer(mass_assignment_method='molecular_mass',
                               normalisation_method='total_mass',
                               struct_pen_method='bond_frac')
    ]
    titles = [
        'Equal mass assign, no norm', 'Equal mass assign, total mass norm',
        'Mol mass assign, no norm', 'Mol mass assign, total mass norm'
    ]

    smile_strings, smiles_to_prop = get_chembl_prop(n_mols=n_mols)
    if func == 'prop':
        smile_strings, smiles_to_prop = get_chembl_prop(n_mols=n_mols)
        prop_list = [smiles_to_prop[sm] for sm in smile_strings]
    else:
        mols = get_chembl(max_size=n_mols)
        smile_strings = [mol.to_smiles() for mol in mols]
        func_ = get_objective_by_name(func)
        prop_list = [func_(mol) for mol in mols]

    f, ll_ax = plt.subplots(2, 2, figsize=(15, 15))
    axes = itertools.chain.from_iterable(ll_ax)
    for ind, (ax, dist_computer,
              title) in enumerate(zip(axes, dist_computers, titles)):
        distances_mat = dist_computer(smile_strings, smile_strings)[0]

        # plot them
        tsne = TSNE(metric='precomputed')
        points_to_plot = tsne.fit_transform(distances_mat)
        if as_subplots:
            ax.set_title(title)
            ax.scatter(points_to_plot[:, 0],
                       points_to_plot[:, 1],
                       c=prop_list,
                       cmap=plt.cm.Spectral,
                       s=9,
                       alpha=0.8)
            ax.set_xticks([])
            ax.set_yticks([])
        else:
            # save separately:
            plt.clf()
            fig = plt.figure()  # figsize=fsize
            ax = fig.add_subplot(1, 1, 1)
            plt.title(title)
            plt.scatter(points_to_plot[:, 0],
                        points_to_plot[:, 1],
                        c=prop_list,
                        cmap=plt.cm.Spectral,
                        s=9,
                        alpha=0.8)
            plt.xticks([])
            plt.yticks([])
            # extent = ax.get_window_extent().transformed(fig.dpi_scale_trans.inverted())
            plt.savefig(os.path.join(VIS_DIR,
                                     f'tsne_vis_{func}_{dist_computer}.eps'),
                        format='eps',
                        dpi=1000)  # bbox_inches=extent, pad_inches=0
            plt.clf()

    if as_subplots:
        plt.savefig(os.path.join(VIS_DIR, f'tsne_vis_{func}.eps'),
                    format='eps',
                    dpi=1000)
        plt.clf()
Ejemplo n.º 9
0
def make_pairwise(func, as_subplots=False):
    n_mols = 100

    if func == 'prop':
        smile_strings, smiles_to_prop = get_chembl_prop(n_mols=n_mols)
        prop_list = [smiles_to_prop[sm] for sm in smile_strings]
    else:
        mols = get_chembl(max_size=n_mols)
        smile_strings = [mol.to_smiles() for mol in mols]
        func_ = get_objective_by_name(func)
        prop_list = [func_(mol) for mol in mols]

    dist_computers = [
        OTChemDistanceComputer(mass_assignment_method='equal',
                               normalisation_method='none',
                               struct_pen_method='bond_frac'),
        OTChemDistanceComputer(mass_assignment_method='equal',
                               normalisation_method='total_mass',
                               struct_pen_method='bond_frac'),
        OTChemDistanceComputer(mass_assignment_method='molecular_mass',
                               normalisation_method='none',
                               struct_pen_method='bond_frac'),
        OTChemDistanceComputer(mass_assignment_method='molecular_mass',
                               normalisation_method='total_mass',
                               struct_pen_method='bond_frac')
    ]
    titles = [
        'Unit weight, Unnormalized', 'Unit weight, Normalized',
        'Molecular mass weight, Unnormalized',
        'Molecular mass weight, Normalized'
    ]

    f, ll_ax = plt.subplots(2, 2, figsize=(15, 15))
    axes = itertools.chain.from_iterable(ll_ax)
    for ind, (ax, dist_computer,
              title) in enumerate(zip(axes, dist_computers, titles)):
        distmat = dist_computer(smile_strings, smile_strings)[0]
        xs, ys = [], []
        for i in range(n_mols):
            for j in range(n_mols):
                dist_in_dist = distmat[i, j]
                dist_in_val = np.abs(prop_list[i] - prop_list[j])
                xs.append(dist_in_dist)
                ys.append(dist_in_val)

        if as_subplots:
            ax.set_title(title)
            ax.scatter(xs, ys, s=2, alpha=0.6)
            ax.set_xticks([])
            ax.set_yticks([])
        else:
            # save separately:
            plt.clf()
            fig = plt.figure()  # figsize=fsize
            ax = fig.add_subplot(1, 1, 1)
            plt.title(title, fontsize=22)
            plt.scatter(xs, ys, s=2, alpha=0.6)
            plt.xscale('log')
            plt.xticks([])
            plt.yticks([])
            plt.xlim([None, 1.03 * max(xs)])
            plt.xlabel("OT-distance, log scale", fontsize=20)
            if ind == 0:
                plt.ylabel(f"Difference in SA score", fontsize=20)
                extent = ax.get_window_extent().transformed(
                    fig.dpi_scale_trans.inverted())
                extent.x0 -= 0.5
                extent.x1 += 0.1
                extent.y0 -= 0.6
                extent.y1 += 0.7
            else:
                extent = ax.get_window_extent().transformed(
                    fig.dpi_scale_trans.inverted())
                extent.x0 -= 0.5
                extent.x1 += 0.1
                extent.y0 -= 0.6
                extent.y1 += 0.7
            plt.savefig(
                os.path.join(VIS_DIR, f"dist_vs_value_{func}_{ind+1}.pdf"),
                bbox_inches=extent,
                pad_inches=0
            )  #bbox_inches=extent, pad_inches=0, format='eps', dpi=1000,
            plt.clf()

    if as_subplots:
        plt.savefig(os.path.join(VIS_DIR, f"dist_vs_value_{func}.eps"),
                    format='eps',
                    dpi=1000)
        plt.clf()