コード例 #1
0
 def setUp(self):
     """ Set up. """
     self.data_file_names = ['data_alkanes.txt', 'data_prop.txt']
     self.dist_computers = [
         OTChemDistanceComputer(),  # default parameters
         OTChemDistanceComputer(
             mass_assignment_method='equal',
             nonexist_non_assignment_penalty_vals=[1, 5, 10]),
         OTChemDistanceComputer(normalisation_method='none'),
     ]
コード例 #2
0
def make_pairwise(func, n_mols, to_randomize=True):


    if func == 'prop':
        smile_strings, smiles_to_prop = get_chembl_prop(n_mols=n_mols)
        prop_list = [smiles_to_prop[sm] for sm in smile_strings]
    else:
        n_mols_to_get = 5 * n_mols if to_randomize else n_mols
        mols = get_chembl(n_mols=n_mols_to_get)
        np.random.shuffle(mols)
        mols = mols[:n_mols]
        smile_strings = [mol.to_smiles() for mol in mols]
        func_ = get_objective_by_name(func)
        prop_list = [func_(mol) for mol in mols]

    dist_computer = OTChemDistanceComputer()  # <-- default computer
    dists = dist_computer(smile_strings, smile_strings)

    num_rows = max(2, int(np.ceil(dist_computer.get_num_distances() / 4.0)))
    print(num_rows)
    f, ll_ax = plt.subplots(num_rows, 4, figsize=(15, 15))
    axes = itertools.chain.from_iterable(ll_ax)
    for ind, (ax, distmat) in enumerate(zip(axes, dists)):

        xs, ys = [], []
        pairs = []
        for i in range(n_mols):
            for j in range(i, n_mols):
                dist_in_dist = distmat[i, j]
                dist_in_val = np.abs(prop_list[i] - prop_list[j])
                xs.append(dist_in_dist)
                ys.append(dist_in_val)
                pairs.append((i,j))
#                 pairs.append('(%d,%d)'%(i,j))

        ax.set_title(f'Distance {ind}')  # TODO: parameters of distance
        if n_mols > 12:
          ax.scatter(xs, ys, s=1, alpha=0.6)
        else:
          for xval, yval, pval in zip(xs, ys, pairs):
            print(xval, yval, pval)
            if pval[0] == pval[1]:
#               ax.scatter([xval], [yval], s=1, alpha=0.8)
              ax.text(xval, yval, '*', fontsize=14)
            else:
              ax.text(xval, yval, '(%d, %d)'%(pval[0], pval[1]))
          ax.set_xlim((0.0, max(xs) * 1.25))
#         ax.set_xticks([])
#         ax.set_yticks([])

    plt.savefig(os.path.join(VIS_DIR, "dist_vs_value_%d_%s_%s"%(n_mols, func,
                             datetime.now().strftime('%m%d%H%M%S'))))
    print(smile_strings, len(smile_strings))
コード例 #3
0
ファイル: evaluate_time.py プロジェクト: limberc/chembo
def test(N=100):
    dist_computer = OTChemDistanceComputer()
    mols = get_chembl(max_size=N, as_mols=True)
    natoms = [mol.to_rdkit().GetNumAtoms() for mol in mols]
    times = defaultdict(list)
    for i in range(N):
        for j in range(i):
            t0 = time()
            dist_computer([mols[i].to_smiles()], [mols[j].to_smiles()])
            time_elapsed = time() - t0
            times[natoms[i] + natoms[j]].append(time_elapsed)
    for k, res_lst in times.items():
        times[k] = np.mean(res_lst)
    return times
コード例 #4
0
def make_tsne():
    """
    Plot TSNE embeddings colored with property
    for several distance computers.
    """
    dist_computers = [
        OTChemDistanceComputer(), # default parameters
        OTChemDistanceComputer(mass_assignment_method='equal',
                               nonexist_non_assignment_penalty_vals=[1, 5, 10]),
        OTChemDistanceComputer(normalisation_method='atomic_mass'),
    ]

    smile_strings, smiles_to_prop = get_chembl_prop(n_mols=200)
    prop_list = [smiles_to_prop[sm] for sm in smile_strings]

    for dist_computer in dist_computers:
        distances_mat = np.mean(dist_computer(smile_strings, smile_strings), axis=0)
        # plot them
        tsne = TSNE(metric='precomputed')
        points_to_plot = tsne.fit_transform(distances_mat)
        plt.scatter(points_to_plot[:, 0], points_to_plot[:, 1], c=prop_list, cmap=plt.cm.Spectral)
        print(str(dist_computer))
        plt.savefig(os.path.join(VIS_DIR, str(dist_computer)))
コード例 #5
0
def plot_tsne(func):
    n_mols = 250
    mols = get_chembl(max_size=n_mols, as_mols=True)
    smile_strings = [m.smiles for m in mols]

    title = f"{func} ot-dist"
    distance_computer = OTChemDistanceComputer(
        mass_assignment_method='molecular_mass',
        normalisation_method='total_mass',
        struct_pen_method='bond_frac')
    distances_mat = distance_computer(smile_strings, smile_strings)[0]

    # title = f"{func} similarity kernel"
    # kernel = mol_kern_factory('similarity_kernel')
    # kern_mat = kernel(mols, mols)
    # distances_mat = 1/kern_mat

    # title = f"{func} fingerprint dist"
    # distances_mat = np.zeros((len(smile_strings), len(smile_strings)))
    # for i in tqdm(range(len(smile_strings))):
    #     for j in range(len(smile_strings)):
    #         distances_mat[i, j] = np.sum((mols[i].to_fingerprint(ftype='numeric') -
    #             mols[j].to_fingerprint(ftype='numeric')) ** 2 )

    tsne = TSNE(metric='precomputed')
    points_to_plot = tsne.fit_transform(distances_mat)

    mols = get_chembl(max_size=n_mols)
    smile_strings = [mol.to_smiles() for mol in mols]
    func_ = get_objective_by_name(func)
    prop_list = [func_(mol) for mol in mols]

    plt.title(title, fontsize=22)
    plt.scatter(points_to_plot[:, 0],
                points_to_plot[:, 1],
                c=prop_list,
                cmap=plt.cm.Spectral,
                s=15,
                alpha=0.8)
    plt.xticks([])
    plt.yticks([])
    # extent = ax.get_window_extent().transformed(fig.dpi_scale_trans.inverted())
    plt.savefig(os.path.join(VIS_DIR,
                             title.replace(" ", "_") + '.eps'),
                format='eps',
                dpi=1000)  # bbox_inches=extent, pad_inches=0
コード例 #6
0
 def get_dist_computers(self, chemist_args):
     domain_dist_computers = []
     for domain, kernel_type in zip(
             self.func_caller.domain.list_of_domains,
             self.func_caller.domain_orderings.kernel_ordering):
         domain_type = domain.get_type()
         if domain_type == "molecule":
             # first check function caller kernel, then `chemist_args`, and finally default kernel
             if kernel_type is None or kernel_type == '':
                 kernel_type = chemist_args["dom_mol_kernel_type"]
             if kernel_type == "default":
                 kernel_type = get_default_kernel_type(domain_type)
             if kernel_type in MOL_DISTANCE_KERNEL_TYPES:
                 computer = OTChemDistanceComputer()
                 domain_dist_computers.append(computer)
             else:
                 domain_dist_computers.append(None)
         else:
             raise NotImplementedError(
                 "Distance computers not implemented for other domains.")
     logging.info(f"domain_dist_computers: {domain_dist_computers}")
     return domain_dist_computers
コード例 #7
0
def make_tsne(func, as_subplots=False):
    """
    Plot TSNE embeddings colored with property
    for several distance computers.
    """
    n_mols = 200

    dist_computers = [
        OTChemDistanceComputer(mass_assignment_method='equal',
                               normalisation_method='none',
                               struct_pen_method='bond_frac'),
        OTChemDistanceComputer(mass_assignment_method='equal',
                               normalisation_method='total_mass',
                               struct_pen_method='bond_frac'),
        OTChemDistanceComputer(mass_assignment_method='molecular_mass',
                               normalisation_method='none',
                               struct_pen_method='bond_frac'),
        OTChemDistanceComputer(mass_assignment_method='molecular_mass',
                               normalisation_method='total_mass',
                               struct_pen_method='bond_frac')
    ]
    titles = [
        'Equal mass assign, no norm', 'Equal mass assign, total mass norm',
        'Mol mass assign, no norm', 'Mol mass assign, total mass norm'
    ]

    smile_strings, smiles_to_prop = get_chembl_prop(n_mols=n_mols)
    if func == 'prop':
        smile_strings, smiles_to_prop = get_chembl_prop(n_mols=n_mols)
        prop_list = [smiles_to_prop[sm] for sm in smile_strings]
    else:
        mols = get_chembl(max_size=n_mols)
        smile_strings = [mol.to_smiles() for mol in mols]
        func_ = get_objective_by_name(func)
        prop_list = [func_(mol) for mol in mols]

    f, ll_ax = plt.subplots(2, 2, figsize=(15, 15))
    axes = itertools.chain.from_iterable(ll_ax)
    for ind, (ax, dist_computer,
              title) in enumerate(zip(axes, dist_computers, titles)):
        distances_mat = dist_computer(smile_strings, smile_strings)[0]

        # plot them
        tsne = TSNE(metric='precomputed')
        points_to_plot = tsne.fit_transform(distances_mat)
        if as_subplots:
            ax.set_title(title)
            ax.scatter(points_to_plot[:, 0],
                       points_to_plot[:, 1],
                       c=prop_list,
                       cmap=plt.cm.Spectral,
                       s=9,
                       alpha=0.8)
            ax.set_xticks([])
            ax.set_yticks([])
        else:
            # save separately:
            plt.clf()
            fig = plt.figure()  # figsize=fsize
            ax = fig.add_subplot(1, 1, 1)
            plt.title(title)
            plt.scatter(points_to_plot[:, 0],
                        points_to_plot[:, 1],
                        c=prop_list,
                        cmap=plt.cm.Spectral,
                        s=9,
                        alpha=0.8)
            plt.xticks([])
            plt.yticks([])
            # extent = ax.get_window_extent().transformed(fig.dpi_scale_trans.inverted())
            plt.savefig(os.path.join(VIS_DIR,
                                     f'tsne_vis_{func}_{dist_computer}.eps'),
                        format='eps',
                        dpi=1000)  # bbox_inches=extent, pad_inches=0
            plt.clf()

    if as_subplots:
        plt.savefig(os.path.join(VIS_DIR, f'tsne_vis_{func}.eps'),
                    format='eps',
                    dpi=1000)
        plt.clf()
コード例 #8
0
def make_pairwise(func, as_subplots=False):
    n_mols = 100

    if func == 'prop':
        smile_strings, smiles_to_prop = get_chembl_prop(n_mols=n_mols)
        prop_list = [smiles_to_prop[sm] for sm in smile_strings]
    else:
        mols = get_chembl(max_size=n_mols)
        smile_strings = [mol.to_smiles() for mol in mols]
        func_ = get_objective_by_name(func)
        prop_list = [func_(mol) for mol in mols]

    dist_computers = [
        OTChemDistanceComputer(mass_assignment_method='equal',
                               normalisation_method='none',
                               struct_pen_method='bond_frac'),
        OTChemDistanceComputer(mass_assignment_method='equal',
                               normalisation_method='total_mass',
                               struct_pen_method='bond_frac'),
        OTChemDistanceComputer(mass_assignment_method='molecular_mass',
                               normalisation_method='none',
                               struct_pen_method='bond_frac'),
        OTChemDistanceComputer(mass_assignment_method='molecular_mass',
                               normalisation_method='total_mass',
                               struct_pen_method='bond_frac')
    ]
    titles = [
        'Unit weight, Unnormalized', 'Unit weight, Normalized',
        'Molecular mass weight, Unnormalized',
        'Molecular mass weight, Normalized'
    ]

    f, ll_ax = plt.subplots(2, 2, figsize=(15, 15))
    axes = itertools.chain.from_iterable(ll_ax)
    for ind, (ax, dist_computer,
              title) in enumerate(zip(axes, dist_computers, titles)):
        distmat = dist_computer(smile_strings, smile_strings)[0]
        xs, ys = [], []
        for i in range(n_mols):
            for j in range(n_mols):
                dist_in_dist = distmat[i, j]
                dist_in_val = np.abs(prop_list[i] - prop_list[j])
                xs.append(dist_in_dist)
                ys.append(dist_in_val)

        if as_subplots:
            ax.set_title(title)
            ax.scatter(xs, ys, s=2, alpha=0.6)
            ax.set_xticks([])
            ax.set_yticks([])
        else:
            # save separately:
            plt.clf()
            fig = plt.figure()  # figsize=fsize
            ax = fig.add_subplot(1, 1, 1)
            plt.title(title, fontsize=22)
            plt.scatter(xs, ys, s=2, alpha=0.6)
            plt.xscale('log')
            plt.xticks([])
            plt.yticks([])
            plt.xlim([None, 1.03 * max(xs)])
            plt.xlabel("OT-distance, log scale", fontsize=20)
            if ind == 0:
                plt.ylabel(f"Difference in SA score", fontsize=20)
                extent = ax.get_window_extent().transformed(
                    fig.dpi_scale_trans.inverted())
                extent.x0 -= 0.5
                extent.x1 += 0.1
                extent.y0 -= 0.6
                extent.y1 += 0.7
            else:
                extent = ax.get_window_extent().transformed(
                    fig.dpi_scale_trans.inverted())
                extent.x0 -= 0.5
                extent.x1 += 0.1
                extent.y0 -= 0.6
                extent.y1 += 0.7
            plt.savefig(
                os.path.join(VIS_DIR, f"dist_vs_value_{func}_{ind+1}.pdf"),
                bbox_inches=extent,
                pad_inches=0
            )  #bbox_inches=extent, pad_inches=0, format='eps', dpi=1000,
            plt.clf()

    if as_subplots:
        plt.savefig(os.path.join(VIS_DIR, f"dist_vs_value_{func}.eps"),
                    format='eps',
                    dpi=1000)
        plt.clf()