def setUp(self): """ Set up. """ self.data_file_names = ['data_alkanes.txt', 'data_prop.txt'] self.dist_computers = [ OTChemDistanceComputer(), # default parameters OTChemDistanceComputer( mass_assignment_method='equal', nonexist_non_assignment_penalty_vals=[1, 5, 10]), OTChemDistanceComputer(normalisation_method='none'), ]
def make_pairwise(func, n_mols, to_randomize=True): if func == 'prop': smile_strings, smiles_to_prop = get_chembl_prop(n_mols=n_mols) prop_list = [smiles_to_prop[sm] for sm in smile_strings] else: n_mols_to_get = 5 * n_mols if to_randomize else n_mols mols = get_chembl(n_mols=n_mols_to_get) np.random.shuffle(mols) mols = mols[:n_mols] smile_strings = [mol.to_smiles() for mol in mols] func_ = get_objective_by_name(func) prop_list = [func_(mol) for mol in mols] dist_computer = OTChemDistanceComputer() # <-- default computer dists = dist_computer(smile_strings, smile_strings) num_rows = max(2, int(np.ceil(dist_computer.get_num_distances() / 4.0))) print(num_rows) f, ll_ax = plt.subplots(num_rows, 4, figsize=(15, 15)) axes = itertools.chain.from_iterable(ll_ax) for ind, (ax, distmat) in enumerate(zip(axes, dists)): xs, ys = [], [] pairs = [] for i in range(n_mols): for j in range(i, n_mols): dist_in_dist = distmat[i, j] dist_in_val = np.abs(prop_list[i] - prop_list[j]) xs.append(dist_in_dist) ys.append(dist_in_val) pairs.append((i,j)) # pairs.append('(%d,%d)'%(i,j)) ax.set_title(f'Distance {ind}') # TODO: parameters of distance if n_mols > 12: ax.scatter(xs, ys, s=1, alpha=0.6) else: for xval, yval, pval in zip(xs, ys, pairs): print(xval, yval, pval) if pval[0] == pval[1]: # ax.scatter([xval], [yval], s=1, alpha=0.8) ax.text(xval, yval, '*', fontsize=14) else: ax.text(xval, yval, '(%d, %d)'%(pval[0], pval[1])) ax.set_xlim((0.0, max(xs) * 1.25)) # ax.set_xticks([]) # ax.set_yticks([]) plt.savefig(os.path.join(VIS_DIR, "dist_vs_value_%d_%s_%s"%(n_mols, func, datetime.now().strftime('%m%d%H%M%S')))) print(smile_strings, len(smile_strings))
def test(N=100): dist_computer = OTChemDistanceComputer() mols = get_chembl(max_size=N, as_mols=True) natoms = [mol.to_rdkit().GetNumAtoms() for mol in mols] times = defaultdict(list) for i in range(N): for j in range(i): t0 = time() dist_computer([mols[i].to_smiles()], [mols[j].to_smiles()]) time_elapsed = time() - t0 times[natoms[i] + natoms[j]].append(time_elapsed) for k, res_lst in times.items(): times[k] = np.mean(res_lst) return times
def make_tsne(): """ Plot TSNE embeddings colored with property for several distance computers. """ dist_computers = [ OTChemDistanceComputer(), # default parameters OTChemDistanceComputer(mass_assignment_method='equal', nonexist_non_assignment_penalty_vals=[1, 5, 10]), OTChemDistanceComputer(normalisation_method='atomic_mass'), ] smile_strings, smiles_to_prop = get_chembl_prop(n_mols=200) prop_list = [smiles_to_prop[sm] for sm in smile_strings] for dist_computer in dist_computers: distances_mat = np.mean(dist_computer(smile_strings, smile_strings), axis=0) # plot them tsne = TSNE(metric='precomputed') points_to_plot = tsne.fit_transform(distances_mat) plt.scatter(points_to_plot[:, 0], points_to_plot[:, 1], c=prop_list, cmap=plt.cm.Spectral) print(str(dist_computer)) plt.savefig(os.path.join(VIS_DIR, str(dist_computer)))
def plot_tsne(func): n_mols = 250 mols = get_chembl(max_size=n_mols, as_mols=True) smile_strings = [m.smiles for m in mols] title = f"{func} ot-dist" distance_computer = OTChemDistanceComputer( mass_assignment_method='molecular_mass', normalisation_method='total_mass', struct_pen_method='bond_frac') distances_mat = distance_computer(smile_strings, smile_strings)[0] # title = f"{func} similarity kernel" # kernel = mol_kern_factory('similarity_kernel') # kern_mat = kernel(mols, mols) # distances_mat = 1/kern_mat # title = f"{func} fingerprint dist" # distances_mat = np.zeros((len(smile_strings), len(smile_strings))) # for i in tqdm(range(len(smile_strings))): # for j in range(len(smile_strings)): # distances_mat[i, j] = np.sum((mols[i].to_fingerprint(ftype='numeric') - # mols[j].to_fingerprint(ftype='numeric')) ** 2 ) tsne = TSNE(metric='precomputed') points_to_plot = tsne.fit_transform(distances_mat) mols = get_chembl(max_size=n_mols) smile_strings = [mol.to_smiles() for mol in mols] func_ = get_objective_by_name(func) prop_list = [func_(mol) for mol in mols] plt.title(title, fontsize=22) plt.scatter(points_to_plot[:, 0], points_to_plot[:, 1], c=prop_list, cmap=plt.cm.Spectral, s=15, alpha=0.8) plt.xticks([]) plt.yticks([]) # extent = ax.get_window_extent().transformed(fig.dpi_scale_trans.inverted()) plt.savefig(os.path.join(VIS_DIR, title.replace(" ", "_") + '.eps'), format='eps', dpi=1000) # bbox_inches=extent, pad_inches=0
def get_dist_computers(self, chemist_args): domain_dist_computers = [] for domain, kernel_type in zip( self.func_caller.domain.list_of_domains, self.func_caller.domain_orderings.kernel_ordering): domain_type = domain.get_type() if domain_type == "molecule": # first check function caller kernel, then `chemist_args`, and finally default kernel if kernel_type is None or kernel_type == '': kernel_type = chemist_args["dom_mol_kernel_type"] if kernel_type == "default": kernel_type = get_default_kernel_type(domain_type) if kernel_type in MOL_DISTANCE_KERNEL_TYPES: computer = OTChemDistanceComputer() domain_dist_computers.append(computer) else: domain_dist_computers.append(None) else: raise NotImplementedError( "Distance computers not implemented for other domains.") logging.info(f"domain_dist_computers: {domain_dist_computers}") return domain_dist_computers
def make_tsne(func, as_subplots=False): """ Plot TSNE embeddings colored with property for several distance computers. """ n_mols = 200 dist_computers = [ OTChemDistanceComputer(mass_assignment_method='equal', normalisation_method='none', struct_pen_method='bond_frac'), OTChemDistanceComputer(mass_assignment_method='equal', normalisation_method='total_mass', struct_pen_method='bond_frac'), OTChemDistanceComputer(mass_assignment_method='molecular_mass', normalisation_method='none', struct_pen_method='bond_frac'), OTChemDistanceComputer(mass_assignment_method='molecular_mass', normalisation_method='total_mass', struct_pen_method='bond_frac') ] titles = [ 'Equal mass assign, no norm', 'Equal mass assign, total mass norm', 'Mol mass assign, no norm', 'Mol mass assign, total mass norm' ] smile_strings, smiles_to_prop = get_chembl_prop(n_mols=n_mols) if func == 'prop': smile_strings, smiles_to_prop = get_chembl_prop(n_mols=n_mols) prop_list = [smiles_to_prop[sm] for sm in smile_strings] else: mols = get_chembl(max_size=n_mols) smile_strings = [mol.to_smiles() for mol in mols] func_ = get_objective_by_name(func) prop_list = [func_(mol) for mol in mols] f, ll_ax = plt.subplots(2, 2, figsize=(15, 15)) axes = itertools.chain.from_iterable(ll_ax) for ind, (ax, dist_computer, title) in enumerate(zip(axes, dist_computers, titles)): distances_mat = dist_computer(smile_strings, smile_strings)[0] # plot them tsne = TSNE(metric='precomputed') points_to_plot = tsne.fit_transform(distances_mat) if as_subplots: ax.set_title(title) ax.scatter(points_to_plot[:, 0], points_to_plot[:, 1], c=prop_list, cmap=plt.cm.Spectral, s=9, alpha=0.8) ax.set_xticks([]) ax.set_yticks([]) else: # save separately: plt.clf() fig = plt.figure() # figsize=fsize ax = fig.add_subplot(1, 1, 1) plt.title(title) plt.scatter(points_to_plot[:, 0], points_to_plot[:, 1], c=prop_list, cmap=plt.cm.Spectral, s=9, alpha=0.8) plt.xticks([]) plt.yticks([]) # extent = ax.get_window_extent().transformed(fig.dpi_scale_trans.inverted()) plt.savefig(os.path.join(VIS_DIR, f'tsne_vis_{func}_{dist_computer}.eps'), format='eps', dpi=1000) # bbox_inches=extent, pad_inches=0 plt.clf() if as_subplots: plt.savefig(os.path.join(VIS_DIR, f'tsne_vis_{func}.eps'), format='eps', dpi=1000) plt.clf()
def make_pairwise(func, as_subplots=False): n_mols = 100 if func == 'prop': smile_strings, smiles_to_prop = get_chembl_prop(n_mols=n_mols) prop_list = [smiles_to_prop[sm] for sm in smile_strings] else: mols = get_chembl(max_size=n_mols) smile_strings = [mol.to_smiles() for mol in mols] func_ = get_objective_by_name(func) prop_list = [func_(mol) for mol in mols] dist_computers = [ OTChemDistanceComputer(mass_assignment_method='equal', normalisation_method='none', struct_pen_method='bond_frac'), OTChemDistanceComputer(mass_assignment_method='equal', normalisation_method='total_mass', struct_pen_method='bond_frac'), OTChemDistanceComputer(mass_assignment_method='molecular_mass', normalisation_method='none', struct_pen_method='bond_frac'), OTChemDistanceComputer(mass_assignment_method='molecular_mass', normalisation_method='total_mass', struct_pen_method='bond_frac') ] titles = [ 'Unit weight, Unnormalized', 'Unit weight, Normalized', 'Molecular mass weight, Unnormalized', 'Molecular mass weight, Normalized' ] f, ll_ax = plt.subplots(2, 2, figsize=(15, 15)) axes = itertools.chain.from_iterable(ll_ax) for ind, (ax, dist_computer, title) in enumerate(zip(axes, dist_computers, titles)): distmat = dist_computer(smile_strings, smile_strings)[0] xs, ys = [], [] for i in range(n_mols): for j in range(n_mols): dist_in_dist = distmat[i, j] dist_in_val = np.abs(prop_list[i] - prop_list[j]) xs.append(dist_in_dist) ys.append(dist_in_val) if as_subplots: ax.set_title(title) ax.scatter(xs, ys, s=2, alpha=0.6) ax.set_xticks([]) ax.set_yticks([]) else: # save separately: plt.clf() fig = plt.figure() # figsize=fsize ax = fig.add_subplot(1, 1, 1) plt.title(title, fontsize=22) plt.scatter(xs, ys, s=2, alpha=0.6) plt.xscale('log') plt.xticks([]) plt.yticks([]) plt.xlim([None, 1.03 * max(xs)]) plt.xlabel("OT-distance, log scale", fontsize=20) if ind == 0: plt.ylabel(f"Difference in SA score", fontsize=20) extent = ax.get_window_extent().transformed( fig.dpi_scale_trans.inverted()) extent.x0 -= 0.5 extent.x1 += 0.1 extent.y0 -= 0.6 extent.y1 += 0.7 else: extent = ax.get_window_extent().transformed( fig.dpi_scale_trans.inverted()) extent.x0 -= 0.5 extent.x1 += 0.1 extent.y0 -= 0.6 extent.y1 += 0.7 plt.savefig( os.path.join(VIS_DIR, f"dist_vs_value_{func}_{ind+1}.pdf"), bbox_inches=extent, pad_inches=0 ) #bbox_inches=extent, pad_inches=0, format='eps', dpi=1000, plt.clf() if as_subplots: plt.savefig(os.path.join(VIS_DIR, f"dist_vs_value_{func}.eps"), format='eps', dpi=1000) plt.clf()