def test_metrics(self): """Tests that different metrics as defined by scikit-learn can be used.""" # Create SOAP features for a system desc = SOAP( species=[1, 8], rcut=5.0, nmax=2, lmax=2, sigma=0.2, periodic=False, crossover=True, sparse=False, ) a = molecule("H2O") a_features = desc.create(a) # Linear dot-product kernel kernel = REMatchKernel(metric="linear", alpha=0.1, threshold=1e-6) K = kernel.create([a_features, a_features]) # Gaussian kernel kernel = REMatchKernel(metric="rbf", gamma=1, alpha=0.1, threshold=1e-6) K = kernel.create([a_features, a_features]) # Laplacian kernel kernel = REMatchKernel(metric="laplacian", gamma=1, alpha=0.1, threshold=1e-6) K = kernel.create([a_features, a_features])
def test_difference(self): """Tests that the similarity is correct. """ # Create SOAP features for a system desc = SOAP(species=[1, 6, 7, 8], rcut=5.0, nmax=2, lmax=2, sigma=0.2, periodic=False, crossover=True, sparse=False) # Calculate that identical molecules are identical. a = molecule("H2O") a_features = desc.create(a) kernel = REMatchKernel(metric="linear", alpha=1, threshold=1e-6) K = kernel.create([a_features, a_features]) self.assertTrue(np.all(np.abs(K - 1) < 1e-3)) # Check that completely different molecules are completely different a = molecule("N2") b = molecule("H2O") a_features = desc.create(a) b_features = desc.create(b) K = kernel.create([a_features, b_features]) self.assertTrue(np.all(np.abs(K - np.eye(2)) < 1e-3)) # Check that somewhat similar molecules are somewhat similar a = molecule("H2O") b = molecule("H2O2") a_features = desc.create(a) b_features = desc.create(b) K = kernel.create([a_features, b_features]) self.assertTrue(K[0, 1] > 0.9)
def test_xy(self): """Tests that the kernel can be also calculated between two different sets, which is necessary for making predictions with kernel-based methods. """ # Create SOAP features for a system desc = SOAP( species=[1, 8], rcut=5.0, nmax=2, lmax=2, sigma=0.2, periodic=False, crossover=True, sparse=False, ) a = molecule("H2O") b = molecule("O2") c = molecule("H2O2") a_feat = desc.create(a) b_feat = desc.create(b) c_feat = desc.create(c) # Linear dot-product kernel kernel = REMatchKernel(metric="linear", alpha=0.1, threshold=1e-6) K = kernel.create([a_feat, b_feat], [c_feat]) self.assertEqual(K.shape, (2, 1))
def test_convergence_infinity(self): """Tests that the REMatch kernel correctly converges to the average kernel at the the limit of infinite alpha. """ # Create SOAP features for a system desc = SOAP( species=[1, 8], rcut=5.0, nmax=2, lmax=2, sigma=0.2, periodic=False, crossover=True, sparse=False, ) a = molecule("H2O") b = molecule("H2O2") a_features = desc.create(a) b_features = desc.create(b) # REMatch kernel with very high alpha kernel_re = REMatchKernel(metric="linear", alpha=1e20, threshold=1e-6) K_re = kernel_re.create([a_features, b_features]) # Average kernel kernel_ave = AverageKernel(metric="linear") K_ave = kernel_ave.create([a_features, b_features]) # Test approximate equality self.assertTrue(np.allclose(K_re, K_ave))
def remax_listcomp(desc_list): re = REMatchKernel(metric='rbf', gamma=1, alpha=1, threshold=1e-6) re_comp_list = [] for i in range(0, len(desc_list) - 1): comp_pair = [desc_list[i], desc_list[i + 1][:, 0:len(desc_list[i][0])]] norm_pair = [normalize(j) for j in comp_pair] kern = re.create(norm_pair) re_comp_list.append(kern[0][1]) return re_comp_list
def test_sparse(self): """Tests that sparse features may also be used to construct the kernels. """ # Create SOAP features for a system desc = SOAP(species=[1, 8], rcut=5.0, nmax=2, lmax=2, sigma=0.2, periodic=False, crossover=True, sparse=True) a = molecule('H2O') a_feat = desc.create(a) kernel = REMatchKernel(metric="linear", alpha=0.1, threshold=1e-6) K = kernel.create([a_feat])
def update_soap_similarities(struct, all_kwargs): if not struct: raise PreventUpdate structs = {"input": self.from_data(struct)} kwargs = self.reconstruct_kwargs_from_state( callback_context.inputs) elements = [ str(el) for el in structs["input"].composition.elements ] all_chemsyses = [] for i in range(len(elements)): for els in itertools.combinations(elements, i + 1): all_chemsyses.append("-".join(sorted(els))) with MPRester() as mpr: docs = mpr.query( {"chemsys": { "$in": all_chemsyses }}, ["task_id", "structure"], ) structs.update({d["task_id"]: d["structure"] for d in docs}) if not structs: raise PreventUpdate elements = { elem for s in structs.values() for elem in s.composition.elements } # TODO: make sure is_int kwarg information is enforced so that int() conversion is unnecessary desc = SOAP( species=[e.number for e in elements], sigma=kwargs["sigma"], rcut=kwargs["rcut"], nmax=int(kwargs["nmax"]), lmax=int(kwargs["lmax"]), periodic=True, crossover=kwargs["crossover"], sparse=False, average=kwargs["average"], ) adaptor = AseAtomsAdaptor() atomss = { mpid: adaptor.get_atoms(struct) for mpid, struct in structs.items() } print(f"Calculating {len(atomss)} SOAP vectors") features = { mpid: normalize(desc.create(atoms, n_jobs=cpu_count())) for mpid, atoms in atomss.items() } re = REMatchKernel( metric=kwargs["metric"], alpha=kwargs["alpha"], threshold=kwargs["threshold"], normalize_kernel=kwargs["normalize_kernel"], ) print(f"Calculating similarity kernel") re_kernel = re.create(list(features.values())) similarities = { mpid: score for mpid, score in zip(features.keys(), re_kernel[0]) if mpid != "input" } sorted_mpids = sorted(similarities.keys(), key=lambda x: -similarities[x]) all_graphs = [ _get_soap_graph( features[mpid], [ html.Span( f"{unicodeify(structs[mpid].composition.reduced_formula)}" ), dcc.Markdown( f"[{mpid}](https://materialsproject.org/{mpid})"), html.Span(f"{similarities[mpid]:.5f}"), ], ) for mpid in sorted_mpids ] return html.Div(all_graphs)
def main(args): if args.task != 'IC50': mols, num_list, atom_list, species = read_xyz('data/' + args.task + '.xyz') else: mols, num_list, atom_list, species = read_xyz('data/' + args.task + '/' + args.subtask + '.xyz') dat_size = len(mols) mpi_comm = MPI.COMM_WORLD mpi_rank = mpi_comm.Get_rank() mpi_size = mpi_comm.Get_size() if mpi_rank == 0: print("\nEvaluating " + data_name + " rematch on " + str(mpi_size) + " MPI processes.\n") print('No. of molecules = {}\n'.format(dat_size)) print('Elements present = {}\n'.format(species)) # Setting up the SOAP descriptor rcut_small = 3.0 sigma_small = 0.2 rcut_large = 6.0 sigma_large = 0.4 small_soap = SOAP(species=species, periodic=False, rcut=rcut_small, nmax=12, lmax=8, sigma=sigma_small, sparse=True) large_soap = SOAP(species=species, periodic=False, rcut=rcut_large, nmax=12, lmax=8, sigma=sigma_large, sparse=True) t0 = time.time() my_border_low, my_border_high = return_borders( mpi_rank, dat_size, mpi_size) # split indices between MPI processes my_mols = mols[my_border_low:my_border_high] soap = scipy.sparse.hstack( [small_soap.create(my_mols), large_soap.create(my_mols)]) # generate atomic descriptors t1 = time.time() if mpi_rank == 0: print("SOAP: {:.2f}s\n".format(t1 - t0)) print( "rcut_small = {:.1f}, sigma_small = {:.1f}, rcut_large = {:.1f}, sigma_large = {:.1f}" .format(rcut_small, sigma_small, rcut_large, sigma_large)) soap = normalize(soap, copy=False) my_soap = split_by_lengths(soap, num_list[my_border_low:my_border_high] ) # group atomic descriptors by molecule my_len = len(my_soap) t2 = time.time() if mpi_rank == 0: print("Normalise & Split Descriptors: {:.2f}s\n".format(t2 - t1)) if args.save_soap: # save to args.soap_path for use with gpr_onthefly.py for i, mat in enumerate(my_soap): if args.task != 'IC50': scipy.sparse.save_npz( args.soap_path + args.task + '_soap_' + str(i + my_border_low), mat) else: scipy.sparse.save_npz( args.soap_path + args.subtask + '_soap_' + str(i + my_border_low), mat) if args.save_kernel: # save to args.kernel_path for use with gpr_soap.py re = REMatchKernel(metric="polynomial", degree=3, gamma=1, coef0=0, alpha=0.5, threshold=1e-6, normalize_kernel=True) K = np.zeros((my_len, dat_size), dtype=np.float32) sendcounts = np.array(mpi_comm.gather(my_len * dat_size, root=0)) if mpi_rank == 0: K_full = np.empty((dat_size, dat_size), dtype=np.float32) print("K memory usage(bytes): {}".format(K.nbytes + K_full.nbytes)) else: K_full = None #row-parallelised kernel computation for index in range(0, mpi_size): if index == mpi_rank: K[:, my_border_low:my_border_high] += re.create(my_soap).astype( np.float32) continue #skip useless calculation start, end = return_borders(index, dat_size, mpi_size) ref_mols = mols[start:end] ref_soap = scipy.sparse.hstack( [small_soap.create(ref_mols), large_soap.create(ref_mols)]) ref_soap = normalize(ref_soap, copy=False) ref_soap = split_by_lengths(ref_soap, num_list[start:end]) K[:, start:end] += re.create(my_soap, ref_soap).astype(np.float32) #Gather kernel rows mpi_comm.Gatherv(sendbuf=K, recvbuf=(K_full, sendcounts), root=0) K = K_full if mpi_rank == 0: t3 = time.time() print("Normalised Kernel: {:.2f}s\n".format(t3 - t2)) np.save(args.kernel_path + data_name + '_soap', K) print(K) mpi_comm.Barrier() MPI.Finalize()
from dscribe.descriptors import SOAP from dscribe.kernels import REMatchKernel from ase.build import molecule # We will compare two similar molecules a = molecule("H2O") b = molecule("H2O2") # First we will have to create the features for atomic environments. Lets # use SOAP. desc = SOAP(species=[1, 6, 7, 8], rcut=5.0, nmax=2, lmax=2, sigma=0.2, periodic=False, crossover=True, sparse=False) a_features = desc.create(a) b_features = desc.create(b) # Calculates the similarity with an average kernel, and a linear metric. The # result will be a full similarity matrix. re = REMatchKernel(metric="linear", alpha=1, threshold=1e-6) re_kernel = re.create([a_features, b_features]) # Any metric supported by scikit-learn will work: e.g. a Gaussian: re = REMatchKernel(metric="rbf", gamma=1, alpha=1, threshold=1e-6) re_kernel = re.create([a_features, b_features])
def pop_fitness(population, rcut, sigma, kernel, tgt_atoms, tgt_species, tgt_atoms2=None, max_score=[-9999, '']): """ Calculates the fitness (ie SOAP similarity score) of the population by generating conformers for each of the population molecules, then evaluating their SOAP descriptors and calculating its similarity score with the SOAP descriptor of the binding ligand 'field' Conformer generation and similarity calculation are the computational bottlenecks - might be worth splitting the task up with MPI. see return_borders.py in helper.py if you want to do that - make sure you only run the reproduction on the master node (since there is randomness), then broadcast to the other nodes :param population: list of RDKit molecule objects :param tgt_atoms: list of ASE atom objects of the target ligand field - from read_xyz, second is optional if separate sites :param tgt_species: list of the atomic species present in the target ligand field - from read_xyz :param rcut, sigma: SOAP parameters :param max_score: Maximum SOAP similarity found so far :return: fitness, max_score, fit_mean, fit_std """ t0 = time.time() # loop over RDKit mols and turn them into lists of ASE atom objects for dscribe SOAP atomic feature generation population_ase = [] num_list = [] species = ['C'] bad_mols = [] for m in population: m = Chem.AddHs(m) conf_result = AllChem.EmbedMolecule(m, maxAttempts=1000) if conf_result != 0: bad_mols.append(m) continue m = Chem.RemoveHs(m) num_list.append(len(m.GetAtoms())) for i, atom in enumerate(m.GetAtoms()): symbol = atom.GetSymbol() conf = m.GetConformer() population_ase.append(Atoms(symbol, [conf.GetPositions()[i]])) if symbol not in species: # find unique atomic species for SOAP generation species.append(symbol) if bad_mols != []: for bm in bad_mols: bm = Chem.RemoveHs(bm) try: population.remove( bm) # filter out molecules which have no conformers except: continue # Check that we also include the atom types present in the ligand targets for atom in tgt_species: if atom not in species: species.append(atom) t1 = time.time() print('Time taken to generate conformers: {}'.format(t1 - t0)) # Generate SOAP descriptors using dscribe soap_generator = SOAP(species=species, periodic=False, rcut=rcut, nmax=8, lmax=6, sigma=sigma, sparse=True) soap = soap_generator.create(population_ase) tgt_soap = soap_generator.create(tgt_atoms) if tgt_atoms2 is not None: tgt_soap2 = [normalize(soap_generator.create(tgt_atoms2), copy=False)] # normalize SOAP atom descriptors and group by molecule soap = normalize(soap, copy=False) tgt_soap = [normalize(tgt_soap, copy=False)] soap = split_by_lengths(soap, num_list) t2 = time.time() print('Time taken to generate SOAP descriptors: {}'.format(t2 - t1)) # TODO make REMatch kernel args as input args if kernel == 'rematch': soap_similarity = REMatchKernel(metric="polynomial", degree=3, gamma=1, coef0=0, alpha=0.1, threshold=1e-3, normalize_kernel=True) elif kernel == 'average': soap_similarity = AverageKernel(metric="polynomial", degree=3, gamma=1, coef0=0, normalize_kernel=True) if tgt_atoms2 is not None: fitness1 = soap_similarity.create(soap, tgt_soap) fitness1.flatten() fitness2 = soap_similarity.create(soap, tgt_soap2) fitness2.flatten() # calculate fitness score as product of the two fitnesses fitness = np.multiply(fitness1, fitness2) fitness = np.array([f[0] for f in fitness]) else: fitness = soap_similarity.create(soap, tgt_soap) fitness = fitness.flatten() t3 = time.time() print('Time taken to calculate fitness: {}'.format(t3 - t2)) # update max_score, include new champion if np.amax(fitness) > max_score[0]: max_score = [ np.amax(fitness), Chem.MolToSmiles(population[np.argmax(fitness)]) ] #Print the top 5 scores and corresponding molecules for a particular generation top_scores = np.flip(fitness[np.argsort(fitness)[-5:]]) # print(top_scores) for i in range(5): print("Mol {}: {} (fitness = {:.3f})".format( i, Chem.MolToSmiles(population[np.argsort(fitness)[-i - 1]]), top_scores[i])) fitness = fitness / np.sum(fitness) return fitness, max_score
def pop_fitness(mpi_comm, mpi_rank, mpi_size, population, rcut, sigma, kernel, tgt_atoms, tgt_species, tgt_atoms2=None, max_score=[-9999, '']): """ Calculates the fitness (ie SOAP similarity score) of the population by generating conformers for each of the population molecules, then evaluating their SOAP descriptors and calculating its similarity score with the SOAP descriptor of the binding ligand 'field' :param population: list of RDKit molecule objects :param tgt_atoms: list of ASE atom objects of the target ligand field - from read_xyz, second is optional if separate sites :param tgt_species: list of the atomic species present in the target ligand field - from read_xyz :param rcut, sigma: SOAP parameters :param max_score: Maximum SOAP similarity found so far :return: fitness, max_score, fit_mean, fit_std """ t0 = time.time() # partition the population between the MPI cpus my_border_low, my_border_high = return_borders(mpi_rank, len(population), mpi_size) my_pop = population[my_border_low:my_border_high] # loop over RDKit mols and turn them into lists of ASE atom objects for dscribe SOAP atomic feature generation population_ase = [] num_list = [] species = ['C'] bad_mols = [] for ind, m in enumerate(my_pop): m = Chem.AddHs(m) conf_result = AllChem.EmbedMolecule(m, maxAttempts=1000) m = Chem.RemoveHs(m) num_list.append(len(m.GetAtoms())) for i, atom in enumerate( m.GetAtoms() ): # this is actually wrong, should have an Atoms object for each mol... symbol = atom.GetSymbol() if conf_result != 0: bad_mols.append(ind) population_ase.append(Atoms(symbol, [[0, 0, 0]])) else: conf = m.GetConformer() population_ase.append(Atoms(symbol, [conf.GetPositions()[i]])) if symbol not in species: # find unique atomic species for SOAP generation species.append(symbol) # Check that we also include the atom types present in the ligand targets for atom in tgt_species: if atom not in species: species.append(atom) t1 = time.time() if mpi_rank == 0: print('Time taken to generate conformers: {}'.format(t1 - t0)) # Generate SOAP descriptors using dscribe soap_generator = SOAP(species=species, periodic=False, rcut=rcut, nmax=8, lmax=6, sigma=sigma, sparse=True) soap = soap_generator.create(population_ase) tgt_soap = soap_generator.create(tgt_atoms) if tgt_atoms2 is not None: tgt_soap2 = [normalize(soap_generator.create(tgt_atoms2), copy=False)] # normalize SOAP atom descriptors and group by molecule soap = normalize(soap, copy=False) tgt_soap = [normalize(tgt_soap, copy=False)] soap = split_by_lengths(soap, num_list) t2 = time.time() if mpi_rank == 0: print('Time taken to generate SOAP descriptors: {}'.format(t2 - t1)) # TODO make REMatch kernel args as input args if kernel == 'rematch': soap_similarity = REMatchKernel(metric="polynomial", degree=3, gamma=1, coef0=0, alpha=0.1, threshold=1e-3, normalize_kernel=True) elif kernel == 'average': soap_similarity = AverageKernel(metric="polynomial", degree=3, gamma=1, coef0=0, normalize_kernel=True) if tgt_atoms2 is not None: fitness1 = soap_similarity.create(soap, tgt_soap) fitness1.flatten() fitness2 = soap_similarity.create(soap, tgt_soap2) fitness2.flatten() # calculate fitness score as product of the two fitnesses fitness = np.multiply(fitness1, fitness2) fitness = np.array([f[0] for f in fitness]) else: fitness = soap_similarity.create(soap, tgt_soap) fitness = fitness.flatten() fitness[bad_mols] = 0 # set fitness of bad conformers to 0 sendcounts = np.array(mpi_comm.gather(len(fitness), root=0)) if mpi_rank == 0: fitness_full = np.empty(len(population)) else: fitness_full = None # Gather fitness arrays from MPI cpus into the root cpu, then broadcast the gathered array to all cpus mpi_comm.Gatherv(sendbuf=fitness, recvbuf=(fitness_full, sendcounts), root=0) fitness = mpi_comm.bcast(fitness_full, root=0) t3 = time.time() if mpi_rank == 0: print('Time taken to calculate fitness: {}'.format(t3 - t2)) # update max_score, include new champion if np.amax(fitness) > max_score[0]: max_score = [ np.amax(fitness), Chem.MolToSmiles(population[np.argmax(fitness)]) ] #Print the top 5 scores and corresponding molecules for a particular generation top_scores = np.flip(fitness[np.argsort(fitness)[-5:]]) # print(top_scores) for i in range(5): if mpi_rank == 0: print("Mol {}: {} (fitness = {:.3f})".format( i, Chem.MolToSmiles(population[np.argsort(fitness)[-i - 1]]), top_scores[i])) fitness = fitness / np.sum(fitness) return fitness, max_score
sparse=False, rbf='gto') descriptors = [soapgen_rcut.create(i) for i in structures] descdiffs.append(descriptors[1][0][0] - descriptors[0][0][0]) tic_1 = time.perf_counter() re = AverageKernel(metric='linear') kern = re.create(descriptors) toc_1 = time.perf_counter() ctime.append(toc_1 - tic_1) kerndiffs.append(kern[0][1]) tic_2 = time.perf_counter() normed = [normalize(i) for i in descriptors] rem = REMatchKernel(metric='rbf', gamma=1, alpha=1, threshold=1e-6) remkern = rem.create(descriptors) toc_2 = time.perf_counter() rectime.append(toc_2 - tic_2) remkerndiffs.append(remkern[0][1]) allkerndiffs.append(abs(remkern[0][1] - kern[0][1])) plt.plot(xax, ctime, label='Average Kernel') plt.plot(xax, rectime, label='REMatch Kernel') plt.xlabel('Number of radial basis functions') plt.ylabel('Kernel comparison time') plt.title(f'lmax = {lmax}, rcut = {rcut}') plt.legend() plt.savefig(outputdir + f"/{test_name}_{comp_name}_kerncomp_lmax={lmax}_rcut={rcut}.png")
nmax=9, lmax=9, sigma=0.3, periodic=True, crossover=True, sparse=False) ref_features = desc.create(reference_frame) ref_features = normalize(ref_features) re = REMatchKernel(metric="linear", alpha=1, threshold=1e-6, gamma=1) similarities = [] for i, image in enumerate(images): image_features = desc.create(image) image_features = normalize(image_features) re_kernel = re.create([image_features, ref_features]) print(i, re_kernel[0][1]) similarities.append(re_kernel[0][1]) scorer = AnharmonicScore(md_frames='./vasprun_md.xml', ref_frame='./SPOSCAR', unit_cell_frame='./SPOSCAR') sigmas, _ = scorer.structural_sigma(return_trajectory=True) fig, ax1 = plt.subplots(figsize=(7, 5)) color = '#1fbfb8' ax1.set_xlabel('$t$ (fs)') ax1.set_ylabel('$\\sigma(t)$', color=color) ax1.plot(range(len(sigmas)), sigmas, '.-', color=color) ax1.tick_params(axis='y', labelcolor=color)
class SOAP_onthefly(gpflow.kernels.Kernel): """ A kernel class that dynamically calculates the SOAP kernel on-the-fly by loading SOAP descriptors from args.soap_path by index and calculating the SOAP kernel. """ def __init__(self, args): super().__init__(active_dims=[0]) self.var = gpflow.Parameter(10.0, transform=positive()) self.mag = gpflow.Parameter(1.0, transform=positive()) self.args = args self.re = REMatchKernel(metric="polynomial", degree=3, gamma=1, coef0=0, alpha=0.5, threshold=1e-6, normalize_kernel=True) def K(self, X, X2=None, presliced=None): A = X.numpy().flatten().astype(int) X_soap = scipy.sparse.load_npz(args.soap_path + args.task + '_soap_' + str(A[0]) + '.npz') X_list = [X_soap.get_shape()[0]] for i in A[1:]: X_next = scipy.sparse.load_npz(args.soap_path + args.task + '_soap_' + str(i) + '.npz') X_list.append(X_next.get_shape()[0]) X_soap = scipy.sparse.vstack([X_soap, X_next]) X_soap = normalize(X_soap, copy=False) X_soap = split_by_lengths(X_soap, X_list) if X2 is None: # t1 = time.time() K_mat = self.re.create(X_soap) # t2 = time.time() # print('Time taken to calculate kernel = {:.1f}s'.format(t2-t1)) max_rem = K_mat.max() z = tf.math.sqrt( 6 * (max_rem - tf.constant(K_mat, dtype=tf.float64))) * self.var K_final = self.mag * (1 + z) * tf.math.exp(-z) return K_final else: A2 = X2.numpy().flatten().astype(int) X2_soap = scipy.sparse.load_npz(args.soap_path + args.task + '_soap_' + str(A2[0]) + '.npz') X2_list = [X2_soap.get_shape()[0]] for i in A2[1:]: X_next = scipy.sparse.load_npz(args.soap_path + args.task + '_soap_' + str(i) + '.npz') X2_list.append(X_next.get_shape()[0]) X2_soap = scipy.sparse.vstack([X2_soap, X_next]) X2_soap = normalize(X2_soap, copy=False) X2_soap = split_by_lengths(X2_soap, X2_list) # t3 = time.time() K_mat = self.re.create(X_soap, X2_soap) # t4 = time.time() # print('Time taken to calculate kernel = {:.1f}s'.format(t4-t3)) max_rem = K_mat.max() z = tf.math.sqrt(6 * (max_rem - tf.constant(K_mat, dtype=tf.float64)) ) * self.var # Matern v=3/2 kernel K_final = self.mag * (1 + z) * tf.math.exp(-z) return K_final def K_diag(self, X, presliced=None): return self.mag * tf.reshape(tf.ones_like(X), -1) # diagonal of ones * self.mag