def dist_angle_calculator(u_frag, u_overlap, v_frag, v_overlap, u_connections=1, v_connections=1, verbose=True): u_frag = read_xyz(u_frag)[0].get_positions() u_overlap = read_xyz(u_overlap)[0].get_positions() v_frag = read_xyz(v_frag)[0].get_positions() v_overlap = read_xyz(v_overlap)[0].get_positions() u_exits, u_linkeds = coordinate_finder(u_frag, u_overlap, u_connections, verbose) v_exits, v_linkeds = coordinate_finder(v_frag, v_overlap, v_connections, verbose) distances = np.zeros((u_exits.shape[0], v_exits.shape[0])) angles = np.zeros((u_exits.shape[0], v_exits.shape[0])) for i, u_exit in enumerate(u_exits): u_linked = u_linkeds[i, :] for j, v_exit in enumerate(v_exits): v_linked = v_linkeds[j, :] distances[i, j], angles[i, j] = distance_angle(u_exit, u_linked, v_exit, v_linked) return distances, angles
def return_overlap(x1, x2, radius=0.8, n_overlap=3, make_plot=False, verbose=False): """ Calculates the Euclidean distance matrix between 2 fragments, and returns True or False depending on whether or not the fragments overlap in space, as determined by a threshold on the Euclidean distance. :param x1, x2: .xyz files of the fragment atom coordinates :param radius: threshold for determining atom overlap (in angstroms) :param n_overlap: number of overlapping atoms for determining fragment pair overlap :param make_plot: set to True to plot the euclidean distance matrix :param verbose: set to True to print the number of overlapping atoms for each x1, x2 :return: """ # read 2 xyz files u_atoms = read_xyz(x1)[0] v_atoms = read_xyz(x2)[0] # loop over atom xyz vectors, calculate euclidean distance dist_mat = np.empty((len(u_atoms),len(v_atoms))) for i, u in enumerate(u_atoms.get_positions()): for j,v in enumerate(v_atoms.get_positions()): dist_mat[i,j] = euc_dist(u, v) if make_plot: plt.figure(figsize=(7,7)) plt.matshow(dist_mat, fignum=1) cb = plt.colorbar(fraction=0.046, pad=0.04) plt.title('Euclidean Distance Matrix') plt.savefig('euc_dist.png') # apply distance, return True or False for overlap overlap_atoms = np.where(np.less_equal(dist_mat,radius),1,0) num_overlap_atoms = np.sum(overlap_atoms) overlap_atom_indices = np.argwhere(np.less_equal(dist_mat,radius)).T if num_overlap_atoms>=n_overlap: # choose n_overlap=4 for planar overlap u_overlaps = u_atoms[overlap_atom_indices[0]] v_overlaps = v_atoms[overlap_atom_indices[1]] del u_atoms[overlap_atom_indices[0]] del v_atoms[overlap_atom_indices[1]] if verbose: print('Overlap found! Number of overlapping atoms: {}'.format(num_overlap_atoms)) return True, u_atoms, v_atoms, u_overlaps, v_overlaps else: if verbose: print('No overlap found :(') return False, u_atoms, v_atoms, None, None
def initialise_system(args): """ Reads in a .csv file and generates a population of RDKit molecules, as well as reading in target ligand coordinates :param args: system arguments parsed into main - should contain args.csv, args.tgt (and args.tgt2) :return: population, tgt_atoms, tg_species or population, tgt_atoms, tgt_species, tgt_atoms2, tgt_species2 """ population = [] csv = pd.read_csv(args.csv, header=0) for i, row in csv.iterrows(): population.append(Chem.MolFromSmiles(row['SMILES'])) tgt_atoms, _, _, tgt_species = read_xyz(args.tgt) if args.tgt2 is not None: tgt_atoms2, _, _, tgt_species2 = read_xyz(args.tgt2) tgt_species = list(set().union(tgt_species, tgt_species2)) # creates a single tgt_species list return population, tgt_atoms, tgt_species, tgt_atoms2 else: return population, tgt_atoms, tgt_species
def main(args): """ Generates SOAP descriptors for the atoms saved in args.xyz :param args: :return: """ mols, num_list, atom_list, species = read_xyz(args.xyz) soap_generator = SOAP(species=species, periodic=False, rcut=args.rcut, nmax=8, lmax=6, sigma=args.sigma, sparse=True) soap = soap_generator.create(mols) soap = normalize(soap, copy=False) np.save(args.tgt, [soap])
def main(args): if args.task != 'IC50': mols, num_list, atom_list, species = read_xyz('data/' + args.task + '.xyz') else: mols, num_list, atom_list, species = read_xyz('data/' + args.task + '/' + args.subtask + '.xyz') dat_size = len(mols) mpi_comm = MPI.COMM_WORLD mpi_rank = mpi_comm.Get_rank() mpi_size = mpi_comm.Get_size() if mpi_rank == 0: print("\nEvaluating " + data_name + " rematch on " + str(mpi_size) + " MPI processes.\n") print('No. of molecules = {}\n'.format(dat_size)) print('Elements present = {}\n'.format(species)) # Setting up the SOAP descriptor rcut_small = 3.0 sigma_small = 0.2 rcut_large = 6.0 sigma_large = 0.4 small_soap = SOAP(species=species, periodic=False, rcut=rcut_small, nmax=12, lmax=8, sigma=sigma_small, sparse=True) large_soap = SOAP(species=species, periodic=False, rcut=rcut_large, nmax=12, lmax=8, sigma=sigma_large, sparse=True) t0 = time.time() my_border_low, my_border_high = return_borders( mpi_rank, dat_size, mpi_size) # split indices between MPI processes my_mols = mols[my_border_low:my_border_high] soap = scipy.sparse.hstack( [small_soap.create(my_mols), large_soap.create(my_mols)]) # generate atomic descriptors t1 = time.time() if mpi_rank == 0: print("SOAP: {:.2f}s\n".format(t1 - t0)) print( "rcut_small = {:.1f}, sigma_small = {:.1f}, rcut_large = {:.1f}, sigma_large = {:.1f}" .format(rcut_small, sigma_small, rcut_large, sigma_large)) soap = normalize(soap, copy=False) my_soap = split_by_lengths(soap, num_list[my_border_low:my_border_high] ) # group atomic descriptors by molecule my_len = len(my_soap) t2 = time.time() if mpi_rank == 0: print("Normalise & Split Descriptors: {:.2f}s\n".format(t2 - t1)) if args.save_soap: # save to args.soap_path for use with gpr_onthefly.py for i, mat in enumerate(my_soap): if args.task != 'IC50': scipy.sparse.save_npz( args.soap_path + args.task + '_soap_' + str(i + my_border_low), mat) else: scipy.sparse.save_npz( args.soap_path + args.subtask + '_soap_' + str(i + my_border_low), mat) if args.save_kernel: # save to args.kernel_path for use with gpr_soap.py re = REMatchKernel(metric="polynomial", degree=3, gamma=1, coef0=0, alpha=0.5, threshold=1e-6, normalize_kernel=True) K = np.zeros((my_len, dat_size), dtype=np.float32) sendcounts = np.array(mpi_comm.gather(my_len * dat_size, root=0)) if mpi_rank == 0: K_full = np.empty((dat_size, dat_size), dtype=np.float32) print("K memory usage(bytes): {}".format(K.nbytes + K_full.nbytes)) else: K_full = None #row-parallelised kernel computation for index in range(0, mpi_size): if index == mpi_rank: K[:, my_border_low:my_border_high] += re.create(my_soap).astype( np.float32) continue #skip useless calculation start, end = return_borders(index, dat_size, mpi_size) ref_mols = mols[start:end] ref_soap = scipy.sparse.hstack( [small_soap.create(ref_mols), large_soap.create(ref_mols)]) ref_soap = normalize(ref_soap, copy=False) ref_soap = split_by_lengths(ref_soap, num_list[start:end]) K[:, start:end] += re.create(my_soap, ref_soap).astype(np.float32) #Gather kernel rows mpi_comm.Gatherv(sendbuf=K, recvbuf=(K_full, sendcounts), root=0) K = K_full if mpi_rank == 0: t3 = time.time() print("Normalised Kernel: {:.2f}s\n".format(t3 - t2)) np.save(args.kernel_path + data_name + '_soap', K) print(K) mpi_comm.Barrier() MPI.Finalize()