def is_same_mol(smiles1, smiles2): try: can_smiles1 = Chem.CanonSmiles(smiles1) can_smiles2 = Chem.CanonSmiles(smiles2) return can_smiles1 == can_smiles2 except Exception: return False
def test_queryStep(self): self.env.seed('C(=O)O') self.action.setAction("add", pos="back", mol='C1=CC=CC=C1') self.env.step(self.action) query = self.env._queryStep("remove", query=np.array(['Arom6'])) self.assertTrue(query) smiles = self.env._listToSmiles() s = Chem.CanonSmiles(smiles) m = Chem.CanonSmiles('C(=O)O') self.assertEqual(m, s)
def testDeprotect(self): smiles = "N(C(=O)OC(C)(C)C)Cc1ccccc1NC(=O)OC(C)(C)C" m = Chem.MolFromSmiles(smiles) m2 = rd.Deprotect(m) self.assertEqual(Chem.MolToSmiles(m2), Chem.CanonSmiles("NCc1ccccc1N")) self.assertEqual(list(m2.GetPropsAsDict()["DEPROTECTIONS"]), ["Boc", "Boc"]) self.assertEqual(m2.GetPropsAsDict()["DEPROTECTION_COUNT"], 2)
def main(): # Read in a SD or SMILES file - then write out into a specified directory parser = argparse.ArgumentParser( description= "Convert a SMILES or SDFile to input for Astex Fragment network.") parser.add_argument("--input") parser.add_argument("--input_format", default="smi") parser.add_argument("--base_dir") parser.add_argument("--iso_flag", default=True) args = parser.parse_args() attrs = [] id = 0 mols = parse_mols(args.input, args.input_format) for x in tqdm(mols): if x is None: continue attr = Attr( Chem.CanonSmiles(Chem.MolToSmiles(x, isomericSmiles=True)), ["EM", x.GetProp("_Name")], ) attrs.append(attr) id += 1 if not os.path.isdir(args.base_dir): os.mkdir(args.base_dir) # Build the network node_holder = NodeHolder(iso_flag=args.iso_flag) node_holder = build_network(attrs, node_holder) # Write the data out write_data(args.base_dir, node_holder, attrs)
def A2AR(input, out): """Construction of A2AR set, which is used for fine-tuned model and predictor training. Arguments: input (str): the path of tab-delimited data file that contains CANONICAL_SMILES. out (str): the path saving the refined data after filtering the invalid data, including removing molecule contained metal atom, reserving the largest fragments, and replacing the nitrogen electrical group to nitrogen atom "N". """ df = pd.read_table(input) df = df[[ 'CMPD_CHEMBLID', 'CANONICAL_SMILES', 'PCHEMBL_VALUE', 'ACTIVITY_COMMENT' ]] df = df.dropna() for i, row in df.iterrows(): # replacing the nitrogen electrical group to nitrogen atom "N" smile = row['CANONICAL_SMILES'].replace('[NH+]', 'N').replace( '[NH2+]', 'N').replace('[NH3+]', 'N') # removing the radioactivity of each atom smile = re.sub('\[\d+', '[', smile) # reserving the largest fragments if '.' in smile: frags = smile.split('.') ix = np.argmax([len(frag) for frag in frags]) smile = frags[ix] # Transforming into canonical SMILES based on the Rdkit built-in algorithm. df.loc[i, 'CANONICAL_SMILES'] = Chem.CanonSmiles(smile, 0) # removing molecule contained metal atom if '[Au]' in smile or '[As]' in smile or '[Hg]' in smile or '[Se]' in smile or smile.count( 'C') + smile.count('c') < 2: df = df.drop(i) # df = df.drop_duplicates(subset='CANONICAL_SMILES') df.to_csv(out, index=False, sep='\t')
def smiles(dir="../data/xyz/DB3/", verbose=1): dir_str = "ls " + str(dir) + " | sort -d " temp = os.popen(dir_str).read() temp = str(temp).split() ret_list = [] names = [] for j, i in enumerate(temp): try: mol = next(pybel.readfile("xyz", dir + i)) smi = mol.write(format="smi") smi = Chem.CanonSmiles(smi) ret_list.append(smi.split()[0].strip()) names.append(i) if (verbose == 1): sys.stdout.write("\r %s / " % j + str(len(temp))) sys.stdout.flush() except: try: f = open(dir + i, "r") smi = f.readlines()[0] if (len(smi > 5)): ret_list.append(smi) names.append(i) if (verbose == 1): sys.stdout.write("\r %s / " % j + str(len(temp))) sys.stdout.flush() except: pass # print(ret_list[0:4]) return names, ret_list
def main(): # Read in a SD or SMILES file - then write out into a specified directory parser = argparse.ArgumentParser( description= "Convert a SMILES or SDFile to input for Astex Fragment network.") parser.add_argument("--input") parser.add_argument("--input_format", default="smi") parser.add_argument("--base_dir") parser.add_argument("--isomeric", dest="iso_flag", action="store_true") parser.add_argument("--non_isomeric", dest="iso_flag", action="store_false") group = parser.add_mutually_exclusive_group() group.add_argument("-v", dest="verbosity", action="store_const", const=1) group.add_argument("-vv", dest="verbosity", action="store_const", const=2) parser.set_defaults(verbosity=0) parser.set_defaults(iso_flag=True) args = parser.parse_args() # Do we have an input and base directory? if not args.input: print('ERROR: Must specify an input') sys.exit(1) if not os.path.isfile(args.input): print('ERROR: input (%s) does not exist' % args.input) sys.exit(1) if not args.base_dir: print('ERROR: Must specify a base directory') sys.exit(1) if not os.path.isdir(args.base_dir): print('ERROR:input base directory (%s) does not exist' % args.base_dir) sys.exit(1) tqdm_disable = True if args.verbosity else False attrs = [] id = 0 mols = parse_mols(args.input, args.input_format) for x in tqdm(mols, disable=tqdm_disable): print("Processing " + Chem.MolToSmiles(x, isomericSmiles=True)) if x is None: continue attr = Attr( Chem.CanonSmiles(Chem.MolToSmiles(x, isomericSmiles=True)), ["EM", x.GetProp("_Name")], ) attrs.append(attr) id += 1 if not os.path.isdir(args.base_dir): os.mkdir(args.base_dir) # Build the network node_holder = NodeHolder(iso_flag=args.iso_flag) max_frags = 0 node_holder = build_network(attrs, node_holder, max_frags, args.base_dir, args.verbosity) # Write the data out write_data(args.base_dir, node_holder, attrs)
def graph_corpus(input, output, suffix='sdf'): metals = {'Na', 'Zn', 'Li', 'K', 'Ca', 'Mg', 'Ag', 'Cs', 'Ra', 'Rb', 'Al', 'Sr', 'Ba', 'Bi'} voc = utils.VocGraph('data/voc_atom.txt') inf = gzip.open(input) if suffix == 'sdf': mols = Chem.ForwardSDMolSupplier(inf) total = 2e6 else: mols = pd.read_table(input).drop_duplicates(subset=['Smiles']).dropna(subset=['Smiles']) total = len(mols) mols = mols.iterrows() vals = {} exps = {} codes, ids = [], [] chooser = rdMolStandardize.LargestFragmentChooser() disconnector = rdMolStandardize.MetalDisconnector() normalizer = rdMolStandardize.Normalizer() for i, mol in enumerate(tqdm(mols, total=total)): if mol is None: continue if suffix != 'sdf': idx = mol[1]['Molecule ChEMBL ID'] mol = Chem.MolFromSmiles(mol[1].Smiles) else: idx = mol.GetPropsAsDict() idx = idx['chembl_id'] try: mol = disconnector.Disconnect(mol) mol = normalizer.normalize(mol) mol = chooser.choose(mol) mol = disconnector.Disconnect(mol) mol = normalizer.normalize(mol) except: print(idx) symb = [a.GetSymbol() for a in mol.GetAtoms()] # Nr. of the atoms bonds = mol.GetBonds() if len(bonds) < 4 or len(bonds) >= 63: continue if {'C'}.isdisjoint(symb): continue if not metals.isdisjoint(symb): continue smile = Chem.MolToSmiles(mol) try: s0 = smile.replace('[O]', 'O').replace('[C]', 'C') \ .replace('[N]', 'N').replace('[B]', 'B') \ .replace('[2H]', '[H]').replace('[3H]', '[H]') s0 = Chem.CanonSmiles(s0, 0) code = voc.encode([smile]) s1 = voc.decode(code)[0] assert s0 == s1 codes.append(code[0].reshape(-1).tolist()) ids.append(idx) except Exception as ex: print(ex) print('Parse Error:', idx) df = pd.DataFrame(codes, index=ids, columns=['C%d' % i for i in range(64*4)]) df.to_csv(output, sep='\t', index=True) print(vals) print(exps)
def make_canon_smiles(smiles): canon_smiles = [] for mol in smiles: try: canon_smiles.append(Chem.CanonSmiles(mol)) except: return print('Invalid SMILE:', mol) return canon_smiles
def __call__(self, molecule) -> None: """ Updates the fitness value of a molecule. """ molecular_graph = Chem.MolFromSmiles(Chem.CanonSmiles(molecule.smiles)) molecule_fingerprint = self.get_fingerprint(molecular_graph, self.fingerprint_type) fitness = TanimotoSimilarity(self.target_fingerprint, molecule_fingerprint) molecule.fitness = fitness return molecule
def load_from_database(self) -> List[Molecule]: dataframe = pd.read_csv(hydra.utils.to_absolute_path(self.data_file)) smiles_list = dataframe['smiles'].sample(n=self.initial_size).tolist() pedigree = ("database", "no reaction", "no parent") molecules = [ Molecule(Chem.CanonSmiles(smiles), pedigree) for smiles in smiles_list ] return molecules
def __call__(self, molecule_pair): pedigree = ("crossover", molecule_pair[0].smiles, molecule_pair[1].smiles) smiles_list = self.merge(molecule_pair) molecules = [ Molecule(Chem.CanonSmiles(smiles), pedigree) for smiles in smiles_list if Chem.MolFromSmiles(smiles) ] return molecules
def substructure_split(ligand: Molecule, idx: Tuple[int, int], split: bool = True) -> Molecule: """Delete the hydrogen or mono-/polyatomic counterion attached to the functional group. Sets the charge of the remaining heteroatom to -1 if ``split=True``. Parameters ---------- ligand: |plams.Molecule|_ The ligand molecule. idx : |tuple|_ [|int|_] A tuple with 2 atomic indices associated with a functional group. split : bool If a functional group should be split from **ligand** (``True``) or not (``False``). Returns ------- |plams.Molecule|_ A copy of **ligand**, with part of its functional group removed (see **split**). """ lig = ligand.copy() at1 = lig[idx[0] + 1] at2 = lig[idx[-1] + 1] if split: lig.delete_atom(at2) mol_list = lig.separate_mod() for mol in mol_list: if at1 not in mol: continue lig = mol break # Check if the ligand heteroatom has a charge assigned, assigns a charge if not if not at1.properties.charge: at1.properties.charge = -1 # Update ligand properties lig.properties.dummies = at1 lig.properties.anchor = at1.symbol + str(lig.atoms.index(at1) + 1) lig.properties.charge = sum( atom.properties.get('charge', 0) for atom in lig) # Update the ligand smiles string rdmol = molkit.to_rdmol(lig) smiles = Chem.MolToSmiles(rdmol) lig.properties.smiles = Chem.CanonSmiles(smiles) lig.properties.name = santize_smiles( lig.properties.smiles) + '@' + lig.properties.anchor lig.properties.path = ligand.properties.path return lig
def get_canonical_smiles(mols): smiles = list(set([Chem.MolToSmiles(m) for m in mols])) canon_smiles = [] for s in smiles: try: canon_smiles.append(Chem.CanonSmiles(s)) except: print(s) return canon_smiles
def test_step(self): #test add-back smile = "CC" smile = Chem.CanonSmiles(smile) m = Chem.CanonSmiles(self.env._listToSmiles()) self.assertEqual(m, smile) mols = [] legends = [] mols.append(RWMol(Chem.MolFromSmiles("C"))) legends.append("1. C") mols.append(RWMol(Chem.MolFromSmiles("CC"))) legends.append("2. CC") #test add-front self.action.setAction("add", pos="front", mol="C1=CC=CC=C1") self.env.step(self.action) smile = "CCC1=CC=CC=C1" smile = Chem.CanonSmiles(smile) m = Chem.CanonSmiles(self.env._listToSmiles()) self.assertEqual(m, smile) mols.append(RWMol(Chem.MolFromSmiles("CCC1=CC=CC=C1"))) l = "3. " + self.env._listToSmiles() legends.append(l) #test remove-back self.action.setAction("remove", pos="back", mol="C") self.env.step(self.action) smile = "C1=CC=CC=C1C" smile = Chem.CanonSmiles(smile) m = Chem.CanonSmiles(self.env._listToSmiles()) self.assertEqual(m, smile) mols.append(RWMol(Chem.MolFromSmiles("CC1=CC=CC=C1"))) l = "3. " + self.env._listToSmiles() legends.append(l) #test remove-front self.action.setAction("remove", pos="front", mol="C1=CC=CC=C1") self.env.step(self.action) smile = "C" smile = Chem.CanonSmiles(smile) m = Chem.CanonSmiles(self.env._listToSmiles()) self.assertEqual(m, smile) mols.append(RWMol(Chem.MolFromSmiles("C"))) l = "3. " + self.env._listToSmiles() legends.append(l) #test current molecule mol = self.env.current_molecule self.action.setAction("add", pos="front", mol="CC") self.env.step(self.action) self.assertNotEqual(self.env.current_molecule, mol) mols.append(RWMol(Chem.MolFromSmiles("CCC"))) l = "3. " + self.env._listToSmiles() legends.append(l)
def main(): # Read in a SD or SMILES file - then write out into a specified directory parser = argparse.ArgumentParser( description="Convert a SMILES to nodes, edges and attributes" ) parser.add_argument("--smiles") parser.add_argument("--id") parser.add_argument("--standardize", action="store_true") parser.add_argument("--isomeric", dest="iso_flag", action="store_true") parser.add_argument("--non_isomeric", dest="iso_flag", action="store_false") group = parser.add_mutually_exclusive_group() group.add_argument("-v", dest="verbosity", action="store_const", const=1) group.add_argument("-vv", dest="verbosity", action="store_const", const=2) parser.set_defaults(verbosity=0) parser.set_defaults(iso_flag=True) args = parser.parse_args() # Do we have an input and base directory? if not args.smiles: print('ERROR: Must specify a SMILES') sys.exit(1) attrs = [] print("Original SMILES: " + args.smiles) mol = Chem.MolFromSmiles(args.smiles) if args.standardize: mol = standardize(mol) print("Standardized SMILES: " + Chem.MolToSmiles(mol)) smiles = Chem.CanonSmiles(Chem.MolToSmiles(mol, isomericSmiles=True)) print("Canonical SMILES: " + smiles) id = args.id if id is None: id = "smiles1" attr = Attr(smiles, ["EM", id]) attrs.append(attr) # Build the network node_holder = NodeHolder(iso_flag=args.iso_flag) max_frags = 0 node_holder = build_network(attrs, node_holder, max_frags, smiles, args.verbosity) # Write the data out for node in node_holder.node_list: print(str(node)) for edge in node_holder.get_edges(): print(str(edge)) for attr in attrs: print(str(attr)) print("Number of nodes: " + str(len(node_holder.node_list))) print("Number of edges: " + str(len(node_holder.get_edges())))
def default_molecules(): smiles_list = [ "Clc1ccc(cc1)C(c2ccccc2)N3CCN(CC3)CCOCC(=O)O", "CC1=CC(Cl)=CC(C(=O)N[C@@H]2C[C@@H]3CCCC[C@@H]32)=C1C" ] pedigree = ("database", "no reaction", "no parent") molecules = [ Molecule(Chem.CanonSmiles(smiles), pedigree) for smiles in smiles_list ] return molecules
def test_examples(self): count = 0 for data in rd.GetDeprotections(): if data.example: start, end = data.example.split(">>") m = Chem.MolFromSmiles(start) m2 = rd.Deprotect(m, [data]) print("Testing", data.full_name) self.assertEqual(Chem.MolToSmiles(m2), Chem.CanonSmiles(end)) count += 1 assert count
def sanitize(smi): mol = Chem.MolFromSmiles(smi, sanitize=False) if mol is None: raise Exception("could not convert SMILES to RDKit Mol: %s" % smi) # if we don't exclude SANITIZE_FINDRADICALS, then we get [C] in about 10% of generated mols Chem.SanitizeMol(mol, sanitizeOps=Chem.SanitizeFlags.SANITIZE_ALL ^ Chem.SanitizeFlags.SANITIZE_FINDRADICALS) # we somehow still get SMILES that cannot be later converted to a Mol, if we just call Chem.MolToSmiles, and # not follow it with a call to Chem.CanonSmiles return Chem.CanonSmiles(Chem.MolToSmiles(mol, isomericSmiles=True))
def main(args): count = 0 with open(args.input_file, 'r') as f: with open(args.output_file, 'w') as w: for i, l in enumerate(tqdm(f)): l = l.strip() try: canonical = Chem.CanonSmiles(l) w.write("{}\n".format(canonical)) except: pass
def create_sd_file(name, smiles, save_directory): """ Create a 2D sdf file in the proasis project directory for successfully detected ligands """ # create sdf file for ligand and save to hit directory canon_smiles = Chem.CanonSmiles(smiles) mol = Chem.MolFromSmiles(canon_smiles) AllChem.Compute2DCoords(mol) print('Generating sdf file and saving to ' + name + ' directory...\n') sd_file = Chem.SDWriter(save_directory) sd_file.write(mol)
def tanimoto_dist(x_test, autoencoded_selfies, dim, selfies_alphabet): ''' method that computes the equality in encode-decode performance between a test dataset and an encode-decoded dataset ''' test_size = len(x_test) count_good = 0 dist = [] for i, mol in enumerate(x_test): # single point - through vae one_hot = np.zeros((dim[0], dim[1])) one_hot_true = np.zeros((dim[0], dim[1])) for ind, row in enumerate(autoencoded_selfies[i].reshape( dim[0], dim[1])): lab_temp = np.argmax(row) one_hot[ind][lab_temp] = 1 # single point - non vae for ind, row in enumerate(mol.reshape(dim[0], dim[1])): lab_temp = np.argmax(row) one_hot_true[ind][lab_temp] = 1 self_test = sf.encoding_to_selfies(one_hot.tolist(), selfies_alphabet, "one_hot") self_true = sf.encoding_to_selfies(one_hot_true.tolist(), selfies_alphabet, "one_hot") canonical_smiles = Chem.CanonSmiles(sf.decoder(self_test)) canonical_autoencoder_smiles = Chem.CanonSmiles(sf.decoder(self_true)) fps1 = Chem.RDKFingerprint(Chem.MolFromSmiles(canonical_smiles)) fps2 = Chem.RDKFingerprint( Chem.MolFromSmiles(canonical_autoencoder_smiles)) diff = DataStructs.FingerprintSimilarity(fps1, fps2) dist.append(float(diff)) return np.array(dist)
def corpus(input: str, out: str, *, vocab_path: str): """Constructing the molecular corpus by splitting each SMILES into a range of tokens contained in vocabulary. Arguments: input : the path of tab-delimited data file that contains CANONICAL_SMILES. out : the path for vocabulary (containing all of tokens for SMILES construction) and output table (including CANONICAL_SMILES and whitespace delimited token sentence) """ df = pd.read_table(input).CANONICAL_SMILES voc = Voc(vocab_path) canons = [] tokens = [] smiles = set() it = tqdm(df, desc='Reading SMILES') for smile in it: # replacing the radioactive atom into nonradioactive atom smile = SUB_RE.sub('[', smile) # reserving the largest one if the molecule contains more than one fragments, # which are separated by '.'. if '.' in smile: frags = smile.split('.') ix = np.argmax([len(frag) for frag in frags]) smile = frags[ix] # TODO replace with: smile = max(frags, key=len) # if it doesn't contain carbon atom, it cannot be drug-like molecule, just remove if smile.count('C') + smile.count('c') < 2: continue if smile in smiles: it.write('duplicate: {}'.format(smile)) smiles.add(smile) # collecting all of the tokens in the sentences for vocabulary construction. words = set() it = tqdm(smiles, desc='Collecting tokens') for smile in it: try: token = voc.tokenize(smile) if len(token) <= 100: words.update(token) canons.append(Chem.CanonSmiles(smile, 0)) tokens.append(' '.join(token)) except Exception as e: it.write('{} {}'.format(e, smile)) # persisting the vocabulary on the hard drive. with open(out + '_voc.txt', 'w') as file: file.write('\n'.join(sorted(words))) # saving the canonical smiles and token sentences as a table into hard drive. log = pd.DataFrame() log['CANONICAL_SMILES'] = canons log['SENT'] = tokens log.drop_duplicates(subset='CANONICAL_SMILES') log.to_csv(out + '_corpus.txt', sep='\t', index=None)
def fragment_molecule_on_explicit_hydrogens(smiles): num_heavies = get_num_heavies_from_smiles(smiles) smiles_with_H = Chem.CanonSmiles(smiles) input_mol = Chem.MolFromSmiles( smiles, sanitize=False) # use santize=False to preserve explicit hydrogens Chem.SanitizeMol(input_mol, Chem.SANITIZE_ALL) cut_pairs = input_mol.GetSubstructMatches(_hydrogen_cut_pat) fragmentations = [] for cut_pair in cut_pairs: bond_idx = input_mol.GetBondBetweenAtoms(*cut_pair).GetIdx() fragmented_mol = Chem.FragmentOnBonds(input_mol, [bond_idx], dummyLabels=[(0, 0)]) new_smiles = Chem.MolToSmiles(fragmented_mol, isomericSmiles=True) left, mid, right = new_smiles.partition(".") assert mid == ".", new_smiles if left == "[*][H]": # Hard-coded cut_smiles = right elif right == "[*][H]": cut_smiles = left else: raise AssertionError("did not split hydrogen correctly: %r %r" % (smiles, new_smiles)) if "[H]" in cut_smiles: # If there were multiple [H] atoms, then we cut on one but others remain. # Recanonicalize to remove them. cut_smiles = Chem.CanonSmiles(cut_smiles) new_fragmentation = Fragmentation(1, EnumerationLabel.NO_ENUMERATION, 0, "1", "[*][H]", "0", num_heavies, "1", cut_smiles, None) fragmentations.append(new_fragmentation) return fragmentations
def corpus(input, output, suffix='sdf'): if suffix =='sdf': inf = gzip.open(input) mols = Chem.ForwardSDMolSupplier(inf) # mols = [mol for mol in suppl] else: df = pd.read_table(input).Smiles.dropna() mols = [Chem.MolFromSmiles(s) for s in df] voc = Voc('data/voc_smiles.txt') charger = rdMolStandardize.Uncharger() chooser = rdMolStandardize.LargestFragmentChooser() disconnector = rdMolStandardize.MetalDisconnector() normalizer = rdMolStandardize.Normalizer() words = set() canons = [] tokens = [] smiles = set() for mol in tqdm(mols): try: mol = disconnector.Disconnect(mol) mol = normalizer.normalize(mol) mol = chooser.choose(mol) mol = charger.uncharge(mol) mol = disconnector.Disconnect(mol) mol = normalizer.normalize(mol) smileR = Chem.MolToSmiles(mol, 0) smiles.add(Chem.CanonSmiles(smileR)) except: print('Parsing Error:') #, Chem.MolToSmiles(mol)) for smile in tqdm(smiles): token = voc.split(smile) + ['EOS'] if {'C', 'c'}.isdisjoint(token): print('Warning:', smile) continue if not {'[Na]', '[Zn]'}.isdisjoint(token): print('Redudent', smile) continue if 10 < len(token) <= 100: words.update(token) canons.append(smile) tokens.append(' '.join(token)) log = open(output + '_voc.txt', 'w') log.write('\n'.join(sorted(words))) log.close() log = pd.DataFrame() log['Smiles'] = canons log['Token'] = tokens log.drop_duplicates(subset='Smiles') log.to_csv(output + '_corpus.txt', sep='\t', index=False)
def compare_equality(x_test, autoencoded_selfies, dim, selfies_alphabet): ''' method that computes the equality in encode-decode performance between a test dataset and an encode-decoded dataset ''' test_size = len(x_test) count_good = 0 for i, mol in enumerate(x_test): # single point - through vae one_hot = np.zeros((dim[0], dim[1])) one_hot_true = np.zeros((dim[0], dim[1])) for ind, row in enumerate(autoencoded_selfies[i].reshape( dim[0], dim[1])): lab_temp = np.argmax(row) one_hot[ind][lab_temp] = 1 # single point - non vae for ind, row in enumerate(mol.reshape(dim[0], dim[1])): lab_temp = np.argmax(row) one_hot_true[ind][lab_temp] = 1 self_test = sf.encoding_to_selfies(one_hot.tolist(), selfies_alphabet, "one_hot") self_true = sf.encoding_to_selfies(one_hot_true.tolist(), selfies_alphabet, "one_hot") canonical_smiles = Chem.CanonSmiles(sf.decoder(self_true)) canonical_autoencoder_smiles = Chem.CanonSmiles(sf.decoder(self_test)) if (i == 1): print("Autoencoded Smiles: " + canonical_autoencoder_smiles) print("True Smiles: " + canonical_smiles) if (canonical_autoencoder_smiles == canonical_smiles): count_good += 1 print("Percent Reconstructed Molescules: " + str(count_good / test_size))
def parse_node(input_str): """ Convert something like to a Node: NODE O=CCCc1ccc(cc1)c2ccccc2 16 12 OCCCC1CCC(CC1)C2CCCCC2 0 :param input_str: :return: """ smiles = input_str.split()[1] new_node = Node() new_node.SMILES = Chem.CanonSmiles(smiles) new_node.HAC = input_str.split()[2] new_node.RAC = input_str.split()[3] new_node.RING_SMILES = input_str.split()[4] return new_node
def clean_mol(smile, is_deep=True): smile = smile.replace('[O]', 'O').replace('[C]', 'C') \ .replace('[N]', 'N').replace('[B]', 'B') \ .replace('[2H]', '[H]').replace('[3H]', '[H]') try: mol = Chem.MolFromSmiles(smile) if is_deep: mol = rdMolStandardize.ChargeParent(mol) smileR = Chem.MolToSmiles(mol, 0) smile = Chem.CanonSmiles(smileR) except: print('Parsing Error:', smile) smile = None return smile
def replace_wildcard_with_H(smiles): # The cache gives about 2% overall performance improvement. # My tests suggest there's about a 50% cache hit. try: return _H_cache[smiles] except KeyError: pass assert smiles.count("[*]") == 1, smiles smiles_with_H = smiles.replace("[*]", "[H]") new_smiles = Chem.CanonSmiles(smiles_with_H) if len(_H_cache) > 10000: _H_cache.clear() _H_cache[smiles] = new_smiles return new_smiles
def post(self, request): if request.is_ajax(): data = request.POST is_sub = data.get("is_sub") tanimoto = float(data.get("tanimoto", 0.8)) smiles = data.get("smiles") try: smiles = Chem.CanonSmiles(smiles) except: return "Sorry, the structure is not appropriate!" if is_sub and smiles: compounds = self._substructure_search(smiles) paginator = Paginator(compounds, 10) ## My paginator code ### try: paginator_compounds = paginator.page(1) except PageNotAnInteger: paginator_compounds = paginator.page(1) except EmptyPage: paginator_compounds = paginator.page(paginator.num_pages) respone = render( request, template_name="result/compounds_result.html", context={ "compounds": paginator_compounds, } ) return StreamingHttpResponse(respone.content) elif not is_sub and smiles: compounds = self._similarity_search(smiles, tanimoto=tanimoto) paginator = Paginator(compounds, 10) try: paginator_compounds = paginator.page(1) except PageNotAnInteger: paginator_compounds = paginator.page(1) except EmptyPage: paginator_compounds = paginator.page(paginator.num_pages) response = render( request, template_name="result/compounds_result.html", context={ "compounds": paginator_compounds, } ) return StreamingHttpResponse(response.content)