def time_roundtrip(file_path: str, sample_size: int = -1): """Tests the amount of time it takes to encode and then decode an entire .txt file of SMILES strings <n> times. If <sample_size> is positive, then a random sample is taken from the file instead. """ curr_dir = os.path.dirname(__file__) file_path = os.path.join(curr_dir, file_path) # load data with open(file_path, 'r') as file: smiles = [line.rstrip() for line in file.readlines()] smiles.pop(0) if sample_size > 0: smiles = random.sample(smiles, sample_size) selfies = list(map(sf.encoder, smiles)) print(f"Timing {len(smiles)} SMILES from {file_path}") # time sf.encoder start = time.time() for s in smiles: sf.encoder(s) enc_time = time.time() - start print(f"--> selfies.encoder: {enc_time:0.7f}s") # time sf.decoder start = time.time() for s in selfies: sf.decoder(s) dec_time = time.time() - start print(f"--> selfies.decoder: {dec_time:0.7f}s")
def test_standardized_alphabet(): """Tests that equivalent SMILES atom symbols are translated into the same SELFIES atom symbol. """ assert sf.encoder("[C][O][N][P][F]") == "[CH0][OH0][NH0][PH0][FH0]" assert sf.encoder("[Fe][Si]") == "[Fe][Si]" assert sf.encoder("[Fe++][Fe+2]") == "[Fe+2][Fe+2]" assert sf.encoder("[CH][CH1]") == "[CH1][CH1]"
def get_selfie_and_smiles_encodings_for_dataset(filename_data_set_file_smiles): """ Returns encoding, alphabet and length of largest molecule in SMILES and SELFIES, given a file containing SMILES molecules. input: csv file with molecules. Column's name must be 'smiles'. output: - selfies encoding - selfies alphabet - longest selfies string - smiles encoding (equivalent to file content) - smiles alphabet (character based) - longest smiles string """ df = pd.read_csv(filename_data_set_file_smiles) smiles_list = np.asanyarray(df.smiles) smiles_alphabet=list(set(''.join(smiles_list))) largest_smiles_len=len(max(smiles_list, key=len)) selfies_list=[] selfies_len=[] print('--> Translating SMILES to SELFIES...') for individual_smile in smiles_list: individual_selfie=selfies.encoder(individual_smile) selfies_list.append(individual_selfie) selfies_len.append(len(individual_selfie)-len(individual_selfie.replace('[',''))) # len of SELFIES selfies_alphabet_pre=list(set(''.join(selfies_list)[1:-1].split(']['))) selfies_alphabet=[] for selfies_element in selfies_alphabet_pre: selfies_alphabet.append('['+selfies_element+']') largest_selfies_len=max(selfies_len) print('Finished translating SMILES to SELFIES.') return(selfies_list, selfies_alphabet, largest_selfies_len, smiles_list, smiles_alphabet, largest_smiles_len)
def selfies(dir="../data/xyz/"): temp = xyz_to_smiles(dir) ret = [] for i in temp: selfies_temp = encoder(i) ret.append(selfies_temp) ret = np.array(ret) return ret
def mol2string(mol): smiles = Chem.MolToSmiles(mol) if string_type == 'selfies': return encoder(smiles).split('][') if string_type == 'deepsmiles': string = converter.encode(smiles) return list(string) return list(smiles)
def mol2string(mol): smiles = Chem.MolToSmiles(mol) if string_type == 'SELFIES': return encoder(smiles).split('][') if string_type == 'DeepSMILES': string = converter.encode(smiles) return list(string) return list(smiles)
def generate_ops(smi): smi = smi.strip() mol = Chem.MolFromSmiles(smi) if mol is None: print(f"None value: {smi}\n") return [] scaffold = Chem.Scaffolds.MurckoScaffold.GetScaffoldForMol(mol) frags = sg.get_next_murcko_fragments(scaffold) smi_selfies = selfies.encoder(smi) scaffold_selfies = selfies.encoder(Chem.MolToSmiles(scaffold)) data = [] data += [('SCAFFOLD', smi_selfies, scaffold_selfies), ('EXPAND', scaffold_selfies, smi_selfies)] for frag in frags: frag_seflies = selfies.encoder(Chem.MolToSmiles(frag)) data.append(('LOWER', scaffold_selfies, frag_seflies)) data.append(('UPPER', frag_seflies, scaffold_selfies)) return data
def __init__(self, smiles_file, percentage, vocab): """ smiles_file: path to the .smi file containing SMILES. percantage: percentage of the dataset to use. """ super(SMILESDataset, self).__init__() assert (0 < percentage <= 1) self.percentage = percentage self.vocab = vocab # load eaqual portion of data from each tranche self.data = self.read_smiles_file(smiles_file) print("total number of SMILES loaded: ", len(self.data)) # convert the smiles to selfies if self.vocab.name == "selfies": self.data = [ sf.encoder(x) for x in self.data if sf.encoder(x) is not None ] print("total number of valid SELFIES: ", len(self.data))
def mol2string(mol): Chem.Kekulize(mol, clearAromaticFlags=True) smiles = Chem.MolToSmiles(mol, canonical=False) if string_type == 'selfies': return encoder(smiles).split('][') if string_type == 'deepsmiles': string = converter.encode(smiles) return list(string) return list(smiles)
def create_annotations(input_file, output_file, idx2selfies, selfies2idx, max_length): with open(input_file, 'r) as f: for l in f: l = l.strip().split('\t') if len(l) > 1: smiles = l[0] img = l[1] try: selfies = [i + ']' for i in sf.encoder(l[0]).split(']')[:-1]] selfies = [i + ']' for i in s] except: pass
def test_invalid_or_unsupported_smiles_encoder(): malformed_smiles = [ "", "(", "C(Cl)(Cl)CC[13C", "C(CCCOC", "C=(CCOC", "CCCC)", "C1CCCCC", "C(F)(F)(F)(F)(F)F", # violates bond constraints "C=C1=CCCCCC1", # violates bond constraints "CC*CC", # uses wildcard "C$C", # uses $ bond "S[As@TB1](F)(Cl)(Br)N", # unrecognized chirality, "SOMETHINGWRONGHERE", "1243124124", ] for smiles in malformed_smiles: with pytest.raises(sf.EncoderError): sf.encoder(smiles)
def selfies_scanner(*, parent_smiles: str): # get the parent mol set up properly with defined aromaticity parent_mol = Chem.MolFromSmiles(parent_smiles, sanitize=True) Chem.rdmolops.Kekulize(parent_mol) parent_smiles = Chem.MolToSmiles(parent_mol, isomericSmiles=True, kekuleSmiles=True) logger.info(f"Generating children from: {parent_smiles}") children = [] # finished children spawns = 0 # counter for children produced parent_selfies = encoder(parent_smiles) symbols = re.findall(r"[^[]*\[([^]]*)\]", parent_selfies) # get the SELFIES symbols into a list for i, symb in enumerate(symbols): if not (symb == "epsilon" or "Branch" in symb or "Ring" in symb): if symb in ALLOWED_SUBS: # if we have rules for how to handle this symbol for replacement in ALLOWED_SUBS[symb]: mut_symbols = symbols.copy( ) # don't manipulate the original mut_symbols[i] = replacement child_symbols = [f"[{symb}]" for symb in mut_symbols] child = "".join(child_symbols) child_smiles = decoder(child) # get the smiles # test that smiles is valid try: # same as parent, have to have explicit aromaticity child_mol = Chem.MolFromSmiles(child_smiles, sanitize=True) Chem.rdmolops.Kekulize(child_mol) child_smiles = Chem.MolToSmiles(child_mol, isomericSmiles=True, kekuleSmiles=True) assert child_mol # if MolToSmiles fails, it will be a None if child_smiles == parent_smiles: # ignore this child if it's the same as the parent continue except Exception: # pylint: disable=broad-except logger.warning( f"Produced improper SELFIES. Ignoring and trying again. Details below:" ) logger.warning(f"Child SELFIES: {child}") logger.warning(f"Parent SELFIES:{parent_selfies}") logger.warning(f"Child SMILES: {child_smiles}") logger.warning(f"Parent SMILES: {parent_smiles}") continue # Every good child deserves fudge children.append(child_smiles) spawns += 1 # update our counter return children
def iterate_dataframe(dataset: pd.DataFrame) -> Iterable[Sentence]: for _, row in dataset.iterrows(): res = encoder(row.smiles) if not res: continue res = res.replace("]", "] ").replace(".", "DOT ") sent = Sentence(res.strip(), use_tokenizer=plain_tokenizer) for col, val in row.items(): if isinstance(val, float): if val == 1.0: sent.add_label(None, col.replace(" ", "_") + "_P ") if val == 0.0: sent.add_label(None, col.replace(" ", "_") + "_N ") yield sent
def test_encoder_attribution(): smiles = "C1([O-])C=CC=C1Cl" indices = [0, 3, 3, 3, 5, 7, 8, 10, None, None, 12] s, am = sf.encoder(smiles, attribute=True) # check that Cl lined up for i, ta in enumerate(am): if ta[1]: assert indices[i] == ta[1][0][0], \ f'found {ta[1]}; should be {indices[i]}' if ta[0] == '[Cl]': for i, v in ta[1]: if v == 'Cl': return raise ValueError('Failed to find Cl in attribution map')
def test_roundtrip_translation(test_path, dataset_samples): """Tests SMILES -> SELFIES -> SMILES translation on various datasets. """ # very relaxed constraints constraints = sf.get_preset_constraints("hypervalent") constraints.update({"P": 7, "P-1": 8, "P+1": 6, "?": 12}) sf.set_semantic_constraints(constraints) error_path = ERROR_LOG_DIR / "{}.csv".format(test_path.stem) with open(error_path, "w+") as error_log: error_log.write("In, Out\n") error_data = [] error_found = False n_lines = sum(1 for _ in open(test_path)) - 1 n_keep = dataset_samples if (0 < dataset_samples <= n_lines) else n_lines skip = random.sample(range(1, n_lines + 1), n_lines - n_keep) reader = pd.read_csv(test_path, chunksize=10000, header=0, skiprows=skip) for chunk in reader: for in_smiles in chunk["smiles"]: in_smiles = in_smiles.strip() mol = Chem.MolFromSmiles(in_smiles, sanitize=True) if (mol is None) or ("*" in in_smiles): continue try: selfies = sf.encoder(in_smiles, strict=True) out_smiles = sf.decoder(selfies) except (sf.EncoderError, sf.DecoderError): error_data.append((in_smiles, "")) continue if not is_same_mol(in_smiles, out_smiles): error_data.append((in_smiles, out_smiles)) with open(error_path, "a") as error_log: for entry in error_data: error_log.write(",".join(entry) + "\n") error_found = error_found or error_data error_data = [] sf.set_semantic_constraints() # restore constraints assert not error_found
def smiles2string(smiles): if string_type == 'smiles': string = smiles if string_type == 'selfies': try: string = encoder(smiles, PrintErrorMessage=False) except: return None if string_type == 'deepsmiles': try: string = converter.encode(smiles) except deepsmiles.DecodeError as e: return None return string
def batch_mode(input_file, output_file, model_size): outfile = open(output_file, "w") with open(input_file, "r") as f: for i, line in enumerate(f): smiles_string = line.strip() canonical_smiles = subprocess.check_output([ 'java', '-cp', 'Java_dependencies/cdk-2.1.1.jar:.', 'SMILEStoCanonicalSMILES', smiles_string ]) iupac_name = translate( selfies.encoder( canonical_smiles.decode('utf-8').strip()).replace( "][", "] ["), model_size) outfile.write( iupac_name.replace(" ", "").replace("<end>", "") + "\n") outfile.close() return output_file
def to_selfies(mol: Union[str, Chem.rdchem.Mol]) -> Optional[str]: """Convert a mol to SELFIES. Args: mol: a molecule or a SMILES. Returns: selfies: SELFIES string. """ if mol is None: return None if isinstance(mol, Chem.rdchem.Mol): mol = to_smiles(mol) selfies = sf.encoder(mol) # type: ignore if selfies == -1: return None return selfies
def roundtrip_translation(): sf.set_semantic_constraints("hypervalent") n_entries = 0 for chunk in make_reader(): n_entries += len(chunk) pbar = tqdm(total=n_entries) reader = make_reader() error_log = open(ERROR_LOG_DIR / f"{TEST_SET_PATH.stem}.txt", "a+") curr_idx = 0 for chunk_idx, chunk in enumerate(reader): for in_smiles in chunk[args.col_name]: pbar.update(1) curr_idx += 1 if curr_idx < args.start_from: continue in_smiles = in_smiles.strip() mol = Chem.MolFromSmiles(in_smiles, sanitize=True) if (mol is None) or ("*" in in_smiles): continue try: selfies = sf.encoder(in_smiles, strict=True) out_smiles = sf.decoder(selfies) except (sf.EncoderError, sf.DecoderError): error_log.write(in_smiles + "\n") tqdm.write(in_smiles) continue if not is_same_mol(in_smiles, out_smiles): error_log.write(in_smiles + "\n") tqdm.write(in_smiles) error_log.close()
def create_tokenized_smiles_json(tokenizer, data_dir, split, config_output_name, max_length, label_filename, idx2selfies, selfies2idx): data = {"images" : []} start_token_id = tokenizer.encode('<start>').ids[0] end_token_id = tokenizer.encode('<end>').ids[0] pad_token_id = tokenizer.get_vocab_size() print("The <start> token id is:{}\nThe <end> token id is:{}\nThe <pad> token id is {}".format(start_token_id, end_token_id, pad_token_id)) with open(os.path.join(data_dir, label_filename), "r") as f: for i, l in enumerate(tqdm(f)): try: smiles, idx = l.strip().split("\t") encoding = tokenizer.encode(smiles) selfies = sf.encoder(smiles) if selfies == None: selfies = '' selfies_cap = [selfies2idx['[start]']] + [selfies2idx[i] for i in sf.split_selfies(selfies)] if len(selfies_cap) > max_length-1: selfies_cap = selfies_cap[:max_length-1] selfies_cap = selfies_cap + [selfies2idx['[end]']] selfies_len = len(selfies_cap) selfies_len_orig = selfies_len while selfies_len < max_length: selfies_cap = selfies_cap + [selfies2idx['[pad]']] selfies_len += 1 encodingids = encoding.ids encodingids = [start_token_id] + encodingids[:-1] #add <start> token and shorten to 150 cap_len = max_length for j in range(0, len(encodingids)): if encodingids[j] == pad_token_id: cap_len = j+1 encodingids[j] = end_token_id break current_sample = {"filepath": data_dir, "filename": "{}".format(idx), "imgid": 0, "split": split, "sentences" : [{"tokens": encoding.tokens, "raw": smiles, "ids": encodingids , "length": cap_len, "selfies_raw": selfies, "selfies_ids": selfies_cap, "selfies_length":selfies_len_orig }] } # note if image augmentation ever happens need to introduce a sentence id token. see mscoco json for example data["images"].append(current_sample) except: pass pickle.dump(data, open(os.path.join(data_dir, config_output_name),'wb')) del data
def __getitem__(self, idx): # Returns tuple # Smiles has to be in first column of the csv !! row = self.df.iloc[idx, :] smiles = row.smiles # needed anyway to build graph m = Chem.MolFromSmiles(smiles) if self.compute_selfies: Chem.Kekulize(m) k = Chem.MolToSmiles(m, isomericSmiles=False, kekuleSmiles=True) # kekuleSmiles selfie = encoder(k) """ if selfie != row.selfies: print('new selfie:', selfie) print('prev : ', row.selfies) """ else: selfie = row.selfies # 1 - Graph building if m != None: graph = smiles_to_nx(smiles) else: return None, 0, 0, 0 one_hot = { edge: torch.tensor(self.edge_map[label]) for edge, label in ( nx.get_edge_attributes(graph, 'bond_type')).items() } nx.set_edge_attributes(graph, name='one_hot', values=one_hot) try: at_type = { a: oh_tensor(self.at_map[label], self.num_atom_types) for a, label in ( nx.get_node_attributes(graph, 'atomic_num')).items() } nx.set_node_attributes(graph, name='atomic_num', values=at_type) except KeyError: print('!!!! Atom type to one-hot error for input ', smiles, ' ignored') return None, 0, 0, 0 at_charge = { a: oh_tensor(self.charges_map[label], self.num_charges) for a, label in ( nx.get_node_attributes(graph, 'formal_charge')).items() } nx.set_node_attributes(graph, name='formal_charge', values=at_charge) try: hydrogens = { a: torch.tensor(self.chi_map[label], dtype=torch.float) for a, label in ( nx.get_node_attributes(graph, 'num_explicit_hs')).items() } nx.set_node_attributes(graph, name='num_explicit_hs', values=hydrogens) except KeyError: print( '!!!! Number of explicit hydrogens to one-hot error for input ', smiles, ' ignored') return None, 0, 0, 0 aromatic = { a: torch.tensor(self.chi_map[label], dtype=torch.float) for a, label in ( nx.get_node_attributes(graph, 'is_aromatic')).items() } nx.set_node_attributes(graph, name='is_aromatic', values=aromatic) at_chir = { a: torch.tensor(self.chi_map[label], dtype=torch.float) for a, label in ( nx.get_node_attributes(graph, 'chiral_tag')).items() } nx.set_node_attributes(graph, name='chiral_tag', values=at_chir) # to dgl g_dgl = dgl.DGLGraph() node_features = [ 'atomic_num', 'formal_charge', 'num_explicit_hs', 'is_aromatic', 'chiral_tag' ] g_dgl.from_networkx(nx_graph=graph, node_attrs=node_features, edge_attrs=['one_hot']) N = g_dgl.number_of_nodes() g_dgl.ndata['h'] = torch.cat( [g_dgl.ndata[f].view(N, -1) for f in node_features], dim=1) if self.graph_only: # give only the graph (to encode in latent space) return g_dgl, 0, 0, 0 # 2 - Smiles / selfies to integer indices array if self.language == 'selfies': a, valid_flag = self.selfies_to_hot(selfie) if valid_flag == 0: # no one hot encoding for this selfie, ignore print('!!! Selfie to one-hot failed with current alphabet') return None, 0, 0, 0 else: a = np.zeros(self.max_len) idces = [self.char_to_index[c] for c in smiles] a[:len(idces)] = idces # 3 - Optional props and affinities props, targets = 0, 0 if len(self.props) > 0: props = np.array(row[self.props], dtype=np.float32) if len(self.targets) > 0 and self.binned_scores: targets = np.array(row[self.targets], dtype=np.int64) # for torch.long class labels elif len(self.targets) > 0: targets = np.array(row[self.targets], dtype=np.float32) # for torch.float values targets = np.nan_to_num(targets) # if nan somewhere, change to 0. return g_dgl, a, props, targets
def test_selfies_split(self) -> None: """Test tokenization by selfies package has not changed.""" benzene = 'c1ccccc1' encoded_selfies = sf.encoder(benzene) # '[c][c][c][c][c][c][Ring1][Branch1_1]' v0.2.4 # '[C][=C][C][=C][C][=C][Ring1][Branch1_2]' v1.0.2 (no aromatic) # sf.split_selfies returns generator symbols_benzene = list(sf.split_selfies(encoded_selfies)) # before selfies 2.0.0 the last token is [Branch1_2] self.assertListEqual( symbols_benzene, ['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[=Branch1]'], ) for smiles, ground_truth in [ ( 'c1cnoc1', # before selfies 2.0.0 the 2 last token are [Expl=Ring1] and [Branch1_1] ['[C]', '[C]', '[=N]', '[O]', '[C]', '[=Ring1]', '[Branch1]'], ), ( '[O-][n+]1ccccc1S', # before selfies 2.0.0 it is: [O-expl], [N+expl] and [=Branch1_2] [ '[O-1]', '[N+1]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[=Branch1]', '[S]', ], ), ( 'c1snnc1-c1ccccn1', # before selfies 2.0.0 it is: [Expl=Ring1], [Branch1_1] and [Branch1_2] [ '[C]', '[S]', '[N]', '[=N]', '[C]', '[=Ring1]', '[Branch1]', '[C]', '[=C]', '[C]', '[=C]', '[C]', '[=N]', '[Ring1]', '[=Branch1]', ], ), ]: self.assertListEqual( # list wrapping version split_selfies(sf.encoder(smiles)), ground_truth, )
''' Written by Jan H. Jensen 2019 ''' from selfies import encoder, decoder import pandas as pd from rdkit import Chem pd.set_option('max_colwidth', 200) df = pd.read_csv('ZINC_250k.smi', sep=" ", header=None) df.columns = ["smiles"] rows = 1000 symbols_list = [] for index, row in df.iterrows(): smiles = row['smiles'] mol = Chem.MolFromSmiles(smiles) Chem.Kekulize(mol, clearAromaticFlags=True) smiles = Chem.MolToSmiles(mol) symbols = encoder(smiles).split('][') for symbol in symbols: symbol = symbol.replace(']', '').replace('[', '') if symbol not in symbols_list: symbols_list.append(symbol) print(symbols_list)
def selfies_substitution(*, parent_smiles: str, n_children: int = 100, mut_rate: float = 0.03, mut_min: int = 1, mut_max: int = 2, max_trials: int = 100 ): """ This function takes a parent molecule, and generates a number of children derived from it, via converting the parent into SELFIES format and substituting symbols for other symbols, based on primitive chemical rules. :param parent_smiles: The smiles of the parent molecule. :param n_children: How many children to produce. :param mut_rate: How frequent should mutations be? 0.0 to 1.0 :param mut_min: What is the min number of mutations to allow in a given child, relative to the parent? :param mut_max: Same as above but the max. :param max_trials: number of attempts to create valid mutations before moving on, useful for pathological selfies :return: """ # get the parent mol set up properly with defined aromaticity parent_mol = Chem.MolFromSmiles(parent_smiles, sanitize=True) Chem.rdmolops.Kekulize(parent_mol) parent_smiles = Chem.MolToSmiles(parent_mol, isomericSmiles=True, kekuleSmiles=True) logger.info(f"Generating children from: {parent_smiles}") children = [] # finished children spawns = 0 # counter for children produced parent_selfies = encoder(parent_smiles) symbols = re.findall(r"[^[]*\[([^]]*)\]", parent_selfies) # get the SELFIES symbols into a list while spawns < n_children: # try to produce the correct number of children muts = 0 mutations = [] # which parts of the SELFIES to remove mut_symbols = symbols.copy() # don't manipulate the original mut_positions = list(range(len(symbols))) # need the index t = 0 while (muts < mut_min) and (t <= max_trials): random.shuffle(mut_positions) # shuffle the order so that mutations will be random for pos in mut_positions: # try to mutate if pos not in mutations: if random.random() <= mut_rate: # ignore special symbols, leave them alone if not (symbols[pos] == "epsilon" or "Branch" in symbols[pos] or "Ring" in symbols[pos]): if symbols[pos] in ALLOWED_SUBS: # if we have rules for how to handle this symbol mutations.append(pos) # record which symbol this is muts += 1 # record the intention to mutate if muts == mut_max: # when we're done, stop looking break t += 1 if t > max_trials: logger.warning(f"Failed to produce any selfies after {max_trials} trials. Returning empty list.") return list() # for each planned mutation, actually execute it, on the non-shuffled original SELFIES list for index in sorted(mutations, reverse=True): mut_symbols[index] = random.choice(ALLOWED_SUBS[mut_symbols[index]]) # convert the new child into a SELFIES (rather than a list) child_symbols = [f"[{symb}]" for symb in mut_symbols] child = "".join(child_symbols) child_smiles = decoder(child) # get the smiles # test that smiles is valid try: # same as parent, have to have explicit aromaticity child_mol = Chem.MolFromSmiles(child_smiles, sanitize=True) Chem.rdmolops.Kekulize(child_mol) child_smiles = Chem.MolToSmiles(child_mol, isomericSmiles=True, kekuleSmiles=True) assert child_mol # if MolToSmiles fails, it will be a None if child_smiles == parent_smiles: # ignore this child if it's the same as the parent continue except Exception: # pylint: disable=broad-except logger.warning(f"Produced improper SELFIES. Ignoring and trying again. Details below:") logger.warning(f"Child SELFIES: {child}") logger.warning(f"Parent SELFIES:{parent_selfies}") logger.warning(f"Child SMILES: {child_smiles}") logger.warning(f"Parent SMILES: {parent_smiles}") continue # Every good child deserves fudge children.append(child_smiles) spawns += 1 # update our counter return children
num_iterations = 1 results_dir = du.make_clean_results_dir() total_time = time.time() for i in range(num_iterations): for beta in beta_params: max_fitness_collector = [] image_dir, saved_models_dir, data_dir = du.make_clean_directories(beta, results_dir, i) torch.cuda.empty_cache() writer = SummaryWriter() smiles_all_counter = train( num_generations = 100, generation_size = 500, starting_selfies = [encoder('C')], max_molecules_len = 81, disc_epochs_per_generation = 10, disc_enc_type = 'properties_rdkit', disc_layers = [100, 10], training_start_gen = 0, device = 'cuda', properties_calc_ls = ['logP', 'SAS', 'RingP', 'QED'], num_processors = multiprocessing.cpu_count(), beta = beta, max_fitness_collector = max_fitness_collector, impose_time_adapted_pen = True ) print('Total time: ', (time.time()-total_time)/60, ' mins')
'VAE_dependencies/Saved_models/VAE_decode_epoch_{}'.format( settings['training_VAE']['num_epochs'])) #plot epoch vs reconstruction loss / quality print(recons_quality_valid, recons_quality_train, recons_loss) if settings['plot']['plot_quality']: line1, = plt.plot(recons_quality_valid, label='Validation set') line2, = plt.plot(recons_quality_train, label='Training set') plt.xlabel('Epochs') plt.ylabel('Reconstruction Quality (%)') plt.legend(handles=[line1, line2]) plt.show() if settings['plot']['plot_loss']: plt.plot(recons_loss) plt.xlabel('Epochs') plt.ylabel('Reconstruction Loss') plt.show() else: print('Linear interpolation of 10 steps between ' + test_mol1 + ' and ' + test_mol2 + ':') mol_inter = linear_interpolation(encoder(test_mol1), encoder(test_mol2), 10) print(mol_inter) with open('COMPLETED', 'w') as content: content.write('exit code: 0') except AttributeError: _, error_message, _ = sys.exc_info() print(error_message) print(error_message)
def test_roundtrip_translation(test_name, column_name, dataset_samples): """Tests a roundtrip SMILES -> SELFIES -> SMILES translation of the SMILES examples in QM9, NonFullerene, Zinc, etc. """ # modify semantic bond constraints sf.set_semantic_constraints({ 'H': 1, 'F': 1, 'Cl': 1, 'Br': 1, 'I': 1, 'O': 2, 'O+1': 3, 'O-1': 1, 'N': 6, 'N+1': 4, 'N-1': 2, 'C': 4, 'C+1': 5, 'C-1': 3, 'S': 6, 'S+1': 7, 'S-1': 5, 'P': 7, 'P+1': 8, 'P-1': 6, '?': 8, }) # file I/O curr_dir = os.path.dirname(__file__) test_path = os.path.join(curr_dir, 'test_sets', test_name + ".txt") error_path = os.path.join(curr_dir, 'error_sets', "errors_{}.csv".format(test_name)) # create error directory os.makedirs(os.path.dirname(error_path), exist_ok=True) error_list = [] # add header in error log text file with open(error_path, "w+") as error_log: error_log.write("In, Out\n") error_found_flag = False # make pandas reader N = sum(1 for _ in open(test_path)) - 1 S = dataset_samples if (0 < dataset_samples <= N) else N skip = sorted(random.sample(range(1, N + 1), N - S)) reader = pd.read_csv(test_path, chunksize=10000, header=0, skiprows=skip) # roundtrip testing for chunk in reader: for in_smiles in chunk[column_name]: # check if SMILES in chunk is a valid RDKit molecule. # if not, skip testing # All inputted SMILES must be valid # RDKit Mol objects to be encoded. if (MolFromSmiles(in_smiles) is None) or ('*' in in_smiles): continue # encode SELFIE string selfies = sf.encoder(in_smiles) # if unable to encode SMILES, write to list of errors if selfies is None: error_list.append((in_smiles, '')) continue # take encoeded SELFIES and decode out_smiles = sf.decoder(selfies) # compare original SMILES to decoded SELFIE string. # if not the same string, write to list of errors. if not is_same_mol(in_smiles, out_smiles): error_list.append((in_smiles, str(out_smiles))) # open and write all errors to errors_{test_name}.csv with open(error_path, "a") as error_log: for error in error_list: error_log.write(','.join(error) + "\n") error_found_flag = error_found_flag or error_list error_list = [] sf.set_semantic_constraints() # restore defaults assert not error_found_flag
def test_malformed_smiles_encoder(): """Tests selfies.encoder() terminates on a malformed SMILES.""" sf.encoder("C(Cl)(Cl)CC[13C") assert True
def test_roundtrip_translation(): """Tests a roundtrip SMILES -> SELFIES -> SMILES translation of the SMILES examples in QM9, NonFullerene, Zinc, etc. """ # modify constraints constraints = sf.get_semantic_constraints() constraints['N'] = 6 constraints['Br'] = 7 constraints['Cl'] = 7 constraints['I'] = 7 sf.set_semantic_constraints(constraints) # file I/O ckpt_path = os.path.join(curr_dir, 'checkpoints', 'emolecule_ckpt.txt') error_path = os.path.join(curr_dir, 'error_sets', 'errors_emolecules.csv') # check if a previous checkpoint exists to continue tests if os.path.exists(ckpt_path): with open(ckpt_path, 'r') as ckpt_file: checkpoint = int(ckpt_file.readlines()[0]) # if no path to a checkpoint exists, # create a new directory for error logging and checkpoints else: os.makedirs(os.path.dirname(ckpt_path), exist_ok=True) os.makedirs(os.path.dirname(error_path), exist_ok=True) with open(error_path, "w+") as error_log: error_log.write("In, Out\n") checkpoint = -1 error_list = [] error_found_flag = False # make pandas reader reader = pd.read_csv(EMOL_PATH, chunksize=10000, compression='gzip', delimiter=' ', header=0) # roundtrip testing for chunk_idx, chunk in enumerate(reader): if chunk_idx <= checkpoint: continue for in_smiles in chunk[COL_NAME]: # check if SMILES in chunk is a valid RDKit molecule. # if not, skip testing # All inputted SMILES must be valid # RDKit Mol objects to be encoded. if (MolFromSmiles(in_smiles) is None) or ('*' in in_smiles): continue # encode selfies selfies = sf.encoder(in_smiles) # if unable to encode SMILES, write to list of errors if selfies is None: error_list.append((in_smiles, '')) continue # take encoeded SELFIES and decode out_smiles = sf.decoder(selfies) # compare original SMILES to decoded SELFIE string. # if not the same string, write to list of errors. if not is_same_mol(in_smiles, out_smiles): error_list.append((in_smiles, out_smiles)) # open and write all errors to errors_emolecule.csv with open(error_path, "a") as error_log: for error in error_list: error_log.write(','.join(error) + "\n") error_found_flag = error_found_flag or error_list error_list = [] # create checkpoint from the current pandas reader chunk, # to load from and continue testing. with open(ckpt_path, 'w+') as ckpt_file: ckpt_file.write(str(chunk_idx)) sf.set_semantic_constraints() # restore defaults os.remove(ckpt_path) # remove checkpoint assert not error_found_flag
def main(args): with open(args.input_file, 'r') as f: with open(args.output_file, 'w') as w: for l in f: l = l.strip() w.write('{}\n'.format(sf.encoder(l)))