def label_smiles(line, MAX_SMI_LEN, smi_ch_ind): X = np.zeros(MAX_SMI_LEN) smi_to_sel = list(sf.split_selfies(line)) for i, ch in enumerate(smi_to_sel): # x, smi_ch_ind, y X[i] = smi_ch_ind[ch] return X #.tolist()
def tokenize(self, selfie): """ Convert a SELFIES string into a sequence of tokens. """ tokens = list(sf.split_selfies(selfie)) tokens = ["SOS"] + tokens + ["EOS"] return tokens
def one_hot_smiles(line, MAX_SMI_LEN, smi_ch_ind): X = np.zeros((MAX_SMI_LEN, len(smi_ch_ind))) #+1 smi_to_sel = list(sf.split_selfies(line)) for i, ch in enumerate(smi_to_sel): X[i, (smi_ch_ind[ch] - 1)] = 1 return X #.tolist()
def test_tokenize_selfies_match(self) -> None: """Test deprecated tokenize_selfies.""" for smiles in ['c1cnoc1', '[O-][n+]1ccccc1S', 'c1snnc1-c1ccccn1']: transform = Selfies() selfies = transform(smiles) self.assertListEqual( tokenize_selfies(selfies), list(sf.split_selfies(selfies)) )
def tokenize_smiles(self, mol): """convert the smiles to selfies, then return integer tokens.""" ints = [self.vocab['<sos>']] #encoded_selfies = sf.encoder(smiles) selfies_list = list(sf.split_selfies(mol)) for token in selfies_list: ints.append(self.vocab[token]) ints.append(self.vocab['<eos>']) return ints
def get_smiles_length(input_file, tokenizer): molecule_length = {} end_token_id = tokenizer.encode('<end>').ids[0] with open(input_file, 'r') as f: for l in f: l = l.strip() encoding = tokenizer.encode(args.test_string) cur_molecule_length = 0 for i in range(0, len(encoding.ids)): if encoding.ids[i] == end_token_id: break cur_molecule_length += 1 cur_molecule_length = len(list(sf.split_selfies(l)))
def get_selfies_length(input_file, remove_null): molecule_length = [] with open(input_file, 'r') as f: for l in f: l = l.strip() cur_molecule_length = len(list(sf.split_selfies(l))) if remove_null == True: if cur_molecule_length > 0: molecule_length.append(cur_molecule_length) else: molecule_length.append(cur_molecule_length) print("######################################") print( "The tokenized captions lengths using selfies for {}.\n The statistics are as follows:Average:{}, Min:{}, Max:{}, Median{}, 5% percentile {}, 95% percentile {}, 50% percentile {}." .format(input_file, np.average(molecule_length), np.min(molecule_length), np.max(molecule_length), np.median(molecule_length), np.percentile(molecule_length, 0.05), np.percentile(molecule_length, 0.95), np.percentile(molecule_length, 0.5))) print("######################################")
def selfies_to_hot(selfie, largest_selfie_len, alphabet): """Go from a single selfies string to a one-hot encoding. """ symbol_to_int = dict((c, i) for i, c in enumerate(alphabet)) # pad with [nop] selfie += '[nop]' * (largest_selfie_len - sf.len_selfies(selfie)) # integer encode symbol_list = sf.split_selfies(selfie) integer_encoded = [symbol_to_int[symbol] for symbol in symbol_list] # one hot-encode the integer encoded selfie onehot_encoded = list() for index in integer_encoded: letter = [0] * len(alphabet) letter[index] = 1 onehot_encoded.append(letter) return integer_encoded, np.array(onehot_encoded)
def create_tokenized_smiles_json(tokenizer, data_dir, split, config_output_name, max_length, label_filename, idx2selfies, selfies2idx): data = {"images" : []} start_token_id = tokenizer.encode('<start>').ids[0] end_token_id = tokenizer.encode('<end>').ids[0] pad_token_id = tokenizer.get_vocab_size() print("The <start> token id is:{}\nThe <end> token id is:{}\nThe <pad> token id is {}".format(start_token_id, end_token_id, pad_token_id)) with open(os.path.join(data_dir, label_filename), "r") as f: for i, l in enumerate(tqdm(f)): try: smiles, idx = l.strip().split("\t") encoding = tokenizer.encode(smiles) selfies = sf.encoder(smiles) if selfies == None: selfies = '' selfies_cap = [selfies2idx['[start]']] + [selfies2idx[i] for i in sf.split_selfies(selfies)] if len(selfies_cap) > max_length-1: selfies_cap = selfies_cap[:max_length-1] selfies_cap = selfies_cap + [selfies2idx['[end]']] selfies_len = len(selfies_cap) selfies_len_orig = selfies_len while selfies_len < max_length: selfies_cap = selfies_cap + [selfies2idx['[pad]']] selfies_len += 1 encodingids = encoding.ids encodingids = [start_token_id] + encodingids[:-1] #add <start> token and shorten to 150 cap_len = max_length for j in range(0, len(encodingids)): if encodingids[j] == pad_token_id: cap_len = j+1 encodingids[j] = end_token_id break current_sample = {"filepath": data_dir, "filename": "{}".format(idx), "imgid": 0, "split": split, "sentences" : [{"tokens": encoding.tokens, "raw": smiles, "ids": encodingids , "length": cap_len, "selfies_raw": selfies, "selfies_ids": selfies_cap, "selfies_length":selfies_len_orig }] } # note if image augmentation ever happens need to introduce a sentence id token. see mscoco json for example data["images"].append(current_sample) except: pass pickle.dump(data, open(os.path.join(data_dir, config_output_name),'wb')) del data
def sf_tokenizer(sfi): return ' '.join("." if tok == "." else tok[1:-1] for tok in sf.split_selfies(sfi))
def __call__(self, smi): tokens = selfies.split_selfies(smi) return ' '.join(tokens)
def test_selfies_split(self) -> None: """Test tokenization by selfies package has not changed.""" benzene = 'c1ccccc1' encoded_selfies = sf.encoder(benzene) # '[c][c][c][c][c][c][Ring1][Branch1_1]' v0.2.4 # '[C][=C][C][=C][C][=C][Ring1][Branch1_2]' v1.0.2 (no aromatic) # sf.split_selfies returns generator symbols_benzene = list(sf.split_selfies(encoded_selfies)) # before selfies 2.0.0 the last token is [Branch1_2] self.assertListEqual( symbols_benzene, ['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[=Branch1]'], ) for smiles, ground_truth in [ ( 'c1cnoc1', # before selfies 2.0.0 the 2 last token are [Expl=Ring1] and [Branch1_1] ['[C]', '[C]', '[=N]', '[O]', '[C]', '[=Ring1]', '[Branch1]'], ), ( '[O-][n+]1ccccc1S', # before selfies 2.0.0 it is: [O-expl], [N+expl] and [=Branch1_2] [ '[O-1]', '[N+1]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[=Branch1]', '[S]', ], ), ( 'c1snnc1-c1ccccn1', # before selfies 2.0.0 it is: [Expl=Ring1], [Branch1_1] and [Branch1_2] [ '[C]', '[S]', '[N]', '[=N]', '[C]', '[=Ring1]', '[Branch1]', '[C]', '[=C]', '[C]', '[=C]', '[C]', '[=N]', '[Ring1]', '[=Branch1]', ], ), ]: self.assertListEqual( # list wrapping version split_selfies(sf.encoder(smiles)), ground_truth, )
def tokenzie_smile(smi): tokens = selfies.split_selfies(smi) return ' '.join(tokens)
def test_split_selfies(test_cases): for s, (_, symbols) in test_cases.items(): assert list(sf.split_selfies(s)) == symbols
def _tokenize_selfies(self, selfies_string): return list(sf.split_selfies(selfies_string))
if __name__ == "__main__": # 6.655716427238491 vocab = get_arpa_vocab( '../resources/chemts_250k_selfies_klm_10gram_210908.arpa') lm = KenLMSELFIESLanguageModel( '../resources/chemts_250k_selfies_klm_10gram_210908.klm', vocab) def smiles_to_selfies(smiles): canonical = pybel.readstring("smi", smiles).write("can").strip() return sf.encoder(canonical) with open("../resources/zinc12_enaminebb_smiles_corpus.txt") as f: all_smiles = [s.strip() for s in f.readlines()] # the sum of log10 probs of each sentence in the corpus sum_log_prob = 0.0 # the total number of "words" (i.e. tokens) in the corpus M = 0 for smiles in all_smiles: s = smiles_to_selfies(smiles.strip()) tokens = list(sf.split_selfies(s)) M += len(tokens) sum_log_prob += lm.log_prob(' '.join(tokens)) perplexity = 10**(-sum_log_prob / M) # log probs are in base 10 print(perplexity)
logger.info("best generated text: %s" % generated_text) decoded = SELFIESLanguageModelUtils.decode(generated_text, start='<s>', end='</s>') smiles = SELFIESLanguageModelUtils.sanitize(decoded) logger.info("best SMILES: %s, J: %s (%s seconds)" % (smiles, scorer.score(smiles), str((end - start)))) log_top_best(all_smiles, 5, logger) logger.info("writing dataset...") name = 'molexit-%d' % n dataset = '../models/molexit/%s.txt' % name with open(dataset, 'w') as f: for smi in list( reversed(sorted(all_smiles.items(), key=lambda kv: kv[1][0])))[:keep_top_n]: ssmi = smiles_to_selfies(smi[0].strip()) if ssmi is None: logger.info("WARNING: could not convert: %s" % smi[0].strip()) continue tokens = sf.split_selfies(ssmi) f.write(' '.join(tokens)) f.write("\n") logger.info('training new LM...') lm_trainer.train(10, dataset, '../models/molexit', name) vocab = get_arpa_vocab('../models/molexit/%s.arpa' % name) lm = KenLMSELFIESLanguageModel('../models/molexit/%s.klm' % name, vocab)
def test_split_selfies(dataset): for entry in dataset[0]: assert list(sf.split_selfies(entry.selfies)) == entry.symbols