def retrain(self, n, keep_top_n, smiles_and_scores): print("writing dataset...") name = 'molexit-%d' % n dataset = '../models/molexit/%s.txt' % name dataset_scores = [] with open(dataset, 'w') as f: for smi, score in list( reversed(sorted(smiles_and_scores, key=lambda p: p[1])))[:keep_top_n]: dsmi = self.converter.encode( pybel.readstring("smi", smi.strip()).write("can").strip()) tok = DeepSMILESTokenizer(dsmi) tokens = tok.get_tokens() f.write(' '.join([t.value for t in tokens])) f.write("\n") dataset_scores.append(score) print('dataset: size: %s, mean score: %s, max score: %s' % (len(dataset_scores), np.mean(dataset_scores), np.max(dataset_scores))) print('training new LM...') self.lm_trainer.train(10, dataset, '../models/molexit', name) vocab = get_arpa_vocab('../models/molexit/%s.arpa' % name) self.lm = KenLMDeepSMILESLanguageModel( '../models/molexit/%s.klm' % name, vocab)
def eval_function(text): generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') DeepSMILESLanguageModelUtils.sanitize(decoded) except Exception: return 0 extracted = DeepSMILESLanguageModelUtils.extract(generated, start='<s>', end='</s>') tokenized = DeepSMILESTokenizer(extracted) score = len(tokenized.get_tokens()) / ( text_length - 1) # provide more reward for longer text sequences logger.info("%s, %s" % (generated, str(score))) return score
best = mcts.get_best_sequence() generated_text = ''.join(best[0]) logger.info("best generated text: %s" % generated_text) decoded = DeepSMILESLanguageModelUtils.decode(generated_text, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) logger.info("best SMILES: %s, J: %s (%s seconds)" % (smiles, scorer.score(smiles), str((end - start)))) log_top_best(all_smiles, 5, logger) logger.info("writing dataset...") name = 'molexit-%d' % n dataset = '../models/molexit/%s.txt' % name with open(dataset, 'w') as f: for smi in list( reversed(sorted(all_smiles.items(), key=lambda kv: kv[1][0])))[:keep_top_n]: dsmi = smiles_to_deepsmiles(smi[0].strip()) tok = DeepSMILESTokenizer(dsmi) tokens = tok.get_tokens() f.write(' '.join([t.value for t in tokens])) f.write("\n") logger.info('training new LM...') lm_trainer.train(10, dataset, '../models/molexit', name) vocab = get_arpa_vocab('../models/molexit/%s.arpa' % name) lm = KenLMDeepSMILESLanguageModel('../models/molexit/%s.klm' % name, vocab)