def retrain(self, n, keep_top_n, smiles_and_scores): print("writing dataset...") name = 'molexit-%d' % n dataset = '../models/molexit/%s.txt' % name dataset_scores = [] with open(dataset, 'w') as f: for smi, score in list( reversed(sorted(smiles_and_scores, key=lambda p: p[1])))[:keep_top_n]: dsmi = self.converter.encode( pybel.readstring("smi", smi.strip()).write("can").strip()) tok = DeepSMILESTokenizer(dsmi) tokens = tok.get_tokens() f.write(' '.join([t.value for t in tokens])) f.write("\n") dataset_scores.append(score) print('dataset: size: %s, mean score: %s, max score: %s' % (len(dataset_scores), np.mean(dataset_scores), np.max(dataset_scores))) print('training new LM...') self.lm_trainer.train(10, dataset, '../models/molexit', name) vocab = get_arpa_vocab('../models/molexit/%s.arpa' % name) self.lm = KenLMDeepSMILESLanguageModel( '../models/molexit/%s.klm' % name, vocab)
def __init__(self): self.vocab = get_arpa_vocab( '../resources/chembl_25_deepsmiles_klm_10gram_200503.arpa') self.lm = KenLMDeepSMILESLanguageModel( '../resources/chembl_25_deepsmiles_klm_10gram_200503.klm', self.vocab) self.best_smiles = None self.best_score = -1.0
def generate_optimized_molecules(self, scoring_function, number_molecules, starting_population=None): self.new_model_dir() vocab = get_arpa_vocab( '../resources/chembl_25_deepsmiles_klm_10gram_200503.arpa') self.lm = KenLMDeepSMILESLanguageModel( '../resources/chembl_25_deepsmiles_klm_10gram_200503.klm', vocab) print("generating %s samples..." % number_molecules) smiles_and_scores = [] TIME_PER_ITERATION = self.time_per_iteration_minutes * 60 # in seconds found = False for n in range(1, self.num_iterations + 1): print("iteration %s" % n) num_valid = 0 start = time.time() elapsed = time.time() - start while elapsed < TIME_PER_ITERATION: try: generated = self.lm.generate(num_chars=100, text_seed='<s>') decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) score = scoring_function.score(smiles) num_valid += 1 smiles_and_scores.append((smiles, score)) if score == 1.0: found = True break except Exception: pass elapsed = time.time() - start print("num valid: %s" % num_valid) if found: break self.retrain(n, self.keep_top_n, smiles_and_scores) return [ pair[0] for pair in list( reversed(sorted(smiles_and_scores, key=lambda p: p[1]))) [:number_molecules] ]
from rdkit.Chem.Crippen import MolLogP from rdkit.RDLogger import logger from rdkit import Chem from chemgrams import get_arpa_vocab, EmptyDeepSMILESLanguageModel, DeepSMILESLanguageModelUtils from rdkit import rdBase rdBase.DisableLog('rdApp.error') rdBase.DisableLog('rdApp.warning') logger = logger() vocab = get_arpa_vocab( '../resources/chemts_250k_deepsmiles_klm_6gram_190414.arpa') lm = EmptyDeepSMILESLanguageModel(vocab, n=6) current_best_score = None current_best_smiles = None beats_current = lambda score: score < current_best_score for i in range(1000): generated = lm.generate(num_chars=25, text_seed="<s>") try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') sanitized = DeepSMILESLanguageModelUtils.sanitize(decoded) mol = Chem.MolFromSmiles(sanitized) logp_score = MolLogP(mol)
logger.info( "KenLMDeepSMILESLanguageModel('../resources/zinc12_fragments_deepsmiles_klm_10gram_200502.klm', vocab)" ) logger.info("width = 12, max_depth = 35, start_state = ['<s>'], c = 5") logger.info( "score: -1.0 if invalid; -1.0 if seen previously; tanimoto distance from abilify if valid" ) logger.info("LanguageModelMCTSWithPUCTTerminating") logger.info("TanimotoScorer(abilify, radius=6)") logger.info("num_iterations = 100") logger.info("simulations_per_iteration = 50000") logger.info("keep_top_n = 5000") logger.info("loading language model...") vocab = get_arpa_vocab( '../resources/zinc12_fragments_deepsmiles_klm_10gram_200502.arpa') lm = KenLMDeepSMILESLanguageModel( '../resources/zinc12_fragments_deepsmiles_klm_10gram_200502.klm', vocab) abilify = "Clc4cccc(N3CCN(CCCCOc2ccc1c(NC(=O)CC1)c2)CC3)c4Cl" scorer = TanimotoScorer(abilify, radius=6) converter = Converter(rings=True, branches=True) env = os.environ.copy() env["PATH"] = "/Users/luis/kenlm/build/bin:" + env["PATH"] lm_trainer = KenLMTrainer(env) def log_best(j, all_best, n_valid, lggr): if j % 1000 == 0: lggr.info("--iteration: %d--" % j)
from rdkit import rdBase rdBase.DisableLog('rdApp.error') rdBase.DisableLog('rdApp.warning') logger = get_logger('chemgrams.log') THIS_DIR = os.path.dirname(os.path.abspath(__file__)) logger.info("LM-only") logger.info( "KenLMDeepSMILESLanguageModel(n=10, 'chemts_250k_deepsmiles_klm_10gram_200429.klm')" ) logger.info("num_chars=100, text_seed='<s>'") vocab = get_arpa_vocab( '../resources/chemts_250k_deepsmiles_klm_10gram_200429.arpa') lm = KenLMDeepSMILESLanguageModel( '../resources/chemts_250k_deepsmiles_klm_10gram_200429.klm', vocab) all_smiles = set() num_valid = 0 start = time.time() for i in range(500000): # about enough to get ~250,000 valid molecules try: generated = lm.generate(num_chars=100, text_seed='<s>') decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') sanitized = DeepSMILESLanguageModelUtils.sanitize(decoded)
logger.info( "KenLMDeepSMILESLanguageModel('../resources/chembl_25_deepsmiles_klm_10gram_200503.klm', vocab)" ) logger.info("width = 12, max_depth = 50, start_state = ['<s>'], c = 15") logger.info( "score: -1.0 if invalid; -1.0 if seen in iteration; tanimoto distance from abilify if valid; rescaling from [0,1] to [-1,1]" ) logger.info("LanguageModelMCTSWithPUCTTerminating") logger.info( "TanimotoScorer(abilify, radius=6); distance only (no SA or cycle scoring)" ) logger.info("num_iterations = 100") logger.info("time per iteration = 45 min.") logger.info("keep_top_n = 20000 unique") vocab = get_arpa_vocab( '../resources/chembl_25_deepsmiles_klm_10gram_200503.arpa') lm = KenLMDeepSMILESLanguageModel( '../resources/chembl_25_deepsmiles_klm_10gram_200503.klm', vocab) abilify = "Clc4cccc(N3CCN(CCCCOc2ccc1c(NC(=O)CC1)c2)CC3)c4Cl" distance_scorer = TanimotoScorer(abilify, radius=6) converter = Converter(rings=True, branches=True) env = os.environ.copy() env["PATH"] = "/Users/luis/kenlm/build/bin:" + env["PATH"] lm_trainer = KenLMTrainer(env) def smiles_to_deepsmiles(smiles): canonical = pybel.readstring("smi", smiles).write("can").strip() return converter.encode(canonical)
logger.info("KenLMDeepSMILESLanguageModel('../resources/chembl_25_deepsmiles_klm_10gram_200503.klm', vocab)") logger.info("width = 12, max_depth = 50, start_state = ['<s>'], c = 5") logger.info("score: -1.0 if invalid; -1.0 if seen previously; TanimotoScorer(abilify, radius=6) if valid; rescaling from [0,1] to [-1,1]") logger.info("LanguageModelMCTSWithPUCTTerminating") TIME_LIMIT = 3 * 60 * 60 # three hours in seconds # TIME_LIMIT = 2*60 # 2 minutes in seconds LOG_INTERVAL = 1 * 60 * 60 # one hour in seconds # LOG_INTERVAL = 30.0 # 30 seconds KEEP_TOP_N = 20000 logger.info("loading language model...") vocab = get_arpa_vocab('../resources/chembl_25_deepsmiles_klm_10gram_200503.arpa') lm = KenLMDeepSMILESLanguageModel('../resources/chembl_25_deepsmiles_klm_10gram_200503.klm', vocab) abilify = "Clc4cccc(N3CCN(CCCCOc2ccc1c(NC(=O)CC1)c2)CC3)c4Cl" distance_scorer = TanimotoScorer(abilify, radius=6) num_simulations = 15000000 # much more than 8 hours width = 12 max_depth = 50 start_state = ["<s>"] c = 5 all_unique = {} all_valid = [] num_valid = 0 simulations = 0
best = mcts.get_best_sequence() generated_text = ''.join(best[0]) logger.info("best generated text: %s" % generated_text) decoded = DeepSMILESLanguageModelUtils.decode(generated_text, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) logger.info("best SMILES: %s, J: %s (%s seconds)" % (smiles, distance_scorer.score(smiles), str((end - start)))) log_top_best(all_smiles, 5, logger) logger.info("writing dataset...") name = 'molexit-%d' % n dataset = '../models/molexit/%s.txt' % name with open(dataset, 'w') as f: for smi in list( reversed(sorted(all_smiles.items(), key=lambda kv: kv[1][0])))[:keep_top_n]: dsmi = smiles_to_deepsmiles(smi[0].strip()) tok = DeepSMILESTokenizer(dsmi) tokens = tok.get_tokens() f.write(' '.join([t.value for t in tokens])) f.write("\n") logger.info('training new LM...') lm_trainer.train(6, dataset, '../models/molexit', name) vocab = get_arpa_vocab('../models/molexit/%s.arpa' % name) lm = KenLMDeepSMILESLanguageModel('../models/molexit/%s.klm' % name, vocab)
from chemgrams import get_arpa_vocab, KenLMDeepSMILESLanguageModel, DeepSMILESLanguageModelUtils, \ LanguageModelMCTSWithUCB1 from rdkit.RDLogger import logger from rdkit import rdBase, Chem rdBase.DisableLog('rdApp.error') rdBase.DisableLog('rdApp.warning') logger = logger() if __name__ == '__main__': logger.info("loading language model...") vocab = get_arpa_vocab( '../models/chembl_25_deepsmiles_klm_6gram_190413.arpa') lm = KenLMDeepSMILESLanguageModel( '../models/chembl_25_deepsmiles_klm_6gram_190413.klm', vocab) num_simulations = 1000 width = 3 text_length = 25 start_state = ["<s>"] def eval_function(text): generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') DeepSMILESLanguageModelUtils.sanitize(decoded)
from chemgrams import get_arpa_vocab, KenLMSELFIESLanguageModel import selfies as sf from openbabel import pybel if __name__ == "__main__": # 6.655716427238491 vocab = get_arpa_vocab( '../resources/chemts_250k_selfies_klm_10gram_210908.arpa') lm = KenLMSELFIESLanguageModel( '../resources/chemts_250k_selfies_klm_10gram_210908.klm', vocab) def smiles_to_selfies(smiles): canonical = pybel.readstring("smi", smiles).write("can").strip() return sf.encoder(canonical) with open("../resources/zinc12_enaminebb_smiles_corpus.txt") as f: all_smiles = [s.strip() for s in f.readlines()] # the sum of log10 probs of each sentence in the corpus sum_log_prob = 0.0 # the total number of "words" (i.e. tokens) in the corpus M = 0 for smiles in all_smiles: s = smiles_to_selfies(smiles.strip()) tokens = list(sf.split_selfies(s)) M += len(tokens) sum_log_prob += lm.log_prob(' '.join(tokens))
logger.info( "KenLMSELFIESLanguageModel('../resources/zinc12_fragments_selfies_klm_10gram_210908.klm', vocab)" ) logger.info("width = 12, max_depth = 35, start_state = ['<s>'], c = 5") logger.info( "score: -1.0 if invalid; -1.0 if seen previously; tanimoto distance from abilify if valid" ) logger.info("LanguageModelMCTSWithPUCTTerminating") logger.info("TanimotoScorer(abilify, radius=6)") logger.info("num_iterations = 100") logger.info("simulations_per_iteration = 50000") logger.info("keep_top_n = 1500") logger.info("loading language model...") vocab = get_arpa_vocab( '../resources/zinc12_fragments_selfies_klm_10gram_210908.arpa') lm = KenLMSELFIESLanguageModel( '../resources/zinc12_fragments_selfies_klm_10gram_210908.klm', vocab) abilify = "Clc4cccc(N3CCN(CCCCOc2ccc1c(NC(=O)CC1)c2)CC3)c4Cl" scorer = TanimotoScorer(abilify, radius=6) converter = Converter(rings=True, branches=True) env = os.environ.copy() env["PATH"] = "/Users/luis/kenlm/build/bin:" + env["PATH"] lm_trainer = KenLMTrainer(env) def log_best(j, all_best, n_valid, lggr): if j % 1000 == 0: lggr.info("--iteration: %d--" % j)