def retrain(self, n, keep_top_n, smiles_and_scores):
        print("writing dataset...")
        name = 'molexit-%d' % n
        dataset = '../models/molexit/%s.txt' % name
        dataset_scores = []
        with open(dataset, 'w') as f:
            for smi, score in list(
                    reversed(sorted(smiles_and_scores,
                                    key=lambda p: p[1])))[:keep_top_n]:
                dsmi = self.converter.encode(
                    pybel.readstring("smi", smi.strip()).write("can").strip())
                tok = DeepSMILESTokenizer(dsmi)
                tokens = tok.get_tokens()
                f.write(' '.join([t.value for t in tokens]))
                f.write("\n")
                dataset_scores.append(score)

        print('dataset: size: %s, mean score: %s, max score: %s' %
              (len(dataset_scores), np.mean(dataset_scores),
               np.max(dataset_scores)))
        print('training new LM...')
        self.lm_trainer.train(10, dataset, '../models/molexit', name)

        vocab = get_arpa_vocab('../models/molexit/%s.arpa' % name)
        self.lm = KenLMDeepSMILESLanguageModel(
            '../models/molexit/%s.klm' % name, vocab)
 def __init__(self):
     self.vocab = get_arpa_vocab(
         '../resources/chembl_25_deepsmiles_klm_10gram_200503.arpa')
     self.lm = KenLMDeepSMILESLanguageModel(
         '../resources/chembl_25_deepsmiles_klm_10gram_200503.klm',
         self.vocab)
     self.best_smiles = None
     self.best_score = -1.0
    def generate_optimized_molecules(self,
                                     scoring_function,
                                     number_molecules,
                                     starting_population=None):
        self.new_model_dir()

        vocab = get_arpa_vocab(
            '../resources/chembl_25_deepsmiles_klm_10gram_200503.arpa')
        self.lm = KenLMDeepSMILESLanguageModel(
            '../resources/chembl_25_deepsmiles_klm_10gram_200503.klm', vocab)

        print("generating %s samples..." % number_molecules)
        smiles_and_scores = []

        TIME_PER_ITERATION = self.time_per_iteration_minutes * 60  # in seconds

        found = False
        for n in range(1, self.num_iterations + 1):
            print("iteration %s" % n)
            num_valid = 0

            start = time.time()
            elapsed = time.time() - start
            while elapsed < TIME_PER_ITERATION:
                try:
                    generated = self.lm.generate(num_chars=100,
                                                 text_seed='<s>')

                    decoded = DeepSMILESLanguageModelUtils.decode(generated,
                                                                  start='<s>',
                                                                  end='</s>')
                    smiles = DeepSMILESLanguageModelUtils.sanitize(decoded)

                    score = scoring_function.score(smiles)
                    num_valid += 1
                    smiles_and_scores.append((smiles, score))

                    if score == 1.0:
                        found = True
                        break

                except Exception:
                    pass
                elapsed = time.time() - start

            print("num valid: %s" % num_valid)

            if found:
                break

            self.retrain(n, self.keep_top_n, smiles_and_scores)

        return [
            pair[0] for pair in list(
                reversed(sorted(smiles_and_scores, key=lambda p: p[1])))
            [:number_molecules]
        ]
Example #4
0
from rdkit.Chem.Crippen import MolLogP
from rdkit.RDLogger import logger
from rdkit import Chem

from chemgrams import get_arpa_vocab, EmptyDeepSMILESLanguageModel, DeepSMILESLanguageModelUtils

from rdkit import rdBase
rdBase.DisableLog('rdApp.error')
rdBase.DisableLog('rdApp.warning')

logger = logger()

vocab = get_arpa_vocab(
    '../resources/chemts_250k_deepsmiles_klm_6gram_190414.arpa')
lm = EmptyDeepSMILESLanguageModel(vocab, n=6)

current_best_score = None
current_best_smiles = None
beats_current = lambda score: score < current_best_score

for i in range(1000):
    generated = lm.generate(num_chars=25, text_seed="<s>")
    try:

        decoded = DeepSMILESLanguageModelUtils.decode(generated,
                                                      start='<s>',
                                                      end='</s>')
        sanitized = DeepSMILESLanguageModelUtils.sanitize(decoded)

        mol = Chem.MolFromSmiles(sanitized)
        logp_score = MolLogP(mol)
Example #5
0
logger.info(
    "KenLMDeepSMILESLanguageModel('../resources/zinc12_fragments_deepsmiles_klm_10gram_200502.klm', vocab)"
)
logger.info("width = 12, max_depth = 35, start_state = ['<s>'], c = 5")
logger.info(
    "score: -1.0 if invalid; -1.0 if seen previously; tanimoto distance from abilify if valid"
)
logger.info("LanguageModelMCTSWithPUCTTerminating")
logger.info("TanimotoScorer(abilify, radius=6)")
logger.info("num_iterations = 100")
logger.info("simulations_per_iteration = 50000")
logger.info("keep_top_n = 5000")

logger.info("loading language model...")

vocab = get_arpa_vocab(
    '../resources/zinc12_fragments_deepsmiles_klm_10gram_200502.arpa')
lm = KenLMDeepSMILESLanguageModel(
    '../resources/zinc12_fragments_deepsmiles_klm_10gram_200502.klm', vocab)

abilify = "Clc4cccc(N3CCN(CCCCOc2ccc1c(NC(=O)CC1)c2)CC3)c4Cl"
scorer = TanimotoScorer(abilify, radius=6)

converter = Converter(rings=True, branches=True)
env = os.environ.copy()
env["PATH"] = "/Users/luis/kenlm/build/bin:" + env["PATH"]
lm_trainer = KenLMTrainer(env)


def log_best(j, all_best, n_valid, lggr):
    if j % 1000 == 0:
        lggr.info("--iteration: %d--" % j)
from rdkit import rdBase

rdBase.DisableLog('rdApp.error')
rdBase.DisableLog('rdApp.warning')

logger = get_logger('chemgrams.log')
THIS_DIR = os.path.dirname(os.path.abspath(__file__))

logger.info("LM-only")
logger.info(
    "KenLMDeepSMILESLanguageModel(n=10, 'chemts_250k_deepsmiles_klm_10gram_200429.klm')"
)
logger.info("num_chars=100, text_seed='<s>'")

vocab = get_arpa_vocab(
    '../resources/chemts_250k_deepsmiles_klm_10gram_200429.arpa')
lm = KenLMDeepSMILESLanguageModel(
    '../resources/chemts_250k_deepsmiles_klm_10gram_200429.klm', vocab)

all_smiles = set()
num_valid = 0

start = time.time()
for i in range(500000):  # about enough to get ~250,000 valid molecules
    try:
        generated = lm.generate(num_chars=100, text_seed='<s>')

        decoded = DeepSMILESLanguageModelUtils.decode(generated,
                                                      start='<s>',
                                                      end='</s>')
        sanitized = DeepSMILESLanguageModelUtils.sanitize(decoded)
Example #7
0
logger.info(
    "KenLMDeepSMILESLanguageModel('../resources/chembl_25_deepsmiles_klm_10gram_200503.klm', vocab)"
)
logger.info("width = 12, max_depth = 50, start_state = ['<s>'], c = 15")
logger.info(
    "score: -1.0 if invalid; -1.0 if seen in iteration; tanimoto distance from abilify if valid; rescaling from [0,1] to [-1,1]"
)
logger.info("LanguageModelMCTSWithPUCTTerminating")
logger.info(
    "TanimotoScorer(abilify, radius=6); distance only (no SA or cycle scoring)"
)
logger.info("num_iterations = 100")
logger.info("time per iteration = 45 min.")
logger.info("keep_top_n = 20000 unique")

vocab = get_arpa_vocab(
    '../resources/chembl_25_deepsmiles_klm_10gram_200503.arpa')
lm = KenLMDeepSMILESLanguageModel(
    '../resources/chembl_25_deepsmiles_klm_10gram_200503.klm', vocab)

abilify = "Clc4cccc(N3CCN(CCCCOc2ccc1c(NC(=O)CC1)c2)CC3)c4Cl"
distance_scorer = TanimotoScorer(abilify, radius=6)

converter = Converter(rings=True, branches=True)
env = os.environ.copy()
env["PATH"] = "/Users/luis/kenlm/build/bin:" + env["PATH"]
lm_trainer = KenLMTrainer(env)


def smiles_to_deepsmiles(smiles):
    canonical = pybel.readstring("smi", smiles).write("can").strip()
    return converter.encode(canonical)
    logger.info("KenLMDeepSMILESLanguageModel('../resources/chembl_25_deepsmiles_klm_10gram_200503.klm', vocab)")
    logger.info("width = 12, max_depth = 50, start_state = ['<s>'], c = 5")
    logger.info("score: -1.0 if invalid; -1.0 if seen previously; TanimotoScorer(abilify, radius=6) if valid; rescaling from [0,1] to [-1,1]")
    logger.info("LanguageModelMCTSWithPUCTTerminating")

    TIME_LIMIT = 3 * 60 * 60  # three hours in seconds
    # TIME_LIMIT = 2*60  # 2 minutes in seconds

    LOG_INTERVAL = 1 * 60 * 60  # one hour in seconds
    # LOG_INTERVAL = 30.0  # 30 seconds

    KEEP_TOP_N = 20000

    logger.info("loading language model...")

    vocab = get_arpa_vocab('../resources/chembl_25_deepsmiles_klm_10gram_200503.arpa')
    lm = KenLMDeepSMILESLanguageModel('../resources/chembl_25_deepsmiles_klm_10gram_200503.klm', vocab)

    abilify = "Clc4cccc(N3CCN(CCCCOc2ccc1c(NC(=O)CC1)c2)CC3)c4Cl"
    distance_scorer = TanimotoScorer(abilify, radius=6)

    num_simulations = 15000000  # much more than 8 hours
    width = 12
    max_depth = 50
    start_state = ["<s>"]
    c = 5

    all_unique = {}
    all_valid = []
    num_valid = 0
    simulations = 0
Example #9
0
    best = mcts.get_best_sequence()
    generated_text = ''.join(best[0])
    logger.info("best generated text: %s" % generated_text)
    decoded = DeepSMILESLanguageModelUtils.decode(generated_text,
                                                  start='<s>',
                                                  end='</s>')
    smiles = DeepSMILESLanguageModelUtils.sanitize(decoded)
    logger.info("best SMILES: %s, J: %s (%s seconds)" %
                (smiles, distance_scorer.score(smiles), str((end - start))))

    log_top_best(all_smiles, 5, logger)

    logger.info("writing dataset...")
    name = 'molexit-%d' % n
    dataset = '../models/molexit/%s.txt' % name
    with open(dataset, 'w') as f:
        for smi in list(
                reversed(sorted(all_smiles.items(),
                                key=lambda kv: kv[1][0])))[:keep_top_n]:
            dsmi = smiles_to_deepsmiles(smi[0].strip())
            tok = DeepSMILESTokenizer(dsmi)
            tokens = tok.get_tokens()
            f.write(' '.join([t.value for t in tokens]))
            f.write("\n")

    logger.info('training new LM...')
    lm_trainer.train(6, dataset, '../models/molexit', name)

    vocab = get_arpa_vocab('../models/molexit/%s.arpa' % name)
    lm = KenLMDeepSMILESLanguageModel('../models/molexit/%s.klm' % name, vocab)
from chemgrams import get_arpa_vocab, KenLMDeepSMILESLanguageModel, DeepSMILESLanguageModelUtils, \
    LanguageModelMCTSWithUCB1

from rdkit.RDLogger import logger
from rdkit import rdBase, Chem
rdBase.DisableLog('rdApp.error')
rdBase.DisableLog('rdApp.warning')

logger = logger()

if __name__ == '__main__':

    logger.info("loading language model...")

    vocab = get_arpa_vocab(
        '../models/chembl_25_deepsmiles_klm_6gram_190413.arpa')
    lm = KenLMDeepSMILESLanguageModel(
        '../models/chembl_25_deepsmiles_klm_6gram_190413.klm', vocab)

    num_simulations = 1000
    width = 3
    text_length = 25
    start_state = ["<s>"]

    def eval_function(text):
        generated = ''.join(text)
        try:
            decoded = DeepSMILESLanguageModelUtils.decode(generated,
                                                          start='<s>',
                                                          end='</s>')
            DeepSMILESLanguageModelUtils.sanitize(decoded)
Example #11
0
from chemgrams import get_arpa_vocab, KenLMSELFIESLanguageModel

import selfies as sf
from openbabel import pybel

if __name__ == "__main__":

    # 6.655716427238491
    vocab = get_arpa_vocab(
        '../resources/chemts_250k_selfies_klm_10gram_210908.arpa')
    lm = KenLMSELFIESLanguageModel(
        '../resources/chemts_250k_selfies_klm_10gram_210908.klm', vocab)

    def smiles_to_selfies(smiles):
        canonical = pybel.readstring("smi", smiles).write("can").strip()
        return sf.encoder(canonical)

    with open("../resources/zinc12_enaminebb_smiles_corpus.txt") as f:
        all_smiles = [s.strip() for s in f.readlines()]

    # the sum of log10 probs of each sentence in the corpus
    sum_log_prob = 0.0

    # the total number of "words" (i.e. tokens) in the corpus
    M = 0

    for smiles in all_smiles:
        s = smiles_to_selfies(smiles.strip())
        tokens = list(sf.split_selfies(s))
        M += len(tokens)
        sum_log_prob += lm.log_prob(' '.join(tokens))
Example #12
0
logger.info(
    "KenLMSELFIESLanguageModel('../resources/zinc12_fragments_selfies_klm_10gram_210908.klm', vocab)"
)
logger.info("width = 12, max_depth = 35, start_state = ['<s>'], c = 5")
logger.info(
    "score: -1.0 if invalid; -1.0 if seen previously; tanimoto distance from abilify if valid"
)
logger.info("LanguageModelMCTSWithPUCTTerminating")
logger.info("TanimotoScorer(abilify, radius=6)")
logger.info("num_iterations = 100")
logger.info("simulations_per_iteration = 50000")
logger.info("keep_top_n = 1500")

logger.info("loading language model...")

vocab = get_arpa_vocab(
    '../resources/zinc12_fragments_selfies_klm_10gram_210908.arpa')
lm = KenLMSELFIESLanguageModel(
    '../resources/zinc12_fragments_selfies_klm_10gram_210908.klm', vocab)

abilify = "Clc4cccc(N3CCN(CCCCOc2ccc1c(NC(=O)CC1)c2)CC3)c4Cl"
scorer = TanimotoScorer(abilify, radius=6)

converter = Converter(rings=True, branches=True)
env = os.environ.copy()
env["PATH"] = "/Users/luis/kenlm/build/bin:" + env["PATH"]
lm_trainer = KenLMTrainer(env)


def log_best(j, all_best, n_valid, lggr):
    if j % 1000 == 0:
        lggr.info("--iteration: %d--" % j)