def retrain(self, n, keep_top_n, smiles_and_scores):
        print("writing dataset...")
        name = 'molexit-%d' % n
        dataset = '../models/molexit/%s.txt' % name
        dataset_scores = []
        with open(dataset, 'w') as f:
            for smi, score in list(
                    reversed(sorted(smiles_and_scores,
                                    key=lambda p: p[1])))[:keep_top_n]:
                dsmi = self.converter.encode(
                    pybel.readstring("smi", smi.strip()).write("can").strip())
                tok = DeepSMILESTokenizer(dsmi)
                tokens = tok.get_tokens()
                f.write(' '.join([t.value for t in tokens]))
                f.write("\n")
                dataset_scores.append(score)

        print('dataset: size: %s, mean score: %s, max score: %s' %
              (len(dataset_scores), np.mean(dataset_scores),
               np.max(dataset_scores)))
        print('training new LM...')
        self.lm_trainer.train(10, dataset, '../models/molexit', name)

        vocab = get_arpa_vocab('../models/molexit/%s.arpa' % name)
        self.lm = KenLMDeepSMILESLanguageModel(
            '../models/molexit/%s.klm' % name, vocab)
Ejemplo n.º 2
0
    def eval_function(text):
        generated = ''.join(text)
        try:
            decoded = DeepSMILESLanguageModelUtils.decode(generated,
                                                          start='<s>',
                                                          end='</s>')
            DeepSMILESLanguageModelUtils.sanitize(decoded)
        except Exception:
            return 0
        extracted = DeepSMILESLanguageModelUtils.extract(generated,
                                                         start='<s>',
                                                         end='</s>')
        tokenized = DeepSMILESTokenizer(extracted)

        score = len(tokenized.get_tokens()) / (
            text_length - 1)  # provide more reward for longer text sequences

        logger.info("%s, %s" % (generated, str(score)))
        return score
Ejemplo n.º 3
0
    best = mcts.get_best_sequence()
    generated_text = ''.join(best[0])
    logger.info("best generated text: %s" % generated_text)
    decoded = DeepSMILESLanguageModelUtils.decode(generated_text,
                                                  start='<s>',
                                                  end='</s>')
    smiles = DeepSMILESLanguageModelUtils.sanitize(decoded)
    logger.info("best SMILES: %s, J: %s (%s seconds)" %
                (smiles, scorer.score(smiles), str((end - start))))

    log_top_best(all_smiles, 5, logger)

    logger.info("writing dataset...")
    name = 'molexit-%d' % n
    dataset = '../models/molexit/%s.txt' % name
    with open(dataset, 'w') as f:
        for smi in list(
                reversed(sorted(all_smiles.items(),
                                key=lambda kv: kv[1][0])))[:keep_top_n]:
            dsmi = smiles_to_deepsmiles(smi[0].strip())
            tok = DeepSMILESTokenizer(dsmi)
            tokens = tok.get_tokens()
            f.write(' '.join([t.value for t in tokens]))
            f.write("\n")

    logger.info('training new LM...')
    lm_trainer.train(10, dataset, '../models/molexit', name)

    vocab = get_arpa_vocab('../models/molexit/%s.arpa' % name)
    lm = KenLMDeepSMILESLanguageModel('../models/molexit/%s.klm' % name, vocab)