def __init__(self, num_iterations, keep_top_n, time_per_iteration_minutes):
        self.num_iterations = num_iterations
        self.keep_top_n = keep_top_n
        self.time_per_iteration_minutes = time_per_iteration_minutes

        self.lm = None

        env = os.environ.copy()
        env["PATH"] = "/Users/luis/kenlm/build/bin:" + env["PATH"]
        self.lm_trainer = KenLMTrainer(env)

        self.converter = Converter(rings=True, branches=True)
Ejemplo n.º 2
0
class DeepSmilesEncoder(FunctionApplier):
    def __init__(self, astype=Series, branches=True, rings=True):
        super().__init__(self.encode)
        self.encoder = DeepSmilesConverter(branches, rings)

    def encode(self, smiles):
        k = self.function_kwargs
        return self.encoder.encode(smiles)
Ejemplo n.º 3
0
logger.info("TanimotoScorer(abilify, radius=6)")
logger.info("num_iterations = 100")
logger.info("simulations_per_iteration = 50000")
logger.info("keep_top_n = 5000")

logger.info("loading language model...")

vocab = get_arpa_vocab(
    '../resources/zinc12_fragments_deepsmiles_klm_10gram_200502.arpa')
lm = KenLMDeepSMILESLanguageModel(
    '../resources/zinc12_fragments_deepsmiles_klm_10gram_200502.klm', vocab)

abilify = "Clc4cccc(N3CCN(CCCCOc2ccc1c(NC(=O)CC1)c2)CC3)c4Cl"
scorer = TanimotoScorer(abilify, radius=6)

converter = Converter(rings=True, branches=True)
env = os.environ.copy()
env["PATH"] = "/Users/luis/kenlm/build/bin:" + env["PATH"]
lm_trainer = KenLMTrainer(env)


def log_best(j, all_best, n_valid, lggr):
    if j % 1000 == 0:
        lggr.info("--iteration: %d--" % j)
        lggr.info("num valid: %d" % n_valid)
        log_top_best(all_best, 5, lggr)


def smiles_to_deepsmiles(smiles):
    canonical = pybel.readstring(
        "smi", smiles).write("can").strip()  # TODO do we need to canonicalize?
Ejemplo n.º 4
0
 def __init__(self, astype=Series, branches=True, rings=True):
     super().__init__(self.encode)
     self.encoder = DeepSmilesConverter(branches, rings)
Ejemplo n.º 5
0
from rdkit import Chem
from rdkit.Chem import AllChem
from deepsmiles import Converter
f = open('ms-nmr.txt')
f1 = open('ms-m-ir-nmr.txt', 'w')
# alcohol, ether, carbonyl
for line in f:
    if '[' in line:
        smi = Converter(rings=True, branches=True).decode(line.split('],')[1])
        mol = Chem.MolFromSmiles(smi)
        mol = AllChem.AddHs(mol)
        # print(smi,mol.HasSubstructMatch(Chem.MolFromSmarts('[$(C[OX2H1]);!$([CX3](=O)[OX2H1])]')))
        print(smi, mol.HasSubstructMatch(Chem.MolFromSmarts('[$(c[OX2H1])]')))
        # print(smi,mol.HasSubstructMatch(Chem.MolFromSmarts('[$(C[OX2H1])&(c[OX2H1])]')))
        o = 1 if mol.HasSubstructMatch(Chem.MolFromSmarts(
            '[$(C[OX2H1])]')) == True or mol.HasSubstructMatch(
                Chem.MolFromSmarts('[$(c[OX2H1])]')) == True else 0
        e = 1 if mol.HasSubstructMatch(Chem.MolFromSmarts(
            '[o]([c])[c]')) == True or mol.HasSubstructMatch(
                Chem.MolFromSmarts('[O]([C])[C]')) == True else 0
        carb = 1 if mol.HasSubstructMatch(
            Chem.MolFromSmarts(
                '[$([CX3]=[OX1]),$([CX3+]-[OX1-])]')) == True else 0
        atoms = mol.GetAtoms()
        l = [a.GetSymbol() for a in atoms]
        cs = l.count('C')
        hs = l.count('H')
        os = l.count('O')
        #print(line,cs,hs,os)
        l1 = str(line.split(']')[0]) + ', ' + str(
            [cs, hs, os, o, e, carb]).split('[')[1] + line.split(']')[1]
class ChemgramsGoalDirectedGenerator(GoalDirectedGenerator):
    def __init__(self, num_iterations, keep_top_n, time_per_iteration_minutes):
        self.num_iterations = num_iterations
        self.keep_top_n = keep_top_n
        self.time_per_iteration_minutes = time_per_iteration_minutes

        self.lm = None

        env = os.environ.copy()
        env["PATH"] = "/Users/luis/kenlm/build/bin:" + env["PATH"]
        self.lm_trainer = KenLMTrainer(env)

        self.converter = Converter(rings=True, branches=True)

    def generate_optimized_molecules(self,
                                     scoring_function,
                                     number_molecules,
                                     starting_population=None):
        self.new_model_dir()

        vocab = get_arpa_vocab(
            '../resources/chembl_25_deepsmiles_klm_10gram_200503.arpa')
        self.lm = KenLMDeepSMILESLanguageModel(
            '../resources/chembl_25_deepsmiles_klm_10gram_200503.klm', vocab)

        print("generating %s samples..." % number_molecules)
        smiles_and_scores = []

        TIME_PER_ITERATION = self.time_per_iteration_minutes * 60  # in seconds

        found = False
        for n in range(1, self.num_iterations + 1):
            print("iteration %s" % n)
            num_valid = 0

            start = time.time()
            elapsed = time.time() - start
            while elapsed < TIME_PER_ITERATION:
                try:
                    generated = self.lm.generate(num_chars=100,
                                                 text_seed='<s>')

                    decoded = DeepSMILESLanguageModelUtils.decode(generated,
                                                                  start='<s>',
                                                                  end='</s>')
                    smiles = DeepSMILESLanguageModelUtils.sanitize(decoded)

                    score = scoring_function.score(smiles)
                    num_valid += 1
                    smiles_and_scores.append((smiles, score))

                    if score == 1.0:
                        found = True
                        break

                except Exception:
                    pass
                elapsed = time.time() - start

            print("num valid: %s" % num_valid)

            if found:
                break

            self.retrain(n, self.keep_top_n, smiles_and_scores)

        return [
            pair[0] for pair in list(
                reversed(sorted(smiles_and_scores, key=lambda p: p[1])))
            [:number_molecules]
        ]

    def new_model_dir(self):
        print(
            "deleting any existing molexit directory, and creating a new one..."
        )
        path = Path("../models/molexit/")
        if os.path.exists(path) and os.path.isdir(path):
            shutil.rmtree(path)
        path.mkdir(parents=True, exist_ok=True)

    def retrain(self, n, keep_top_n, smiles_and_scores):
        print("writing dataset...")
        name = 'molexit-%d' % n
        dataset = '../models/molexit/%s.txt' % name
        dataset_scores = []
        with open(dataset, 'w') as f:
            for smi, score in list(
                    reversed(sorted(smiles_and_scores,
                                    key=lambda p: p[1])))[:keep_top_n]:
                dsmi = self.converter.encode(
                    pybel.readstring("smi", smi.strip()).write("can").strip())
                tok = DeepSMILESTokenizer(dsmi)
                tokens = tok.get_tokens()
                f.write(' '.join([t.value for t in tokens]))
                f.write("\n")
                dataset_scores.append(score)

        print('dataset: size: %s, mean score: %s, max score: %s' %
              (len(dataset_scores), np.mean(dataset_scores),
               np.max(dataset_scores)))
        print('training new LM...')
        self.lm_trainer.train(10, dataset, '../models/molexit', name)

        vocab = get_arpa_vocab('../models/molexit/%s.arpa' % name)
        self.lm = KenLMDeepSMILESLanguageModel(
            '../models/molexit/%s.klm' % name, vocab)