def predict_outcome(self, reaction, k=1):
        """
        Using a predictor, produce top-k most likely reactions

        Params:
            reaction {Reaction}
            k {int} - how many top predictions to set and return
        Returns:
            {list[Molecule]} - list of products of reaction
        """
        react = reaction.get_input_str()
        try:
            (react, bond_preds, bond_scores, cur_att_score) = self.directcorefinder.predict(react)
            outcomes = self.directcandranker.predict(react, bond_preds, bond_scores)
        except RuntimeError as e:
            logging.error(f"Error occured in DirectCandRanker.predict: {e}")
            raise e

        res = []
        for out in outcomes:
            if out["smiles"]:  # may be empty for some reason?
                smiles = out["smiles"][0]
                mol = Molecule(smiles)
                mol.set_synthesis(reaction.inputs)
                res.append(mol)
            else:
                continue
        res = res[:k]

        # setting predicted products, if not already set:
        reaction.set_products(res)
        return res
 def _draw_node(self, node: Molecule):
     import os
     self._node_counter += 1
     if self._draw_mode == "smiles":
         self._dot.node(name=node.to_smiles(), label=node.to_smiles())
     elif self._draw_mode == "formula":
         self._dot.node(name=node.to_smiles(), label=node.to_formula())
     elif self._draw_mode == "plot":
         mol_img_path = os.path.join(self._sub_dir,
                                     str(self._node_counter) + ".png")
         visualize_mol(node, path=mol_img_path)
         self._dot.node(name=node.to_smiles(),
                        label="",
                        image=mol_img_path,
                        shape="plaintext")
Exemple #3
0
    def _test_sas(self):
        sas_func = lambda mol: calculateSAScore(Chem.MolFromSmiles(mol.smiles))
        print(sas_func(Molecule("CC")))
        test_pool = ["CC", "O=C=O", "C#N", "CCN(CC)CC", "CC(=O)O", "C1CCCCC1", "c1ccccc1"]
        test_pool = [Molecule(smiles) for smiles in test_pool]
        exp = RandomExplorer(sas_func, initial_pool=test_pool)
        print("Starting SA score optimization")
        t0 = time()
        exp.run(10)

        #check
        print("Completed SA score optimization, time elapsed: %.3fs" % (time()-t0))
        print(exp.pool)
        top = exp.get_best(1)[0]
        print(top.get_synthesis_path())
def get_chembl(n_mols=None, as_mols=True, option='', max_size=1000):
    """ 
        Return list of SMILES.
        NOTE: this function should be located
        in the same directory as data files.
    """
    path = os.path.join(__location__, "ChEMBL.txt")
    with open(path, "r") as f:
        if n_mols is None:
            res = [line.strip() for line in f]
        else:
            res = [f.readline().strip() for _ in range(n_mols)]
    mols = [Molecule(smile) for smile in res]
    if len(mols) < max_size:
        return mols

    gen = np.random.RandomState(42)
    mols = list(gen.choice(mols, max_size, replace=False))
    if option == '':
        return mols
    elif option == 'small_qed':
        qed_func = get_objective_by_name("qed")
        return [mol for mol in mols if qed_func(mol) < 0.6]
    elif option == 'large_qed':
        qed_func = get_objective_by_name("qed")
        return [mol for mol in mols if qed_func(mol) >= 0.6]
    else:
        raise ValueError(f"Dataset filter {option} not supported.")
Exemple #5
0
    def test_chembl(self):
        """
        Problem with fixed-prop testing:
        Almost all of the results (<10% for init_pool of 50) seem to be outside of the database,
        and even less for smaller pool. Hence cannot get its score for testing;
        setting them to zero leads to slow exploration.
        """
        pool_all, dd = get_chembl_prop()

        # loading with mol conversions takes 8 minutes
        # pool_all = [Molecule(smiles, conv_enabled=True) for smiles in tqdm(pool_all[:10000])]
        pool_all = [Molecule(smiles, conv_enabled=False) for smiles in pool_all]
        start_pool = list(np.random.choice(pool_all, size=100, replace=False))

        def print_props(pool):
            props = [dd[mol.smiles] for mol in pool]
            print("Props of pool", len(pool), np.min(props), np.mean(props), np.max(props))
        print_props(pool_all)
        print_props(start_pool)

        func = lambda mol: dd[mol.smiles]
        exp = RandomExplorer(lambda mol_list: func(mol_list[0]), initial_pool=start_pool)

        print("Starting ChEMBL score 1 optimization")
        t0 = time()
        exp.run(30)
        print("Completed ChEMBL score 1 optimization, time elapsed: %.3fs" % (time()-t0))

        # print(exp.pool)
        top = exp.get_best(1)[0]
        print(top.get_synthesis_path())

        print("Best achieved score: %.3f" % func(top))
        props = [dd[mol.smiles] for mol in pool_all]
        print("Best possible score: %.3f" % np.max(props))
Exemple #6
0
 def get_min_score(syn):
     res = float('inf')
     for mol, syn_graph in syn.items():
         # if mol.begin_flag:
         if isinstance(syn_graph, str):
             return sa_score(Molecule(mol))
         res = min(res, get_min_score(syn_graph))
     return res
Exemple #7
0
def get_graph_data_for_distance_computation(mol):
    """ Returns graph representation for a molecule. """
    if isinstance(mol, str):
        from mols.molecule import Molecule
        mol = Molecule(mol)
    rdk_mol = mol.to_rdkit()
    rdk_mol = Chem.AddHs(rdk_mol)
    adj_matrix = Chem.rdmolops.GetAdjacencyMatrix(rdk_mol)
    bonds = [(b.GetBeginAtomIdx(), b.GetEndAtomIdx())
             for b in rdk_mol.GetBonds()]
    bond_types = [
        rdk_mol.GetBondBetweenAtoms(b[0], b[1]).GetBondType() for b in bonds
    ]
    atom_idxs = list(range(len(rdk_mol.GetAtoms())))
    atomic_numbers = [
        rdk_mol.GetAtomWithIdx(idx).GetAtomicNum() for idx in atom_idxs
    ]
    atomic_symbols = [
        rdk_mol.GetAtomWithIdx(idx).GetSymbol() for idx in atom_idxs
    ]
    atomic_masses = [
        rdk_mol.GetAtomWithIdx(idx).GetMass() for idx in atom_idxs
    ]
    num_atoms = len(atom_idxs)
    bonds_of_each_atom = [
        get_neighbors_and_bond_types(idx, bonds, atomic_symbols, bond_types)
        for idx in range(num_atoms)
    ]
    bond_type_counts_of_each_atom = [
        get_bond_type_counts(bt) for bt in bonds_of_each_atom
    ]
    # Return
    graph_data = Namespace(
        rdk_mol=rdk_mol,
        adj_matrix=adj_matrix,
        bonds=bonds,
        bond_types=bond_types,
        atom_idxs=atom_idxs,
        atomic_numbers=atomic_numbers,
        atomic_symbols=atomic_symbols,
        atomic_masses=atomic_masses,
        num_atoms=num_atoms,
        bonds_of_each_atom=bonds_of_each_atom,
        bond_type_counts_of_each_atom=bond_type_counts_of_each_atom,
    )
    return graph_data
Exemple #8
0
def draw_molecule(mol: Molecule) -> PIL.Image.Image:
    """
    Draw a single molecule `mol` (make it `PIL.Image.Image`)
    :param mol: molecule to draw
    :return: corresponding image to `mol`
    """
    img = Draw.MolToImage(mol.to_rdkit())
    return img
def compute_synthesizability(exp_path):
    sas = get_objective_by_name("sascore")
    mol = None
    with open(os.path.join(exp_path, 'exp_log'), 'r') as f:
        for line in f:
            if 'Resulting molecule' in line:
                mol = Molecule(smiles=line.split()[2])
    if not mol: return
    sa_score = sas(mol)
    return sa_score
Exemple #10
0
    def _test_len(self):
        dummy_func = lambda mol: len(mol.smiles)
        test_pool = ["CC", "O=C=O", "C#N", "CCN(CC)CC", "CC(=O)O", "C1CCCCC1", "c1ccccc1"]
        test_pool = [Molecule(smiles) for smiles in test_pool]
        exp = RandomExplorer(dummy_func, initial_pool=test_pool)
        print("Starting len of SMILES optimization")
        exp.run(2)

        #check
        print(exp.pool)
Exemple #11
0
def draw_synthesis_path(target_smiles: str, synth_path: str,
                        out_path: str) -> None:
    """Draw the synthesis path and save to provided location.
    :param target_smiles: SMILES of the molecule being synthesized
    :param synth_path: dictionary of format SMILES m -> synpath of m
    :param out_path: where to save the resulting pdf
    """
    with open("./mols/best_molecule.pkl", "rb") as f:
        synpath = pickle.load(f)
    synpath = smile_synpath_to_mols(Molecule(smiles=target_smiles), synpath)
    drawer = SynPathDrawer(synpath, "plot")
    drawer.render(out_path)
Exemple #12
0
def get_chembl(option='', max_size=1000, as_mols=True):
    """ 
    Return list of Molecules.
    NOTE: this function should be located
    in the same directory as data files.

    Arguments:
        option {str} -- either empty or of format '{small,large}_{objective name}'
        max_size {int} -- number of molecules to sample, if None, returns all,
            else randomly samples a subset. Attention: there is a randomly set random seed
            that seeds this sampler now, so the subset will always be the same.
        as_mols {bool} -- whether to wrap SMILES into the Molecule class
    """
    path = os.path.join(__location__, "ChEMBL.txt")
    with open(path, "r") as f:
        mols = [line.strip() for line in f]
    if as_mols:
        mols = [Molecule(smile) for smile in mols]

    if max_size == -1:
        max_size = len(mols)
    if len(mols) <= max_size:
        return mols

    # TODO: this logic is off, if filtering afterwards,
    # we get less than max_size molecules in the end.
    # Fix this if needed.
    gen = np.random.RandomState(42)
    mols = list(gen.choice(mols, max_size, replace=False))
    if option == '':
        return mols
    elif option.startswith('small_'):
        obj_name = option.split("_")[1]
        obj_func = get_objective_by_name(obj_name)
        small_thresh = get_threshold(obj_name, mode='low')
        return [mol for mol in mols if obj_func(mol) < small_thresh]
    elif option.startswith('large_'):
        obj_name = option.split("_")[1]
        obj_func = get_objective_by_name(obj_name)
        large_thresh = get_threshold(obj_name, mode='high')
        return [mol for mol in mols if obj_func(mol) >= large_thresh]
    else:
        raise ValueError(f"Dataset filter {option} not supported.")
Exemple #13
0
def compute_min_sa_score(mol):
    """ Compute sas scores along the synthesis path of molecule. """
    sa_score = get_objective_by_name("sascore")

    def get_min_score(syn):
        res = float('inf')
        for mol, syn_graph in syn.items():
            # if mol.begin_flag:
            if isinstance(syn_graph, str):
                return sa_score(Molecule(mol))
            res = min(res, get_min_score(syn_graph))
        return res

    synthesis_path = mol.get_synthesis_path()
    if isinstance(synthesis_path, dict):
        min_sa_score = get_min_score(synthesis_path)
    else:
        min_sa_score = sa_score(Molecule(synthesis_path))
    return min_sa_score
Exemple #14
0
    def __init__(self, mol: Molecule, draw_mode: str):
        """
        :param mol: the molecule to draw synthesis path for
        :param draw_mode: "smiles" | "formula" | "plot" way of plotting each single molecule

        Examples::

            >>> drawer = SynPathDrawer(root_mol, "smiles")  # or "formula" or "plot"
            >>> drawer.render("some_output_dir/some_file_name")  # please, no file extension
        """
        assert draw_mode in ["smiles", "formula", "plot"]
        from graphviz import Digraph
        self._mol = mol
        self._dot = Digraph(comment="Synthesis path for {}".format(
            mol.to_smiles()),
                            format="pdf")
        self._draw_mode = draw_mode
        self._node_counter = 0
        self._sub_dir = None
def get_zinc250(option='', max_size=1000):
    path = os.path.join(__location__, "zinc250k.csv")
    zinc_df = pd.read_csv(path)
    list_of_smiles = list(map(lambda x: x.strip(), zinc_df.smiles.values))
    # other columns are logP, qed, and sas
    mols = [Molecule(smile) for smile in res]
    if len(mols) < max_size:
        return mols

    gen = np.random.RandomState(42)
    mols = list(gen.choice(mols, max_size, replace=False))
    if option == '':
        return mols
    elif option == 'small_qed':
        qed_func = get_objective_by_name("qed")
        return [mol for mol in mols if qed_func(mol) < 0.6]
    elif option == 'large_qed':
        qed_func = get_objective_by_name("qed")
        return [mol for mol in mols if qed_func(mol) >= 0.6]
    else:
        raise ValueError(f"Dataset filter {option} not supported.")
Exemple #16
0
def get_zinc250(option='', max_size=1000, as_mols=True):
    """ 
    Return list of Molecules.
    NOTE: this function should be located
    in the same directory as data files.

    Arguments:
        option {str} -- either empty or of format '{small,large}_{objective name}'
        max_size {int} -- number of molecules to sample, if None, returns all,
            else randomly samples a subset. Attention: there is a randomly set random seed
            that seeds this sampler now, so the subset will always be the same.
        as_mols {bool} -- whether to wrap SMILES into the Molecule class
    """
    path = os.path.join(__location__, "zinc250k.csv")
    zinc_df = pd.read_csv(path)
    list_of_smiles = list(map(lambda x: x.strip(), zinc_df.smiles.values))
    # other columns are logP, qed, and sas
    mols = [Molecule(smile) for smile in list_of_smiles]

    if max_size == -1:
        max_size = len(mols)
    if len(mols) <= max_size:
        return mols

    gen = np.random.RandomState(42)
    mols = list(gen.choice(mols, max_size, replace=False))
    if option == '':
        return mols
    elif option.startswith('small_'):
        obj_func = get_objective_by_name(option.split("_")[1])
        return [mol for mol in mols if obj_func(mol) < 0.6]
    elif option.startswith('large_'):
        obj_func = get_objective_by_name(option.split("_")[1])
        return [mol for mol in mols if obj_func(mol) >= 0.6]
    else:
        raise ValueError(f"Dataset filter {option} not supported.")
        except RuntimeError as e:
            logging.error(f"Error occured in DirectCandRanker.predict: {e}")
            raise e

        res = []
        for out in outcomes:
            if out["smiles"]:  # may be empty for some reason?
                smiles = out["smiles"][0]
                mol = Molecule(smiles)
                mol.set_synthesis(reaction.inputs)
                res.append(mol)
            else:
                continue
        res = res[:k]

        # setting predicted products, if not already set:
        reaction.set_products(res)
        return res


if __name__=="__main__":
    list_of_mols = ["[CH3:26][c:27]1[cH:28][cH:29][cH:30][cH:31][cH:32]1", 
                    "[Cl:18][C:19](=[O:20])[O:21][C:22]([Cl:23])([Cl:24])[Cl:25]",
                    "[NH2:1][c:2]1[cH:3][cH:4][c:5]([Br:17])[c:6]2[c:10]1[O:9][C:8]"+
                    "([CH3:11])([C:12](=[O:13])[O:14][CH2:15][CH3:16])[CH2:7]2"
                    ]
    list_of_mols = [Molecule(smiles) for smiles in list_of_mols]
    t = RexgenForwardSynthesizer()
    reaction = Reaction(list_of_mols)
    t.predict_outcome(reaction)
Exemple #18
0
 def setUp(self):
     S1, S2, S3 = "Cc1ccccc1", "C1OC1", "CCOC(=O)C1=C[C@@H](OC(CC)CC)[C@H](NC(C)=O)[C@@H](N)C1"
     self.mols = [Molecule(S1), Molecule(S2), Molecule(S3)]
Exemple #19
0
    def members_are_equal(cls, point_1, point_2):
        """ Technically, because SMILES are not unique,
            this may sometimes give false negatives.
            TODO: graph structure matching?
        """
        return mol1.to_smiles() == mol2.to_smiles()

    def __str__(self):
        """ Returns a string representation. """
        cc_attrs = ""
        if hasattr(self, "constraint_checker") and self.constraint_checker is not None:
            cc_attrs = {key:getattr(self.constraint_checker, key)
                        for key in self.constraint_checker.constraint_names}
        return 'Mol(%s):%s'%(self.mol_type, cc_attrs)


# Different constraint checker functions(Molecule -> bool) --------------------

def has_carbon(mol):
    rdk = mol.to_rdkit()
    atomic_symbols = [rdk.GetAtomWithIdx(idx).GetSymbol() for idx in range(len(rdk.GetAtoms()))]
    mol_has_carbon = ('C' in atomic_symbols)
    print(atomic_symbols, mol_has_carbon)
    return mol_has_carbon


if __name__ == "__main__":
    mol = Molecule("C=C1NC(N(C)C)=NC12CCN(CC(C)c1ccccc1)CC2")
    has_carbon(mol)

 def setUp(self):
     self.mol = Molecule("CC")
# def draw_synthesis_path(mol):
#     def compute_depth(syn_path):
#         depth = 1
#         if not mol.begin_flag:
#             for inp, inp_syn_path in syn_path:
#                 inp_depth = compute_depth(inp_syn_path)
#                 depth = max(depth, inp_depth)
#         return depth
#
#     syn_path = mol.get_syn_path()
#     depth = compute_depth(syn_path)  # number of rows to allocate for plotting
#     imgs_per_row = []
#     min_shape = None
#
#     # traverse the synthesis path and append images to imgs_per_row
#     # each row should be concatenated: see
#     # https://stackoverflow.com/questions/30227466/combine-several-images-horizontally-with-python
#
#     # TODO
#
#     imgs_comb = np.vstack([np.asarray(img.resize(min_shape))
#                                     for img in imgs_per_row])
#     result_img = PIL.Image.fromarray(imgs_comb)
#     return result_img

if __name__ == "__main__":
    mol = Molecule("CCCC")
    img = draw_molecule(mol)
    img.save('./experiments/results/test.png')
 def _draw_edge(self, tail: Molecule, head: Molecule):
     self._dot.edge(tail_name=tail.to_smiles(), head_name=head.to_smiles())