Exemple #1
0
def get_model_path(model_name: str) -> os.PathLike:
    """
    Find correct path to the directory storing the model.

    A `model_name` can be provided in various format:
    1. `dataset/data`, in this case the model will be returned directly.
    2. `dataset`, in this case the latest model corresponds to the dataset is returned.
    3. it can also be a path to a local directory where the model info are stored.

    Args:
        model_name: name of the model, e.g. pubchem

    Returns:
        model_path: path storing the model
    """
    # download model stored at Google Drive if haven't yet
    if model_name.lower() in GOOGLE_DRIVE_MODEL_INFO:
        prefix = model_name.lower()

        # directory to store the model
        model_dir = to_path("bondnet_pretrained_model__").joinpath(prefix)

        # if not exist, download it
        if not model_dir.is_dir():
            date = GOOGLE_DRIVE_MODEL_INFO[prefix]["date"][-1]
            file_id = GOOGLE_DRIVE_MODEL_INFO[prefix]["file_id"][-1]
            download_model(file_id, date, model_dir)
        else:
            print(
                f"Found existing model files at {model_dir} for your requested model "
                f"`{model_name}`. We'll reuse it.")

        model_name = model_dir

    # model provided as a path
    if to_path(model_name).is_dir():
        model_path = to_path(model_name)

    # model stored with code
    else:
        model_name = model_name.strip().lower()
        if "/" in model_name:
            prefix, date = model_name.split("/")
            if date not in MODEL_INFO[prefix]["date"]:
                raise ValueError(
                    f"expect model date to be one of { MODEL_INFO[prefix]['date'] }, "
                    f"but got {date}.")
        else:
            prefix = model_name
            date = MODEL_INFO[prefix]["date"][-1]

        model_path = to_path(bondnet.__file__).parent.joinpath(
            "prediction", "pretrained", prefix, date)

    return model_path
def main(
    model_name="bdncm/20200808",
    sdf_file="~/Applications/db_access/mol_builder/struct_rxn_ntwk_rgrn_qc.sdf",
    label_file="~/Applications/db_access/mol_builder/label_rxn_ntwk_rgrn_qc.yaml",
    feature_file="~/Applications/db_access/mol_builder/feature_rxn_ntwk_rgrn_qc.yaml",
    error_file="~/Applications/db_access/mol_builder/post_analysis/evaluation_error.tsv",
    charge_file="~/Applications/db_access/mol_builder/post_analysis/charges.tsv",
):

    seed_torch()

    dataset = load_dataset(model_name, sdf_file, label_file, feature_file)

    # trainset, valset, testset = train_validation_test_split(
    #     dataset, validation=0.1, test=0.1
    # )

    trainset, valset, testset = train_validation_test_split_selected_bond_in_train(
        dataset,
        validation=0.1,
        test=0.1,
        selected_bond_type=(("H", "H"), ("H", "F"), ("F", "F")),
    )

    # data_loader = DataLoaderReactionNetwork(trainset, batch_size=100, shuffle=False)
    # data_loader = DataLoaderReactionNetwork(valset, batch_size=100, shuffle=False)
    data_loader = DataLoaderReactionNetwork(testset,
                                            batch_size=100,
                                            shuffle=False)

    model = load_model(model_name)

    # make predictions
    feature_names = ["atom", "bond", "global"]
    ids, targets, predictions, errors, species = evaluate(
        model, feature_names, data_loader)

    # sort by error
    ids, targets, predictions, errors, species = zip(*sorted(
        zip(ids, targets, predictions, errors, species), key=lambda x: x[3]))

    df = pd.DataFrame({
        "identifier": ids,
        "target": targets,
        "prediction": predictions,
        "error": errors,
        "species": species,
    })

    df.to_csv(to_path(error_file), sep="\t", index=False)

    # charges
    df = get_charges(label_file, feature_file)
    df.to_csv(to_path(charge_file), sep="\t", index=False)
Exemple #3
0
 def __init__(self,
              molecule_file,
              reaction_file,
              charge_file=None,
              format="sdf",
              nprocs=None):
     self.molecule_file = to_path(molecule_file)
     self.reaction_file = to_path(reaction_file)
     self.charge_file = to_path(
         charge_file) if charge_file is not None else None
     self.format = format
     self.nprocs = nprocs
def main(
    model_name="bdncm/20200808",
    sdf_file="~/Applications/db_access/mol_builder/struct_rxn_ntwk_rgrn_qc.sdf",
    label_file="~/Applications/db_access/mol_builder/label_rxn_ntwk_rgrn_qc.yaml",
    feature_file="~/Applications/db_access/mol_builder/feature_rxn_ntwk_rgrn_qc.yaml",
    feat_filename="~/Applications/db_access/mol_builder/post_analysis/feats.tsv",
    meta_filename="~/Applications/db_access/mol_builder/post_analysis/feats_metadata.tsv",
):

    seed_torch()

    dataset = load_dataset(model_name, sdf_file, label_file, feature_file)
    _, _, testset = train_validation_test_split(dataset,
                                                validation=0.1,
                                                test=0.1)
    data_loader = DataLoaderReactionNetwork(testset,
                                            batch_size=100,
                                            shuffle=False)
    # data_loader = DataLoaderReactionNetwork(dataset, batch_size=100, shuffle=False)

    model = load_model(model_name)

    # make predictions
    feature_names = ["atom", "bond", "global"]
    ids, targets, predictions, errors, species, features = evaluate(
        model, feature_names, data_loader, compute_features=True)
    df = pd.DataFrame(features)
    df.to_csv(to_path(feat_filename), sep="\t", header=False, index=False)

    # metadata
    charges = get_charges(label_file, feature_file)
    rct_charges = []
    prdt1_charges = []
    prdt2_charges = []
    for i in ids:
        c = charges[charges["identifier"] == i].to_dict("records")[0]
        rct_charges.append(c["charge"])
        prdt1_charges.append(c["product1 charge"])
        prdt2_charges.append(c["product2 charge"])

    df = pd.DataFrame({
        "identifier": ids,
        "target": targets,
        "prediction": predictions,
        "error": errors,
        "species": species,
        "reactant charge": rct_charges,
        "product1 charge": prdt1_charges,
        "product2 charge": prdt2_charges,
    })
    df.to_csv(to_path(meta_filename), sep="\t", index=False)
def select_one_bond_break_reactions(filename, outname="one_bond_break.csv"):

    filename = to_path(filename)
    df = pd.read_csv(filename, header=0, index_col=0)

    reactions = read_dataset(filename)
    num_altered_bucket = bucket_rxns_by_num_altered_bonds(reactions)
    altered_type_bucket = bucket_rxns_by_altered_bond_types(
        num_altered_bucket[1], 1)

    rxn_id = []
    activation_e = []
    enthalpy = []
    broken_bond = []
    breaking = []

    for (rct_bond, prdt_bond), reactions in altered_type_bucket.items():
        for rxn in reactions:

            act_e = df["ea"][rxn.get_id()]
            eth_e = df["dh"][rxn.get_id()]

            # one bond breaking
            if len(rct_bond) == 1 and len(prdt_bond) == 0:
                bond = rct_bond[0]
                brk = True
            # one bond formation, we make it like bond breaking
            elif len(rct_bond) == 0 and len(prdt_bond) == 1:
                bond = prdt_bond[0]
                brk = False
            else:
                raise RuntimeError(
                    "not one bond alternation reaction encountered")

            rxn_id.append(rxn.get_id())
            activation_e.append(act_e)
            enthalpy.append(eth_e)
            broken_bond.append("-".join(bond))
            breaking.append(brk)

    df = pd.DataFrame({
        "id": rxn_id,
        "bond type": broken_bond,
        "activation energy": activation_e,
        "enthalpy": enthalpy,
        "breaking bond": breaking,
    })

    df.to_csv(to_path(outname))
Exemple #6
0
def write_sdf_csv_dataset(
    molecules,
    struct_file="struct_mols.sdf",
    label_file="label_mols.csv",
    feature_file="feature_mols.yaml",
    exclude_single_atom=True,
):
    """
    Write the molecular atomization free energy to file.
    Args:
        molecules:
        struct_file:
        label_file:
        feature_file:
        exclude_single_atom:

    Returns:

    """
    struct_file = to_path(struct_file)
    label_file = to_path(label_file)

    logger.info("Start writing dataset to files: {} and {}".format(
        struct_file, label_file))

    feats = []

    with open(struct_file, "w") as fx, open(label_file, "w") as fy:

        fy.write("mol_id,atomization_energy\n")

        i = 0
        for m in molecules:

            if exclude_single_atom and m.num_atoms == 1:
                logger.info("Excluding single atom molecule {}".format(
                    m.formula))
                continue

            sdf = m.write(name=m.id + "_index-" + str(i))
            fx.write(sdf)
            fy.write("{},{:.15g}\n".format(m.id, m.atomization_free_energy))

            feats.append(m.pack_features())
            i += 1

    # write feature file
    yaml_dump(feats, feature_file)
Exemple #7
0
    def write_results(self, predictions, filename="bed_result.yaml"):
        """
        Add prediction value as 'prediction' of each reaction (given by a dict) in the
        label file.
        """
        labels = yaml_load(self.label_file)

        failed = []
        for d, p in zip(labels, predictions):
            if p is None:
                failed.append(str(d["id"]))
                d["prediction"] = p
            else:
                d["prediction"] = float(p)

        # if any failed
        if failed:
            msg = ", ".join(failed)
            print(
                f"These reactions failed when converting their molecules, and therefore "
                f"predictions for them are not made: {msg}")

        filename = to_path(filename) if filename is not None else filename
        if filename is not None:
            yaml_dump(labels, filename)
        else:
            print(labels)
Exemple #8
0
def detect_bad_mols():
    struct_file = "~/Applications/db_access/mol_builder/struct.sdf"
    struct_file = to_path(struct_file)
    suppl = Chem.SDMolSupplier(struct_file, sanitize=True, removeHs=False)
    for i, mol in enumerate(suppl):
        if mol is None:
            print("bad mol:", i)
Exemple #9
0
def plot_zinc_mols(dirname="~/Documents/Dataset/ZINC_BDE"):

    dirname = to_path(dirname)

    filenames = dirname.glob("*.sdf")
    for i, fname in enumerate(filenames):
        m = Chem.MolFromMolFile(str(fname), sanitize=True, removeHs=False)
        if m is not None:
            m = rdkit_mol_to_wrapper_mol(m)
            identifier = fname.stem
            fname = to_path("~/Applications/db_access/zinc_bde/mol_png"
                            ).joinpath(identifier + ".png")
            m.draw(fname, show_atom_idx=True)
            fname = to_path("~/Applications/db_access/zinc_bde/mol_pdb"
                            ).joinpath(identifier + ".pdb")
            m.write(fname, format="pdb")
Exemple #10
0
    def __init__(
        self,
        molecule_file,
        charge_file=None,
        format="smiles",
        allowed_product_charges=[0],
        ring_bond=False,
    ):
        super(PredictionMultiReactant, self).__init__()

        self.molecule_file = to_path(molecule_file)
        self.charge_file = to_path(
            charge_file) if charge_file is not None else None
        self.format = format
        self.allowed_product_charges = allowed_product_charges
        self.ring_bond = ring_bond

        self.rxn_idx_to_mol_and_bond_map = None
Exemple #11
0
    def plot_molecules(self, prefix=Path.cwd()):
        """
        Plot molecules to .png and write .sdf and .pdb files.

        Args:
            prefix (Path): directory path for the created files.
        """

        prefix = to_path(prefix)
        if prefix.exists():
            if not prefix.is_dir():
                raise ValueError(
                    f"Expect `prefix` be a path to a directory, but got {prefix}"
                )
        else:
            create_directory(prefix, path_is_directory=True)
        for i in ["png", "sdf", "pdb"]:
            create_directory(prefix.joinpath(f"mol_{i}"),
                             path_is_directory=True)
            create_directory(prefix.joinpath(f"mol_{i}_id"),
                             path_is_directory=True)

        for m in self.molecules:

            fname1 = prefix.joinpath(
                "mol_png/{}_{}_{}_{}.png".format(
                    m.formula, m.charge, m.id,
                    str(m.free_energy).replace(".", "dot")), )
            m.draw(filename=fname1, show_atom_idx=True)
            fname2 = prefix.joinpath(
                "mol_png_id/{}_{}_{}_{}.png".format(
                    m.id, m.formula, m.charge,
                    str(m.free_energy).replace(".", "dot")), )
            shutil.copyfile(fname1, fname2)

            for ext in ["sdf", "pdb"]:
                fname1 = prefix.joinpath(
                    "mol_{}/{}_{}_{}_{}.{}".format(
                        ext,
                        m.formula,
                        m.charge,
                        m.id,
                        str(m.free_energy).replace(".", "dot"),
                        ext,
                    ), )
                m.write(fname1, format=ext)
                fname2 = prefix.joinpath(
                    "mol_{}_id/{}_{}_{}_{}.{}".format(
                        ext,
                        m.id,
                        m.formula,
                        m.charge,
                        str(m.free_energy).replace(".", "dot"),
                        ext,
                    ), )
                create_directory(fname2)
                shutil.copyfile(fname1, fname2)
Exemple #12
0
    def __init__(
        self,
        grapher,
        molecules,
        labels,
        extra_features=None,
        feature_transformer=True,
        label_transformer=True,
        dtype="float32",
        state_dict_filename=None,
    ):

        if dtype not in ["float32", "float64"]:
            raise ValueError(f"`dtype {dtype}` should be `float32` or `float64`.")

        self.grapher = grapher
        self.molecules = (
            to_path(molecules) if isinstance(molecules, (str, Path)) else molecules
        )
        self.raw_labels = to_path(labels) if isinstance(labels, (str, Path)) else labels
        self.extra_features = (
            to_path(extra_features)
            if isinstance(extra_features, (str, Path))
            else extra_features
        )
        self.feature_transformer = feature_transformer
        self.label_transformer = label_transformer
        self.dtype = dtype
        self.state_dict_filename = state_dict_filename

        self.graphs = None
        self.labels = None
        self._feature_size = None
        self._feature_name = None
        self._feature_scaler_mean = None
        self._feature_scaler_std = None
        self._label_scaler_mean = None
        self._label_scaler_std = None
        self._species = None
        self._failed = None

        self._load()
 def get_sdfs(fname):
     structs = []
     with open(to_path(fname), "r") as f:
         for line in f:
             if "index" in line:
                 body = line
             elif "$$$$" in line:
                 structs.append(body)
             else:
                 body += line
     return structs
Exemple #14
0
 def read_sdf(filename):
     filename = to_path(filename)
     supp = Chem.SDMolSupplier(filename, sanitize=True, removeHs=False)
     all_mols = []
     for i, mol in enumerate(supp):
         if mol is None:
             print("bad mol:", i)
         else:
             all_mols.append(mol)
     print("{} molecules read from sdf file".format(len(all_mols)))
     return all_mols
 def plot_via_umap_points(
     self,
     filename="embedding.pdf",
     metadata_key_as_color="prediction",
     categorical_color=False,
 ):
     if not categorical_color:
         umap_plot.points(
             to_path(filename),
             self.embedding,
             values=self.metadata[metadata_key_as_color],
             theme="viridis",
         )
     else:
         umap_plot.points(
             to_path(filename),
             self.embedding,
             labels=self.metadata[metadata_key_as_color],
             color_key_cmap="tab20",
         )
Exemple #16
0
def download_model(file_id, date, directory):
    """
    Download a shared tarball file of the model from Google Drive with `file_id`,
    untar it and move it to `directory`.

    For info on how to find the file_id, see
    https://medium.com/@acpanjan/download-google-drive-files-using-wget-3c2c025a8b99
    """

    with tempfile.TemporaryDirectory() as dirpath:

        fname = "pretrained_model.tar.gz"
        fname2 = fname.split(".")[0]
        fname = to_path(dirpath).joinpath(fname)
        fname2 = to_path(dirpath).joinpath(fname2)

        print(
            "Start downloading pretrained model from Google Drive; this may take a while."
        )
        download_file_from_google_drive(file_id, fname)

        if not tarfile.is_tarfile(fname):
            model_path = f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
            raise RuntimeError(
                f"Failed downloading model from Google Drive. You can try download the "
                f"model manually at: {model_path}, untar it, and pass the path to the "
                f"model to bondnet to use it; i.e. do something like: "
                f"$ bondnet --model <path_to_model> ...")

        tf = tarfile.open(fname, "r:gz")
        tf.extractall(fname2)

        # copy content to the given directory
        # note, we need to joinpath date because the download file from Google Drive
        # with extract to a directory named date
        shutil.copytree(fname2.joinpath(date), to_path(directory))

        print(
            f"Finish downloading pretrained model; placed at: {to_path(directory)}"
        )
def plot_scatter(features, labels, filename):
    """
    Scatter plot for features and use labels as color.

    Args:
        features (list of 2D array)
        labels (list of 1D array)
        filename (str)
    """
    # plot
    all_marker = ["o", "D", "x", "<", "+"]

    # x and y range
    xmin = min([min(d[:, 0]) for d in features])
    xmax = max([max(d[:, 0]) for d in features])
    ymin = min([min(d[:, 1]) for d in features])
    ymax = max([max(d[:, 1]) for d in features])
    del_x = 0.1 * (xmax - xmin)
    del_y = 0.1 * (ymax - ymin)
    xmin -= del_x
    xmax += del_x
    ymin -= del_y
    ymax += del_y

    # ensure different class has the same color range
    cmin = min([min(i) for i in labels])
    cmax = max([max(i) for i in labels])

    for i, (data, color) in enumerate(zip(features, labels)):
        fig, ax = plt.subplots()
        print("Feature array {} num data points {}".format(i, len(data)))
        X = data[:, 0]
        Y = data[:, 1]
        sc = ax.scatter(
            X,
            Y,
            marker=all_marker[i],
            c=color,
            vmin=cmin,
            vmax=cmax,
            cmap=mpl.cm.viridis,
            ec=None,
        )
        fig.colorbar(sc)

        ax.set_xlim(xmin, xmax)
        ax.set_ylim(ymin, ymax)

        filename = to_path(filename)
        filename = filename.parent.joinpath(filename.stem + f"_{i}" +
                                            filename.suffix)
        fig.savefig(filename, bbox_inches="tight")
def main(
    model_name="mesd/20200808",
    sdf_file="/Users/mjwen/Applications/db_access/mol_builder/post_analysis/lbdc/struct.sdf",
    label_file="/Users/mjwen/Applications/db_access/mol_builder/post_analysis/lbdc/label.yaml",
    feature_file="/Users/mjwen/Applications/db_access/mol_builder/post_analysis/lbdc/feature.yaml",
    feat_meta_prefix=f"~/Applications/db_access/mol_builder/post_analysis/lbdc",
):

    seed_torch()

    dataset = load_dataset(model_name, sdf_file, label_file, feature_file)
    data_loader = DataLoaderReactionNetwork(dataset, batch_size=100, shuffle=False)

    model = load_model(model_name, pretrained=False)

    # make predictions
    feature_names = ["atom", "bond", "global"]
    ids, targets, predictions, broken_bonds, species, features = evaluate(
        model, feature_names, data_loader
    )

    # write to file
    for idx, ft in features.items():
        fname = to_path(feat_meta_prefix).joinpath(f"feats_layer{idx}.tsv")
        df = pd.DataFrame(ft)
        df.to_csv(fname, sep="\t", header=False, index=False)

    df = pd.DataFrame(
        {
            "identifier": ids,
            "target": targets,
            "prediction": predictions,
            "broken_bonds": broken_bonds,
            "species": species,
        }
    )
    to_path(feat_meta_prefix).joinpath("feats_metadata.tsv")
    df.to_csv(fname, sep="\t", index=False)
Exemple #19
0
def nrel_plot_molecules(
    filename="~/Documents/Dataset/NREL_BDE/rdf_data_190531_n200.csv",
    plot_prefix="~/Applications/db_access/nrel_bde",
):
    reactions = read_nrel_bde_dataset(filename)
    molecules = get_molecules_from_reactions(reactions)

    unique_mols = {m.id: m for m in molecules}
    molecules = list(unique_mols.keys())

    for m in molecules:

        fname = to_path(plot_prefix).joinpath(
            "mol_png/{}_{}_{}_{}.png".format(
                m.id, m.formula, m.charge,
                str(m.free_energy).replace(".", "dot")), )
        m.draw(fname, show_atom_idx=True)

        fname = to_path(plot_prefix).joinpath(
            "mol_pdb/{}_{}_{}_{}.pdb".format(
                m.id, m.formula, m.charge,
                str(m.free_energy).replace(".", "dot")), )
        m.write(fname, format="pdb")
Exemple #20
0
    def write_results(self, predictions, filename="result.csv"):
        """
        Append prediction as the last column of a dataframe and write csv file.
        """

        df = pd.read_csv(self.reaction_file, header=0, index_col=None)

        all_predictions = []
        all_failed = []
        p_idx = 0
        for i, (fail, reason) in enumerate(self.no_result_reason):
            row = df.iloc[i]
            str_row = ",".join([str(i) for i in row])

            # failed at conversion to mol wrapper stage
            if fail:
                logger.info(
                    f"Reaction {i} fails because molecule {reason} fails")
                all_failed.append(str_row)
                all_predictions.append(None)

            else:
                pred = predictions[p_idx]

                # failed at prediction stage
                if pred is None:
                    logger.info(
                        f"Reaction {i} fails because prediction cannot be made"
                    )
                    all_failed.append(str_row)

                all_predictions.append(pred)
                p_idx += 1

        # if any failed
        if all_failed:
            msg = "\n".join(all_failed)
            print(
                f"\n\nThese reactions failed either at converting smiles to internal "
                f"molecules or converting internal molecules to dgl graph, "
                f"and therefore predictions for them are not made (represented by "
                f"None in the output):\n{msg}\n\n."
                f"See the log file for more info.")

        df["energy"] = all_predictions
        filename = to_path(filename) if filename is not None else filename
        rst = df.to_csv(filename, index=False)
        if rst is not None:
            print(rst)
 def plot_via_umap_interactive(
     self,
     filename="embedding.html",
     metadata_key_as_color="species",
     categorical_color=False,
 ):
     if not categorical_color:
         umap_plot.interactive(
             to_path(filename),
             self.model,
             values=self.metadata[metadata_key_as_color],
             hover_data=pd.DataFrame(self.metadata),
             theme="viridis",
             point_size=5,
         )
     else:
         umap_plot.interactive(
             to_path(filename),
             self.model,
             labels=self.metadata[metadata_key_as_color],
             hover_data=pd.DataFrame(self.metadata),
             color_key_cmap="tab20",
             point_size=5,
         )
Exemple #22
0
    def extract_one(extractor, s1=None, s2=None):
        results = extractor.group_by_reactant_charge_pair()
        for charge in [(-1, 0), (-1, 1), (0, 1)]:
            reactions = results[charge]
            energies = []
            for rxn1, rxn2 in reactions:
                e_diff = rxn2.get_free_energy() - rxn1.get_free_energy()
                energies.append(e_diff)

            outname = "~/Applications/db_access/mol_builder/reactant_e_diff/"
            outname += "reactant_e_diff_{}{}_{}_{}.pdf".format(s1, s2, *charge)
            create_directory(outname)

            outname = to_path(outname)
            plot_hist(energies, outname, s1, s2, *charge)
Exemple #23
0
    def write(self,
              filename=None,
              name=None,
              format="sdf",
              kekulize=True,
              v3000=True):
        """Write a molecule to file or as string using rdkit.

        Args:
            filename (str): name of the file to write the output. If None, return the
                output as string.
            name (str): name of a molecule. If `file_format` is sdf, this is the first
                line the molecule block in the sdf.
            format (str): format of the molecule, supporting: sdf, pdb, and smi.
            kekulize (bool): whether to kekulize the mol if format is `sdf`
            v3000 (bool): whether to force v3000 form if format is `sdf`
        """
        if filename is not None:
            create_directory(filename)
            filename = str(to_path(filename))

        name = str(self.id) if name is None else name
        self.rdkit_mol.SetProp("_Name", name)

        if format == "sdf":
            if filename is None:
                sdf = Chem.MolToMolBlock(self.rdkit_mol,
                                         kekulize=kekulize,
                                         forceV3000=v3000)
                return sdf + "$$$$\n"
            else:
                return Chem.MolToMolFile(self.rdkit_mol,
                                         filename,
                                         kekulize=kekulize,
                                         forceV3000=v3000)
        elif format == "pdb":
            if filename is None:
                sdf = Chem.MolToPDBBlock(self.rdkit_mol)
                return sdf + "$$$$\n"
            else:
                return Chem.MolToPDBFile(self.rdkit_mol, filename)
        elif format == "smi":
            return Chem.MolToSmiles(self.rdkit_mol)
        else:
            raise ValueError(f"format {format} currently not supported")
Exemple #24
0
    def plot_atom_distance_histogram(self,
                                     bond=True,
                                     filename="atom_dist_hist.pdf",
                                     **kwargs):
        """
        Plot histogram of atom distances (bond lengths).

        Args:
            filename (Path): path to the created plot.
            bond (bool): If `True`, only consider bond length; otherwise consider all
                atom pair distances.
            kwargs: keyword args passed to matplotlib.pyplot.hist.
        """
        def plot_hist(data, filename, **kwargs):
            fig = plt.figure()
            ax = fig.gca()
            ax.hist(data, **kwargs)

            ax.set_xlabel("Atom distance length")
            ax.set_ylabel("counts")

            fig.savefig(filename, bbox_inches="tight")

        def get_distances(m, bond):
            if bond:
                dist = [
                    np.linalg.norm(m.coords[u] - m.coords[v])
                    for (u, v), _ in m.bonds
                ]
            else:
                dist = [
                    np.linalg.norm(m.coords[u] - m.coords[v])
                    for u, v in itertools.combinations(range(m.num_atoms), 2)
                ]
            return dist

        data = [get_distances(m, bond) for m in self.molecules]
        data = np.concatenate(data)

        print("\n\n### atom distance min={}, max={}".format(
            min(data), max(data)))

        filename = to_path(filename)
        plot_hist(data, filename, **kwargs)
Exemple #25
0
    def draw_with_bond_note(self,
                            bond_note,
                            filename="mol.png",
                            show_atom_idx=True):
        """
        Draw molecule using rdkit and show bond annotation, e.g. bond energy.

        Args:
            bond_note (dict): {bond_index: note}. The note to show for the
                corresponding bond.
            filename (str): path to the save the generated image. If `None` the
                molecule is returned and can be viewed in Jupyter notebook.
        """
        m = self.draw(show_atom_idx=show_atom_idx)

        # set bond annotation
        highlight_bonds = []
        for bond, note in bond_note.items():
            if isinstance(note, (float, np.floating)):
                note = "{:.3g}".format(note)
            idx = m.GetBondBetweenAtoms(*bond).GetIdx()
            m.GetBondWithIdx(idx).SetProp("bondNote", note)
            highlight_bonds.append(idx)

        # set highlight color
        bond_colors = {
            b: (192 / 255, 192 / 255, 192 / 255)
            for b in highlight_bonds
        }

        d = rdMolDraw2D.MolDraw2DCairo(400, 300)

        # smaller font size
        d.SetFontSize(0.8 * d.FontSize())

        rdMolDraw2D.PrepareAndDrawMolecule(d,
                                           m,
                                           highlightBonds=highlight_bonds,
                                           highlightBondColors=bond_colors)
        d.FinishDrawing()

        create_directory(filename)
        with open(to_path(filename), "wb") as f:
            f.write(d.GetDrawingText())
Exemple #26
0
def check_all(filename="molecules.pkl", outname="molecules_qc.pkl"):

    mol_coll = MoleculeCollection.from_file(filename)

    print("Number of mols before any check:", len(mol_coll))

    mol_coll.filter_by_connectivity(exclude_species=["Li"])
    print("Number of mols after connectivity check:", len(mol_coll))

    mol_coll.filter_by_rdkit_sanitize()
    print("Number of mols after rdkit check:", len(mol_coll))

    mol_coll.filter_by_bond_species()
    print("Number of mols after bond species check:", len(mol_coll))

    mol_coll.filter_by_bond_length()
    print("Number of mols after bond length check:", len(mol_coll))

    mol_coll.filter_by_species(species=["P"])
    print("Number of mols after species check:", len(mol_coll))

    mol_coll.to_file(to_path(outname))
Exemple #27
0
    def draw(self, filename=None, show_atom_idx=False):
        """
        Draw the molecule.

        Args:
            filename (str): path to the save the generated image. If `None` the
                molecule is returned and can be viewed in Jupyter notebook.
        """
        m = copy.deepcopy(self.rdkit_mol)
        AllChem.Compute2DCoords(m)

        if show_atom_idx:
            for a in m.GetAtoms():
                a.SetAtomMapNum(a.GetIdx() + 1)
        # d.drawOptions().addAtomIndices = True

        if filename is None:
            return m
        else:
            create_directory(filename)
            filename = str(to_path(filename))
            Draw.MolToFile(m, filename)
def plot_molecules(filename, plot_prefix):
    reactions = read_dataset(filename)

    plot_prefix = to_path(plot_prefix)

    for rxn in reactions:
        molecules = rxn.reactants + rxn.products
        rxn_id = rxn.get_id()

        for i, m in enumerate(molecules):
            r_or_p = "rct" if i == 0 else "prdt"

            fname = plot_prefix.joinpath(
                "mol_png/{}_{}_{}_{}.png".format(rxn_id, r_or_p, m.formula,
                                                 m.charge), )
            # since the molecules have AtomMapNum set when reading smarts mol,
            # we set `show_atom_idx` to False to use the AtomMapNum there
            m.draw(fname, show_atom_idx=False)

            fname = plot_prefix.joinpath(
                "mol_pdb/{}_{}_{}_{}.pdb".format(rxn_id, r_or_p, m.formula,
                                                 m.charge), )
            m.write(fname, format="pdb")
Exemple #29
0
def predict_single_molecule(
    model_name,
    molecule,
    charge=0,
    ring_bond=False,
    write_result=False,
    figure_name="prediction.png",
    format=None,
):
    """
    Make predictions for a single molecule.

    Breaking a bond may result in products with different combination of products
    charge, we report the smallest charge w.r.t. the product charge assignation.

    Args:
        model_name (str): The pre-trained model to use for making predictions. A model
            should be of the format format `dataset/date`, e.g. `mesd/20200808`,
            `pubchem/20200521`. It is possible to provide only the `dataset` part,
            and in this case, the latest model will be used.
        molecule (str): SMILES or InChI string or a path to a file storing these string.
        charge (int): charge of the molecule.
        ring_bond (bool): whether to make predictions for ring bond.
        write_result (bool): whether to write the returned sdf to stdout.
        figure_name (str): the name of the figure to be created showing the bond energy.
        format (str): format of the molecule, if not provided, will guess based on the
            file extension.

    Returns:
        str: sdf string representing the molecules and energies.
    """

    model_path = get_model_path(model_name)
    model_info = get_model_info(model_path)
    allowed_charge = model_info["allowed_charge"]
    unit_converter = model_info["unit_conversion"]

    assert (charge in allowed_charge
            ), f"expect charge to be one of {allowed_charge}, but got {charge}"

    p = to_path(molecule)
    if p.is_file():
        if format is None:
            suffix = p.suffix.lower()
            if suffix == ".sdf":
                format = "sdf"
            elif suffix == ".pdb":
                format = "pdb"
            else:
                raise RuntimeError(
                    f"Expect file format `.sdf` or `.pdb`, but got {suffix}")
        with open(p, "r") as f:
            molecule = f.read().strip()
    else:
        if format is None:
            if molecule.lower().startswith("inchi="):
                format = "inchi"
            else:
                format = "smiles"

    predictor = PredictionOneReactant(molecule, charge, format, allowed_charge,
                                      ring_bond)

    molecules, labels, extra_features = predictor.prepare_data()
    predictions = get_prediction(model_path, unit_converter, molecules, labels,
                                 extra_features)

    return predictor.write_results(predictions, figure_name, write_result)
Exemple #30
0
    mol_coll.filter_by_bond_species()
    print("Number of mols after bond species check:", len(mol_coll))

    mol_coll.filter_by_bond_length()
    print("Number of mols after bond length check:", len(mol_coll))

    mol_coll.filter_by_species(species=["P"])
    print("Number of mols after species check:", len(mol_coll))

    mol_coll.to_file(to_path(outname))


if __name__ == "__main__":

    working_dir = to_path("~/Applications/db_access/mol_builder/")

    num_entries = 500
    filename = working_dir.joinpath("molecules_n200.pkl")
    # num_entries = None
    # filename = working_dir.joinpath("molecules.pkl")
    pickle_molecules(num_entries=num_entries, outname=filename)

    #
    # clean molecules
    #
    filename = working_dir.joinpath("molecules_n200.pkl")
    outname = working_dir.joinpath("molecules_n200_qc.pkl")
    # filename = working_dir.joinpath("molecules.pkl")
    # outname = working_dir.joinpath("molecules_qc.pkl")
    check_all(filename, outname)