def get_model_path(model_name: str) -> os.PathLike: """ Find correct path to the directory storing the model. A `model_name` can be provided in various format: 1. `dataset/data`, in this case the model will be returned directly. 2. `dataset`, in this case the latest model corresponds to the dataset is returned. 3. it can also be a path to a local directory where the model info are stored. Args: model_name: name of the model, e.g. pubchem Returns: model_path: path storing the model """ # download model stored at Google Drive if haven't yet if model_name.lower() in GOOGLE_DRIVE_MODEL_INFO: prefix = model_name.lower() # directory to store the model model_dir = to_path("bondnet_pretrained_model__").joinpath(prefix) # if not exist, download it if not model_dir.is_dir(): date = GOOGLE_DRIVE_MODEL_INFO[prefix]["date"][-1] file_id = GOOGLE_DRIVE_MODEL_INFO[prefix]["file_id"][-1] download_model(file_id, date, model_dir) else: print( f"Found existing model files at {model_dir} for your requested model " f"`{model_name}`. We'll reuse it.") model_name = model_dir # model provided as a path if to_path(model_name).is_dir(): model_path = to_path(model_name) # model stored with code else: model_name = model_name.strip().lower() if "/" in model_name: prefix, date = model_name.split("/") if date not in MODEL_INFO[prefix]["date"]: raise ValueError( f"expect model date to be one of { MODEL_INFO[prefix]['date'] }, " f"but got {date}.") else: prefix = model_name date = MODEL_INFO[prefix]["date"][-1] model_path = to_path(bondnet.__file__).parent.joinpath( "prediction", "pretrained", prefix, date) return model_path
def main( model_name="bdncm/20200808", sdf_file="~/Applications/db_access/mol_builder/struct_rxn_ntwk_rgrn_qc.sdf", label_file="~/Applications/db_access/mol_builder/label_rxn_ntwk_rgrn_qc.yaml", feature_file="~/Applications/db_access/mol_builder/feature_rxn_ntwk_rgrn_qc.yaml", error_file="~/Applications/db_access/mol_builder/post_analysis/evaluation_error.tsv", charge_file="~/Applications/db_access/mol_builder/post_analysis/charges.tsv", ): seed_torch() dataset = load_dataset(model_name, sdf_file, label_file, feature_file) # trainset, valset, testset = train_validation_test_split( # dataset, validation=0.1, test=0.1 # ) trainset, valset, testset = train_validation_test_split_selected_bond_in_train( dataset, validation=0.1, test=0.1, selected_bond_type=(("H", "H"), ("H", "F"), ("F", "F")), ) # data_loader = DataLoaderReactionNetwork(trainset, batch_size=100, shuffle=False) # data_loader = DataLoaderReactionNetwork(valset, batch_size=100, shuffle=False) data_loader = DataLoaderReactionNetwork(testset, batch_size=100, shuffle=False) model = load_model(model_name) # make predictions feature_names = ["atom", "bond", "global"] ids, targets, predictions, errors, species = evaluate( model, feature_names, data_loader) # sort by error ids, targets, predictions, errors, species = zip(*sorted( zip(ids, targets, predictions, errors, species), key=lambda x: x[3])) df = pd.DataFrame({ "identifier": ids, "target": targets, "prediction": predictions, "error": errors, "species": species, }) df.to_csv(to_path(error_file), sep="\t", index=False) # charges df = get_charges(label_file, feature_file) df.to_csv(to_path(charge_file), sep="\t", index=False)
def __init__(self, molecule_file, reaction_file, charge_file=None, format="sdf", nprocs=None): self.molecule_file = to_path(molecule_file) self.reaction_file = to_path(reaction_file) self.charge_file = to_path( charge_file) if charge_file is not None else None self.format = format self.nprocs = nprocs
def main( model_name="bdncm/20200808", sdf_file="~/Applications/db_access/mol_builder/struct_rxn_ntwk_rgrn_qc.sdf", label_file="~/Applications/db_access/mol_builder/label_rxn_ntwk_rgrn_qc.yaml", feature_file="~/Applications/db_access/mol_builder/feature_rxn_ntwk_rgrn_qc.yaml", feat_filename="~/Applications/db_access/mol_builder/post_analysis/feats.tsv", meta_filename="~/Applications/db_access/mol_builder/post_analysis/feats_metadata.tsv", ): seed_torch() dataset = load_dataset(model_name, sdf_file, label_file, feature_file) _, _, testset = train_validation_test_split(dataset, validation=0.1, test=0.1) data_loader = DataLoaderReactionNetwork(testset, batch_size=100, shuffle=False) # data_loader = DataLoaderReactionNetwork(dataset, batch_size=100, shuffle=False) model = load_model(model_name) # make predictions feature_names = ["atom", "bond", "global"] ids, targets, predictions, errors, species, features = evaluate( model, feature_names, data_loader, compute_features=True) df = pd.DataFrame(features) df.to_csv(to_path(feat_filename), sep="\t", header=False, index=False) # metadata charges = get_charges(label_file, feature_file) rct_charges = [] prdt1_charges = [] prdt2_charges = [] for i in ids: c = charges[charges["identifier"] == i].to_dict("records")[0] rct_charges.append(c["charge"]) prdt1_charges.append(c["product1 charge"]) prdt2_charges.append(c["product2 charge"]) df = pd.DataFrame({ "identifier": ids, "target": targets, "prediction": predictions, "error": errors, "species": species, "reactant charge": rct_charges, "product1 charge": prdt1_charges, "product2 charge": prdt2_charges, }) df.to_csv(to_path(meta_filename), sep="\t", index=False)
def select_one_bond_break_reactions(filename, outname="one_bond_break.csv"): filename = to_path(filename) df = pd.read_csv(filename, header=0, index_col=0) reactions = read_dataset(filename) num_altered_bucket = bucket_rxns_by_num_altered_bonds(reactions) altered_type_bucket = bucket_rxns_by_altered_bond_types( num_altered_bucket[1], 1) rxn_id = [] activation_e = [] enthalpy = [] broken_bond = [] breaking = [] for (rct_bond, prdt_bond), reactions in altered_type_bucket.items(): for rxn in reactions: act_e = df["ea"][rxn.get_id()] eth_e = df["dh"][rxn.get_id()] # one bond breaking if len(rct_bond) == 1 and len(prdt_bond) == 0: bond = rct_bond[0] brk = True # one bond formation, we make it like bond breaking elif len(rct_bond) == 0 and len(prdt_bond) == 1: bond = prdt_bond[0] brk = False else: raise RuntimeError( "not one bond alternation reaction encountered") rxn_id.append(rxn.get_id()) activation_e.append(act_e) enthalpy.append(eth_e) broken_bond.append("-".join(bond)) breaking.append(brk) df = pd.DataFrame({ "id": rxn_id, "bond type": broken_bond, "activation energy": activation_e, "enthalpy": enthalpy, "breaking bond": breaking, }) df.to_csv(to_path(outname))
def write_sdf_csv_dataset( molecules, struct_file="struct_mols.sdf", label_file="label_mols.csv", feature_file="feature_mols.yaml", exclude_single_atom=True, ): """ Write the molecular atomization free energy to file. Args: molecules: struct_file: label_file: feature_file: exclude_single_atom: Returns: """ struct_file = to_path(struct_file) label_file = to_path(label_file) logger.info("Start writing dataset to files: {} and {}".format( struct_file, label_file)) feats = [] with open(struct_file, "w") as fx, open(label_file, "w") as fy: fy.write("mol_id,atomization_energy\n") i = 0 for m in molecules: if exclude_single_atom and m.num_atoms == 1: logger.info("Excluding single atom molecule {}".format( m.formula)) continue sdf = m.write(name=m.id + "_index-" + str(i)) fx.write(sdf) fy.write("{},{:.15g}\n".format(m.id, m.atomization_free_energy)) feats.append(m.pack_features()) i += 1 # write feature file yaml_dump(feats, feature_file)
def write_results(self, predictions, filename="bed_result.yaml"): """ Add prediction value as 'prediction' of each reaction (given by a dict) in the label file. """ labels = yaml_load(self.label_file) failed = [] for d, p in zip(labels, predictions): if p is None: failed.append(str(d["id"])) d["prediction"] = p else: d["prediction"] = float(p) # if any failed if failed: msg = ", ".join(failed) print( f"These reactions failed when converting their molecules, and therefore " f"predictions for them are not made: {msg}") filename = to_path(filename) if filename is not None else filename if filename is not None: yaml_dump(labels, filename) else: print(labels)
def detect_bad_mols(): struct_file = "~/Applications/db_access/mol_builder/struct.sdf" struct_file = to_path(struct_file) suppl = Chem.SDMolSupplier(struct_file, sanitize=True, removeHs=False) for i, mol in enumerate(suppl): if mol is None: print("bad mol:", i)
def plot_zinc_mols(dirname="~/Documents/Dataset/ZINC_BDE"): dirname = to_path(dirname) filenames = dirname.glob("*.sdf") for i, fname in enumerate(filenames): m = Chem.MolFromMolFile(str(fname), sanitize=True, removeHs=False) if m is not None: m = rdkit_mol_to_wrapper_mol(m) identifier = fname.stem fname = to_path("~/Applications/db_access/zinc_bde/mol_png" ).joinpath(identifier + ".png") m.draw(fname, show_atom_idx=True) fname = to_path("~/Applications/db_access/zinc_bde/mol_pdb" ).joinpath(identifier + ".pdb") m.write(fname, format="pdb")
def __init__( self, molecule_file, charge_file=None, format="smiles", allowed_product_charges=[0], ring_bond=False, ): super(PredictionMultiReactant, self).__init__() self.molecule_file = to_path(molecule_file) self.charge_file = to_path( charge_file) if charge_file is not None else None self.format = format self.allowed_product_charges = allowed_product_charges self.ring_bond = ring_bond self.rxn_idx_to_mol_and_bond_map = None
def plot_molecules(self, prefix=Path.cwd()): """ Plot molecules to .png and write .sdf and .pdb files. Args: prefix (Path): directory path for the created files. """ prefix = to_path(prefix) if prefix.exists(): if not prefix.is_dir(): raise ValueError( f"Expect `prefix` be a path to a directory, but got {prefix}" ) else: create_directory(prefix, path_is_directory=True) for i in ["png", "sdf", "pdb"]: create_directory(prefix.joinpath(f"mol_{i}"), path_is_directory=True) create_directory(prefix.joinpath(f"mol_{i}_id"), path_is_directory=True) for m in self.molecules: fname1 = prefix.joinpath( "mol_png/{}_{}_{}_{}.png".format( m.formula, m.charge, m.id, str(m.free_energy).replace(".", "dot")), ) m.draw(filename=fname1, show_atom_idx=True) fname2 = prefix.joinpath( "mol_png_id/{}_{}_{}_{}.png".format( m.id, m.formula, m.charge, str(m.free_energy).replace(".", "dot")), ) shutil.copyfile(fname1, fname2) for ext in ["sdf", "pdb"]: fname1 = prefix.joinpath( "mol_{}/{}_{}_{}_{}.{}".format( ext, m.formula, m.charge, m.id, str(m.free_energy).replace(".", "dot"), ext, ), ) m.write(fname1, format=ext) fname2 = prefix.joinpath( "mol_{}_id/{}_{}_{}_{}.{}".format( ext, m.id, m.formula, m.charge, str(m.free_energy).replace(".", "dot"), ext, ), ) create_directory(fname2) shutil.copyfile(fname1, fname2)
def __init__( self, grapher, molecules, labels, extra_features=None, feature_transformer=True, label_transformer=True, dtype="float32", state_dict_filename=None, ): if dtype not in ["float32", "float64"]: raise ValueError(f"`dtype {dtype}` should be `float32` or `float64`.") self.grapher = grapher self.molecules = ( to_path(molecules) if isinstance(molecules, (str, Path)) else molecules ) self.raw_labels = to_path(labels) if isinstance(labels, (str, Path)) else labels self.extra_features = ( to_path(extra_features) if isinstance(extra_features, (str, Path)) else extra_features ) self.feature_transformer = feature_transformer self.label_transformer = label_transformer self.dtype = dtype self.state_dict_filename = state_dict_filename self.graphs = None self.labels = None self._feature_size = None self._feature_name = None self._feature_scaler_mean = None self._feature_scaler_std = None self._label_scaler_mean = None self._label_scaler_std = None self._species = None self._failed = None self._load()
def get_sdfs(fname): structs = [] with open(to_path(fname), "r") as f: for line in f: if "index" in line: body = line elif "$$$$" in line: structs.append(body) else: body += line return structs
def read_sdf(filename): filename = to_path(filename) supp = Chem.SDMolSupplier(filename, sanitize=True, removeHs=False) all_mols = [] for i, mol in enumerate(supp): if mol is None: print("bad mol:", i) else: all_mols.append(mol) print("{} molecules read from sdf file".format(len(all_mols))) return all_mols
def plot_via_umap_points( self, filename="embedding.pdf", metadata_key_as_color="prediction", categorical_color=False, ): if not categorical_color: umap_plot.points( to_path(filename), self.embedding, values=self.metadata[metadata_key_as_color], theme="viridis", ) else: umap_plot.points( to_path(filename), self.embedding, labels=self.metadata[metadata_key_as_color], color_key_cmap="tab20", )
def download_model(file_id, date, directory): """ Download a shared tarball file of the model from Google Drive with `file_id`, untar it and move it to `directory`. For info on how to find the file_id, see https://medium.com/@acpanjan/download-google-drive-files-using-wget-3c2c025a8b99 """ with tempfile.TemporaryDirectory() as dirpath: fname = "pretrained_model.tar.gz" fname2 = fname.split(".")[0] fname = to_path(dirpath).joinpath(fname) fname2 = to_path(dirpath).joinpath(fname2) print( "Start downloading pretrained model from Google Drive; this may take a while." ) download_file_from_google_drive(file_id, fname) if not tarfile.is_tarfile(fname): model_path = f"https://drive.google.com/file/d/{file_id}/view?usp=sharing" raise RuntimeError( f"Failed downloading model from Google Drive. You can try download the " f"model manually at: {model_path}, untar it, and pass the path to the " f"model to bondnet to use it; i.e. do something like: " f"$ bondnet --model <path_to_model> ...") tf = tarfile.open(fname, "r:gz") tf.extractall(fname2) # copy content to the given directory # note, we need to joinpath date because the download file from Google Drive # with extract to a directory named date shutil.copytree(fname2.joinpath(date), to_path(directory)) print( f"Finish downloading pretrained model; placed at: {to_path(directory)}" )
def plot_scatter(features, labels, filename): """ Scatter plot for features and use labels as color. Args: features (list of 2D array) labels (list of 1D array) filename (str) """ # plot all_marker = ["o", "D", "x", "<", "+"] # x and y range xmin = min([min(d[:, 0]) for d in features]) xmax = max([max(d[:, 0]) for d in features]) ymin = min([min(d[:, 1]) for d in features]) ymax = max([max(d[:, 1]) for d in features]) del_x = 0.1 * (xmax - xmin) del_y = 0.1 * (ymax - ymin) xmin -= del_x xmax += del_x ymin -= del_y ymax += del_y # ensure different class has the same color range cmin = min([min(i) for i in labels]) cmax = max([max(i) for i in labels]) for i, (data, color) in enumerate(zip(features, labels)): fig, ax = plt.subplots() print("Feature array {} num data points {}".format(i, len(data))) X = data[:, 0] Y = data[:, 1] sc = ax.scatter( X, Y, marker=all_marker[i], c=color, vmin=cmin, vmax=cmax, cmap=mpl.cm.viridis, ec=None, ) fig.colorbar(sc) ax.set_xlim(xmin, xmax) ax.set_ylim(ymin, ymax) filename = to_path(filename) filename = filename.parent.joinpath(filename.stem + f"_{i}" + filename.suffix) fig.savefig(filename, bbox_inches="tight")
def main( model_name="mesd/20200808", sdf_file="/Users/mjwen/Applications/db_access/mol_builder/post_analysis/lbdc/struct.sdf", label_file="/Users/mjwen/Applications/db_access/mol_builder/post_analysis/lbdc/label.yaml", feature_file="/Users/mjwen/Applications/db_access/mol_builder/post_analysis/lbdc/feature.yaml", feat_meta_prefix=f"~/Applications/db_access/mol_builder/post_analysis/lbdc", ): seed_torch() dataset = load_dataset(model_name, sdf_file, label_file, feature_file) data_loader = DataLoaderReactionNetwork(dataset, batch_size=100, shuffle=False) model = load_model(model_name, pretrained=False) # make predictions feature_names = ["atom", "bond", "global"] ids, targets, predictions, broken_bonds, species, features = evaluate( model, feature_names, data_loader ) # write to file for idx, ft in features.items(): fname = to_path(feat_meta_prefix).joinpath(f"feats_layer{idx}.tsv") df = pd.DataFrame(ft) df.to_csv(fname, sep="\t", header=False, index=False) df = pd.DataFrame( { "identifier": ids, "target": targets, "prediction": predictions, "broken_bonds": broken_bonds, "species": species, } ) to_path(feat_meta_prefix).joinpath("feats_metadata.tsv") df.to_csv(fname, sep="\t", index=False)
def nrel_plot_molecules( filename="~/Documents/Dataset/NREL_BDE/rdf_data_190531_n200.csv", plot_prefix="~/Applications/db_access/nrel_bde", ): reactions = read_nrel_bde_dataset(filename) molecules = get_molecules_from_reactions(reactions) unique_mols = {m.id: m for m in molecules} molecules = list(unique_mols.keys()) for m in molecules: fname = to_path(plot_prefix).joinpath( "mol_png/{}_{}_{}_{}.png".format( m.id, m.formula, m.charge, str(m.free_energy).replace(".", "dot")), ) m.draw(fname, show_atom_idx=True) fname = to_path(plot_prefix).joinpath( "mol_pdb/{}_{}_{}_{}.pdb".format( m.id, m.formula, m.charge, str(m.free_energy).replace(".", "dot")), ) m.write(fname, format="pdb")
def write_results(self, predictions, filename="result.csv"): """ Append prediction as the last column of a dataframe and write csv file. """ df = pd.read_csv(self.reaction_file, header=0, index_col=None) all_predictions = [] all_failed = [] p_idx = 0 for i, (fail, reason) in enumerate(self.no_result_reason): row = df.iloc[i] str_row = ",".join([str(i) for i in row]) # failed at conversion to mol wrapper stage if fail: logger.info( f"Reaction {i} fails because molecule {reason} fails") all_failed.append(str_row) all_predictions.append(None) else: pred = predictions[p_idx] # failed at prediction stage if pred is None: logger.info( f"Reaction {i} fails because prediction cannot be made" ) all_failed.append(str_row) all_predictions.append(pred) p_idx += 1 # if any failed if all_failed: msg = "\n".join(all_failed) print( f"\n\nThese reactions failed either at converting smiles to internal " f"molecules or converting internal molecules to dgl graph, " f"and therefore predictions for them are not made (represented by " f"None in the output):\n{msg}\n\n." f"See the log file for more info.") df["energy"] = all_predictions filename = to_path(filename) if filename is not None else filename rst = df.to_csv(filename, index=False) if rst is not None: print(rst)
def plot_via_umap_interactive( self, filename="embedding.html", metadata_key_as_color="species", categorical_color=False, ): if not categorical_color: umap_plot.interactive( to_path(filename), self.model, values=self.metadata[metadata_key_as_color], hover_data=pd.DataFrame(self.metadata), theme="viridis", point_size=5, ) else: umap_plot.interactive( to_path(filename), self.model, labels=self.metadata[metadata_key_as_color], hover_data=pd.DataFrame(self.metadata), color_key_cmap="tab20", point_size=5, )
def extract_one(extractor, s1=None, s2=None): results = extractor.group_by_reactant_charge_pair() for charge in [(-1, 0), (-1, 1), (0, 1)]: reactions = results[charge] energies = [] for rxn1, rxn2 in reactions: e_diff = rxn2.get_free_energy() - rxn1.get_free_energy() energies.append(e_diff) outname = "~/Applications/db_access/mol_builder/reactant_e_diff/" outname += "reactant_e_diff_{}{}_{}_{}.pdf".format(s1, s2, *charge) create_directory(outname) outname = to_path(outname) plot_hist(energies, outname, s1, s2, *charge)
def write(self, filename=None, name=None, format="sdf", kekulize=True, v3000=True): """Write a molecule to file or as string using rdkit. Args: filename (str): name of the file to write the output. If None, return the output as string. name (str): name of a molecule. If `file_format` is sdf, this is the first line the molecule block in the sdf. format (str): format of the molecule, supporting: sdf, pdb, and smi. kekulize (bool): whether to kekulize the mol if format is `sdf` v3000 (bool): whether to force v3000 form if format is `sdf` """ if filename is not None: create_directory(filename) filename = str(to_path(filename)) name = str(self.id) if name is None else name self.rdkit_mol.SetProp("_Name", name) if format == "sdf": if filename is None: sdf = Chem.MolToMolBlock(self.rdkit_mol, kekulize=kekulize, forceV3000=v3000) return sdf + "$$$$\n" else: return Chem.MolToMolFile(self.rdkit_mol, filename, kekulize=kekulize, forceV3000=v3000) elif format == "pdb": if filename is None: sdf = Chem.MolToPDBBlock(self.rdkit_mol) return sdf + "$$$$\n" else: return Chem.MolToPDBFile(self.rdkit_mol, filename) elif format == "smi": return Chem.MolToSmiles(self.rdkit_mol) else: raise ValueError(f"format {format} currently not supported")
def plot_atom_distance_histogram(self, bond=True, filename="atom_dist_hist.pdf", **kwargs): """ Plot histogram of atom distances (bond lengths). Args: filename (Path): path to the created plot. bond (bool): If `True`, only consider bond length; otherwise consider all atom pair distances. kwargs: keyword args passed to matplotlib.pyplot.hist. """ def plot_hist(data, filename, **kwargs): fig = plt.figure() ax = fig.gca() ax.hist(data, **kwargs) ax.set_xlabel("Atom distance length") ax.set_ylabel("counts") fig.savefig(filename, bbox_inches="tight") def get_distances(m, bond): if bond: dist = [ np.linalg.norm(m.coords[u] - m.coords[v]) for (u, v), _ in m.bonds ] else: dist = [ np.linalg.norm(m.coords[u] - m.coords[v]) for u, v in itertools.combinations(range(m.num_atoms), 2) ] return dist data = [get_distances(m, bond) for m in self.molecules] data = np.concatenate(data) print("\n\n### atom distance min={}, max={}".format( min(data), max(data))) filename = to_path(filename) plot_hist(data, filename, **kwargs)
def draw_with_bond_note(self, bond_note, filename="mol.png", show_atom_idx=True): """ Draw molecule using rdkit and show bond annotation, e.g. bond energy. Args: bond_note (dict): {bond_index: note}. The note to show for the corresponding bond. filename (str): path to the save the generated image. If `None` the molecule is returned and can be viewed in Jupyter notebook. """ m = self.draw(show_atom_idx=show_atom_idx) # set bond annotation highlight_bonds = [] for bond, note in bond_note.items(): if isinstance(note, (float, np.floating)): note = "{:.3g}".format(note) idx = m.GetBondBetweenAtoms(*bond).GetIdx() m.GetBondWithIdx(idx).SetProp("bondNote", note) highlight_bonds.append(idx) # set highlight color bond_colors = { b: (192 / 255, 192 / 255, 192 / 255) for b in highlight_bonds } d = rdMolDraw2D.MolDraw2DCairo(400, 300) # smaller font size d.SetFontSize(0.8 * d.FontSize()) rdMolDraw2D.PrepareAndDrawMolecule(d, m, highlightBonds=highlight_bonds, highlightBondColors=bond_colors) d.FinishDrawing() create_directory(filename) with open(to_path(filename), "wb") as f: f.write(d.GetDrawingText())
def check_all(filename="molecules.pkl", outname="molecules_qc.pkl"): mol_coll = MoleculeCollection.from_file(filename) print("Number of mols before any check:", len(mol_coll)) mol_coll.filter_by_connectivity(exclude_species=["Li"]) print("Number of mols after connectivity check:", len(mol_coll)) mol_coll.filter_by_rdkit_sanitize() print("Number of mols after rdkit check:", len(mol_coll)) mol_coll.filter_by_bond_species() print("Number of mols after bond species check:", len(mol_coll)) mol_coll.filter_by_bond_length() print("Number of mols after bond length check:", len(mol_coll)) mol_coll.filter_by_species(species=["P"]) print("Number of mols after species check:", len(mol_coll)) mol_coll.to_file(to_path(outname))
def draw(self, filename=None, show_atom_idx=False): """ Draw the molecule. Args: filename (str): path to the save the generated image. If `None` the molecule is returned and can be viewed in Jupyter notebook. """ m = copy.deepcopy(self.rdkit_mol) AllChem.Compute2DCoords(m) if show_atom_idx: for a in m.GetAtoms(): a.SetAtomMapNum(a.GetIdx() + 1) # d.drawOptions().addAtomIndices = True if filename is None: return m else: create_directory(filename) filename = str(to_path(filename)) Draw.MolToFile(m, filename)
def plot_molecules(filename, plot_prefix): reactions = read_dataset(filename) plot_prefix = to_path(plot_prefix) for rxn in reactions: molecules = rxn.reactants + rxn.products rxn_id = rxn.get_id() for i, m in enumerate(molecules): r_or_p = "rct" if i == 0 else "prdt" fname = plot_prefix.joinpath( "mol_png/{}_{}_{}_{}.png".format(rxn_id, r_or_p, m.formula, m.charge), ) # since the molecules have AtomMapNum set when reading smarts mol, # we set `show_atom_idx` to False to use the AtomMapNum there m.draw(fname, show_atom_idx=False) fname = plot_prefix.joinpath( "mol_pdb/{}_{}_{}_{}.pdb".format(rxn_id, r_or_p, m.formula, m.charge), ) m.write(fname, format="pdb")
def predict_single_molecule( model_name, molecule, charge=0, ring_bond=False, write_result=False, figure_name="prediction.png", format=None, ): """ Make predictions for a single molecule. Breaking a bond may result in products with different combination of products charge, we report the smallest charge w.r.t. the product charge assignation. Args: model_name (str): The pre-trained model to use for making predictions. A model should be of the format format `dataset/date`, e.g. `mesd/20200808`, `pubchem/20200521`. It is possible to provide only the `dataset` part, and in this case, the latest model will be used. molecule (str): SMILES or InChI string or a path to a file storing these string. charge (int): charge of the molecule. ring_bond (bool): whether to make predictions for ring bond. write_result (bool): whether to write the returned sdf to stdout. figure_name (str): the name of the figure to be created showing the bond energy. format (str): format of the molecule, if not provided, will guess based on the file extension. Returns: str: sdf string representing the molecules and energies. """ model_path = get_model_path(model_name) model_info = get_model_info(model_path) allowed_charge = model_info["allowed_charge"] unit_converter = model_info["unit_conversion"] assert (charge in allowed_charge ), f"expect charge to be one of {allowed_charge}, but got {charge}" p = to_path(molecule) if p.is_file(): if format is None: suffix = p.suffix.lower() if suffix == ".sdf": format = "sdf" elif suffix == ".pdb": format = "pdb" else: raise RuntimeError( f"Expect file format `.sdf` or `.pdb`, but got {suffix}") with open(p, "r") as f: molecule = f.read().strip() else: if format is None: if molecule.lower().startswith("inchi="): format = "inchi" else: format = "smiles" predictor = PredictionOneReactant(molecule, charge, format, allowed_charge, ring_bond) molecules, labels, extra_features = predictor.prepare_data() predictions = get_prediction(model_path, unit_converter, molecules, labels, extra_features) return predictor.write_results(predictions, figure_name, write_result)
mol_coll.filter_by_bond_species() print("Number of mols after bond species check:", len(mol_coll)) mol_coll.filter_by_bond_length() print("Number of mols after bond length check:", len(mol_coll)) mol_coll.filter_by_species(species=["P"]) print("Number of mols after species check:", len(mol_coll)) mol_coll.to_file(to_path(outname)) if __name__ == "__main__": working_dir = to_path("~/Applications/db_access/mol_builder/") num_entries = 500 filename = working_dir.joinpath("molecules_n200.pkl") # num_entries = None # filename = working_dir.joinpath("molecules.pkl") pickle_molecules(num_entries=num_entries, outname=filename) # # clean molecules # filename = working_dir.joinpath("molecules_n200.pkl") outname = working_dir.joinpath("molecules_n200_qc.pkl") # filename = working_dir.joinpath("molecules.pkl") # outname = working_dir.joinpath("molecules_qc.pkl") check_all(filename, outname)