def _load(self): logger.info("Start loading dataset") # read label and feature file raw_labels = yaml_load(self.raw_labels) if self.extra_features is not None: features = yaml_load(self.extra_features) else: features = [None] * len(raw_labels) # build graph for mols from sdf file supp = Chem.SDMolSupplier(self.molecules, sanitize=True, removeHs=False) species = get_dataset_species(self.molecules) self.graphs = [] self.labels = [] for i, mol in enumerate(supp): if i % 100 == 0: logger.info("Processing molecule {}/{}".format( i, len(raw_labels))) # bad mol if mol is None: continue # graph g = self.grapher.build_graph_and_featurize( mol, extra_feats_info=features[i], dataset_species=species) # add this for check purpose; some entries in the sdf file may fail g.graph_id = i self.graphs.append(g) # label bonds_class = torch.tensor(raw_labels[i], dtype=torch.int64) label = {"value": bonds_class, "id": i} self.labels.append(label) # Should after grapher.build_graph_and_featurize, which initializes the # feature name and size self._feature_name = self.grapher.feature_name self._feature_size = self.grapher.feature_size logger.info("Feature name: {}".format(self.feature_name)) logger.info("Feature size: {}".format(self.feature_size)) # feature transformers if self.feature_transformer: feature_scaler = HeteroGraphFeatureStandardScaler() self.graphs = feature_scaler(self.graphs) logger.info("Feature scaler mean: {}".format(feature_scaler.mean)) logger.info("Feature scaler std: {}".format(feature_scaler.std)) logger.info("Finish loading {} graphs...".format(len(self.labels)))
def _read_molecules(molecule_file, molecule_attributes_file): # read rdkit mols rdkit_mols = read_rdkit_mols_from_file(molecule_file) # read molecule attributes attrs = yaml_load(molecule_attributes_file) msg = ( f"expect the number of molecules given in {molecule_file} " f"and the number of molecule attributes given in {molecule_attributes_file} to " f"be the same, but got {len(rdkit_mols)} and f{len(attrs)}. ") assert len(rdkit_mols) == len(attrs), msg # convert rdkit mols to wrapper molecules identifiers = [ m.GetProp("_Name") + f"_index-{i}" if m is not None else None for i, m in enumerate(rdkit_mols) ] charges = [a["charge"] for a in attrs] molecules = [ rdkit_mol_to_wrapper_mol(m, charge=cg, free_energy=None, identifier=idx) for m, idx, cg in zip(rdkit_mols, identifiers, charges) ] return molecules
def _load(self): logger.info("Start loading dataset") # read label and feature file raw_value, raw_indicator, raw_mol_source = self._read_label_file() if self.extra_features is not None: features = yaml_load(self.extra_features) else: features = [None] * len(raw_value) # build graph for mols from sdf file molecules = self.get_molecules(self.molecules) species = get_dataset_species(molecules) self.graphs = [] self.labels = [] for i, mol in enumerate(molecules): if i % 100 == 0: logger.info("Processing molecule {}/{}".format(i, len(raw_value))) # bad mol if mol is None: continue # graph g = self.grapher.build_graph_and_featurize( mol, extra_feats_info=features[i], dataset_species=species ) # we add this for check purpose, because some entries in the sdf file may fail g.graph_id = i self.graphs.append(g) # label bonds_class = torch.tensor(raw_value[i], dtype=torch.int64) bonds_indicator = int(raw_indicator[i]) bonds_mol_source = raw_mol_source[i] label = { "value": bonds_class, # torch.int64 "indicator": bonds_indicator, # int "id": bonds_mol_source, # str } self.labels.append(label) # Should after grapher.build_graph_and_featurize, which initializes the # feature name and size self._feature_name = self.grapher.feature_name self._feature_size = self.grapher.feature_size logger.info("Feature name: {}".format(self.feature_name)) logger.info("Feature size: {}".format(self.feature_size)) # feature transformers if self.feature_transformer: feature_scaler = HeteroGraphFeatureStandardScaler() self.graphs = feature_scaler(self.graphs) logger.info("Feature scaler mean: {}".format(feature_scaler.mean)) logger.info("Feature scaler std: {}".format(feature_scaler.std)) logger.info("Finish loading {} graphs...".format(len(self.labels)))
def write_results(self, predictions, filename="bed_result.yaml"): """ Add prediction value as 'prediction' of each reaction (given by a dict) in the label file. """ labels = yaml_load(self.label_file) failed = [] for d, p in zip(labels, predictions): if p is None: failed.append(str(d["id"])) d["prediction"] = p else: d["prediction"] = float(p) # if any failed if failed: msg = ", ".join(failed) print( f"These reactions failed when converting their molecules, and therefore " f"predictions for them are not made: {msg}") filename = to_path(filename) if filename is not None else filename if filename is not None: yaml_dump(labels, filename) else: print(labels)
def read_molecules(self): if self.format == "graph": if self.charge_file is not None: warnings.warn( f"charge file {self.charge_file} ignored for format `graph`" ) file_type = self.molecule_file.suffix if file_type == ".json": with open(self.molecule_file, "r") as f: mol_graph_dicts = json.load(f) elif file_type in [".yaml", ".yml"]: mol_graph_dicts = yaml_load(self.molecule_file) else: supported = [".json", ".yaml", ".yml"] raise ValueError( f"File extension of {self.molecule_file} not supported; " f"supported are: {supported}.") mol_graphs = [MoleculeGraph.from_dict(d) for d in mol_graph_dicts] molecules = [ MoleculeWrapper(g, id=str(i)) for i, g in enumerate(mol_graphs) ] else: # read rdkit mols rdkit_mols = read_rdkit_mols_from_file(self.molecule_file, self.format) # read charge file if self.charge_file is None: charges = [0] * len(rdkit_mols) else: charges = read_charge(self.charge_file) msg = ( f"expect the number of molecules given in {self.molecule_file} " f"and the number of charges given in {self.charge_file} to be " f"the same, but got {len(rdkit_mols)} and f{len(charges)}. " ) assert len(rdkit_mols) == len(charges), msg # convert rdkit mols to wrapper molecules identifiers = [ m.GetProp("_Name") + f"_index-{i}" if m is not None else None for i, m in enumerate(rdkit_mols) ] molecules = rdkit_mols_to_wrapper_mols(rdkit_mols, identifiers, charges, nprocs=self.nprocs) self._molecules = molecules return molecules
def _check_charge(model_path, features): if isinstance(features, (str, Path)): features = yaml_load(features) charges = set([x["charge"] for x in features]) model_info = get_model_info(model_path) allowed_charge = set(model_info["allowed_charge"]) if not charges.issubset(allowed_charge): raise ValueError( f"Supported molecular charges include {allowed_charge}; " f"but the dataset contains molecules of charge {charges}.")
def get_charges(label_file, feature_file): """ Charge of reactant and products molecule in each reaction. """ labels = yaml_load(label_file) features = yaml_load(feature_file) ids = [] num_prdts = [] rct_charges = [] prdt1_charges = [] prdt2_charges = [] for lb in labels: ids.append(lb["id"]) rct_idx = lb["reactants"][0] prdts = lb["products"] N = len(prdts) num_prdts.append(N) rct_charges.append(features[rct_idx]["charge"]) prdt1_idx = prdts[0] prdt1_charges.append(features[prdt1_idx]["charge"]) if N == 2: prdt2_idx = prdts[1] prdt2_charges.append(features[prdt2_idx]["charge"]) else: prdt2_charges.append(None) df = pd.DataFrame({ "identifier": ids, "num products": num_prdts, "charge": rct_charges, "product1 charge": prdt1_charges, "product2 charge": prdt2_charges, }) return df
def _read_reactions(molecules, reaction_file): # read reaction file rxns = yaml_load(reaction_file) # convert to reactions bad_mol_indices = {i for i, m in enumerate(molecules) if m is None} reactions = [] no_result_reason = [] # each element is a tuple (fail, failing_reason) for i, rxn in enumerate(rxns): idx_r = rxn["reactants"][0] idx_p1 = rxn["products"][0] if len(rxn["products"]) == 1: idx_p2 = None else: idx_p2 = rxn["products"][1] bde = rxn["energy"] for idx in (idx_r, idx_p1, idx_p2): if idx in bad_mol_indices: no_result_reason.append((True, idx)) break else: reactants = [molecules[idx_r]] products = [molecules[idx_p1]] if idx_p2 is not None: products.append(molecules[idx_p2]) reactions.append( Reaction( reactants=reactants, products=products, broken_bond=None, free_energy=bde, identifier=f"reaction_{i}", )) no_result_reason.append((False, None)) for i, reason in enumerate(no_result_reason): if reason[0]: logger.warning( f"Reaction {i} ignored because failed to read molecule {reason[1]}." ) return reactions
def _check_charge(model_path, features): if isinstance(features, (str, Path)): features = yaml_load(features) charges = set([x["charge"] for x in features]) model_info = get_model_info(model_path) allowed_charge = set(model_info["allowed_charge"]) if not charges.issubset(allowed_charge): raise ValueError( f"Supported molecular charges include {allowed_charge}; " f"but the dataset contains molecules of charge {charges}." f"Note that two models trained on different datasets are provided: " f"the `pubchem` supports neutral molecules and the `bdncm` supports " f"molecules of charge -1, 0, and 1. " f"You may want to switch the model if you see this message.")
def get_features(features): if isinstance(features, Path): features = yaml_load(features) return features
def get_labels(labels): if isinstance(labels, Path): labels = yaml_load(labels) return labels
def _load(self): logger.info("Start loading dataset") # read label and feature file raw_labels = yaml_load(self.raw_labels) if self.extra_features is not None: features = yaml_load(self.extra_features) else: features = [None] * len(raw_labels) # build graph for mols from sdf file molecules = self.get_molecules(self.molecules) species = get_dataset_species(molecules) graphs = [] for i, (mol, feats) in enumerate(zip(molecules, features)): if i % 100 == 0: logger.info(f"Processing molecule {i}/{len(raw_labels)}") if mol is not None: g = self.grapher.build_graph_and_featurize( mol, extra_feats_info=feats, dataset_species=species ) # add this for check purpose; some entries in the sdf file may fail g.graph_id = i else: g = None graphs.append(g) # Should after grapher.build_graph_and_featurize, which initializes the # feature name and size self._feature_name = self.grapher.feature_name self._feature_size = self.grapher.feature_size logger.info("Feature name: {}".format(self.feature_name)) logger.info("Feature size: {}".format(self.feature_size)) # regroup graphs to reactions num_mols = [lb["num_mols"] for lb in raw_labels] reactions = list_split_by_size(graphs, num_mols) # global feat mapping global_mapping = [[{0: 0} for _ in range(n)] for n in num_mols] self.graphs = [] self.labels = [] for rxn, lb, gmp in zip(reactions, raw_labels, global_mapping): if None not in rxn: lb["value"] = torch.tensor(lb["value"], dtype=getattr(torch, self.dtype)) lb["global_mapping"] = gmp self.graphs.append(rxn) self.labels.append(lb) # transformers if self.feature_transformer: graphs = list(itertools.chain.from_iterable(self.graphs)) # flatten the list feature_scaler = HeteroGraphFeatureStandardScaler() graphs = feature_scaler(graphs) num_mols = [len(rxn) for rxn in self.graphs] self.graphs = list_split_by_size(graphs, num_mols) logger.info("Feature scaler mean: {}".format(feature_scaler.mean)) logger.info("Feature scaler std: {}".format(feature_scaler.std)) if self.label_transformer: # normalization values = [lb["value"] for lb in self.labels] # list of 0D tensor # np and torch compute slightly differently std (depending on `ddof` of np) # here we choose to use np mean = float(np.mean(values)) std = float(np.std(values)) values = (torch.stack(values) - mean) / std std = torch.tensor(std, dtype=getattr(torch, self.dtype)) mean = torch.tensor(mean, dtype=getattr(torch, self.dtype)) # update label for i, lb in enumerate(values): self.labels[i]["value"] = lb self.labels[i]["scaler_mean"] = mean self.labels[i]["scaler_stdev"] = std logger.info("Label scaler mean: {}".format(mean)) logger.info("Label scaler std: {}".format(std)) logger.info("Finish loading {} reactions...".format(len(self.labels)))
def _load(self): logger.info("Start loading dataset") # read label and feature file raw_labels, extensive = self._read_label_file() if self.extra_features is not None: features = yaml_load(self.extra_features) else: features = [None] * len(raw_labels) # build graph for mols from sdf file molecules = self.get_molecules(self.molecules) species = get_dataset_species(molecules) self.graphs = [] self.labels = [] natoms = [] for i, (mol, feats, lb) in enumerate(zip(molecules, features, raw_labels)): if i % 100 == 0: logger.info("Processing molecule {}/{}".format(i, len(raw_labels))) if mol is None: continue # graph g = self.grapher.build_graph_and_featurize( mol, extra_feats_info=feats, dataset_species=species ) # we add this for check purpose, because some entries in the sdf file may fail g.graph_id = i self.graphs.append(g) # label lb = torch.tensor(lb, dtype=getattr(torch, self.dtype)) self.labels.append({"value": lb, "id": i}) natoms.append(mol.GetNumAtoms()) # this should be called after grapher.build_graph_and_featurize, # which initializes the feature name and size self._feature_name = self.grapher.feature_name self._feature_size = self.grapher.feature_size logger.info("Feature name: {}".format(self.feature_name)) logger.info("Feature size: {}".format(self.feature_size)) # feature and label transformer if self.feature_transformer: feature_scaler = HeteroGraphFeatureStandardScaler() self.graphs = feature_scaler(self.graphs) logger.info("Feature scaler mean: {}".format(feature_scaler.mean)) logger.info("Feature scaler std: {}".format(feature_scaler.std)) if self.label_transformer: labels = np.asarray([lb["value"].numpy() for lb in self.labels]) natoms = np.asarray(natoms, dtype=np.float32) scaled_labels = [] scaler_mean = [] scaler_std = [] label_scaler_mean = [] label_scaler_std = [] for i, is_ext in enumerate(extensive): if is_ext: # extensive labels standardized by the number of atoms in the # molecules, i.e. y' = y/natoms lb = labels[:, i] / natoms mean = np.zeros(len(lb)) std = natoms label_scaler_mean.append(None) label_scaler_std.append("num atoms") else: # intensive labels standardized by y' = (y - mean(y))/std(y) scaler = StandardScaler() lb = labels[:, [i]] # 2D array of shape (N, 1) lb = scaler(lb) lb = lb.ravel() mean = np.repeat(scaler.mean, len(lb)) std = np.repeat(scaler.std, len(lb)) label_scaler_mean.append(scaler.mean) label_scaler_std.append(scaler.std) scaled_labels.append(lb) scaler_mean.append(mean) scaler_std.append(std) scaled_labels = torch.tensor( np.asarray(scaled_labels).T, dtype=getattr(torch, self.dtype) ) scaler_mean = torch.tensor( np.asarray(scaler_mean).T, dtype=getattr(torch, self.dtype) ) scaler_std = torch.tensor( np.asarray(scaler_std).T, dtype=getattr(torch, self.dtype) ) for i, (lb, m, s) in enumerate(zip(scaled_labels, scaler_mean, scaler_std)): self.labels[i]["value"] = lb self.labels[i]["scaler_mean"] = m self.labels[i]["scaler_stdev"] = s logger.info("Label scaler mean: {}".format(label_scaler_mean)) logger.info("Label scaler std: {}".format(label_scaler_std)) logger.info("Finish loading {} labels...".format(len(self.labels)))
def get_model_info(model_path): path = model_path.joinpath("model_info.yaml") return yaml_load(path)
def get_extra_features(fname): return yaml_load(fname)