def _sample_ordering(mol: Chem.Mol, scaffold_nodes: np.ndarray, k: int, p: float, ms: MoleculeSpec = MoleculeSpec.get_default() ) -> t.Tuple[np.ndarray, np.ndarray, np.ndarray]: """Sampling decoding routes of a given molecule `mol` Args: mol (Chem.Mol): the given molecule (type: Chem.Mol) scaffold_nodes (np.ndarray): the nodes marked as scaffold k (int): The number of importance samples p (float): Degree of uncertainty during route sampling, should be in (0, 1) ms (mol_spec.MoleculeSpec) Returns: route_list (np.ndarray): route_list[i][j] the index of the atom reached at step j in sample i step_ids_list (np.ndarray): step_ids_list[i][j] the step at which atom j is reach at sample i logp_list (np.ndarray): logp_list[i] - the log-likelihood value of route i """ # build graph atom_types, atom_ranks, bonds = [], [], [] for atom in mol.GetAtoms(): atom_types.append(ms.get_atom_type(atom)) for r in Chem.CanonicalRankAtoms(mol): atom_ranks.append(r) for b in mol.GetBonds(): idx_1, idx_2 = b.GetBeginAtomIdx(), b.GetEndAtomIdx() bonds.append([idx_1, idx_2]) atom_ranks = np.array(atom_ranks) # build nx graph graph = nx.Graph() graph.add_nodes_from(range(len(atom_ranks))) graph.add_edges_from(bonds) route_list, step_ids_list, logp_list = [], [], [] for _ in range(k): step_ids, log_p = _traverse(graph, atom_ranks, scaffold_nodes, p) step_ids_list.append(step_ids) step_ids = np.argsort(step_ids) route_list.append(step_ids) logp_list.append(log_p) # cast to numpy array (route_list, step_ids_list) = (np.array(route_list, dtype=np.int32), np.array(step_ids_list, dtype=np.int32)) logp_list = np.array(logp_list, dtype=np.float32) return route_list, step_ids_list, logp_list
def get_array_from_mol(mol: Chem.Mol, scaffold_nodes: t.Iterable, nh_nodes: t.Iterable, np_nodes: t.Iterable, k: int, p: float, ms: MoleculeSpec = MoleculeSpec.get_default() ) -> t.Tuple[np.ndarray, np.ndarray]: """ Represent the molecule using `np.ndarray` Args: mol (Chem.Mol): The input molecule scaffold_nodes (Iterable): The location of scaffold represented as `list`/`np.ndarray` nh_nodes (Iterable): Nodes with modifications np_nodes (Iterable): Nodes with modifications k (int): The number of importance samples p (float): Degree of uncertainty during route sampling, should be in (0, 1) ms (mol_spec.MoleculeSpec) Returns: mol_array (np.ndarray): The numpy representation of the molecule dtype - np.int32, shape - [k, num_bonds + 1, 5] logp (np.ndarray): The log-likelihood of each route dtype - np.float32, shape - [k, ] """ atom_types, bond_info = [], [] _, num_bonds = mol.GetNumAtoms(), mol.GetNumBonds() # sample route scaffold_nodes = np.array(list(scaffold_nodes), dtype=np.int32) route_list, step_ids_list, logp = _sample_ordering(mol, scaffold_nodes, k, p) for atom_id, atom in enumerate(mol.GetAtoms()): if atom_id in nh_nodes: atom.SetNumExplicitHs(atom.GetNumExplicitHs() + 1) if atom_id in np_nodes: atom.SetFormalCharge(atom.GetFormalCharge() - 1) atom_types.append(ms.get_atom_type(atom)) for bond in mol.GetBonds(): bond_info.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx(), ms.get_bond_type(bond)]) # shape: # atom_types: num_atoms # bond_info: num_bonds x 3 atom_types, bond_info = (np.array(atom_types, dtype=np.int32), np.array(bond_info, dtype=np.int32)) # initialize packed molecule array data mol_array = [] for sample_id in range(k): # get the route and step_ids for the i-th sample (route_i, step_ids_i) = (route_list[sample_id, :], step_ids_list[sample_id, :]) # reorder atom types and bond info # note: bond_info [start_ids, end_ids, bond_type] (atom_types_i, bond_info_i, is_append) = _reorder(atom_types, bond_info, route_i, step_ids_i) # atom type added at each step # -1 if the current step is connect atom_types_added = np.full([num_bonds, ], -1, dtype=np.int32) atom_types_added[is_append] = \ atom_types_i[bond_info_i[:, 1]][is_append] # pack into mol_array_i # size: num_bonds x 4 # note: [atom_types_added, start_ids, end_ids, bond_type] mol_array_i = np.concatenate([atom_types_added[:, np.newaxis], bond_info_i], axis=-1) # add initialization step init_step = np.array([[atom_types_i[0], -1, 0, -1]], dtype=np.int32) # concat into mol_array # size: (num_bonds + 1) x 4 mol_array_i = np.concatenate([init_step, mol_array_i], axis=0) # Mark up scaffold bonds is_scaffold = np.logical_and(mol_array_i[:, 1] < len(scaffold_nodes), mol_array_i[:, 2] < len(scaffold_nodes)) is_scaffold = is_scaffold.astype(np.int32) # Concatenate # shape: k x (num_bonds + 1) x 5 mol_array_i = np.concatenate((mol_array_i, is_scaffold[:, np.newaxis]), axis=-1) mol_array.append(mol_array_i) # num_samples x (num_bonds + 1) x 4 mol_array = np.stack(mol_array, axis=0) # Output size: # mol_array: k x (num_bonds + 1) x 4 # logp: k return mol_array, logp