def _run_through(product_path, regressor, mol_chef_wae, seq_to_smi_list_func, cuda_details): # == Get the products we want to retrosyntheize == with open(product_path, 'r') as fo: products = [x.strip() for x in fo.readlines()] # == Get graph representation of the ones we care about == graphs = [] for mol_smi in tqdm.tqdm(products, desc="creating graphs"): mol = rdkit_general_ops.get_molecule(mol_smi, kekulize=True) mol, am_to_indx_map = rdkit_general_ops.add_atom_mapping(mol) mol_as_adj_list = rdkit_featurization_ops.mol_to_atom_feats_and_adjacency_list(mol, am_to_indx_map) graph = atom_features_dataset.trfm_mol_as_adj_list_to_graph_as_adj_list_trsfm(mol_as_adj_list) graphs.append(graph) # == Now regress to latent space & run decoder == batch_size = 500 predicted_latents = [] resultant_reactants = [] for i in tqdm.tqdm(range(math.ceil(len(graphs) / batch_size)), desc="to_z_and_then_bag"): graphs_of_batch = graphs[i*batch_size:(i+1)*batch_size] graphs_of_batch = graphs_of_batch[0].concatenate(graphs_of_batch) graphs_of_batch = graphs_of_batch.to_torch(cuda_details) latents_ = regressor(graphs_of_batch) predicted_latents.append(latents_.cpu().numpy()) seq_, _ = mol_chef_wae.decode_from_z_no_grad(latents_) predicted_seqs_batch_first_np = seq_.cpu().numpy().T for seq in predicted_seqs_batch_first_np: seq_as_mols = seq_to_smi_list_func(seq) reactant_str = '.'.join(sorted(seq_as_mols)) resultant_reactants.append(reactant_str) return resultant_reactants
def transform_text_to_qed(text_line): molecules = [rdkit_general_ops.get_molecule(mol_str, kekulize=False) for mol_str in text_line.split('.')] qed_scores = [QED.qed(mol) for mol in molecules] # May have many products so take max (given this is what we are optimising for in the optimisation part). # Expect this to be less of an issue in practice as USPTO mostly details # single product reactions. It may be interesting to look at using the Molecular Transformer prediction on # these reactions rather than this ground truth and other ways of combining multiple products eg mean. return np.max(qed_scores)
def __call__(self, path_to_smiles_file): graphs = [] with open(path_to_smiles_file, 'r') as fo: lines = fo.readlines() for mol_smi in tqdm.tqdm(lines): mol = rdkit_general_ops.get_molecule(mol_smi, kekulize=True) mol, am_to_indx_map = rdkit_general_ops.add_atom_mapping(mol) mol_as_adj_list = rdkit_featurization_ops.mol_to_atom_feats_and_adjacency_list(mol, am_to_indx_map) graph = atom_features_dataset.trfm_mol_as_adj_list_to_graph_as_adj_list_trsfm(mol_as_adj_list) graphs.append((graph,)) return graphs
def create_shared_dataset_files(reactants_to_reactant_id_dict): print("creating shared files") # Create file A with open( path.join(mchef_config.get_processed_data_dir(), 'reactants_to_reactant_id.json'), 'w') as fo: json.dump(reactants_to_reactant_id_dict, fo) # Create file B print(f"Creating reactant smi to reactant_id map.") reactant_feats = {} for smiles, id in tqdm.tqdm(reactants_to_reactant_id_dict.items()): mol = rdkit_general_ops.get_molecule(smiles, kekulize=True) mol, am_to_indx_map = rdkit_general_ops.add_atom_mapping(mol) reactant_feats[ id] = rdkit_featurization_ops.mol_to_atom_feats_and_adjacency_list( mol, am_to_indx_map) with open( path.join(mchef_config.get_processed_data_dir(), 'reactants_feats.pick'), 'wb') as fo: pickle.dump(reactant_feats, fo)
def _canonicalize(smi_str): return rdkit_general_ops.return_canoncailised_smiles_str( rdkit_general_ops.get_molecule(smi_str, kekulize=False), kekuleSmiles=False)