def create_final_evaluation_dataset(args): """ Creates a version of the test dataset where the non-reactive substructures are not filtered out and the compounds are treated like real unknown input compounds without mapping or known reaction class. """ # Read the test dataset from the specified fold. test_dataset = pd.read_pickle( args.dataset_config.output_folder + "fold_{}/test_data.pkl".format(args.evaluation_config.best_fold)) final_data_tuples = [] # Iterate through the test dataset and generate the necessary data. for row_ind, row in tqdm( test_dataset.iterrows(), total=len(test_dataset.index), ascii=True, desc="Generating the non-filtered version of the test dataset"): # Select only the products from the reaction SMILES. _, _, products = parse_reaction_roles(row["reaction_smiles"], as_what="mol_no_maps") # Get reaction cores of the reaction for better evaluation. products_reaction_cores = get_reaction_core_atoms( row["reaction_smiles"])[1] # Iterate through all of the product molecules and generate descriptors for each bond. for p_ind, product in enumerate(products): for bond in product.GetBonds(): # Specify the current bond atoms and their extended neighbourhood. bond_atoms = {bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()} ext_bond_atoms = get_atom_environment(bond_atoms, product) if args.evaluation_config.best_input_config["type"] == "ecfp": bond_fp = construct_ecfp( product, radius=args.evaluation_config. best_input_config["radius"], bits=args.evaluation_config.best_input_config["bits"], from_atoms=bond_atoms, output_type="np_array", as_type="np_float") ext_bond_fp = construct_ecfp( product, radius=args.evaluation_config. best_input_config["radius"], bits=args.evaluation_config.best_input_config["bits"], from_atoms=ext_bond_atoms, output_type="np_array", as_type="np_float") else: bond_fp = construct_hsfp( product, radius=args.evaluation_config. best_input_config["radius"], bits=args.evaluation_config.best_input_config["bits"], from_atoms=bond_atoms, neighbourhood_ext=args.evaluation_config. best_input_config["ext"]) ext_bond_fp = construct_hsfp( product, radius=args.evaluation_config. best_input_config["radius"], bits=args.evaluation_config.best_input_config["bits"], from_atoms=ext_bond_atoms, neighbourhood_ext=args.evaluation_config. best_input_config["ext"]) # If the current bond is part of the core, add that information to the new dataset. if bond.GetBeginAtomIdx() in products_reaction_cores[p_ind] or \ bond.GetEndAtomIdx() in products_reaction_cores[p_ind]: in_core = True else: in_core = False # Generate the necessary additional information. reactive_part, non_reactive_part = extract_info_from_molecule( product, bond_atoms) ext_reactive_part, ext_non_reactive_part = extract_info_from_molecule( product, ext_bond_atoms) #reactive_fps = [construct_ecfp(rp_mol, radius=args.descriptor_config.similarity_search["radius"], # bits=args.descriptor_config.similarity_search["bits"]) # for rp_mol in reactive_part[2]] #ext_reactive_fps = [construct_ecfp(rp_mol, radius=args.descriptor_config.similarity_search["radius"], # bits=args.descriptor_config.similarity_search["bits"]) # for rp_mol in ext_reactive_part[2]] non_reactive_fps = [ construct_ecfp( nrp_mol, radius=args.descriptor_config. similarity_search["radius"], bits=args.descriptor_config.similarity_search["bits"]) for nrp_mol in non_reactive_part[2] ] ext_non_reactive_fps = [ construct_ecfp( nrp_mol, radius=args.descriptor_config. similarity_search["radius"], bits=args.descriptor_config.similarity_search["bits"]) for nrp_mol in ext_non_reactive_part[2] ] final_data_tuples.append(( row["patent_id"] + "_{}".format(row_ind), bond.GetIdx(), bond_atoms, bond_fp, ext_bond_atoms, ext_bond_fp, in_core, products_reaction_cores, #reactive_part[0], reactive_part[2], reactive_part[3], reactive_fps, non_reactive_part[0], non_reactive_part[2], non_reactive_part[3], non_reactive_fps, ext_non_reactive_part[0], ext_non_reactive_part[2], ext_non_reactive_part[3], ext_non_reactive_fps, row["reaction_smiles"], row["reaction_class"] if in_core else 0, row["reactants_uq_mol_maps"])) # Save the final evaluation dataset as a .pkl file. pd.DataFrame(final_data_tuples, columns=["patent_id", "bond_id", "bond_atoms", "bond_fp", "ext_bond_atoms", "ext_bond_fp", "in_core", "reaction_cores", # "reactive_smiles", "reactive_smols", "reactive_smals", "reactive_fps", "non_reactive_smiles", "non_reactive_smols", "non_reactive_smals", "non_reactive_fps", "ext_non_reactive_smiles", "ext_non_reactive_smols", "ext_non_reactive_smals", "ext_non_reactive_fps", "reaction_smiles", "reaction_class", "reactants_uq_mol_maps"])\ .to_pickle(args.evaluation_config.final_evaluation_dataset)
def generate_unique_compound_pools(args): """ Generates and stores unique (RDKit Canonical SMILES) chemical compound pools of the reactants and products for a chemical reaction dataset. The dataset needs to contain a column named 'rxn_smiles' in which the values for the mapped reaction SMILES strings are stored. """ reactant_pool_smiles, product_pool_smiles, reactant_pool_mol, product_pool_mol = [], [], [], [] reactant_reaction_class, product_reaction_class = [], [] # Read the raw original chemical reaction dataset. raw_dataset = pd.read_csv(args.dataset_config.raw_dataset) # Iterate through the chemical reaction entries and generate unique canonical SMILES reactant and product pools. # Reagents are skipped in this research. for row_ind, row in tqdm( raw_dataset.iterrows(), total=len(raw_dataset.index), desc= "Generating unique reactant and product compound representations"): # Extract and save the canonical SMILES from the reaction. reactants, _, products = parse_reaction_roles( row["rxn_smiles"], as_what="canonical_smiles_no_maps") [reactant_pool_smiles.append(reactant) for reactant in reactants] [product_pool_smiles.append(product) for product in products] # Extract and save the RDKit Mol objects from the reaction. reactants, _, products = parse_reaction_roles(row["rxn_smiles"], as_what="mol_no_maps") [reactant_pool_mol.append(reactant) for reactant in reactants] [product_pool_mol.append(product) for product in products] # Save the reaction class of the entry. [reactant_reaction_class.append(row["class"]) for _ in reactants] [product_reaction_class.append(row["class"]) for _ in products] # Aggregate the saved reaction classes for the same reactant compounds. for reactant_ind, reactant in tqdm( enumerate(reactant_pool_smiles), total=len(reactant_pool_smiles), desc="Aggregating reaction class values for the reactant compounds" ): if type(reactant_reaction_class[reactant_ind]) == set: continue same_reactant_rows = [ r_ind for r_ind, r in enumerate(reactant_pool_smiles) if r == reactant ] aggregated_class_values = [ c for c_ind, c in enumerate(reactant_reaction_class) if c_ind in same_reactant_rows ] for same_row_ind in same_reactant_rows: reactant_reaction_class[same_row_ind] = set( aggregated_class_values) # Aggregate the saved reaction classes for the same product compounds. for product_ind, product in tqdm( enumerate(product_pool_smiles), total=len(product_pool_smiles), desc="Aggregating reaction class values for the product compounds" ): if type(product_reaction_class[product_ind]) == set: continue same_product_rows = [ p_ind for p_ind, p in enumerate(product_pool_smiles) if p == product ] aggregated_class_values = [ c for c_ind, c in enumerate(product_reaction_class) if c_ind in same_product_rows ] for same_row_ind in same_product_rows: product_reaction_class[same_row_ind] = set(aggregated_class_values) print("Filtering unique reactant and product compounds...", end="") # Filter out duplicate reactant molecules from the reactant and product sets. reactant_pool_smiles, reactants_uq_ind = np.unique(reactant_pool_smiles, return_index=True) product_pool_smiles, products_uq_ind = np.unique(product_pool_smiles, return_index=True) # Apply the unique indices to the list of RDKit Mol objects. reactant_pool_mol = np.array(reactant_pool_mol)[reactants_uq_ind].tolist() product_pool_mol = np.array(product_pool_mol)[products_uq_ind].tolist() # Apply the unique indices to the list of reaction classes. reactant_reaction_class = np.array( reactant_reaction_class)[reactants_uq_ind].tolist() product_reaction_class = np.array( product_reaction_class)[products_uq_ind].tolist() print("done.") # Pre-generate the reactant molecular fingerprint descriptors for similarity searching purpouses. ecfp_1024 = [] for uqr_ind, uq_reactant in tqdm( enumerate(reactant_pool_smiles), total=len(reactant_pool_smiles), desc="Generating reactant compound fingerprints"): ecfp_1024.append( construct_ecfp( uq_reactant, radius=args.descriptor_config.similarity_search["radius"], bits=args.descriptor_config.similarity_search["bits"])) print("Saving the processed reactant compound data...", end="") # Store all of the generated reactant fingerprints in a .pkl file. pd.DataFrame({"mol_id": list(range(0, len(reactant_pool_smiles))), "canonical_smiles": reactant_pool_smiles, "mol_object": reactant_pool_mol, "ecfp_1024": ecfp_1024, "reaction_class": reactant_reaction_class}).\ to_pickle(args.dataset_config.output_folder + "unique_reactants_pool.pkl") print("done.") # Pre-generate the product molecular fingerprint descriptors for similarity searching purpouses. ecfp_1024 = [] for uqp_ind, uq_product in tqdm( enumerate(product_pool_smiles), total=len(product_pool_smiles), desc="Generating product compound fingerprints"): ecfp_1024.append( construct_ecfp( uq_product, radius=args.descriptor_config.similarity_search["radius"], bits=args.descriptor_config.similarity_search["bits"])) print("Saving the processed product compound data...", end="") # Store all of the generated product fingerprints in a .pkl file. pd.DataFrame({"mol_id": list(range(0, len(product_pool_smiles))), "canonical_smiles": product_pool_smiles, "mol_object": product_pool_mol, "ecfp_1024": ecfp_1024, "reaction_class": product_reaction_class}).\ to_pickle(args.dataset_config.output_folder + "unique_products_pool.pkl") print("done.")
def generate_fps_from_reaction_products(reaction_smiles, fp_data_configs): """ Generates specified fingerprints for the both reactive and non-reactive substructures of the reactant and product molecules that are the participating in the chemical reaction. """ # Generate the RDKit Mol representations of the product molecules and generate the reaction cores. reactants, _, products = parse_reaction_roles(reaction_smiles, as_what="mol_no_maps") reaction_cores = get_reaction_core_atoms(reaction_smiles) # Separate the reaction cores if they consist out of multiple non-neighbouring parts. separated_cores = get_separated_cores(reaction_smiles, reaction_cores) # Define variables which will be used for storing the results. total_reactive_fps, total_non_reactive_fps = [], [] # Iterate through the product molecules and generate fingerprints for all reactive and non-reactive substructures. for p_ind, product in enumerate(products): # Iterate through all of the dataset configurations. for fp_config in fp_data_configs: reactive_fps, non_reactive_fps = [], [] # Generate fingerprints from the reactive substructures i.e. the reaction core(s). for core in separated_cores[1][p_ind]: # Generate reactive EC fingerprints and add them to the list. if fp_config["type"] == "ecfp": reactive_fps.append( construct_ecfp(product, radius=fp_config["radius"], bits=fp_config["bits"], from_atoms=core, output_type="np_array", as_type="np_float")) # Generate reactive HS fingerprints and add them to the list. else: reactive_fps.append( construct_hsfp(product, radius=fp_config["radius"], bits=fp_config["bits"], from_atoms=core, neighbourhood_ext=fp_config["ext"])) # Generate the extended environment of the reaction core. extended_core_env = get_atom_environment(reaction_cores[1][p_ind], product, degree=1) # Generate fingerprints from the non-reactive substructures i.e. non-reaction core substructures. for bond in product.GetBonds(): # Generate the extended environment of the focus bond. extended_bond_env = get_bond_environment(bond, product, degree=1) # If the extended environment of the non-reactive substructure does not overlap with the extended # reaction core, generate a non-reactive fingerprint representation. if not extended_bond_env.intersection(extended_core_env): # Generate non-reactive EC fingerprints and add them to the list. if fp_config["type"] == "ecfp": non_reactive_fps.append( construct_ecfp(product, radius=fp_config["radius"], bits=fp_config["bits"], from_atoms=[ bond.GetBeginAtomIdx(), bond.GetEndAtomIdx() ], output_type="np_array", as_type="np_float")) # Generate non-reactive HS fingerprints and add them to the list. else: non_reactive_fps.append( construct_hsfp(product, radius=fp_config["radius"], bits=fp_config["bits"], from_atoms=[ bond.GetBeginAtomIdx(), bond.GetEndAtomIdx() ], neighbourhood_ext=fp_config["ext"])) # Append the generated fingerprints to the final list. total_reactive_fps.append(reactive_fps) total_non_reactive_fps.append(non_reactive_fps) # Return all of the generated fingerprints and labels. return total_reactive_fps, total_non_reactive_fps
def extract_relevant_information(reaction_smiles, uq_reactant_mols_pool, uq_product_mols_pool, fp_params): """ Extracts the necessary information from a single mapped reaction SMILES string. """ # Extract the canonical SMILES and RDKit Mol objects from the reaction SMILES string. reactant_smiles, _, product_smiles = parse_reaction_roles( reaction_smiles, as_what="canonical_smiles_no_maps") reactants, _, products = parse_reaction_roles(reaction_smiles, as_what="mol_no_maps") # Sort the reactants and products in descending order by number of atoms so the largest reactants is always first. reactants, reactant_smiles = zip( *sorted(zip(reactants, reactant_smiles), key=lambda k: len(k[0].GetAtoms()), reverse=True)) products, product_smiles = zip(*sorted(zip(products, product_smiles), key=lambda k: len(k[0].GetAtoms()), reverse=True)) r_uq_mol_maps, rr_smiles, rr_smols, rr_smals, rr_fps, rnr_smiles, rnr_smols, rnr_smals, rnr_fps = \ [], [], [], [], [], [], [], [], [] p_uq_mol_maps, pr_smiles, pr_smols, pr_smals, pr_fps, pnr_smiles, pnr_smols, pnr_smals, pnr_fps = \ [], [], [], [], [], [], [], [], [] # Extract the reactive and non-reactive parts of the reactant and product molecules. reactant_frags, product_frags = extract_info_from_reaction(reaction_smiles) # Iterate through all of the reactants and aggregate the specified data. for r_ind, reactant in enumerate(reactants): r_uq_mol_maps.append( uq_reactant_mols_pool.index(reactant_smiles[r_ind])) rr_smiles.append(reactant_frags[r_ind][0][0]) rnr_smiles.append(reactant_frags[r_ind][1][0]) rr_smols.append(reactant_frags[r_ind][0][2]) rnr_smols.append(reactant_frags[r_ind][1][2]) rr_smals.append(reactant_frags[r_ind][0][3]) rnr_smals.append(reactant_frags[r_ind][1][3]) rr_fps.append( construct_ecfp(reactant_frags[r_ind][0][2], radius=fp_params["radius"], bits=fp_params["bits"])) rnr_fps.append( construct_ecfp(reactant_frags[r_ind][1][2], radius=fp_params["radius"], bits=fp_params["bits"])) # Iterate through all of the products and aggregate the specified data. for p_ind, product in enumerate(products): p_uq_mol_maps.append(uq_product_mols_pool.index(product_smiles[p_ind])) pr_smiles.extend(product_frags[p_ind][0][0]) pnr_smiles.extend(product_frags[p_ind][1][0]) pr_smols.extend(product_frags[p_ind][0][2]) pnr_smols.extend(product_frags[p_ind][1][2]) pr_smals.extend(product_frags[p_ind][0][3]) pnr_smals.extend(product_frags[p_ind][1][3]) for pf in product_frags[p_ind][0][2]: pr_fps.append( construct_ecfp(pf, radius=fp_params["radius"], bits=fp_params["bits"])) for pf in product_frags[p_ind][1][2]: pnr_fps.append( construct_ecfp(pf, radius=fp_params["radius"], bits=fp_params["bits"])) # Return the extracted information. return r_uq_mol_maps, rr_smiles, rr_smols, rr_smals, rr_fps, rnr_smiles, rnr_smols, rnr_smals, rnr_fps,\ p_uq_mol_maps, pr_smiles, pr_smols, pr_smals, pr_fps, pnr_smiles, pnr_smols, pnr_smals, pnr_fps