def compare_ffs(in_dict, conf_id_tag, out_prefix, mol_slice=None): """ For 2+ SDF files that are analogous in terms of molecules and their conformers, assess them by RMSD, TFD, and relative energy differences. Parameters ---------- in_dict : OrderedDict dictionary from input file, where key is method and value is dictionary first entry should be reference method in sub-dictionary, keys are 'sdfile' and 'sdtag' conf_id_tag : string label of the SD tag that should be the same for matching conformers in different files out_prefix : string prefix appended to sdf file name to write out new SDF file with RMSD and TFD info added as SD tags mol_slice : numpy slice object The resulting integers are numerically sorted and duplicates removed. e.g., slices = np.s_[0, 3:5, 6::3] would be parsed to return [0, 3, 4, 6, 9, 12, 15, 18, ...] Can also parse from end: [-3:] gets the last 3 molecules, and [-2:-1] is the same as [-2] to get just next to last molecule. Returns ------- enes_full : 3D list enes_full[i][j][k] = ddE of ith method, jth mol, kth conformer. ddE = (dE of query method) - (dE of ref method), where the dE is computed as conformer M - conformer N, and conformer N is chosen from the lowest energy of the ref confs. the reference method is not present; i.e., self-comparison is skipped, so the max i value represents total number of files minus one. rmsds_full : 3D list same format as that of enes_full but with conformer RMSDs tfds_full : 3D list same format as that of enes_full but with conformer TFDs smiles_full : 3D list same format as that of enes_full but with conformer SMILES strings """ # set RMSD calculation parameters automorph = True # take into acct symmetry related transformations heavyOnly = False # do consider hydrogen atoms for automorphisms overlay = True # find the lowest possible RMSD # initiate final data lists enes_full = [] rmsds_full = [] tfds_full = [] smiles_full = [] # get first filename representing the reference geometries sdf_ref = list(in_dict.values())[0]['sdfile'] tag_ref = list(in_dict.values())[0]['sdtag'] # assess each file against reference for ff_label, ff_dict in in_dict.items(): # get details of queried file sdf_que = ff_dict['sdfile'] tag_que = ff_dict['sdtag'] if sdf_que == sdf_ref: continue # initiate new sublists enes_method = [] rmsds_method = [] tfds_method = [] smiles_method = [] # open an output file to store query molecules with new SD tags out_file = f'{out_prefix}_{os.path.basename(sdf_que)}' ofs = oechem.oemolostream() if not ofs.open(out_file): oechem.OEThrow.Fatal(f"Unable to open {out_file} for writing") # load molecules from open reference and query files print(f"\n\nOpening reference file {sdf_ref}") mols_ref = reader.read_mols(sdf_ref, mol_slice) print(f"Opening query file {sdf_que} for [ {ff_label} ] energies") mols_que = reader.read_mols(sdf_que, mol_slice) # loop over each molecule in reference and query files for rmol, qmol in zip(mols_ref, mols_que): # initial check that they have same title and number of confs rmol_name = rmol.GetTitle() rmol_nconfs = rmol.NumConfs() if (rmol_name != qmol.GetTitle()) or (rmol_nconfs != qmol.NumConfs()): raise ValueError( "ERROR: Molecules not aligned in iteration. " "Offending molecules and number of conformers:\n" f"\'{rmol_name}\': {rmol_nconfs} nconfs\n" f"\'{qmol.GetTitle()}\': {qmol.NumConfs()} nconfs") # initialize lists to store conformer energies enes_ref = [] enes_que = [] rmsds_mol = [] tfds_mol = [] smiles_mol = [] # loop over each conformer of this mol for ref_conf, que_conf in zip(rmol.GetConfs(), qmol.GetConfs()): # check confomer match from the specified tag ref_id = oechem.OEGetSDData(ref_conf, conf_id_tag) que_id = oechem.OEGetSDData(que_conf, conf_id_tag) if ref_id != que_id: raise ValueError( "ERROR: Conformers not aligned in iteration" f" for mol: '{rmol_name}'. The conformer " f"IDs ({conf_id_tag}) for ref and query are:" f"\n{ref_id}\n{que_id}.") # note the smiles id smiles_mol.append(ref_id) # get energies enes_ref.append(float(oechem.OEGetSDData(ref_conf, tag_ref))) enes_que.append(float(oechem.OEGetSDData(que_conf, tag_que))) # compute RMSD between reference and query conformers rmsd = oechem.OERMSD(ref_conf, que_conf, automorph, heavyOnly, overlay) rmsds_mol.append(rmsd) # compute TFD between reference and query conformers tfd = calc_tfd(ref_conf, que_conf, conf_id_tag) tfds_mol.append(tfd) # store data in SD tags for query conf, and write conf to file oechem.OEAddSDData(que_conf, f'RMSD to {sdf_ref}', str(rmsd)) oechem.OEAddSDData(que_conf, f'TFD to {sdf_ref}', str(tfd)) oechem.OEWriteConstMolecule(ofs, que_conf) # compute relative energies against lowest E reference conformer lowest_ref_idx = enes_ref.index(min(enes_ref)) rel_enes_ref = np.array(enes_ref) - enes_ref[lowest_ref_idx] rel_enes_que = np.array(enes_que) - enes_que[lowest_ref_idx] # subtract them to get ddE = dE (query method) - dE (ref method) enes_mol = np.array(rel_enes_que) - np.array(rel_enes_ref) # store then move on enes_method.append(enes_mol) rmsds_method.append(np.array(rmsds_mol)) tfds_method.append(np.array(tfds_mol)) smiles_method.append(smiles_mol) #print(rmsds_method, len(rmsds_method)) #print(enes_method, len(enes_method)) enes_full.append(enes_method) rmsds_full.append(rmsds_method) tfds_full.append(tfds_method) smiles_full.append(smiles_method) ofs.close() return enes_full, rmsds_full, tfds_full, smiles_full
def tailed_parameters(in_sdf, ffxml, cutoff, tag, tag_smiles, metric_type): """ Extract data from SD tags, identify outlier molecules above cutoff, and get the associated force field parameters for each structure. Parameters ---------- in_sdf : string name of the input SDF molecule file with RMSD or TFD info as SD tags ffxml : string name of the FFXML force field file cutoff : float cutoff value to use for the metric; structures with value above the cutoff are considered outliers tag : string name of the SD tag in the SDF file with the metric information tag_smiles : string name of the SD tag in the SDF file with the molecule identifier metric_type : string what metric the tag and cutoff refer to (e.g., TFD or RMSD) for plot and figure labeling Returns ------- data_all : dict key of 'count' has int for count of all structures key of 'mols_dict' has dict for dictionary of mols (see note) key of 'params_mol' has dict of isosmiles and list of parameter IDs key of 'params_id' has dict of parameter IDs and list of isosmiles key of 'smi_dict' has dict of isosmiles keys and SD tag identifier data_out : dict same format as data_all but only containing outlier molecules Note ---- The mols_dict dictionary is itself a dict of dicts where the first level key is the SMILES string (or specified molecular identifier), and the value of that key is a dict with the following key/value pairs: metric geometric measurement structure OEGraphMol of the structure """ # load molecules from open reference and query files print(f"\n\n\nOpening SDF file {in_sdf}...") mols = reader.read_mols(in_sdf) print( f"Looking for outlier molecules with {metric_type.upper()} above {cutoff}...\n" ) # find the molecules with the metric above the cutoff all_smiles = [] mols_all = OrderedDict() mols_out = OrderedDict() count_all = 0 count_out = 0 for mol in mols: for conf in mol.GetConfs(): smiles = oechem.OEGetSDData(conf, tag_smiles) try: value = float(oechem.OEGetSDData(conf, tag)) except ValueError as e: raise ValueError( "There was an error while obtaining the SD " f"tag data of '{oechem.OEGetSDData(conf, tag)}'. Did you " f"specify the correct SD tag of '{tag}'?") if value >= cutoff: mols_out[smiles] = { 'metric': value, 'structure': oechem.OEGraphMol(conf) } count_out += 1 mols_all[smiles] = {'structure': oechem.OEGraphMol(conf)} all_smiles.append(smiles) count_all += 1 # save outliers molecules to file write_mols(mols_out, f'outliers_{metric_type}.mol2') # analyze parameters in the outlier and full sets params_mol_out, params_id_out, smi_dict_out = get_parameters( mols_out, ffxml) params_mol_all, params_id_all, smi_dict_all = get_parameters( mols_all, ffxml) # organize all computed data to encompassing dictionary # all values in data_* are dictionaries except for data_*['count'] data_all = { 'count': count_all, 'mols_dict': mols_all, 'params_mol': params_mol_all, 'params_id': params_id_all, 'smi_dict': smi_dict_all } data_out = { 'count': count_out, 'mols_dict': mols_out, 'params_mol': params_mol_out, 'params_id': params_id_out, 'smi_dict': smi_dict_out } # save the params organized by id to pickle with open(f'tailed_{metric_type}.pickle', 'wb') as f: pickle.dump((data_all, data_out), f) return data_all, data_out
def match_minima(in_dict, rmsd_cutoff): """ For different methods, match the conformer minima to those of the reference method. Ex. Conf G of reference method matches with conf R of method 2. Parameters ---------- in_dict : OrderedDict dictionary from input file, where key is method and value is dictionary first entry should be reference method in sub-dictionary, keys are 'sdfile' and 'sdtag' rmsd_cutoff : float cutoff above which two structures are considered diff conformers Returns ------- mol_dict : dict of dicts mol_dict['mol_name']['energies'] = [[file1_conf1_E file1_conf2_E] [file2_conf1_E file2_conf2_E]] An analogous structure is followed for mol_dict['mol_name']['indices']. """ # nested dictionary: 1st layer of mol names, 2nd layer of method energies mol_dict = {} # get first filename representing the reference geometries sdf_ref = list(in_dict.values())[0]['sdfile'] # assess each file against reference for ff_label, ff_dict in in_dict.items(): sdf_query = ff_dict['sdfile'] sdf_tag = ff_dict['sdtag'] # load molecules from open reference and query files print(f"\n\nOpening reference file {sdf_ref}") mols_ref = reader.read_mols(sdf_ref) print(f"Opening query file {sdf_query} for [ {ff_label} ] energies") mols_query = reader.read_mols(sdf_query) # loop over each molecule in reference and query files for rmol in mols_ref: mol_name = rmol.GetTitle() ref_nconfs = rmol.NumConfs() run_match = False for qmol in mols_query: # same mol titles should mean same molecular identity; # when same molecular identity found, break out of loop to # start matching conformers if rmol.GetTitle() == qmol.GetTitle(): run_match = True break # create entry for this mol in mol_dict if not already present # energies [i][j] will be 2d list of ith method and jth conformer if mol_name not in mol_dict: mol_dict[mol_name] = {'energies': [], 'indices': []} # no same molecules were found bt ref and query methods # for N reference minima of each mol, P matching indices for each ref minimia if not run_match: print(f"No \"{mol_name}\" molecule found in {sdf_query}") # fill in -2 error values for conformer indices mol_dict[mol_name]['indices'].append([-2] * ref_nconfs) # fill in nan values for conformer energies and ref_nconfs mol_dict[mol_name]['energies'].append([np.nan] * ref_nconfs) # reset mols_query generator mols_query = reader.read_mols(sdf_query) # continue with the next rmol continue # get data from specified sd tag for all conformers data_confs = reader.get_sd_list(qmol, sdf_tag) # format sd tag data to float types float_data_confs = list(map(float, data_confs)) # store sd data from tags into dictionary mol_dict[mol_name]['energies'].append(float_data_confs) # don't run match if query method is same as reference method # keep this section after sd tag extraction of energies if sdf_query == sdf_ref: print("Skipping comparison against self.") mol_dict[mol_name]['indices'].append([-1] * ref_nconfs) continue # run the match here # get indices of qmol conformers that match rmol conformers molIndices = compare_two_mols(rmol, qmol, rmsd_cutoff) mol_dict[mol_name]['indices'].append(molIndices) return mol_dict