Beispiel #1
0
def compare_ffs(in_dict, conf_id_tag, out_prefix, mol_slice=None):
    """
    For 2+ SDF files that are analogous in terms of molecules and their
    conformers, assess them by RMSD, TFD, and relative energy differences.

    Parameters
    ----------
    in_dict : OrderedDict
        dictionary from input file, where key is method and value is dictionary
        first entry should be reference method
        in sub-dictionary, keys are 'sdfile' and 'sdtag'
    conf_id_tag : string
        label of the SD tag that should be the same for matching conformers
        in different files
    out_prefix : string
        prefix appended to sdf file name to write out new SDF file
        with RMSD and TFD info added as SD tags
    mol_slice : numpy slice object
        The resulting integers are numerically sorted and duplicates removed.
        e.g., slices = np.s_[0, 3:5, 6::3] would be parsed to return
        [0, 3, 4, 6, 9, 12, 15, 18, ...]
        Can also parse from end: [-3:] gets the last 3 molecules, and
        [-2:-1] is the same as [-2] to get just next to last molecule.

    Returns
    -------
    enes_full : 3D list
        enes_full[i][j][k] = ddE of ith method, jth mol, kth conformer.
        ddE = (dE of query method) - (dE of ref method),
        where the dE is computed as conformer M - conformer N,
        and conformer N is chosen from the lowest energy of the ref confs.
        the reference method is not present; i.e., self-comparison is skipped,
        so the max i value represents total number of files minus one.
    rmsds_full : 3D list
        same format as that of enes_full but with conformer RMSDs
    tfds_full : 3D list
        same format as that of enes_full but with conformer TFDs
    smiles_full : 3D list
        same format as that of enes_full but with conformer SMILES strings

    """
    # set RMSD calculation parameters
    automorph = True  # take into acct symmetry related transformations
    heavyOnly = False  # do consider hydrogen atoms for automorphisms
    overlay = True  # find the lowest possible RMSD

    # initiate final data lists
    enes_full = []
    rmsds_full = []
    tfds_full = []
    smiles_full = []

    # get first filename representing the reference geometries
    sdf_ref = list(in_dict.values())[0]['sdfile']
    tag_ref = list(in_dict.values())[0]['sdtag']

    # assess each file against reference
    for ff_label, ff_dict in in_dict.items():

        # get details of queried file
        sdf_que = ff_dict['sdfile']
        tag_que = ff_dict['sdtag']

        if sdf_que == sdf_ref:
            continue

        # initiate new sublists
        enes_method = []
        rmsds_method = []
        tfds_method = []
        smiles_method = []

        # open an output file to store query molecules with new SD tags
        out_file = f'{out_prefix}_{os.path.basename(sdf_que)}'
        ofs = oechem.oemolostream()
        if not ofs.open(out_file):
            oechem.OEThrow.Fatal(f"Unable to open {out_file} for writing")

        # load molecules from open reference and query files
        print(f"\n\nOpening reference file {sdf_ref}")
        mols_ref = reader.read_mols(sdf_ref, mol_slice)

        print(f"Opening query file {sdf_que} for [ {ff_label} ] energies")
        mols_que = reader.read_mols(sdf_que, mol_slice)

        # loop over each molecule in reference and query files
        for rmol, qmol in zip(mols_ref, mols_que):

            # initial check that they have same title and number of confs
            rmol_name = rmol.GetTitle()
            rmol_nconfs = rmol.NumConfs()
            if (rmol_name != qmol.GetTitle()) or (rmol_nconfs !=
                                                  qmol.NumConfs()):
                raise ValueError(
                    "ERROR: Molecules not aligned in iteration. "
                    "Offending molecules and number of conformers:\n"
                    f"\'{rmol_name}\': {rmol_nconfs} nconfs\n"
                    f"\'{qmol.GetTitle()}\': {qmol.NumConfs()} nconfs")

            # initialize lists to store conformer energies
            enes_ref = []
            enes_que = []
            rmsds_mol = []
            tfds_mol = []
            smiles_mol = []

            # loop over each conformer of this mol
            for ref_conf, que_conf in zip(rmol.GetConfs(), qmol.GetConfs()):

                # check confomer match from the specified tag
                ref_id = oechem.OEGetSDData(ref_conf, conf_id_tag)
                que_id = oechem.OEGetSDData(que_conf, conf_id_tag)
                if ref_id != que_id:
                    raise ValueError(
                        "ERROR: Conformers not aligned in iteration"
                        f" for mol: '{rmol_name}'. The conformer "
                        f"IDs ({conf_id_tag}) for ref and query are:"
                        f"\n{ref_id}\n{que_id}.")

                # note the smiles id
                smiles_mol.append(ref_id)

                # get energies
                enes_ref.append(float(oechem.OEGetSDData(ref_conf, tag_ref)))
                enes_que.append(float(oechem.OEGetSDData(que_conf, tag_que)))

                # compute RMSD between reference and query conformers
                rmsd = oechem.OERMSD(ref_conf, que_conf, automorph, heavyOnly,
                                     overlay)
                rmsds_mol.append(rmsd)

                # compute TFD between reference and query conformers
                tfd = calc_tfd(ref_conf, que_conf, conf_id_tag)
                tfds_mol.append(tfd)

                # store data in SD tags for query conf, and write conf to file
                oechem.OEAddSDData(que_conf, f'RMSD to {sdf_ref}', str(rmsd))
                oechem.OEAddSDData(que_conf, f'TFD to {sdf_ref}', str(tfd))
                oechem.OEWriteConstMolecule(ofs, que_conf)

            # compute relative energies against lowest E reference conformer
            lowest_ref_idx = enes_ref.index(min(enes_ref))
            rel_enes_ref = np.array(enes_ref) - enes_ref[lowest_ref_idx]
            rel_enes_que = np.array(enes_que) - enes_que[lowest_ref_idx]

            # subtract them to get ddE = dE (query method) - dE (ref method)
            enes_mol = np.array(rel_enes_que) - np.array(rel_enes_ref)

            # store then move on
            enes_method.append(enes_mol)
            rmsds_method.append(np.array(rmsds_mol))
            tfds_method.append(np.array(tfds_mol))
            smiles_method.append(smiles_mol)
            #print(rmsds_method, len(rmsds_method))
            #print(enes_method, len(enes_method))

        enes_full.append(enes_method)
        rmsds_full.append(rmsds_method)
        tfds_full.append(tfds_method)
        smiles_full.append(smiles_method)

    ofs.close()

    return enes_full, rmsds_full, tfds_full, smiles_full
Beispiel #2
0
def tailed_parameters(in_sdf, ffxml, cutoff, tag, tag_smiles, metric_type):
    """
    Extract data from SD tags, identify outlier molecules above cutoff,
    and get the associated force field parameters for each structure.

    Parameters
    ----------
    in_sdf : string
        name of the input SDF molecule file with RMSD or TFD info as SD tags
    ffxml : string
        name of the FFXML force field file
    cutoff : float
        cutoff value to use for the metric; structures with value above
        the cutoff are considered outliers
    tag : string
        name of the SD tag in the SDF file with the metric information
    tag_smiles : string
        name of the SD tag in the SDF file with the molecule identifier
    metric_type : string
        what metric the tag and cutoff refer to (e.g., TFD or RMSD)
        for plot and figure labeling

    Returns
    -------
    data_all : dict
        key of 'count' has int for count of all structures
        key of 'mols_dict' has dict for dictionary of mols (see note)
        key of 'params_mol' has dict of isosmiles and list of parameter IDs
        key of 'params_id' has dict of parameter IDs and list of isosmiles
        key of 'smi_dict' has dict of isosmiles keys and SD tag identifier
    data_out : dict
        same format as data_all but only containing outlier molecules

    Note
    ----
    The mols_dict dictionary is itself a dict of dicts where the first
    level key is the SMILES string (or specified molecular identifier),
    and the value of that key is a dict with the following key/value pairs:
        metric      geometric measurement
        structure   OEGraphMol of the structure

    """

    # load molecules from open reference and query files
    print(f"\n\n\nOpening SDF file {in_sdf}...")
    mols = reader.read_mols(in_sdf)
    print(
        f"Looking for outlier molecules with {metric_type.upper()} above {cutoff}...\n"
    )

    # find the molecules with the metric above the cutoff
    all_smiles = []
    mols_all = OrderedDict()
    mols_out = OrderedDict()
    count_all = 0
    count_out = 0

    for mol in mols:
        for conf in mol.GetConfs():

            smiles = oechem.OEGetSDData(conf, tag_smiles)

            try:
                value = float(oechem.OEGetSDData(conf, tag))
            except ValueError as e:
                raise ValueError(
                    "There was an error while obtaining the SD "
                    f"tag data of '{oechem.OEGetSDData(conf, tag)}'. Did you "
                    f"specify the correct SD tag of '{tag}'?")

            if value >= cutoff:
                mols_out[smiles] = {
                    'metric': value,
                    'structure': oechem.OEGraphMol(conf)
                }
                count_out += 1

            mols_all[smiles] = {'structure': oechem.OEGraphMol(conf)}
            all_smiles.append(smiles)
            count_all += 1

    # save outliers molecules to file
    write_mols(mols_out, f'outliers_{metric_type}.mol2')

    # analyze parameters in the outlier and full sets
    params_mol_out, params_id_out, smi_dict_out = get_parameters(
        mols_out, ffxml)
    params_mol_all, params_id_all, smi_dict_all = get_parameters(
        mols_all, ffxml)

    # organize all computed data to encompassing dictionary
    # all values in data_* are dictionaries except for data_*['count']
    data_all = {
        'count': count_all,
        'mols_dict': mols_all,
        'params_mol': params_mol_all,
        'params_id': params_id_all,
        'smi_dict': smi_dict_all
    }
    data_out = {
        'count': count_out,
        'mols_dict': mols_out,
        'params_mol': params_mol_out,
        'params_id': params_id_out,
        'smi_dict': smi_dict_out
    }

    # save the params organized by id to pickle
    with open(f'tailed_{metric_type}.pickle', 'wb') as f:
        pickle.dump((data_all, data_out), f)

    return data_all, data_out
Beispiel #3
0
def match_minima(in_dict, rmsd_cutoff):
    """
    For different methods, match the conformer minima to those of the reference
    method. Ex. Conf G of reference method matches with conf R of method 2.

    Parameters
    ----------
    in_dict : OrderedDict
        dictionary from input file, where key is method and value is dictionary
        first entry should be reference method
        in sub-dictionary, keys are 'sdfile' and 'sdtag'
    rmsd_cutoff : float
        cutoff above which two structures are considered diff conformers

    Returns
    -------
    mol_dict : dict of dicts
        mol_dict['mol_name']['energies'] =
            [[file1_conf1_E file1_conf2_E] [file2_conf1_E file2_conf2_E]]
        An analogous structure is followed for mol_dict['mol_name']['indices'].

    """

    # nested dictionary: 1st layer of mol names, 2nd layer of method energies
    mol_dict = {}

    # get first filename representing the reference geometries
    sdf_ref = list(in_dict.values())[0]['sdfile']

    # assess each file against reference
    for ff_label, ff_dict in in_dict.items():
        sdf_query = ff_dict['sdfile']
        sdf_tag = ff_dict['sdtag']

        # load molecules from open reference and query files
        print(f"\n\nOpening reference file {sdf_ref}")
        mols_ref = reader.read_mols(sdf_ref)

        print(f"Opening query file {sdf_query} for [ {ff_label} ] energies")
        mols_query = reader.read_mols(sdf_query)

        # loop over each molecule in reference and query files
        for rmol in mols_ref:
            mol_name = rmol.GetTitle()
            ref_nconfs = rmol.NumConfs()
            run_match = False

            for qmol in mols_query:

                # same mol titles should mean same molecular identity;
                # when same molecular identity found, break out of loop to
                # start matching conformers
                if rmol.GetTitle() == qmol.GetTitle():
                    run_match = True
                    break

            # create entry for this mol in mol_dict if not already present
            # energies [i][j] will be 2d list of ith method and jth conformer
            if mol_name not in mol_dict:
                mol_dict[mol_name] = {'energies': [], 'indices': []}

            # no same molecules were found bt ref and query methods
            # for N reference minima of each mol, P matching indices for each ref minimia
            if not run_match:
                print(f"No \"{mol_name}\" molecule found in {sdf_query}")

                # fill in -2 error values for conformer indices
                mol_dict[mol_name]['indices'].append([-2] * ref_nconfs)

                # fill in nan values for conformer energies and ref_nconfs
                mol_dict[mol_name]['energies'].append([np.nan] * ref_nconfs)

                # reset mols_query generator
                mols_query = reader.read_mols(sdf_query)

                # continue with the next rmol
                continue

            # get data from specified sd tag for all conformers
            data_confs = reader.get_sd_list(qmol, sdf_tag)

            # format sd tag data to float types
            float_data_confs = list(map(float, data_confs))

            # store sd data from tags into dictionary
            mol_dict[mol_name]['energies'].append(float_data_confs)

            # don't run match if query method is same as reference method
            # keep this section after sd tag extraction of energies
            if sdf_query == sdf_ref:
                print("Skipping comparison against self.")
                mol_dict[mol_name]['indices'].append([-1] * ref_nconfs)
                continue

            # run the match here
            # get indices of qmol conformers that match rmol conformers
            molIndices = compare_two_mols(rmol, qmol, rmsd_cutoff)
            mol_dict[mol_name]['indices'].append(molIndices)

    return mol_dict