Ejemplo n.º 1
0
def substructure_search(db,
                        sub_structure,
                        limit,
                        parent_filter=None,
                        model_db=None,
                        search_projection=DEFAULT_PROJECTION.copy()):
    """Returns compounds in the indicated database which contain the provided
    structure

    :param db: DB to search
    :type db: A Mongo Database
    :param comp_structure: A molecule in Molfile or SMILES format
    :type comp_structure: str
    :param limit: The maximum number of compounds to return
    :type limit: int
    :param search_projection: The fields which should be returned
    :type search_projection: str
    :return: Query results
    :rtype: list
    """
    substructure_search_results = []
    # Create Mol object from Molfile (has newlines)
    if "\n" in sub_structure:
        mol = AllChem.MolFromMolBlock(str(sub_structure))
    # Create Mol object from SMILES string (does not have newlines)
    else:
        mol = AllChem.MolFromSmiles(str(sub_structure))
    if not mol:
        raise ValueError("Unable to parse comp_structure")

    # Based on fingerprint type specified by user, get the finger print as an
    # explicit bit vector (series of 1s and 0s). Then, return a set of all
    # indices where a bit is 1 in the bit vector.
    query_fp = list(AllChem.RDKFingerprint(mol).GetOnBits())
    for x in db.compounds.find({"RDKit": {
            "$all": query_fp
    }}, search_projection):
        # Get Mol object from SMILES string (rdkit)
        comp = AllChem.MolFromSmiles(x['SMILES'])
        # Use HasSubstructMatch (rdkit) to determine if compound has a
        # specified substructure. If so, append it to the results (until
        # limit).
        if comp and comp.HasSubstructMatch(mol):
            substructure_search_results.append(x)
            if len(substructure_search_results) == limit:
                break

    if parent_filter and model_db:
        substructure_search_results = \
            score_compounds(model_db, substructure_search_results,
                            parent_filter)

    return substructure_search_results
Ejemplo n.º 2
0
def structure_search(db,
                     comp_structure,
                     stereo=True,
                     parent_filter=None,
                     model_db=None,
                     search_projection=DEFAULT_PROJECTION.copy()):
    """Returns compounds in the indicated database which are exact matches to
    the provided structure

    :param db: DB to search
    :type db: A Mongo Database
    :param comp_structure: A molecule in Molfile or SMILES format
    :type comp_structure: str
    :param stereo: If true, uses stereochemistry in finding exact match
    :type stereo: bool
    :param search_projection: The fields which should be returned
    :type search_projection: str
    :return: Query results
    :rtype: list    """
    # Create Mol object from Molfile (has newlines)
    if "\n" in comp_structure:
        mol = AllChem.MolFromMolBlock(str(comp_structure))
    # Create Mol object from SMILES string (does not have newlines)
    else:
        mol = AllChem.MolFromSmiles(str(comp_structure))
    if not mol:
        raise ValueError("Unable to parse comp_structure")

    # Get InChI string from mol file (rdkit)
    inchi = AllChem.MolToInchi(mol)
    # Get InChI key from InChI string (rdkit)
    inchi_key = AllChem.InchiToInchiKey(inchi)
    # Sure, we could look for a matching SMILES but this is faster
    if stereo:
        results = quick_search(db, inchi_key, search_projection)
    else:
        results = [
            x for x in db.compounds.find(
                {"Inchikey": {
                    '$regex': '^' + inchi_key.split('-')[0]
                }}, search_projection)
        ]

    if parent_filter and model_db:
        results = score_compounds(model_db, results, parent_filter)

    return results
Ejemplo n.º 3
0
def ms2_search(db, keggdb, text, text_type, ms_params):
    """Search for compounds matching MS2 spectra.

    Parameters
    ----------
    db : Mongo DB
        Contains compound documents to search.
    keggdb : Mongo DB
        Contains models with associated compound documents.
    text : str
        Text as in metabolomics datafile for specific peak.
    text_type : str
        Type of metabolomics datafile (mgf, mzXML, and msp are supported). If
        text, assumes m/z values are separated by newlines (and set text_type
        to 'form').
    ms_params : dict
        Specifies search settings, using the following key-value pairs:
        ------------------------
        Required Key-Value Pairs
        ------------------------
        'tolerance': float specifying tolerance for m/z, in mDa by default.
            Can specify in ppm if 'ppm' key's value is set to True.
        'charge': bool (1 for positive, 0 for negative).
        'energy_level': int specifying fragmentation energy level to use. May
            be 10, 20, or 40.
        'scoring_function': str describing which scoring function to use. Can
            be either 'jaccard' or 'dot product'.
        ------------------------
        Optional Key-Value Pairs
        ------------------------
        'adducts': list of adducts to use. If not specified, uses all adducts.
        'models': List of model _ids. If supplied, score compounds higher if
            present in model.
        'ppm': bool specifying whether 'tolerance' is in mDa or ppm. Default
            value for ppm is False (so tolerance is in mDa by default).
        'kovats': length 2 tuple specifying min and max kovats retention index
            to filter compounds (e.g. (500, 1000)).
        'logp': length 2 tuple specifying min and max logp to filter compounds
            (e.g. (-1, 2)).
        'halogens': bool specifying whether to filter out compounds containing
            F, Cl, or Br. Filtered out if set to True. False by default.

    Returns
    -------
    ms_adduct_output : list
        Compound JSON documents matching ms2 search query.
    """
    print("<MS Adduct Sea"
          "rch: TextType=%s, Parameters=%s>" % (text_type, ms_params))
    name = text_type + time.strftime("_%d-%m-%Y_%H:%M:%S", time.localtime())

    if isinstance(ms_params, dict):
        ms_params = Struct(**ms_params)

    dataset = MetabolomicsDataset(name, ms_params)
    ms_adduct_output = []

    if text_type == 'form':
        split_form = [x.split() for x in text.strip().split('\n')]
        ms2_data = [(float(mz), float(i)) for mz, i in split_form[1:]]
        peak = Peak(split_form[0][0],
                    0,
                    float(split_form[0][0]),
                    ms_params.charge,
                    "False",
                    ms2=ms2_data)
        dataset.unk_peaks.append(peak)
    elif text_type == 'mgf':
        dataset.unk_peaks = read_mgf(text, ms_params.charge)
    elif text_type == 'mzXML':
        dataset.unk_peaks = read_mzxml(text, ms_params.charge)
    elif text_type == 'msp':
        dataset.unk_peaks = read_msp(text, ms_params.charge)
    else:
        raise IOError('%s files not supported' % text_type)

    if not ms_params.models:
        ms_params.models = ['eco']

    dataset.native_set = get_KEGG_comps(db, keggdb, ms_params.models)
    dataset.annotate_peaks(db)

    for peak in dataset.unk_peaks:

        if ms_params.scoring_function == 'jaccard':
            if not ms_params.ppm:
                peak.score_isomers(metric=jaccard,
                                   energy_level=ms_params.energy_level,
                                   tolerance=float(ms_params.tolerance) / 1000)
            else:
                peak.score_isomers(metric=jaccard,
                                   energy_level=ms_params.energy_level)
        elif ms_params.scoring_function == 'dot product':
            if not ms_params.ppm:
                peak.score_isomers(metric=dot_product,
                                   energy_level=ms_params.energy_level,
                                   tolerance=float(ms_params.tolerance) / 1000)
            else:
                peak.score_isomers(metric=dot_product,
                                   energy_level=ms_params.energy_level)
        else:
            raise ValueError("ms_params['scoring_function'] must be either "
                             "'jaccard' or 'dot product'.")

        for hit in peak.isomers:
            ms_adduct_output.append(hit)

        if ms_params.models:
            ms_adduct_output = score_compounds(db,
                                               ms_adduct_output,
                                               ms_params.models[0],
                                               parent_frac=.75,
                                               reaction_frac=.25)

    return ms_adduct_output
Ejemplo n.º 4
0
def ms_adduct_search(db, keggdb, text, text_type, ms_params):
    """Search for compound-adducts matching precursor mass.

    Parameters
    ----------
    db : Mongo DB
        Contains compound documents to search.
    keggdb : Mongo DB
        Contains models with associated compound documents.
    text : str
        Text as in metabolomics datafile for specific peak.
    text_type : str
        Type of metabolomics datafile (mgf, mzXML, and msp are supported). If
        text, assumes m/z values are separated by newlines (and set text_type
        to 'form').
    ms_params : dict
        Specifies search settings, using the following key-value pairs:
        ------------------------
        Required Key-Value Pairs
        ------------------------
        'tolerance': float specifying tolerance for m/z, in mDa by default.
            Can specify in ppm if 'ppm' key's value is set to True.
        'charge': bool (1 for positive, 0 for negative).
        ------------------------
        Optional Key-Value Pairs
        ------------------------
        'adducts': list of adducts to use. If not specified, uses all adducts.
        'models': List of model _ids. If supplied, score compounds higher if
            present in model. ['eco'] by default (E. coli).
        'ppm': bool specifying whether 'tolerance' is in mDa or ppm. Default
            value for ppm is False (so tolerance is in mDa by default).
        'kovats': length 2 tuple specifying min and max kovats retention index
            to filter compounds (e.g. (500, 1000)).
        'logp': length 2 tuple specifying min and max logp to filter compounds
            (e.g. (-1, 2)).
        'halogens': bool specifying whether to filter out compounds containing
            F, Cl, or Br. Filtered out if set to True. False by default.

    Returns
    -------
    ms_adduct_output : list
        Compound JSON documents matching ms adduct query.
    """
    print("<MS Adduct Search: TextType=%s, Text=%s, Parameters=%s>" %
          (text_type, text, ms_params))
    name = text_type + time.strftime("_%d-%m-%Y_%H:%M:%S", time.localtime())

    if isinstance(ms_params, dict):
        ms_params = Struct(**ms_params)

    dataset = MetabolomicsDataset(name, ms_params)
    ms_adduct_output = []

    if text_type == 'form':
        for mz in text.split('\n'):
            dataset.unk_peaks.append(
                Peak(mz, 0, float(mz), ms_params.charge, "False"))
    elif text_type == 'mgf':
        dataset.unk_peaks = read_mgf(text, ms_params.charge)
    elif text_type == 'mzXML' or text_type == 'mzxml':
        dataset.unk_peaks = read_mzxml(text, ms_params.charge)
    elif text_type == 'msp':
        dataset.unk_peaks = read_msp(text, ms_params.charge)
    else:
        raise IOError('%s files not supported' % text_type)

    if not ms_params.models:
        ms_params.models = ['eco']

    dataset.native_set = get_KEGG_comps(db, keggdb, ms_params.models)
    dataset.annotate_peaks(db)

    for peak in dataset.unk_peaks:
        for hit in peak.isomers:
            if 'CFM_spectra' in hit:
                del hit['CFM_spectra']
            ms_adduct_output.append(hit)

    ms_adduct_output = score_compounds(db,
                                       ms_adduct_output,
                                       ms_params.models[0],
                                       parent_frac=.75,
                                       reaction_frac=.25)

    return ms_adduct_output
Ejemplo n.º 5
0
def similarity_search(db,
                      comp_structure,
                      min_tc,
                      limit,
                      parent_filter=None,
                      model_db=None,
                      fp_type='RDKit',
                      search_projection=DEFAULT_PROJECTION.copy()):
    """Returns compounds in the indicated database which have structural
     similarity to the provided compound

    :param db: DB to search
    :type db: A Mongo Database
    :param comp_structure: A molecule in Molfile or SMILES format
    :type comp_structure: str
    :param min_tc: Minimum Tanimoto score
    :type min_tc: float
    :param fp_type: Fingerprint type. Currently accepts MACCS or RDKit
    :type fp_type: str
    :param limit: The maximum number of compounds to return
    :type limit: int
    :param parent_filter: str
    :type parent_filter: str
    :param search_projection: The fields which should be returned
    :type search_projection: str
    :return: Query results
    :rtype: list
    """
    similarity_search_results = []
    fp_type = str(fp_type)
    # Create Mol object from Molfile (has newlines)
    if "\n" in comp_structure:
        mol = AllChem.MolFromMolBlock(str(comp_structure))
    # Create Mol object from SMILES string (does not have newlines)
    else:
        mol = AllChem.MolFromSmiles(str(comp_structure))
    if not mol:
        raise ValueError("Unable to parse comp_structure")

    # Based on fingerprint type specified by user, get the finger print as an
    # explicit bit vector (series of 1s and 0s). Then, return a set of all
    # indices where a bit is 1 in the bit vector.
    if fp_type == 'MACCS':
        query_fp = set(AllChem.GetMACCSKeysFingerprint(mol).GetOnBits())
    elif fp_type == 'RDKit':
        query_fp = set(AllChem.RDKFingerprint(mol).GetOnBits())
    else:
        raise ValueError("Invalid FP_type")

    len_fp = len(query_fp)
    # Return only id and fingerprint vector
    search_projection[fp_type] = 1

    # Filter compounds that meet tanimoto coefficient size requirements
    for x in db.compounds.find(
        {
            "$and": [{
                "len_" + fp_type: {
                    "$gte": min_tc * len_fp
                }
            }, {
                "len_" + fp_type: {
                    "$lte": len_fp / min_tc
                }
            }]
        }, search_projection):
        # Put fingerprint in set for fast union (&) and intersection (|)
        # calculations
        test_fp = set(x[fp_type])
        # Calculate tanimoto coefficient
        tmc = len(query_fp & test_fp) / float(len(query_fp | test_fp))
        # If a sufficient tanimoto coefficient is calculated, append the
        # compound to the search results (until the limit is reached)
        if tmc >= min_tc:
            del x[fp_type]
            similarity_search_results.append(x)
            if len(similarity_search_results) == limit:
                break

    del search_projection[fp_type]

    if parent_filter and model_db:
        similarity_search_results = score_compounds(model_db,
                                                    similarity_search_results,
                                                    parent_filter)

    return similarity_search_results