Beispiel #1
0
def get_significant_metab_chebis(assay_file, input_data_dir):
    """
    Returns a 2 lists of ChEBI IDs for metabolites whose fold change in abundance achieved
    significance at the p < 0.05 level according to a t-test between sensitive and
    resistant strains. If exact matches are available in the ChEBI database, those ChEBI IDs
    are entered into list 1. If no matches are found, the names are entered into list 2.
    """
    # i.e. 'assay_results.tsv'
    assay_results_path = input_data_dir + assay_file
    assay_results = pd.read_csv(assay_results_path, sep='\t', index_col=0, header=0)
    assay_results.rename(columns={'Unnamed: 13': 'ttest_p'}, inplace=True)
    signif = assay_results[assay_results['ttest_p'] < 0.05]
    signif_metabs = signif.index.tolist()

    manually_curated_signif = open(input_data_dir + 'user_added_chebis_SIGNIF_ONLY.txt', 'r')
    man_cur_sig_fh = manually_curated_signif.readlines()

    # see chebi_id_metadata for explanations of the pre-specified signif chebi ids
    signif_chebi_ids = [ch_id.strip('\n') for ch_id in man_cur_sig_fh]
    signif_no_chebi_ids = []
    for name in signif_metabs:
        chebi_ID_obj = lc.search(name, exact=True)
        if len(chebi_ID_obj) > 0:
            signif_chebi_ids.append('CHEBI:' + str(chebi_ID_obj[0]._ChebiEntity__chebi_id))
        elif len(chebi_ID_obj) == 0:
            signif_no_chebi_ids.append(name)

    return signif_chebi_ids, signif_no_chebi_ids
Beispiel #2
0
def get_chebi_ids(dataframe, number_of_items):
    # a method to obtain InChi and Chebi ID from a chemical name using libchebi
    # takes 2 arguments: a data frame and a number of chemicals

    for idx in range(0, number_of_items):
        hit = libchebipy.search(dataframe.loc[idx, 'chemical_name'], True)
        if len(hit) > 0:
            # print("slice -  HIT: ",element, ":", hit[0].get_inchi(), "|", hit[0].get_id())
            dataframe.loc[idx, 'inchi'] = hit[0].get_inchi()
            dataframe.loc[idx, 'chebi_identifier'] = hit[0].get_id()
        else:
            # print("slice - nothing found: ", data_slice.loc[i, 'chemical_name'])
            dataframe.loc[idx, 'inchi'] = ''
            dataframe.loc[idx, 'chebi_identifier'] = ''

    return dataframe
Beispiel #3
0
def get_chebi_matches(metabs, exact=True):
    """Queries ChEBI API for exact or approximate ChEBI ID matches to provided list
    of metabolites. Note: capitalization is irrelevant, even if exact==True.

    Set exact to False if more inclusive search is desired.
        Uses ChEBI API's non-exact search. Not perfect.
        i.e. includes residues and derivatives related to compound
            i.e. methionine sulfone (CHEBI:132188) and methionine sulfoximine (CHEBI:47833)
            will be returned if queried with methionine (CHEBI:16811)

    Returns
        metab_map (dict):
            key: user-provided metab name
            value: list (of any length) of derived ChEBI IDs
        failed_to_map (list):
            user provided metabolite names for which there were no exact or approximate
            matches in ChEBI
    """
    metab_map = {m: set() for m in metabs}
    failed_to_map = set()

    print("Communicating with ChEBI API\n" "This will take a few moments")
    for name in metabs:
        # todo: follow up with Neil re: incompatibility with python3 :(
        # todo: try chebi_obj.getId() and getName() when up and running
        chebi_ID_obj = lc.search(name, exact=exact)
        if len(chebi_ID_obj) > 0:
            [
                metab_map[name].add(
                    'CHEBI:' + str(chebi_ID_obj[i]._ChebiEntity__chebi_id))
                for i in range(len(chebi_ID_obj))
            ]
        elif len(chebi_ID_obj) == 0:
            del metab_map[name]
            failed_to_map.add(name)

    # scavenge for any left-behind metabs for which we have ChEBI matches in the
    # PathwayCommons compounds.tsv
    addl_chebis, failed_to_map = search_exact_matches_compoundstsv(
        failed_to_map)
    # this kind of dictionary unpacking to combine the 2 only works in python3 :(
    metab_map = {**metab_map, **addl_chebis}

    return metab_map, failed_to_map
Beispiel #4
0
def search_entities(search_term: Union[int, str],
                    exact_search: bool = DEFAULT_EXACT_SEARCH,
                    exact_match: bool = DEFAULT_EXACT_MATCH) \
        -> List[ChebiEntity]:
    """
    Search for ChEBI entities.

    Case-insensitive.

    Args:
        search_term:
            String or integer to search for.
        exact_search:
            The ``exact`` parameter to :func:`libchebipy.search`.
        exact_match:
            Ensure that the name of the result exactly matches the search term.
            Example: an exact search for "zopiclone" gives both "zopiclone
            (CHEBI:32315)" and "(5R)-zopiclone (CHEBI:53762)"; this option
            filters to the first.
    """
    log.debug(f"Searching for {search_term!r} "
              f"(exact_search={exact_search}, exact_match={exact_match})")
    results = search(search_term, exact=exact_search)
    log.debug(f"libchebipy.search({search_term!r}, exact={exact_search}) "
              f"-> {results!r}")
    if exact_match:
        if isinstance(search_term, int):
            results = [
                r for r in results if get_chebi_id_number(r) == search_term
            ]
        else:
            assert isinstance(search_term, str)
            results = [
                r for r in results
                if r.get_name().lower() == search_term.lower()
            ]
    log.debug(f"search_entities({search_term!r}, exact_search={exact_search}, "
              f"exact_match={exact_match}) -> {results!r}")
    return results
Beispiel #5
0
def map_all_metabs_to_chebi_ids(metabs, signif_no_chebi_ids, output_data_dir):
    """
    Maps a list of metabolites to their ChEBI IDs, if available.
    See: https://github.com/libChEBI/libChEBIpy/blob/master/libchebipy/_chebi_entity.py


    Returns:
        exact_matches.tsv:
            2 col .tsv with provided metabolite name and matched CHEBI ID
                i.e.
                    malate	CHEBI:25115

        all_close_matches.tsv:
            2 col .tsv with provided metabolite name and dictionaries of
            possible matching CHEBI names and IDs for all metabolites without exact matches
                i.e.
                    GSH	[{'Gsh-prostaglandin A1': '5548'}, {'S-Decyl GSH': '8955'}]

        signif_close_matches.tsv: 2 col .tsv
            same format as all_close_matches.tsv but includes only significant metabolites
            with no exact CHEBI id matches

    """
    names_map = {}
    no_exact_match = {}
    signif_no_exact_match = {}
    for name in metabs:
        chebi_ID_obj = lc.search(name, exact=True)
        # works for 72 of the 112
        if len(chebi_ID_obj) > 0:
            names_map[name] = 'CHEBI:' + str(chebi_ID_obj[0]._ChebiEntity__chebi_id)
            # TODO: search used-to-produce sif for chebi_ID_obj[0]._ChebiEntity__chebi_id
        elif len(chebi_ID_obj) == 0:
            # if an exact match is not possible, don't use exact match
            chebi_ID_obj = lc.search(name)
            no_exact_match[name] = []
            for i in range(0,len(chebi_ID_obj)):
                # fill a dictionary with desired_name: {alternative_name, alternative_id}
                alt_id = str(chebi_ID_obj[i]._ChebiEntity__chebi_id)
                chebi_entity = lc.ChebiEntity(alt_id)
                no_exact_match[name].append({chebi_entity.get_name(): 'CHEBI:' + alt_id})
                if name in signif_no_chebi_ids:
                    signif_no_exact_match[name] = []
                    for i in range(0,len(chebi_ID_obj)):
                        alt_id = str(chebi_ID_obj[i]._ChebiEntity__chebi_id)
                        chebi_entity = lc.ChebiEntity(alt_id)
                        signif_no_exact_match[name].append(
                            {chebi_entity.get_name(): 'CHEBI:' + alt_id})

    exact_outf = open(output_data_dir + "exact_matches.tsv", "w")
    for k, v in names_map.items():
        exact_outf.write(str(k) + '\t' + str(v) + '\n')
    exact_outf.close()

    all_close_matches_outf = open(output_data_dir + "all_close_matches.tsv", "w")
    for k, v in no_exact_match.items():
        all_close_matches_outf.write(str(k) + '\t' + str(v) + '\n')
    all_close_matches_outf.close()

    signif_close_matches_path = output_data_dir + "signif_close_matches.tsv"
    signif_close_matches_outf = open(signif_close_matches_path, "w")
    for k, v in signif_no_exact_match.items():
        signif_close_matches_outf.write(str(k) + '\t' + str(v) + '\n')
    signif_close_matches_outf.close()

    return signif_close_matches_path
Beispiel #6
0
 def test_search_hexenal_inexact(self):
     '''Test search method for aspirin.'''
     results = libchebipy.search('(E)-2-Hexenal', False)
     self.assertIn(ChebiEntity('CHEBI:28913'), results)
Beispiel #7
0
 def test_search_aspirin(self):
     '''Test search method for aspirin.'''
     results = libchebipy.search('aspirin', False)
     self.assertIn(ChebiEntity('CHEBI:15365'), results)
Beispiel #8
0
 def test_search(self):
     '''Tests search method.'''
     self.assertEqual(len(libchebipy.search('glucose', True)), 2)
     self.assertGreater(len(libchebipy.search('glucose', False)), 1)
     self.assertGreater(len(libchebipy.search('aspartate', False)), 1)
Beispiel #9
0
def get_chemicals(term):
    '''Gets organisms from search term.'''
    return json.dumps([{
        'id': entity.get_id(),
        'name': entity.get_name()
    } for entity in libchebipy.search(term)])
Beispiel #10
0
                           "Unnamed: 19": "sem_8"}, inplace=True)

# inserting 2 new fields as placeholders for chemical information descriptors
data_slice.insert(loc=1, column='inchi', value='')
data_slice.insert(loc=2, column='chebi_identifier', value='')

# we reinitialize the dataframe index so row numbering start at 0, not 16
data_slice = data_slice.reset_index(drop=True)

# Using LibChebi to retrieve CHEBI identifiers and InChi from a chemical name
# Note: in this call, we retrieve only values for which an exact match on the chemical name is found in Chebi
# libchebi API does not allow easy searching on synonyms, thus we are failing to retrieve all relevant information.
# This is merely to showcase how to use libchebi

for i in range(0, 60):
    hit = libchebipy.search(data_slice.loc[i, 'chemical_name'], True)
    if len(hit) > 0:
        # print("slice -  HIT: ", data_slice.loc[i, 'chemical_name'], ":", hit[0].get_inchi(), "|", hit[0].get_id())
        data_slice.loc[i, 'inchi'] = hit[0].get_inchi()
        data_slice.loc[i, 'chebi_identifier'] = hit[0].get_id()
    else:
        # print("slice - nothing found: ", data_slice.loc[i, 'chemical_name'])
        data_slice.loc[i, 'inchi'] = ''
        data_slice.loc[i, 'chebi_identifier'] = ''

#  Here, we drop the first row
# data_slice.drop([0], inplace=True)

# We may wish to print intermediate results:
# data_slice.to_csv("slice.txt", sep='\t', encoding='utf-8', index=False)
Beispiel #11
0
 def test_search_hexenal_inexact(self):
     '''Test search method for (E)-2-Hexenal.'''
     results = libchebipy.search('(E)-2-Hexenal', False)
     self.assertIn(ChebiEntity('CHEBI:28913'), results)
Beispiel #12
0
 def test_search_hexenal_exact(self):
     '''Test search method for (E)-2-Hexenal.'''
     results = libchebipy.search('(E)-2-Hexenal', True)
     self.assertTrue(ChebiEntity('CHEBI:28913'),
                     results[0] if results else None)
Beispiel #13
0
 def test_search_aspirin(self):
     '''Test search method for aspirin.'''
     results = libchebipy.search('aspirin', False)
     self.assertIn(ChebiEntity('CHEBI:15365'), results)
Beispiel #14
0
 def test_search(self):
     '''Tests search method.'''
     self.assertEqual(len(libchebipy.search('glucose', True)), 2)
     self.assertGreater(len(libchebipy.search('glucose', False)), 1)
     self.assertGreater(len(libchebipy.search('aspartate', False)), 1)