def test_extract_protein_interactions_kgml(self, kgml_file, expected_no_rel): # Arrange sut = KeggProteinInteractionsExtractor() with open( os.path.join(os.path.dirname(os.path.realpath(__file__)), kgml_file), 'r') as myfile: kgml_string = myfile.read() # Mock Kegg ops mock_kegg = KEGG() sut.kegg = mock_kegg # No matter what the input is, return the ko numbers that map to hsa numbers mock_kegg.link = MagicMock(return_value="ko:K00922 hsa:5293\n" + "ko:K00922 hsa:5291\n" + "ko:K02649 hsa:5295") # No matter what the input is, return the hsa numbers that map to uniprot numbers mock_kegg.conv = MagicMock(return_value={"hsa:5293": "up:B0LPE5"}) # Mock Uni Prot mock_uniprot = UniProt() sut.uniprot = mock_uniprot mock_uniprot.mapping = MagicMock( return_value={"B0LPE5": ["gene1", "gene2"]}) # Act actual = sut.extract_protein_interactions_kgml(kgml_string) # Assert self.assertEqual(expected_no_rel, len(actual))
def kegg_to_uniprot(fr='hsa', cache=False): """Downloads a mapping from a `KEGG` database to `UniProt`, including both `TrEMBL` and `SwissProt`. Parameters: ---------- fr : str, optional, default: 'hsa' KEGG database identifier to convert. Defaults to 'hsa'. cache : bool, optional, default: False If True, results are cached by `bioservices`. This can save time but you will eventually miss out on new database releases if your cache is old. Returns ------- `dict` Mapping from `KEGG` identifiers to a list of `UniProt` accessions. """ kegg = KEGG(cache=cache) mapping = kegg.conv(fr, 'uniprot') parsed_mapping = {} for upid, org in mapping.items(): upid = upid.split(':')[1] # remove the 'up:' prefix if org in parsed_mapping: parsed_mapping[org] += [upid] else: parsed_mapping[org] = [upid] return parsed_mapping
def tcell_read_metabolomics_data(): """This function is quite convoluted as it downloads an excelfile from a publication and extracts a dataframe, idexed by chebi. The function also caches intermediate files""" tcell_metabol_xls = cache.UrlFileCache(os.path.join(cache.get_cache_path(), metabolite_expression_name + ".xlsx"), metabolomics_data_url) metabolomics_df = pd.read_excel(tcell_metabol_xls.get_file_name(), sheet_name = "normalized by sample mean", index_col=0, usecols="A,C:HN", skiprows = [0]) #metabolomics_df = pd.read_excel(tcell_metabol_xls.get_file_name(), sheet_name = "normalized by sample mean", index_col=0, usecols="A,C:HN", skiprows = [0]) for col in metabolomics_df.columns: # Average all technical replicates (Named by trailing ".1") if len(col.split('.'))>1 and col.split('.')[1] == "1": remcol = col.split('.')[0] metabolomics_df[remcol] = scipy.stats.gmean(metabolomics_df[[remcol,col]],axis=1) metabolomics_df.drop(col, axis=1, inplace=True) metabolomics_df.index.name = "KEGG_ID" metabolomics_df = metabolomics_df.apply(np.exp2) # The excel data is in log2 space, return it to normal k = KEGG(verbose=False) map_kegg_chebi = k.conv("chebi", "compound") metabolomics_df = metabolomics_df.groupby("KEGG_ID", group_keys=False).apply(lambda x: one_row_per_compound_convert(x, map_kegg_chebi)).reset_index(drop=True) metabolomics_df.set_index("MetaboliteID", inplace=True) return metabolomics_df
it. ''' import os import click import json import requests import time import xmltodict import bioservices from bioservices import KEGG, ChEBI from zeep import Client from tqdm import tqdm k = KEGG(verbose=False) map_kegg_chebi = k.conv("chebi", "compound") c = ChEBI(verbose=False) chebi_client = Client( "https://www.ebi.ac.uk/webservices/chebi/2.0/webservice?wsdl") chemspider_client = Client("https://www.chemspider.com/InChI.asmx?WSDL") # For compounds that cant be found at all. not_founds = [] # Need to create a global dictonary for these annotations, as I don't # want to take the piss with the web services these wonderful people # provide to us free of charge. global CONVERTED_COMPOUNDS CONVERTED_COMPOUNDS = {}
uniprot = UniProt(cache=True) # ---- Set-up QuickGO bioservice quickgo = QuickGO(cache=True) # ---- Set-up KEGG bioservice kegg, kegg_parser = KEGG(cache=True), KEGGParser() kegg.organism = 'mmu' print '[INFO] KEGG service configured' kegg_pathways = {p: kegg.parse_kgml_pathway(p) for p in kegg.pathwayIds} print '[INFO] KEGG pathways extracted: ', len(kegg_pathways) # Convert KEGG pathways Gene Name to UniProt k2u = kegg.conv('uniprot', 'mmu') kegg_pathways_proteins = {p: {k2u[x].split(':')[1] for i in kegg_pathways[p]['entries'] if i['type'] == 'gene' for x in i['name'].split(' ') if x in k2u} for p in kegg_pathways} kegg_uniprot_acc_map = {x for p in kegg_pathways_proteins for x in kegg_pathways_proteins[p]} kegg_uniprot_acc_map = {p: uniprot.get_fasta(str(p)).split(' ')[0].split('|')[2] for p in kegg_uniprot_acc_map} kegg_pathways_proteins = {p: {kegg_uniprot_acc_map[i] for i in kegg_pathways_proteins[p]} for p in kegg_pathways_proteins} print '[INFO] KEGG pathways Ids converted to UniProt: ', len(kegg_pathways_proteins) # ---- Set-up GO Terms gene list go_terms_file = '%s/files/go_terms_uniprot.pickle' % wd if os.path.isfile(go_terms_file): with open(go_terms_file, 'rb') as handle: go_terms = pickle.load(handle)
found = 0 found_list = [] missing = [] for line in ecorToSita: ecor, sita = line.split(' ') if ecor in ab_dict: if len(ab_dict[ecor]) == 2: ab_dict[ecor].append(sita) found += 1 found_list.append(ecor) else: missing.append(ecor) # load kegg module from bioservices import KEGG s = KEGG() convDb = s.conv('sita', 'ncbi-proteinid') convDb['ncbi-proteinid:YP_008815800'] # annotate kegg module annotated = [] no_joy_for_sita = [] for gene in found_list: sita = ab_dict[gene][-1] sita_q = 'ncbi-proteinid:{g}'.format(g=sita[:-2]) if sita_q in convDb: ab_dict[gene].append(convDb[sita_q]) annotated.append(gene) else: no_joy_for_sita.append(sita) counter = 0