Esempio n. 1
    def test_extract_protein_interactions_kgml(self, kgml_file,
        # Arrange
        sut = KeggProteinInteractionsExtractor()
        with open(
                             kgml_file), 'r') as myfile:
            kgml_string =

        # Mock Kegg ops
        mock_kegg = KEGG()
        sut.kegg = mock_kegg

        # No matter what the input is, return the  ko numbers that map to hsa numbers = MagicMock(return_value="ko:K00922	hsa:5293\n" +
                                   "ko:K00922	hsa:5291\n" +
                                   "ko:K02649	hsa:5295")

        # No matter what the input is, return the  hsa numbers that map to uniprot numbers
        mock_kegg.conv = MagicMock(return_value={"hsa:5293": "up:B0LPE5"})

        # Mock Uni Prot
        mock_uniprot = UniProt()
        sut.uniprot = mock_uniprot
        mock_uniprot.mapping = MagicMock(
            return_value={"B0LPE5": ["gene1", "gene2"]})

        # Act
        actual = sut.extract_protein_interactions_kgml(kgml_string)

        # Assert
        self.assertEqual(expected_no_rel, len(actual))
Esempio n. 2
def kegg_to_uniprot(fr='hsa', cache=False):
    """Downloads a mapping from a `KEGG` database to `UniProt`, including
    both `TrEMBL` and `SwissProt`.

    fr : str, optional, default: 'hsa'
        KEGG database identifier to convert. Defaults to 'hsa'.

    cache : bool, optional, default: False
        If True, results are cached by `bioservices`. This can save
        time but you will eventually miss out on new database releases if
        your cache is old.

        Mapping from `KEGG` identifiers to a list of `UniProt` accessions.

    kegg = KEGG(cache=cache)
    mapping = kegg.conv(fr, 'uniprot')

    parsed_mapping = {}
    for upid, org in mapping.items():
        upid = upid.split(':')[1]  # remove the 'up:' prefix
        if org in parsed_mapping:
            parsed_mapping[org] += [upid]
            parsed_mapping[org] = [upid]
    return parsed_mapping
def tcell_read_metabolomics_data():
    """This function is quite convoluted as it downloads an excelfile from a publication and extracts a dataframe, idexed by chebi. The function also caches intermediate files"""
    tcell_metabol_xls = cache.UrlFileCache(os.path.join(cache.get_cache_path(),  metabolite_expression_name + ".xlsx"), metabolomics_data_url)
    metabolomics_df = pd.read_excel(tcell_metabol_xls.get_file_name(), sheet_name = "normalized by sample mean", index_col=0, usecols="A,C:HN", skiprows = [0])
    #metabolomics_df = pd.read_excel(tcell_metabol_xls.get_file_name(), sheet_name = "normalized by sample mean", index_col=0, usecols="A,C:HN", skiprows = [0])
    for col in metabolomics_df.columns:
        # Average all technical replicates (Named by trailing ".1")
        if len(col.split('.'))>1 and col.split('.')[1] == "1":
            remcol = col.split('.')[0]
            metabolomics_df[remcol] = scipy.stats.gmean(metabolomics_df[[remcol,col]],axis=1)
            metabolomics_df.drop(col, axis=1, inplace=True) = "KEGG_ID"
    metabolomics_df = metabolomics_df.apply(np.exp2)    # The excel data is in log2 space, return it to normal
    k = KEGG(verbose=False)
    map_kegg_chebi = k.conv("chebi", "compound")
    metabolomics_df = metabolomics_df.groupby("KEGG_ID", group_keys=False).apply(lambda x: one_row_per_compound_convert(x, map_kegg_chebi)).reset_index(drop=True)
    metabolomics_df.set_index("MetaboliteID", inplace=True)
    return metabolomics_df
Esempio n. 4

import os
import click
import json
import requests
import time
import xmltodict
import bioservices
from bioservices import KEGG, ChEBI
from zeep import Client
from tqdm import tqdm

k = KEGG(verbose=False)
map_kegg_chebi = k.conv("chebi", "compound")
c = ChEBI(verbose=False)

chebi_client = Client(
chemspider_client = Client("")

# For compounds that cant be found at all.
not_founds = []

# Need to create a global dictonary for these annotations, as I don't
# want to take the piss with the web services these wonderful people
# provide to us free of charge.

uniprot = UniProt(cache=True)

# ---- Set-up QuickGO bioservice
quickgo = QuickGO(cache=True)

# ---- Set-up KEGG bioservice
kegg, kegg_parser = KEGG(cache=True), KEGGParser()

kegg.organism = 'mmu'
print '[INFO] KEGG service configured'

kegg_pathways = {p: kegg.parse_kgml_pathway(p) for p in kegg.pathwayIds}
print '[INFO] KEGG pathways extracted: ', len(kegg_pathways)

# Convert KEGG pathways Gene Name to UniProt
k2u = kegg.conv('uniprot', 'mmu')

kegg_pathways_proteins = {p: {k2u[x].split(':')[1] for i in kegg_pathways[p]['entries'] if i['type'] == 'gene' for x in i['name'].split(' ') if x in k2u} for p in kegg_pathways}

kegg_uniprot_acc_map = {x for p in kegg_pathways_proteins for x in kegg_pathways_proteins[p]}
kegg_uniprot_acc_map = {p: uniprot.get_fasta(str(p)).split(' ')[0].split('|')[2] for p in kegg_uniprot_acc_map}

kegg_pathways_proteins = {p: {kegg_uniprot_acc_map[i] for i in kegg_pathways_proteins[p]} for p in kegg_pathways_proteins}
print '[INFO] KEGG pathways Ids converted to UniProt: ', len(kegg_pathways_proteins)

# ---- Set-up GO Terms gene list
go_terms_file = '%s/files/go_terms_uniprot.pickle' % wd

if os.path.isfile(go_terms_file):
    with open(go_terms_file, 'rb') as handle:
        go_terms = pickle.load(handle)
Esempio n. 6
found = 0
found_list = []
missing = []
for line in ecorToSita:
    ecor, sita = line.split(' ')
    if ecor in ab_dict:
        if len(ab_dict[ecor]) == 2:
            found += 1
# load kegg module
from bioservices import KEGG
s = KEGG()
convDb = s.conv('sita', 'ncbi-proteinid')

# annotate kegg module
annotated = []
no_joy_for_sita = []
for gene in found_list:
    sita = ab_dict[gene][-1]
    sita_q = 'ncbi-proteinid:{g}'.format(g=sita[:-2])
    if sita_q in convDb:
counter = 0