예제 #1
0
network = read_csv('%s/files/string_mouse_network_filtered_800.txt' % wd, sep='\t')
network_proteins = set(network['protein1']).intersection(network['protein2'])

# ---- Set-up UniProt
uniprot = UniProt(cache=True)

# ---- Set-up QuickGO bioservice
quickgo = QuickGO(cache=True)

# ---- Set-up KEGG bioservice
kegg, kegg_parser = KEGG(cache=True), KEGGParser()

kegg.organism = 'mmu'
print '[INFO] KEGG service configured'

kegg_pathways = {p: kegg.parse_kgml_pathway(p) for p in kegg.pathwayIds}
print '[INFO] KEGG pathways extracted: ', len(kegg_pathways)

# Convert KEGG pathways Gene Name to UniProt
k2u = kegg.conv('uniprot', 'mmu')

kegg_pathways_proteins = {p: {k2u[x].split(':')[1] for i in kegg_pathways[p]['entries'] if i['type'] == 'gene' for x in i['name'].split(' ') if x in k2u} for p in kegg_pathways}

kegg_uniprot_acc_map = {x for p in kegg_pathways_proteins for x in kegg_pathways_proteins[p]}
kegg_uniprot_acc_map = {p: uniprot.get_fasta(str(p)).split(' ')[0].split('|')[2] for p in kegg_uniprot_acc_map}

kegg_pathways_proteins = {p: {kegg_uniprot_acc_map[i] for i in kegg_pathways_proteins[p]} for p in kegg_pathways_proteins}
print '[INFO] KEGG pathways Ids converted to UniProt: ', len(kegg_pathways_proteins)

# ---- Set-up GO Terms gene list
go_terms_file = '%s/files/go_terms_uniprot.pickle' % wd
예제 #2
0
def pathway_to_dataframe(pathway_id, org='hsa', verbose=False, cache=False):
    """
    Extract protein-protein interaction from KEGG pathway to
    a pandas DataFrame. NOTE: Interactions will be directionless.

    Parameters
    ----------
    pathway_id : str
        Pathway identifier to parse into a dataframe. Example: 'path:hsa00010'

    org : str or None, optioanl, default: 'hsa'
        If supplied, filters out all interactions with identifiers that
        are not in the dictionary created from :func:`kegg_to_uniprot`. If None,
        all interactions are parsed regardless of mappability to UniProt.

    verbose : bool, optional, default: False
        If True, logs messages to stdout to inform of current progress.

    cache : bool, optional, default: False
        If True, HTTP responses are cached by `bioservices`. This can save
        time but you will eventually miss out on new database releases if
        your cache is old.

    Returns
    -------
    `pd.DataFrame`
        DataFrame with 'source', 'target', 'label', 'pubmed', and 
        'experiment_type' columns.

    """
    kegg = KEGG(cache=cache)
    kegg.organism = org
    kegg_to_up = kegg_to_uniprot(org, cache)
    res = kegg.parse_kgml_pathway(pathway_id)
    sources = []
    targets = []
    labels = []

    if verbose:
        logger.info("Parsing pathway {}".format(pathway_id))

    for rel in res['relations']:
        id1 = rel['entry1']
        id2 = rel['entry2']
        name1 = res['entries'][[x['id']
                                for x in res['entries']].index(id1)]['name']
        name2 = res['entries'][[x['id']
                                for x in res['entries']].index(id2)]['name']
        type1 = res['entries'][[x['id']
                                for x in res['entries']].index(id1)]['type']
        type2 = res['entries'][[x['id']
                                for x in res['entries']].index(id2)]['type']
        reaction_type = rel['name'].replace(' ', '-')
        link_type = rel['link']

        if link_type not in links_to_include:
            continue

        if type1 not in types_to_include or type2 not in types_to_include:
            continue

        for a in name1.strip().split(' '):
            for b in name2.strip().split(' '):
                valid_db_a = (kegg.organism in a or 'ec' in a)
                valid_db_b = (kegg.organism in b or 'ec' in b)

                valid_db_a &= (a in kegg_to_up)
                valid_db_b &= (b in kegg_to_up)

                if valid_db_a and valid_db_b:
                    sources.append(a)
                    targets.append(b)
                    labels.append(reaction_type)

    interactions = make_interaction_frame(sources, targets, labels)
    return interactions
result = {}

for inputGene in inputGenes:
    print('Processing gene ' + inputGene + ':')
    try:
        pathways = k.get_pathway_by_gene(inputGene, 'hsa')
    except (AttributeError):
        print('Invalid gene identifier')
        continue

    else:
        if pathways:
            for pathway in pathways:
                if pathway != 'hsa01100':
                    print('\tProcessing pathway ' + pathway)
                    rel = k.parse_kgml_pathway(pathway)
                    genes_result = []

                    ####### Part where the functions are called
                    for entry in rel['entries']:
                        if entry['type'] == 'gene':
                            for aGene in getGeneNameFromEntries(entry['name']):
                                if aGene == inputGene:
                                    getTargetsFromRelations(rel, entry)
                    ############################

                    unique_genes = list(set(genes_result))

                    if pathway in result:
                        result[pathway] += genes_result
                    else: