network = read_csv('%s/files/string_mouse_network_filtered_800.txt' % wd, sep='\t') network_proteins = set(network['protein1']).intersection(network['protein2']) # ---- Set-up UniProt uniprot = UniProt(cache=True) # ---- Set-up QuickGO bioservice quickgo = QuickGO(cache=True) # ---- Set-up KEGG bioservice kegg, kegg_parser = KEGG(cache=True), KEGGParser() kegg.organism = 'mmu' print '[INFO] KEGG service configured' kegg_pathways = {p: kegg.parse_kgml_pathway(p) for p in kegg.pathwayIds} print '[INFO] KEGG pathways extracted: ', len(kegg_pathways) # Convert KEGG pathways Gene Name to UniProt k2u = kegg.conv('uniprot', 'mmu') kegg_pathways_proteins = {p: {k2u[x].split(':')[1] for i in kegg_pathways[p]['entries'] if i['type'] == 'gene' for x in i['name'].split(' ') if x in k2u} for p in kegg_pathways} kegg_uniprot_acc_map = {x for p in kegg_pathways_proteins for x in kegg_pathways_proteins[p]} kegg_uniprot_acc_map = {p: uniprot.get_fasta(str(p)).split(' ')[0].split('|')[2] for p in kegg_uniprot_acc_map} kegg_pathways_proteins = {p: {kegg_uniprot_acc_map[i] for i in kegg_pathways_proteins[p]} for p in kegg_pathways_proteins} print '[INFO] KEGG pathways Ids converted to UniProt: ', len(kegg_pathways_proteins) # ---- Set-up GO Terms gene list go_terms_file = '%s/files/go_terms_uniprot.pickle' % wd
def pathway_to_dataframe(pathway_id, org='hsa', verbose=False, cache=False): """ Extract protein-protein interaction from KEGG pathway to a pandas DataFrame. NOTE: Interactions will be directionless. Parameters ---------- pathway_id : str Pathway identifier to parse into a dataframe. Example: 'path:hsa00010' org : str or None, optioanl, default: 'hsa' If supplied, filters out all interactions with identifiers that are not in the dictionary created from :func:`kegg_to_uniprot`. If None, all interactions are parsed regardless of mappability to UniProt. verbose : bool, optional, default: False If True, logs messages to stdout to inform of current progress. cache : bool, optional, default: False If True, HTTP responses are cached by `bioservices`. This can save time but you will eventually miss out on new database releases if your cache is old. Returns ------- `pd.DataFrame` DataFrame with 'source', 'target', 'label', 'pubmed', and 'experiment_type' columns. """ kegg = KEGG(cache=cache) kegg.organism = org kegg_to_up = kegg_to_uniprot(org, cache) res = kegg.parse_kgml_pathway(pathway_id) sources = [] targets = [] labels = [] if verbose: logger.info("Parsing pathway {}".format(pathway_id)) for rel in res['relations']: id1 = rel['entry1'] id2 = rel['entry2'] name1 = res['entries'][[x['id'] for x in res['entries']].index(id1)]['name'] name2 = res['entries'][[x['id'] for x in res['entries']].index(id2)]['name'] type1 = res['entries'][[x['id'] for x in res['entries']].index(id1)]['type'] type2 = res['entries'][[x['id'] for x in res['entries']].index(id2)]['type'] reaction_type = rel['name'].replace(' ', '-') link_type = rel['link'] if link_type not in links_to_include: continue if type1 not in types_to_include or type2 not in types_to_include: continue for a in name1.strip().split(' '): for b in name2.strip().split(' '): valid_db_a = (kegg.organism in a or 'ec' in a) valid_db_b = (kegg.organism in b or 'ec' in b) valid_db_a &= (a in kegg_to_up) valid_db_b &= (b in kegg_to_up) if valid_db_a and valid_db_b: sources.append(a) targets.append(b) labels.append(reaction_type) interactions = make_interaction_frame(sources, targets, labels) return interactions
result = {} for inputGene in inputGenes: print('Processing gene ' + inputGene + ':') try: pathways = k.get_pathway_by_gene(inputGene, 'hsa') except (AttributeError): print('Invalid gene identifier') continue else: if pathways: for pathway in pathways: if pathway != 'hsa01100': print('\tProcessing pathway ' + pathway) rel = k.parse_kgml_pathway(pathway) genes_result = [] ####### Part where the functions are called for entry in rel['entries']: if entry['type'] == 'gene': for aGene in getGeneNameFromEntries(entry['name']): if aGene == inputGene: getTargetsFromRelations(rel, entry) ############################ unique_genes = list(set(genes_result)) if pathway in result: result[pathway] += genes_result else: