def kegg_pathways(): data = kegg_interactions() pws = common.uniq_list(map(lambda i: i[3], data)) proteins_pws = dict(map(lambda pw: (pw, set([])), pws)) interactions_pws = dict(map(lambda pw: (pw, set([])), pws)) for u1, u2, eff, pw in data: proteins_pws[pw].add(u1) proteins_pws[pw].add(u2) interactions_pws[pw].add((u1, u2)) return proteins_pws, interactions_pws
def li2012_interactions(): """ Converts table read by ``pypath.inputs.li2012.get_li2012`` to list of interactions. """ result = [] data = get_li2012() for l in data: subs_protein = l[1].split('/')[0] tk_protein = l[2].split()[0] reader_protein = l[3].split()[0] route = l[4] result.append((tk_protein, subs_protein, route, 'phosphorylation')) result.append( (subs_protein, reader_protein, route, 'phosphomotif_binding')) return [list(l) for l in common.uniq_list(result)]
def li2012_enzyme_substrate(): """ Converts table read by `pypath.dataio.get_li2012()` to list of dicts of kinase-substrate interactions. """ result = [] non_digit = re.compile(r'[^\d]+') data = get_li2012() for l in data: subs_protein = l[1].split('/')[0] tk_protein = l[2].split()[0] subs_resnum = int(non_digit.sub('', l[1].split('/')[1])) result.append(( subs_protein, # substrate tk_protein, # kinase None, # instance None, # start None, # end 'Y', # residue letter subs_resnum, # residue offset )) result = [ dict( zip([ 'substrate', 'kinase', 'instance', 'start', 'end', 'resaa', 'resnum', ], list(l))) for l in common.uniq_list(result) ] return result
def kegg_interactions(): """ Downloads and processes KEGG Pathways. Returns list of interactions. """ rehsa = re.compile(r'.*(hsa[0-9]+).*') req_hdrs = [ 'Referer: http://www.genome.jp/kegg-bin/show_pathway' '?map=hsa04710&show_description=show' ] hsa_list = [] interactions = [] c = curl.Curl(urls.urls['kegg_pws']['list_url'], silent = True) htmllst = c.result lstsoup = bs4.BeautifulSoup(htmllst, 'html.parser') for a in lstsoup.find_all('a', href = True): m = rehsa.match(a['href']) if m: hsa_list.append((m.groups(0)[0], a.text)) prg = progress.Progress( len(hsa_list), 'Processing KEGG Pathways', 1, percent = False) for hsa, pw in hsa_list: prg.step() c = curl.Curl(urls.urls['kegg_pws']['kgml_url_2'] % hsa, silent = True, req_headers = req_hdrs) kgml = c.result kgmlsoup = bs4.BeautifulSoup(kgml, 'html.parser') entries = {} for ent in kgmlsoup.find_all('entry'): gr = ent.find('graphics') if gr and 'name' in gr.attrs: entries[ent.attrs['id']] = [ n.strip() for n in gr.attrs['name'].replace('...', '').split(',') ] uentries = dict([(eid, common.uniq_list( common.flat_list([ mapping.map_name( gn, 'genesymbol', 'uniprot', strict = True) for gn in gns ]))) for eid, gns in iteritems(entries)]) for rel in kgmlsoup.find_all('relation'): st = rel.find('subtype') if ( rel.attrs['entry1'] in uentries and rel.attrs['entry2'] in uentries and st and 'name' in st.attrs ): for u1 in uentries[rel.attrs['entry1']]: for u2 in uentries[rel.attrs['entry2']]: interactions.append((u1, u2, st.attrs['name'], pw)) prg.terminate() return common.uniq_list(interactions)
def kegg_interactions(): """ Downloads and processes KEGG Pathways. Returns list of interactions. """ positive_terms = {'activation', 'expression'} negative_terms = {'inhibition', 'repression'} transc_terms = {'expression', 'repression'} mechanism_terms = { 'phosphorylation', 'binding/association', 'dissociation', 'ubiquitination', 'dephosphorylation', 'glycosylation', 'state change', 'methylation', } direct_terms = {'indirect effect'} KeggInteraction = collections.namedtuple( 'KeggInteraction', [ 'id_a', 'id_b', 'effect', 'pathway', 'mechanism', 'is_direct', 'transcriptional', ], ) rehsa = re.compile(r'.*(hsa[0-9]+).*') req_hdrs = [ 'Referer: http://www.genome.jp/kegg-bin/show_pathway' '?map=hsa04710&show_description=show' ] hsa_list = [] interactions = [] c = curl.Curl(urls.urls['kegg_pws']['list_url'], silent=True) htmllst = c.result lstsoup = bs4.BeautifulSoup(htmllst, 'html.parser') for a in lstsoup.find_all('a', href=True): m = rehsa.match(a['href']) if m: hsa_list.append((m.groups(0)[0], a.text)) prg = progress.Progress(len(hsa_list), 'Processing KEGG Pathways', 1, percent=False) for hsa, pw in hsa_list: prg.step() c = curl.Curl(urls.urls['kegg_pws']['kgml_url_2'] % hsa, silent=True, req_headers=req_hdrs) kgml = c.result kgmlsoup = bs4.BeautifulSoup(kgml, 'html.parser') entries = {} for ent in kgmlsoup.find_all('entry'): gr = ent.find('graphics') if gr and 'name' in gr.attrs: entries[ent.attrs['id']] = [ n.strip() for n in gr.attrs['name'].replace('...', '').split(',') ] uentries = dict([(eid, common.uniq_list( common.flat_list([ mapping.map_name(gn, 'genesymbol', 'uniprot', strict=True) for gn in gns ]))) for eid, gns in iteritems(entries)]) for rel in kgmlsoup.find_all('relation'): subtypes = {st.attrs['name'] for st in rel.find_all('subtype')} if (rel.attrs['entry1'] in uentries and rel.attrs['entry2'] in uentries and subtypes): is_direct = 'indirect effect' not in subtypes effect = ('inhibition' if negative_terms & subtypes else 'activation' if positive_terms & subtypes else 'unknown') mechanism = ';'.join(mechanism_terms & subtypes) transcriptional = bool(transc_terms & subtypes) for u1 in uentries[rel.attrs['entry1']]: for u2 in uentries[rel.attrs['entry2']]: interactions.append( KeggInteraction( id_a=u1, id_b=u2, effect=effect, pathway=pw, mechanism=mechanism, is_direct=is_direct, transcriptional=transcriptional, )) prg.terminate() return common.uniq_list(interactions)
def get_pubmed_data(pp, cachefile=None, htp_threshold=20): """ For one PyPath object, obtains metadata for all PubMed IDs through NCBI E-utils. :param pp: ``pypath.PyPath`` object :param htp_threshold: The number of interactions for one reference above the study considered to be high-throughput. """ if cachefile is None: cachefile = settings.get('pubmed_cache') if htp_threshold is not None: pp.htp_stats() pubmeds = common.uniq_list( common.flat_list([[r.pmid for r in e['references']] for e in pp.graph.es])) if htp_threshold is not None: pubmeds = set(pubmeds) - pp.htp[htp_threshold]['htrefs'] notpmid = [i for i in pubmeds if not i.isdigit()] sys.stdout.write('\t:: Number of non PubMed ID references: %u\n' % len(notpmid)) pmdata = {} if os.path.exists(cachefile): sys.stdout.write('\t:: Loading data previously downloaded ' 'from PubMed, from file `%s`\n' % cachefile) pmdata = pickle.load(open(cachefile, 'rb')) missing = list(set(pubmeds) - set(pmdata.keys())) sys.stdout.write('\t:: Downloading data from PubMed about %s papers\n' % len(missing)) cached_pubmeds_len = len(pmdata) pmdata_new = pubmed_input.get_pubmeds(missing) pmdata.update(pmdata_new) sys.stdout.write('\t:: Saving PubMed data to file `%s`\n' % cachefile) if len(pmdata) > cached_pubmeds_len: pickle.dump(pmdata, open(cachefile, 'wb')) pmdata = dict(i for i in pmdata.items() if i[0] in pubmeds) points = [] earliest = [] for e in pp.graph.es: for s, rs in iteritems(e['refs_by_source']): pms = [ r.pmid for r in rs if (htp_threshold is None or r.pmid not in pp.htp[htp_threshold]['htrefs']) and r.pmid in pmdata and 'pubdate' in pmdata[r.pmid] ] if len(pms) > 0: yrs = [int(pmdata[pm]['pubdate'][:4]) for pm in pms] earliest.append((s, 0, min(yrs), '', e.index)) for pm in pms: points.append((s, pm, int(pmdata[pm]['pubdate'][:4]), pmdata[pm]['source'], e.index)) points = common.uniq_list(points) earliest = common.uniq_list(earliest) points = pd.DataFrame.from_records(points) earliest = pd.DataFrame.from_records(earliest) points.columns = ['database', 'pmid', 'year', 'journal', 'eid'] earliest.columns = ['database', 'none', 'year', 'none', 'eid'] return points, earliest
def write_set(self, id_list, setname, id_type, map_ids=True): self.sets[setname] = set(common.uniq_list(common.flat_list( self.mapper.map_name(n, self.ids[id_type], self.target_id) for n in id_list))) if map_ids \ else set(id_list)
def trip_process( exclude_methods=['Inference', 'Speculation'], predictions=False, species='Human', strict=False, ): """ Downloads TRIP data by calling `pypath.dadio.take_a_trip()` and further provcesses it. Returns dict of dict with TRIP data. @exclude_methods : list Interaction detection methods to be discarded. @predictions : bool Whether to include predicted interactions. @species : str Organism name, e.g. `Human`. @strict : bool Whether include interactions with species not used as a bait or not specified. """ nd = 'Not determined' spec = set([]) if strict \ else set(['Not specified', 'Not used as a bait', '']) spec.add(species) result = {} data = take_a_trip() for uniprots in common.uniq_list( common.flat_list([v.keys() for v in data.values()])): to_process = False refs = set([]) mets = set([]) tiss = set([]) reg = set([]) eff = set([]) if uniprots in data['sc']: for sc in data['sc'][uniprots]: if sc[4] in spec and sc[6] in spec and \ (predictions or sc[9] != 'Prediction') and \ sc[3] not in exclude_methods: refs.add(sc[10]) mets.add(sc[3]) tiss.add(sc[7]) if uniprots in data['vtc']: for vtc in data['vtc'][uniprots]: if vtc[4] in spec and vtc[7] in spec and \ vtc[3] not in exclude_methods: refs.add(vtc[10]) mets.add(vtc[3]) if uniprots in data['vvc']: for vvc in data['vvc'][uniprots]: if vvc[6] in spec and vvc[8] in spec and \ vvc[3] not in exclude_methods: refs.add(vvc[10]) mets.add(vvc[3]) if len(vvc[4]) > 0: tiss.add(vvc[4]) if len(vvc[5]) > 0: tiss.add(vvc[5]) if uniprots in data['cc']: for cc in data['cc'][uniprots]: if cc[4] in spec and cc[6] in spec and \ cc[3] not in exclude_methods: refs.add(cc[10]) mets.add(cc[3]) if (cc[5] != nd and len(cc[5]) > 0) or \ (cc[7] != nd and len(cc[7]) > 0): reg.add((cc[5], cc[7])) if uniprots in data['fc']: for fc in data['fc'][uniprots]: mets.add(fc[3]) refs.add(fc[7]) if len(fc[5]) > 0: eff.add(fc[5]) if len(fc[6]) > 0: eff.add(fc[6]) if len(refs) > 0: result[uniprots] = { 'refs': refs, 'methods': mets, 'tissues': tiss, 'effect': eff, 'regions': reg } return result