def main(infile, outfile, annotations, descriptions, categories): cats = dict() with open(categories, 'rU') as handle: for line in handle: if line.startswith('#') or line.strip() == '': continue line = line.rstrip('\n') record = cat_line(line) cats[record['catid']] = record annots = dict() with open(annotations, 'rU') as handle: for line in handle: if line.startswith('#') or line.strip() == '': continue line = line.rstrip('\n') record = annot_line(line) cat = record['catid'] record.update(cats[cat]) annots[record['id']] = record descs = dict() with open(descriptions, 'rU') as handle: for line in handle: if line.startswith('#') or line.strip() == '': continue line = line.rstrip('\n') record = desc_line(line) record['category'] = '' record['short_catname'] = '' record['long_catname'] = '' descs[record['id']] = record ips = InterproscanResult(infile) with outhandler(outfile) as handle: for query, analyses in ips.items(): results = list() if 'SUPERFAMILY' in analyses: analysis = analyses['SUPERFAMILY'] sfids = set() for record in analysis: acc = record.accession.lstrip('SSF') sfids.add(acc) for acc in sfids: try: results.append(annots[acc]) except KeyError: results.append(descs[acc]) template = '{seqid}\t{category}\t{short_catname}\t{long_catname}\t{id}\t{name}\n' for result in results: handle.write(template.format(seqid=query, **result)) return
def main(infile, outfile, pantherfile): pantherdb = External2GO(pantherfile, fmt='panther') ips = InterproscanResult(infile) with outhandler(outfile) as handle: for query, analyses in ips.items(): if 'PANTHER' in analyses: panther_ids = [f.accession for f in analyses['PANTHER']] panther_names = [pantherdb[f].name for f in panther_ids] unnamed = [ i for i, val in enumerate(panther_names) if val in {'FAMILY NOT NAMED', 'SUBFAMILY NOT NAMED'} ] subfamily = [i for i, val in enumerate(panther_ids) if ":" in val] family = [i for i, val in enumerate(panther_ids) if ":" not in val] represented_fams = set() ids = list() names = list() for sf in subfamily: fam = panther_ids[sf].split(':')[0] if sf in unnamed or panther_ids[sf] in represented_fams: continue represented_fams.add(fam) represented_fams.add(panther_ids[sf]) ids.append(panther_ids[sf]) names.append(panther_names[sf]) for f in family: if f in unnamed or panther_ids[f] in represented_fams: continue represented_fams.add(panther_ids[f]) ids.append(panther_ids[f]) names.append(panther_names[f]) else: names = [] ids = [] if len(names) == 0 and len(ids) == 0: continue template = '{seqid}\t{ids}\t{names}\n' for id_, name in zip(ids, names): handle.write(template.format( seqid=query, ids=id_, names=name, ))
def main( infile, outfile, obofile, outfmt='long', pantherfile=None, pfamfile=None, smartfile=None, interprofile=None, prositefile=None, printsfile=None, prodomfile=None, tigrfamfile=None, pirsffile=None, hamapfile=None, domainfile=None, datadir=None, ): """ . """ if datadir is None: datadir = '' dbs = dict() if pantherfile is not None: pantherdb = External2GO(pjoin(datadir, pantherfile), fmt='panther') dbs['PANTHER'] = pantherdb if pfamfile is not None: pfamdb = External2GO(pjoin(datadir, pfamfile)) dbs['Pfam'] = pfamdb if smartfile is not None: smartdb = External2GO(pjoin(datadir, smartfile)) dbs['SMART'] = smartdb if interprofile is not None: interprodb = External2GO(pjoin(datadir, interprofile)) dbs['IPR'] = interprodb if prositefile is not None: prositedb = External2GO(pjoin(datadir, prositefile)) dbs['ProSitePatterns'] = prositedb dbs['ProSiteProfiles'] = prositedb if printsfile is not None: printsdb = External2GO(pjoin(datadir, printsfile)) dbs['PRINTS'] = printsdb if prodomfile is not None: prodomdb = External2GO(pjoin(datadir, prodomfile)) dbs['ProDom'] = prodomdb if tigrfamfile is not None: tigrfamdb = External2GO(pjoin(datadir, tigrfamfile)) dbs['TIGRFAM'] = tigrfamdb if pirsffile is not None: pirsfdb = External2GO(pjoin(datadir, pirsffile)) dbs['PIRSF'] = pirsfdb if hamapfile is not None: hamapdb = External2GO(pjoin(datadir, hamapfile)) dbs['Hamap'] = hamapdb if domainfile is not None: domaindb = External2GO(pjoin(datadir, domainfile), fmt='superfamily') dbs['SUPERFAMILY'] = domaindb ips = InterproscanResult(infile) godag = GODag(pjoin(datadir, obofile)) with outhandler(outfile) as handle: for query, analyses in ips.items(): ontologies = set() for analysis, records in analyses.items(): for record in records: if analysis not in dbs: continue acc = record.accession if acc not in dbs[analysis]: continue gos = [g.id for g in dbs[analysis][acc].ontologies] for go in gos: domain = godag[go].namespace.replace('_', ' ') term = godag[go].name ontologies.add((go, term, domain)) if len(ontologies) == 0: continue if outfmt == 'long': template = "{seqid}\t{go}\t{term}\t{domain}\n" for ontology in ontologies: go, term, domain = ontology handle.write(template.format( seqid=query, go=go, term=term, domain=domain, )) elif outfmt == 'association': template = "{seqid}\t{gos}\n" gos = [go for go, term, domain in ontologies] handle.write(template.format( seqid=query, gos=';'.join(gos) ))