def process_phylome( phyid,n,species_list,dbs,step,verbose ): """If not species_list, all species of given phylome are taken.""" if verbose: sys.stderr.write( "[%s] Connecting to PhylomeDB...\n" % datetime.ctime(datetime.now()) ) p=_getConnection()#; print p.get_phylomes() #get some neccesary info phylome_seedids=p.get_phylome_seed_ids(phyid)[0] #loading seedids #phylome_seedids=['Phy0039MUB_9999994','Phy0039MUC_9999994','Phy0039MQE_9999994'] if verbose: sys.stderr.write( "[%s] Processing %s seeds from phylome_%s...\n" % ( datetime.ctime(datetime.now()),len(phylome_seedids),phyid ) ) #print header header = "#" # "seedid" for i in range(n): header += "one2one%s\t" % ( i+1, ) header += "consistency score\n" sys.stdout.write( header ) #process seedids i=pI=skipped=positives=0 processed = set() for seedid in phylome_seedids: i += 1 if seedid in processed: skipped += 1 continue #get list of groups A1-B1 A2-B2 and so on groups,t = process_tree( p,phyid,n,species_list,seedid ) #do nothing if no such groups if not groups: continue #format output line line = "" #"%s" % seedid ###here you can add protein id conversion for group in groups: #update processed for protid in group: processed.add( protid ) extids = [] if dbs: extids = get_external( p,protid,dbs ) line += "|".join( [protid,]+extids ) + "," line = line[:-1] + "\t" #get consistency across collateral trees cs,processed = get_consistency( p,t,phyid,seedid,species_list,n,processed ) line += "%.3f\n" % cs #write to stdout sys.stdout.write( line ) positives += 1 #print progress if i>pI: pI+=step sys.stderr.write( " %s / %s\t%s \r" % ( i,len(phylome_seedids),positives ) ) if verbose: sys.stderr.write( "[%s] Processed %s seed proteins (duplicated skipped: %s ). %s homologous groups printed.\n" % ( datetime.ctime(datetime.now()),len(phylome_seedids),skipped,positives ) )
def main(): usage = "usage: %prog [options]" parser = OptionParser(usage=usage, version="%prog 1.0") #allow_interspersed_args=True parser.add_option("-p", dest="phyid", default=0, type=int, help="define phylome id [mandatory]") parser.add_option("-s", dest="split", default=False, action="store_true", help="split fasta for ids from every line") parser.add_option("-v", dest="verbose", default=True, action="store_false") (o, args) = parser.parse_args() if o.verbose: sys.stderr.write("%s\n" % (str(o), )) if not o.phyid: parser.error("Specify mandatory parameters!") #connect sys.stderr.write("Connecting...\n") p = _getConnection() #open common output sys.stderr.write("Fetching proteomes...\n") if not o.split: outfn = "phylome_%s.fasta" % o.phyid out = open(outfn, "w") for proteome, pdata in p.get_proteomes_in_phylome( 120)['proteomes'].iteritems(): spcode, ver = proteome.split(".") taxid = pdata['taxid'] sys.stderr.write(" %s (%s) \r" % (spcode, proteome)) #open output for each line if requested if o.split: outfn = "phylome_%s.%s.fasta" % (o.phyid, proteome) out = open(outfn, "w") for id, data in p.get_seqs_in_genome(taxid, ver, filter_isoforms=True).iteritems(): out.write(">%s\n%s\n" % (id, data['seq']))
def phylome2orthogroups(out, phyid, species, one2one, fraction, step, verbose): """Report orthogroups for given phylome and set of species""" #get phylomeDB connection p = _getConnection() #get seeds seedids = p.get_phylome_seed_ids(phyid)[0] if verbose: sys.stderr.write("Processing trees...\n") processed = set() trees = rooted = ogroups = 0 for i, seedid in enumerate(seedids, 1): if seedid in processed: continue #get tree trees_dict = p.get_best_tree(seedid, phyid) if not trees_dict: continue t = trees_dict['tree'] if not t: continue trees += 1 #set species naming function t.set_species_naming_function(_get_spcode) #root #t = root_tree(t, seedid, phyid) if not t: continue rooted += 1 #get orthogroup and report orthogroup = tree2orthogroups(t, seedid, species, one2one, fraction, verbose) if not orthogroup: continue ogroups += 1 out.write("\t".join(orthogroup) + "\n") #add to processed for o in orthogroup: processed.add(o) #print info if verbose and i % step == 1: sys.stderr.write(" %s / %s %s %s %s\r" % (i, len(seedids), trees, rooted, ogroups))
def phylome2orthogroups(out, phyid, species, one2one, fraction, step, verbose): """Report orthogroups for given phylome and set of species""" #get phylomeDB connection p = _getConnection() #get seeds seedids = p.get_phylome_seed_ids(phyid)[0] if verbose: sys.stderr.write("Processing trees...\n") processed = set() trees = rooted = ogroups = 0 for i, seedid in enumerate(seedids, 1): if seedid in processed: continue #get tree trees_dict = p.get_best_tree(seedid, phyid) if not trees_dict: continue t = trees_dict['tree'] if not t: continue trees += 1 #set species naming function t.set_species_naming_function(_get_spcode) #root #t = root_tree(t, seedid, phyid) if not t: continue rooted += 1 #get orthogroup and report orthogroup = tree2orthogroups(t, seedid, species, one2one, fraction, verbose) if not orthogroup: continue ogroups += 1 out.write("\t".join(orthogroup)+"\n") #add to processed for o in orthogroup: processed.add(o) #print info if verbose and i%step == 1: sys.stderr.write(" %s / %s %s %s %s\r" % (i, len(seedids), trees, rooted, ogroups))
def main(): usage = "usage: %prog [options]" parser = OptionParser( usage=usage,version="%prog 1.0" ) #allow_interspersed_args=True parser.add_option("-p", dest="phyid", default=0, type=int, help="define phylome id [mandatory]") parser.add_option("-s", dest="split", default=False, action="store_true", help="split fasta for ids from every line") parser.add_option("-v", dest="verbose", default=True, action="store_false") ( o, args ) = parser.parse_args() if o.verbose: sys.stderr.write( "%s\n" % ( str(o), ) ) if not o.phyid: parser.error( "Specify mandatory parameters!" ) #connect sys.stderr.write("Connecting...\n") p = _getConnection() #open common output sys.stderr.write("Fetching proteomes...\n") if not o.split: outfn = "phylome_%s.fasta" % o.phyid out = open( outfn,"w" ) for proteome,pdata in p.get_proteomes_in_phylome(120)['proteomes'].iteritems(): spcode,ver = proteome.split(".") taxid = pdata['taxid'] sys.stderr.write(" %s (%s) \r" % (spcode,proteome) ) #open output for each line if requested if o.split: outfn = "phylome_%s.%s.fasta" % ( o.phyid,proteome ) out = open( outfn,"w" ) for id,data in p.get_seqs_in_genome(taxid,ver,filter_isoforms=True).iteritems(): out.write( ">%s\n%s\n" % (id,data['seq']) )
def profile(handle, out, phyid, protid2phylmeFn, spCode, speciesInRows, \ annotationFn, verbose): """Generate orthologous gene profile.""" #get phylomeDB connection p = _getConnection() #get protids protids=[] #,protid2pfam,protid2change=get_protids( handle,foldChange,foldChangeColumn ) genes=[] for r in SeqIO.parse(handle, 'fasta'): gene = protid = r.id.split('|')[0] #orf19.3038|TPS2 protids.append(protid) #get gene name if present if len(r.id.split('|'))>1: gene=r.id.split('|')[1].split('_')[0] genes.append(gene) #load annotation prot2ann = {} if annotationFn: prot2ann = load_annotation(annotationFn) #get species info code2name = get_species_in_phylome(phyid, p) #define empty profiles code2profile={} code2score={} for code in code2name: code2profile[code]=[0 for i in range(len(protids))] code2score[code]=[] #get phylomedb ids protid2seedid = {} k = 0 protid2phyid = get_protid2phyid(protid2phylmeFn, protids, spCode, p, phyid) for i, protid in enumerate(protids, 1): sys.stderr.write(" %s / %s %s \r"%(i, len(protids), protid)) if protid not in protid2phyid: continue phyprot = protid2phyid[protid] orthologs, code2score, seedid = get_orthologs(phyprot, phyid, p, code2score) protid2seedid[protid] = seedid #s; print protid, seedids #fill profiles for o in orthologs: code2profile[_get_spcode(o)][i-1]+=1 if len(orthologs)>1: k += 1 elif verbose: sys.stderr.write("[WARNING] Only %s orthologs found for %s (%s)!\n"%(len(orthologs), protid, phyprot)) #write info sys.stderr.write("%s proteins; %s with orthologs\n"%(i, k)) ###print summary #header if not speciesInRows: info='#Protid\tGene\tSeedID' for code in sorted(code2name, key=lambda x: np.mean(code2score[x])): nameShort='%s.%s' % (code2name[code][1][0], code2name[code][1].split()[1]) info+='\t%s' % nameShort info+='\tAnnotation' #data for j in range(len(protids)): protid=protids[j] gene=genes[j] if gene==protid: gene='' seedid = "" if protid in protid2seedid and protid2seedid[protid]: seedid = protid2seedid[protid] info+='\n%s\t%s\t%s' % (protid, gene, seedid) for code in sorted(code2name,key=lambda x: np.mean(code2score[x])): info+='\t%s' % code2profile[code][j] if protid in prot2ann: info+="\t%s" % prot2ann[protid] else: info='#Species' protidLine='#Protid' annLine='#Annotation' for protid,gene in zip(protids,genes): info+='\t%s' % gene if gene!=protid: protidLine+='\t%s' % protid else: protidLine+='\t' seedid = "" if protid in protid2seedid and protid2seedid[protid]: seedid = protid2seedid[protid] protidLine += "\t%s" %seedid if protid in prot2ann: annLine+="\t%s" % prot2ann[protid] else: annLine+='\t' j=0 for code in sorted(code2name, key=lambda x: np.mean(code2score[x])): info+='\n%s.%s' % (code2name[code][1][0], code2name[code][1].split()[1]) for orthologNo in code2profile[code]: info+='\t%s' % orthologNo info += protidLine + annLine out.write(info)
def process_phylome( phyid,species_list=None,one2one=True,collpase_inparalogs=False,missingSpeciesTh=0.10,step=100 ): """If not species_list, all species of given phylome are taken. """ print "Generating orthogroups..." all_orthogroups=[] p=_getConnection()#; print p.get_phylomes() #get some neccesary info if not species_list: species_list=[] proteomes_in_phylome=p.get_proteomes_in_phylome(phyid)['proteomes'] for proteomeID in proteomes_in_phylome: spCode=proteomeID.split('.')[0] species_list.append(spCode) print " for %s species: %s" % ( len( species_list ),", ".join( species_list ) ) #make sure seed species if in orthogroups seed_sp = p.get_phylome_info(phyid)['seed_proteome'].split('.')[0] if not seed_sp in species_list: species_list.append( seed_sp ) orthoFpath='phylome%s_orthogroups_%s_%s.txt' % ( phyid,len(species_list),missingSpeciesTh )#; uncommonFpath='phylome%s_uncommon_%s_%s.txt' % ( phyid,len(species_list),missingSpeciesTh ) if os.path.isfile( orthoFpath ): print " Loading orthologous groups from file: %s" % orthoFpath for line in open(orthoFpath): line=line.strip() all_orthogroups.append( line.split('\t') ) return all_orthogroups,orthoFpath,species_list phylome_seedids=p.get_phylome_seed_ids(phyid)[0] #loading seedids #phylome_seedids=['Phy0039MUB_9999994','Phy0039MUC_9999994','Phy0039MQE_9999994'] outFile=open( orthoFpath,'w' )#; uncommonFile=open( uncommonFpath,'w' ) trees=not_included=i=pI=low_species_cov=0; pt=datetime.now() for seedid in phylome_seedids: trees_dict=p.get_best_tree( seedid,phyid )#; print trees_dict if not trees_dict: continue t=trees_dict['tree'] if not t: continue trees+=1 #process orthogroups orthogroups=get_orthogroups( t,seedid,phyid,species_list,one2one,collpase_inparalogs ) for og in orthogroups: line=""; _curSpecies=[] for o in og: spCode=_get_spcode(o) if spCode not in _curSpecies: _curSpecies.append(spCode) line+="%s\t" % o species_coverage=len(_curSpecies)*1.0/len(species_list) line=line[:-1]+'\n' #if not enough species in orthogroup if species_coverage<1-missingSpeciesTh: #uncommonFile.write( line) # save in uncommon low_species_cov+=1 else: outFile.write( line ) #save in orthogroup file all_orthogroups.append(og) #and add orthogroup to list i+=1 if trees>pI: pI+=step sys.stdout.write( " %s %s %s\t%s\r" % ( trees,i,seedid,datetime.now()-pt ) ) pt=datetime.now() print print " Processed %s trees (skipped: %s ) for %s seeds. %s one2one orthologous groups and %s with species coverage < %s." % ( trees,not_included,len(phylome_seedids),i,low_species_cov,1-missingSpeciesTh ) outFile.close() return all_orthogroups,orthoFpath,species_list