Example #1
0
def process_phylome( phyid,n,species_list,dbs,step,verbose ):    
  """If not species_list, all species of given phylome are taken."""
  if verbose:
    sys.stderr.write( "[%s] Connecting to PhylomeDB...\n" % datetime.ctime(datetime.now()) )
  
  p=_getConnection()#; print p.get_phylomes() #get some neccesary info
    
  phylome_seedids=p.get_phylome_seed_ids(phyid)[0] #loading seedids #phylome_seedids=['Phy0039MUB_9999994','Phy0039MUC_9999994','Phy0039MQE_9999994']
  if verbose:
    sys.stderr.write( "[%s] Processing %s seeds from phylome_%s...\n" % ( datetime.ctime(datetime.now()),len(phylome_seedids),phyid ) )

  #print header
  header = "#" # "seedid"
  for i in range(n):
    header += "one2one%s\t" % ( i+1, )
  header += "consistency score\n"
  sys.stdout.write( header )

  #process seedids
  i=pI=skipped=positives=0
  processed = set()
  for seedid in phylome_seedids:
    i += 1
    if seedid in processed:
      skipped += 1
      continue

    #get list of groups A1-B1 A2-B2 and so on
    groups,t = process_tree( p,phyid,n,species_list,seedid )
    #do nothing if no such groups
    if not groups:
      continue
      
    #format output line
    line = "" #"%s" % seedid
    ###here you can add protein id conversion
    for group in groups:
      #update processed
      for protid in group:
        processed.add( protid )
        extids = []
        if dbs:
          extids = get_external( p,protid,dbs )
        line += "|".join( [protid,]+extids ) + ","
      line = line[:-1] + "\t"

    #get consistency across collateral trees
    cs,processed = get_consistency( p,t,phyid,seedid,species_list,n,processed )      
    line += "%.3f\n" % cs
    #write to stdout
    sys.stdout.write( line )
    positives += 1
    
    #print progress
    if i>pI:
      pI+=step
      sys.stderr.write( "  %s / %s\t%s    \r" % ( i,len(phylome_seedids),positives ) )
      
  if verbose:
    sys.stderr.write( "[%s] Processed %s seed proteins (duplicated skipped: %s ). %s homologous groups printed.\n" % ( datetime.ctime(datetime.now()),len(phylome_seedids),skipped,positives ) )
Example #2
0
def process_phylome( phyid,n,species_list,dbs,step,verbose ):    
  """If not species_list, all species of given phylome are taken."""
  if verbose:
    sys.stderr.write( "[%s] Connecting to PhylomeDB...\n" % datetime.ctime(datetime.now()) )
  
  p=_getConnection()#; print p.get_phylomes() #get some neccesary info
    
  phylome_seedids=p.get_phylome_seed_ids(phyid)[0] #loading seedids #phylome_seedids=['Phy0039MUB_9999994','Phy0039MUC_9999994','Phy0039MQE_9999994']
  if verbose:
    sys.stderr.write( "[%s] Processing %s seeds from phylome_%s...\n" % ( datetime.ctime(datetime.now()),len(phylome_seedids),phyid ) )

  #print header
  header = "#" # "seedid"
  for i in range(n):
    header += "one2one%s\t" % ( i+1, )
  header += "consistency score\n"
  sys.stdout.write( header )

  #process seedids
  i=pI=skipped=positives=0
  processed = set()
  for seedid in phylome_seedids:
    i += 1
    if seedid in processed:
      skipped += 1
      continue

    #get list of groups A1-B1 A2-B2 and so on
    groups,t = process_tree( p,phyid,n,species_list,seedid )
    #do nothing if no such groups
    if not groups:
      continue
      
    #format output line
    line = "" #"%s" % seedid
    ###here you can add protein id conversion
    for group in groups:
      #update processed
      for protid in group:
        processed.add( protid )
        extids = []
        if dbs:
          extids = get_external( p,protid,dbs )
        line += "|".join( [protid,]+extids ) + ","
      line = line[:-1] + "\t"

    #get consistency across collateral trees
    cs,processed = get_consistency( p,t,phyid,seedid,species_list,n,processed )      
    line += "%.3f\n" % cs
    #write to stdout
    sys.stdout.write( line )
    positives += 1
    
    #print progress
    if i>pI:
      pI+=step
      sys.stderr.write( "  %s / %s\t%s    \r" % ( i,len(phylome_seedids),positives ) )
      
  if verbose:
    sys.stderr.write( "[%s] Processed %s seed proteins (duplicated skipped: %s ). %s homologous groups printed.\n" % ( datetime.ctime(datetime.now()),len(phylome_seedids),skipped,positives ) )
Example #3
0
def main():
    usage = "usage: %prog [options]"
    parser = OptionParser(usage=usage,
                          version="%prog 1.0")  #allow_interspersed_args=True

    parser.add_option("-p",
                      dest="phyid",
                      default=0,
                      type=int,
                      help="define phylome id                 [mandatory]")
    parser.add_option("-s",
                      dest="split",
                      default=False,
                      action="store_true",
                      help="split fasta for ids from every line")
    parser.add_option("-v", dest="verbose", default=True, action="store_false")

    (o, args) = parser.parse_args()
    if o.verbose:
        sys.stderr.write("%s\n" % (str(o), ))

    if not o.phyid:
        parser.error("Specify mandatory parameters!")

    #connect
    sys.stderr.write("Connecting...\n")
    p = _getConnection()

    #open common output
    sys.stderr.write("Fetching proteomes...\n")
    if not o.split:
        outfn = "phylome_%s.fasta" % o.phyid
        out = open(outfn, "w")
    for proteome, pdata in p.get_proteomes_in_phylome(
            120)['proteomes'].iteritems():
        spcode, ver = proteome.split(".")
        taxid = pdata['taxid']
        sys.stderr.write(" %s (%s)       \r" % (spcode, proteome))
        #open output for each line if requested
        if o.split:
            outfn = "phylome_%s.%s.fasta" % (o.phyid, proteome)
            out = open(outfn, "w")
        for id, data in p.get_seqs_in_genome(taxid, ver,
                                             filter_isoforms=True).iteritems():
            out.write(">%s\n%s\n" % (id, data['seq']))
def phylome2orthogroups(out, phyid, species, one2one, fraction, step, verbose):
    """Report orthogroups for given phylome and set of species"""
    #get phylomeDB connection
    p = _getConnection()
    #get seeds
    seedids = p.get_phylome_seed_ids(phyid)[0]
    if verbose:
        sys.stderr.write("Processing trees...\n")
    processed = set()
    trees = rooted = ogroups = 0
    for i, seedid in enumerate(seedids, 1):
        if seedid in processed:
            continue
        #get tree
        trees_dict = p.get_best_tree(seedid, phyid)
        if not trees_dict:
            continue
        t = trees_dict['tree']
        if not t:
            continue
        trees += 1
        #set species naming function
        t.set_species_naming_function(_get_spcode)
        #root
        #t = root_tree(t, seedid, phyid)
        if not t:
            continue
        rooted += 1
        #get orthogroup and report
        orthogroup = tree2orthogroups(t, seedid, species, one2one, fraction,
                                      verbose)
        if not orthogroup:
            continue
        ogroups += 1
        out.write("\t".join(orthogroup) + "\n")
        #add to processed
        for o in orthogroup:
            processed.add(o)
        #print info
        if verbose and i % step == 1:
            sys.stderr.write(" %s / %s %s %s %s\r" %
                             (i, len(seedids), trees, rooted, ogroups))
def phylome2orthogroups(out, phyid, species, one2one, fraction, step, verbose):
    """Report orthogroups for given phylome and set of species"""
    #get phylomeDB connection
    p = _getConnection()
    #get seeds
    seedids = p.get_phylome_seed_ids(phyid)[0]
    if verbose:
        sys.stderr.write("Processing trees...\n")
    processed = set()
    trees = rooted = ogroups = 0
    for i, seedid in enumerate(seedids, 1):
        if seedid in processed:
            continue
        #get tree
        trees_dict = p.get_best_tree(seedid, phyid)
        if not trees_dict:
            continue
        t = trees_dict['tree']
        if not t:
            continue
        trees += 1
        #set species naming function
        t.set_species_naming_function(_get_spcode)         
        #root
        #t = root_tree(t, seedid, phyid)
        if not t:
            continue
        rooted += 1
        #get orthogroup and report
        orthogroup = tree2orthogroups(t, seedid, species, one2one, fraction, verbose)
        if not orthogroup:
            continue
        ogroups += 1
        out.write("\t".join(orthogroup)+"\n")
        #add to processed
        for o in orthogroup:
            processed.add(o)
        #print info
        if verbose and i%step == 1:
            sys.stderr.write(" %s / %s %s %s %s\r" % (i, len(seedids), trees, rooted, ogroups))
Example #6
0
def main():
    usage = "usage: %prog [options]" 
    parser = OptionParser( usage=usage,version="%prog 1.0" ) #allow_interspersed_args=True

    parser.add_option("-p", dest="phyid", default=0, type=int,
                      help="define phylome id                 [mandatory]")
    parser.add_option("-s", dest="split",  default=False, action="store_true",
                      help="split fasta for ids from every line")
    parser.add_option("-v", dest="verbose",  default=True, action="store_false")
    
    ( o, args ) = parser.parse_args()
    if o.verbose:
        sys.stderr.write( "%s\n" % ( str(o), ) )

    if not o.phyid:
        parser.error( "Specify mandatory parameters!" )

    #connect
    sys.stderr.write("Connecting...\n")
    p = _getConnection()
    
    #open common output
    sys.stderr.write("Fetching proteomes...\n")    
    if not o.split:
        outfn = "phylome_%s.fasta" % o.phyid
        out   = open( outfn,"w" )
    for proteome,pdata in p.get_proteomes_in_phylome(120)['proteomes'].iteritems():
        spcode,ver = proteome.split(".")
        taxid = pdata['taxid']
        sys.stderr.write(" %s (%s)       \r" % (spcode,proteome) )        
        #open output for each line if requested
        if o.split:
            outfn = "phylome_%s.%s.fasta" % ( o.phyid,proteome )
            out   = open( outfn,"w" )
        for id,data in p.get_seqs_in_genome(taxid,ver,filter_isoforms=True).iteritems():
            out.write( ">%s\n%s\n" % (id,data['seq']) )
Example #7
0
def profile(handle, out, phyid, protid2phylmeFn, spCode, speciesInRows, \
            annotationFn, verbose):
    """Generate orthologous gene profile."""
    #get phylomeDB connection
    p = _getConnection()
    
    #get protids
    protids=[] #,protid2pfam,protid2change=get_protids( handle,foldChange,foldChangeColumn )
    genes=[]
    for r in SeqIO.parse(handle, 'fasta'): 
        gene = protid = r.id.split('|')[0] #orf19.3038|TPS2
        protids.append(protid)
        #get gene name if present
        if len(r.id.split('|'))>1:
            gene=r.id.split('|')[1].split('_')[0]
        genes.append(gene)
  
    #load annotation
    prot2ann = {}
    if annotationFn:
        prot2ann = load_annotation(annotationFn)       
  
    #get species info
    code2name = get_species_in_phylome(phyid, p)
        
    #define empty profiles
    code2profile={}
    code2score={}
    for code in code2name: 
        code2profile[code]=[0 for i in range(len(protids))]
        code2score[code]=[]
    
    #get phylomedb ids
    protid2seedid = {}
    k = 0
    protid2phyid = get_protid2phyid(protid2phylmeFn, protids, spCode, p, phyid)
    for i, protid in enumerate(protids, 1):
        sys.stderr.write(" %s / %s %s   \r"%(i, len(protids), protid))
        if protid not in protid2phyid: 
            continue
        phyprot = protid2phyid[protid]
        orthologs, code2score, seedid = get_orthologs(phyprot, phyid, p, code2score)
        protid2seedid[protid] = seedid #s; print protid, seedids
        #fill profiles
        for o in orthologs: 
            code2profile[_get_spcode(o)][i-1]+=1
        if len(orthologs)>1:
            k += 1
        elif verbose:
            sys.stderr.write("[WARNING] Only %s orthologs found for %s (%s)!\n"%(len(orthologs), protid, phyprot))
    #write info
    sys.stderr.write("%s proteins; %s with orthologs\n"%(i, k))
        
    ###print summary
    #header
    if not speciesInRows:
        info='#Protid\tGene\tSeedID'
        for code in sorted(code2name, key=lambda x: np.mean(code2score[x])):
            nameShort='%s.%s' % (code2name[code][1][0], code2name[code][1].split()[1])
            info+='\t%s' % nameShort
        info+='\tAnnotation'
        #data
        for j in range(len(protids)):
            protid=protids[j]
            gene=genes[j]
            if gene==protid:
                gene=''
            seedid = ""
            if protid in protid2seedid and protid2seedid[protid]:
                seedid = protid2seedid[protid]
            info+='\n%s\t%s\t%s' % (protid, gene, seedid)
            for code in sorted(code2name,key=lambda x: np.mean(code2score[x])):  
                info+='\t%s' % code2profile[code][j]
      
            if protid in prot2ann:
                info+="\t%s" %  prot2ann[protid] 
    else:
        info='#Species'
        protidLine='#Protid'
        annLine='#Annotation'
        for protid,gene in zip(protids,genes): 
            info+='\t%s' % gene
            if gene!=protid:
                protidLine+='\t%s' % protid
            else:
                protidLine+='\t'
            seedid = ""
            if protid in protid2seedid and protid2seedid[protid]:
                seedid = protid2seedid[protid]
            protidLine += "\t%s" %seedid
            if protid in prot2ann:
                annLine+="\t%s" % prot2ann[protid]
            else:
                annLine+='\t'
        j=0
        for code in sorted(code2name, key=lambda x: np.mean(code2score[x])):  
            info+='\n%s.%s' % (code2name[code][1][0], code2name[code][1].split()[1])
            for orthologNo in code2profile[code]:
                info+='\t%s' % orthologNo
    
        info += protidLine + annLine
    out.write(info)
Example #8
0
def process_phylome( phyid,species_list=None,one2one=True,collpase_inparalogs=False,missingSpeciesTh=0.10,step=100 ):    
    """If not species_list, all species of given phylome are taken.
    """
    print "Generating orthogroups..."
    all_orthogroups=[]
    p=_getConnection()#; print p.get_phylomes() #get some neccesary info
    if not species_list:
        species_list=[]
        proteomes_in_phylome=p.get_proteomes_in_phylome(phyid)['proteomes']
        for proteomeID in proteomes_in_phylome: 
            spCode=proteomeID.split('.')[0]
            species_list.append(spCode)

    print " for %s species: %s" % ( len( species_list ),", ".join( species_list ) ) 
    #make sure seed species if in orthogroups
    seed_sp = p.get_phylome_info(phyid)['seed_proteome'].split('.')[0]
    if not seed_sp in species_list:
        species_list.append( seed_sp )
              
    orthoFpath='phylome%s_orthogroups_%s_%s.txt' % ( phyid,len(species_list),missingSpeciesTh )#; uncommonFpath='phylome%s_uncommon_%s_%s.txt' % ( phyid,len(species_list),missingSpeciesTh )
    if os.path.isfile( orthoFpath ): 
        print " Loading orthologous groups from file: %s" % orthoFpath
        for line in open(orthoFpath):
            line=line.strip()
            all_orthogroups.append( line.split('\t') )
        return all_orthogroups,orthoFpath,species_list
  
    phylome_seedids=p.get_phylome_seed_ids(phyid)[0] #loading seedids #phylome_seedids=['Phy0039MUB_9999994','Phy0039MUC_9999994','Phy0039MQE_9999994']
    outFile=open( orthoFpath,'w' )#; uncommonFile=open( uncommonFpath,'w' )
    trees=not_included=i=pI=low_species_cov=0; pt=datetime.now()
    for seedid in phylome_seedids:
        trees_dict=p.get_best_tree( seedid,phyid )#; print trees_dict
        if not trees_dict:
            continue
        t=trees_dict['tree']
        if not t:
            continue
        trees+=1
        #process orthogroups
        orthogroups=get_orthogroups( t,seedid,phyid,species_list,one2one,collpase_inparalogs )
        for og in orthogroups:
            line=""; _curSpecies=[]
            for o in og: 
                spCode=_get_spcode(o)
                if spCode not in _curSpecies:
                    _curSpecies.append(spCode)
                    line+="%s\t" % o                    
            species_coverage=len(_curSpecies)*1.0/len(species_list)
            line=line[:-1]+'\n'
            #if not enough species in orthogroup 
            if species_coverage<1-missingSpeciesTh:  
                #uncommonFile.write( line) # save in uncommon
                low_species_cov+=1
            else: 
                outFile.write( line ) #save in orthogroup file
                all_orthogroups.append(og) #and add orthogroup to list
                i+=1

        if trees>pI:
            pI+=step
            sys.stdout.write( "   %s %s %s\t%s\r" % ( trees,i,seedid,datetime.now()-pt ) )
            pt=datetime.now()
    print
    print " Processed %s trees (skipped: %s ) for %s seeds. %s one2one orthologous groups and %s with species coverage < %s." % ( trees,not_included,len(phylome_seedids),i,low_species_cov,1-missingSpeciesTh )
    outFile.close()
    return all_orthogroups,orthoFpath,species_list
Example #9
0
def profile(handle, out, phyid, protid2phylmeFn, spCode, speciesInRows, \
            annotationFn, verbose):
    """Generate orthologous gene profile."""
    #get phylomeDB connection
    p = _getConnection()
    
    #get protids
    protids=[] #,protid2pfam,protid2change=get_protids( handle,foldChange,foldChangeColumn )
    genes=[]
    for r in SeqIO.parse(handle, 'fasta'): 
        gene = protid = r.id.split('|')[0] #orf19.3038|TPS2
        protids.append(protid)
        #get gene name if present
        if len(r.id.split('|'))>1:
            gene=r.id.split('|')[1].split('_')[0]
        genes.append(gene)
  
    #load annotation
    prot2ann = {}
    if annotationFn:
        prot2ann = load_annotation(annotationFn)       
  
    #get species info
    code2name = get_species_in_phylome(phyid, p)
        
    #define empty profiles
    code2profile={}
    code2score={}
    for code in code2name: 
        code2profile[code]=[0 for i in range(len(protids))]
        code2score[code]=[]
    
    #get phylomedb ids
    protid2seedid = {}
    k = 0
    protid2phyid = get_protid2phyid(protid2phylmeFn, protids, spCode, p, phyid)
    for i, protid in enumerate(protids, 1):
        sys.stderr.write(" %s / %s %s   \r"%(i, len(protids), protid))
        if protid not in protid2phyid: 
            continue
        phyprot = protid2phyid[protid]
        orthologs, code2score, seedid = get_orthologs(phyprot, phyid, p, code2score)
        protid2seedid[protid] = seedid #s; print protid, seedids
        #fill profiles
        for o in orthologs: 
            code2profile[_get_spcode(o)][i-1]+=1
        if len(orthologs)>1:
            k += 1
        elif verbose:
            sys.stderr.write("[WARNING] Only %s orthologs found for %s (%s)!\n"%(len(orthologs), protid, phyprot))
    #write info
    sys.stderr.write("%s proteins; %s with orthologs\n"%(i, k))
        
    ###print summary
    #header
    if not speciesInRows:
        info='#Protid\tGene\tSeedID'
        for code in sorted(code2name, key=lambda x: np.mean(code2score[x])):
            nameShort='%s.%s' % (code2name[code][1][0], code2name[code][1].split()[1])
            info+='\t%s' % nameShort
        info+='\tAnnotation'
        #data
        for j in range(len(protids)):
            protid=protids[j]
            gene=genes[j]
            if gene==protid:
                gene=''
            seedid = ""
            if protid in protid2seedid and protid2seedid[protid]:
                seedid = protid2seedid[protid]
            info+='\n%s\t%s\t%s' % (protid, gene, seedid)
            for code in sorted(code2name,key=lambda x: np.mean(code2score[x])):  
                info+='\t%s' % code2profile[code][j]
      
            if protid in prot2ann:
                info+="\t%s" %  prot2ann[protid] 
    else:
        info='#Species'
        protidLine='#Protid'
        annLine='#Annotation'
        for protid,gene in zip(protids,genes): 
            info+='\t%s' % gene
            if gene!=protid:
                protidLine+='\t%s' % protid
            else:
                protidLine+='\t'
            seedid = ""
            if protid in protid2seedid and protid2seedid[protid]:
                seedid = protid2seedid[protid]
            protidLine += "\t%s" %seedid
            if protid in prot2ann:
                annLine+="\t%s" % prot2ann[protid]
            else:
                annLine+='\t'
        j=0
        for code in sorted(code2name, key=lambda x: np.mean(code2score[x])):  
            info+='\n%s.%s' % (code2name[code][1][0], code2name[code][1].split()[1])
            for orthologNo in code2profile[code]:
                info+='\t%s' % orthologNo
    
        info += protidLine + annLine
    out.write(info)
Example #10
0
def process_phylome( phyid,species_list=None,one2one=True,collpase_inparalogs=False,missingSpeciesTh=0.10,step=100 ):    
    """If not species_list, all species of given phylome are taken.
    """
    print "Generating orthogroups..."
    all_orthogroups=[]
    p=_getConnection()#; print p.get_phylomes() #get some neccesary info
    if not species_list:
        species_list=[]
        proteomes_in_phylome=p.get_proteomes_in_phylome(phyid)['proteomes']
        for proteomeID in proteomes_in_phylome: 
            spCode=proteomeID.split('.')[0]
            species_list.append(spCode)

    print " for %s species: %s" % ( len( species_list ),", ".join( species_list ) ) 
    #make sure seed species if in orthogroups
    seed_sp = p.get_phylome_info(phyid)['seed_proteome'].split('.')[0]
    if not seed_sp in species_list:
        species_list.append( seed_sp )
              
    orthoFpath='phylome%s_orthogroups_%s_%s.txt' % ( phyid,len(species_list),missingSpeciesTh )#; uncommonFpath='phylome%s_uncommon_%s_%s.txt' % ( phyid,len(species_list),missingSpeciesTh )
    if os.path.isfile( orthoFpath ): 
        print " Loading orthologous groups from file: %s" % orthoFpath
        for line in open(orthoFpath):
            line=line.strip()
            all_orthogroups.append( line.split('\t') )
        return all_orthogroups,orthoFpath,species_list
  
    phylome_seedids=p.get_phylome_seed_ids(phyid)[0] #loading seedids #phylome_seedids=['Phy0039MUB_9999994','Phy0039MUC_9999994','Phy0039MQE_9999994']
    outFile=open( orthoFpath,'w' )#; uncommonFile=open( uncommonFpath,'w' )
    trees=not_included=i=pI=low_species_cov=0; pt=datetime.now()
    for seedid in phylome_seedids:
        trees_dict=p.get_best_tree( seedid,phyid )#; print trees_dict
        if not trees_dict:
            continue
        t=trees_dict['tree']
        if not t:
            continue
        trees+=1
        #process orthogroups
        orthogroups=get_orthogroups( t,seedid,phyid,species_list,one2one,collpase_inparalogs )
        for og in orthogroups:
            line=""; _curSpecies=[]
            for o in og: 
                spCode=_get_spcode(o)
                if spCode not in _curSpecies:
                    _curSpecies.append(spCode)
                    line+="%s\t" % o                    
            species_coverage=len(_curSpecies)*1.0/len(species_list)
            line=line[:-1]+'\n'
            #if not enough species in orthogroup 
            if species_coverage<1-missingSpeciesTh:  
                #uncommonFile.write( line) # save in uncommon
                low_species_cov+=1
            else: 
                outFile.write( line ) #save in orthogroup file
                all_orthogroups.append(og) #and add orthogroup to list
                i+=1

        if trees>pI:
            pI+=step
            sys.stdout.write( "   %s %s %s\t%s\r" % ( trees,i,seedid,datetime.now()-pt ) )
            pt=datetime.now()
    print
    print " Processed %s trees (skipped: %s ) for %s seeds. %s one2one orthologous groups and %s with species coverage < %s." % ( trees,not_included,len(phylome_seedids),i,low_species_cov,1-missingSpeciesTh )
    outFile.close()
    return all_orthogroups,orthoFpath,species_list