def write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight,
                             results_dictionary, orf_dictionary, contig,
                             candidate_orf_pos, orfid, compact_output):
    try:
        fields = [
            'source', 'feature', 'start', 'end', 'score', 'strand', 'frame'
        ]

        output_line = orf_dictionary[contig][candidate_orf_pos]['seqname']

        #if compact_output:
        output_line = ShortenContigId(output_line)

        for field in fields:
            # printf("\t%s", orf_dictionary[contig][candidate_orf_pos][field])
            output_line += "\t" + str(
                orf_dictionary[contig][candidate_orf_pos][field])

        #if compact_output:
        try:
            attributes = "ID=" + ShortenORFId(
                orf_dictionary[contig][candidate_orf_pos]['id'])
            attributes += ";" + "locus_tag=" + ShortenORFId(
                orf_dictionary[contig][candidate_orf_pos]['locus_tag'])
        except:
            attributes = "ID=" + orf_dictionary[contig][candidate_orf_pos]['id']
            attributes += ";" + "locus_tag=" + orf_dictionary[contig][
                candidate_orf_pos]['locus_tag']

        attributes += ";" + "contig_length=" + orf_dictionary[contig][
            candidate_orf_pos]['contig_length']
        attributes += ";" + "orf_length=" + orf_dictionary[contig][
            candidate_orf_pos]['orf_length']
        attributes += ";" + "partial=" + orf_dictionary[contig][
            candidate_orf_pos]['partial']
        attributes += ";" + "sourcedb=" + candidatedbname

        if candidatedbname in results_dictionary:
            attributes += ";" + "annotvalue=" + str(
                results_dictionary[candidatedbname][orfid]['value'])
            attributes += ";" + "ec=" + str(
                results_dictionary[candidatedbname][orfid]['ec'])
            attributes += ";" + "product=" + results_dictionary[
                candidatedbname][orfid]['product']
        else:
            attributes += ";" + "annotvalue=" + str('0')
            attributes += ";" + "ec=" + str('')
            attributes += ";" + "product=" + 'hypothetical protein'

        output_line += '\t' + attributes

        if candidatedbname in results_dictionary:
            fprintf(outputgff_file, "%s\n", output_line)
    except:
        eprintf("ERROR : Failure to annotate in contig %s\n", contig)
        #print orf_dictionary[contig]
        print traceback.print_exc(10)
        exit_process()
Ejemplo n.º 2
0
def insert_orf_into_dict(line, contig_dict, shortenorfid=False):
     rawfields = re.split('\t', line)

     fields = []
     for field in rawfields:
        fields.append(field.strip());
     
     if( len(fields) != 9):
       return

     attributes = {}

  
     seqname = fields[0]
     try:
       if shortenorfid:
          seqname = ShortenContigId(fields[0])
       else:
          seqname = fields[0]
     except:
       seqname = fields[0]


     attributes['seqname'] =  seqname   # this is a bit of a  duplication  
     attributes['source'] =  fields[1]
     attributes['feature'] =  fields[2]
     attributes['start'] =  int(fields[3])
     attributes['end'] =  int(fields[4])

     try:
        attributes['score'] =  float(fields[5])
     except:
        attributes['score'] =  fields[5]

     attributes['strand'] =  fields[6]
     attributes['frame'] =  fields[7]
     
     split_attributes(fields[8], attributes)

     if not seqname in contig_dict :
       contig_dict[seqname] = []

     
     contig_dict[seqname].append(attributes)
def create_annotation(results_dictionary, dbname,  annotated_gff,  output_dir, Taxons, orfsPicked, orfToContig, lca, compact_output= False, sample_name = ""):

    meganTree = None
    #lca.set_results_dictionary(results_dictionary)
    if not path.exists(output_dir):
        makedirs(output_dir)

    orf_dictionary={}
    #process_gff_file(annotated_gff, orf_dictionary)
    gffreader = GffFileParser(annotated_gff)

    output_table_name = output_dir + PATHDELIM + sample_name + ".functional_and_taxonomic_table.txt"
    output_table_file = open(output_table_name, 'a')

    count = 0
    for contig in gffreader:
        #    shortORFId = getShortORFId(orf['id'])
        for orf in gffreader.orf_dictionary[contig]:
            shortORFId = getShortORFId(orf['id'])
            count +=1
            #shortORFId = ShortenORFId(orf['id'])
            if shortORFId not in orfsPicked:
                continue

            orfToContig[shortORFId] = contig

            taxonomy = None
            #_results = re.search(r'refseq', opts_global.database_name, re.I)
            if shortORFId in Taxons:
                taxonomy1=Taxons[shortORFId]
                #print taxonomy1, shortORFId
                taxonomy_id=lca.get_supported_taxon(taxonomy1, return_id=True)
                # print taxonomy_id
                preferred_taxonomy = lca.get_preferred_taxonomy(taxonomy_id)

                if preferred_taxonomy:
                    taxonomy = preferred_taxonomy
                else:
                    taxonomy = Taxons[shortORFId]
            else:
                taxonomy = 'root'
            product = orf['product'] # leave product as it is
            # product = re.sub(r'\[{1,2}.+?\]{1,2}','', orf['product']).strip()
            # product = re.sub(r'\[[^\[]+?\]','', orf['product']).strip()
            # if "partial" in orf['product']:
            #     print orf['product'].strip()
            #     print product
            orf_id = orf['id']
            seqname =  orf['seqname']
            if compact_output:
              orf_id = ShortenORFId(orf_id)
              seqname =  ShortenContigId(seqname)

            fprintf(output_table_file, "%s", orf_id)
            fprintf(output_table_file, "\t%s", orf['orf_length'])
            fprintf(output_table_file, "\t%s", orf['start'])
            fprintf(output_table_file, "\t%s", orf['end'])
            fprintf(output_table_file, "\t%s", seqname)
            fprintf(output_table_file, "\t%s", orf['contig_length'])
            fprintf(output_table_file, "\t%s", orf['strand'])
            fprintf(output_table_file, "\t%s", orf['ec'])
            # fprintf(output_table_file, "\t%s", str(species))
            fprintf(output_table_file, "\t%s", taxonomy)
            fprintf(output_table_file, "\t%s\n", product)

            # adding taxons to the megan tree
            #if meganTree and taxonomy != '':
            #    meganTree.insertTaxon(taxonomy)
            #print meganTree.getChildToParentMap()

    output_table_file.close()
def print_orf_table(results, orfToContig,  output_dir,  outputfile, compact_output=False):

    addHeader =True
    if not path.exists(output_dir):
       makedirs(output_dir)


    orf_dict = {}
    for dbname in results.keys():
      print dbname, len(results[dbname].keys())
      for orfname in results[dbname]:

         for orf in results[dbname][orfname]:

           if not orf['query'] in orf_dict:
               orf_dict[orf['query']] = {}
 
           if dbname in orf_dict[orf['query']]:  # only the best hit prevails
               continue

           #if orf['query']=='2_0' and dbname=='refseq-nr-2014-01-18':
           #  print orf
            

           orf_dict[orf['query']]['contig'] = orfToContig[orfname]

           product =  orf['product'].strip()

           _results = re.search(r'cog', dbname, re.I)
           if _results:
              orf_dict[orf['query']][dbname] = cog_id(product)
              continue

           _results = re.search(r'kegg', dbname, re.I)
           if _results:
              orf_dict[orf['query']][dbname] =  kegg_id(product)
              continue

           _results = re.search(r'cazy', dbname, re.I)
           if _results:
              orf_dict[orf['query']][dbname] =  cazy_id(product)
              continue

           _results = re.search(r'metacyc', dbname, re.I)
           if _results:
              orf_dict[orf['query']][dbname] =  product
              continue

           _results = re.search(r'seed', dbname, re.I)
           if _results:
              orf_dict[orf['query']][dbname] =  seed_id(product)
             # print "---", orf_dict[orf['query']][dbname]
              continue

           #if dbname=='refseq-nr-2014-01-18':
           #   if orf['query']=='2_0':
           #      print product 

           #adds it anyway
           orf_dict[orf['query']][dbname] =  product

    # compute the databases
    database_maps = {}
    for dbname in results.keys():
       _results = re.search(r'cog', dbname, re.I)
       if _results:
         database_maps['cog'] = dbname
         continue
         

       _results = re.search(r'kegg', dbname, re.I)
       if _results:
         database_maps['kegg'] = dbname
         hit=True
         continue

       _results = re.search(r'cazy', dbname, re.I)
       if _results:
         database_maps['cazy'] = dbname
         continue

       _results = re.search(r'seed', dbname, re.I)
       if _results:
         database_maps['seed'] = dbname
         continue

       _results = re.search(r'metacyc', dbname, re.I)
       if _results:
         database_maps['metacyc'] = dbname
         continue

       _results = re.search(r'refseq', dbname, re.I)
       if _results:
         database_maps['refseq'] = dbname
         continue

       database_maps[dbname] = dbname


    std_dbnames = ['cog', 'kegg', 'seed', 'cazy', 'metacyc', 'refseq'] 
    dbnames = std_dbnames

    headers = ["#  ORF_ID", "CONTIG_ID"]
    for std_dbname in std_dbnames:
       headers.append(std_dbname.upper())

    for dbname in sorted(results.keys()):
       non_std =True
       for std_dbname in std_dbnames:
          if re.search(std_dbname, dbname, re.I):
             non_std =False
             
       if non_std:
         dbnames.append(dbname)
         headers.append(std_dbname)


    sampleName = None
    for orfn in orf_dict:
       #if orfn=='2_0':
       #  print orfn, '<<',  orf_dict[orfn], ' >> xxxx'
       #_keys =  orf_dict[orfn].keys()
       #_results = re.search(r'cog', dbname, re.I)


       if 'cog' in database_maps and  database_maps['cog'] in orf_dict[orfn]:
          cogFn = orf_dict[orfn][database_maps['cog']]
       else:
          cogFn = ""

       if 'kegg' in database_maps and database_maps['kegg'] in orf_dict[orfn]:
          keggFn = orf_dict[orfn][database_maps['kegg']]
          #print  orfn, keggFn
       else:
          keggFn = ""

       if 'metacyc' in database_maps and database_maps['metacyc'] in orf_dict[orfn]:
          metacycPwy = orf_dict[orfn][database_maps['metacyc']]
       else:
          metacycPwy = ""

       if 'seed' in database_maps and database_maps['seed'] in orf_dict[orfn]:
          seedFn = orf_dict[orfn][database_maps['seed']]
       else:
          seedFn = ""

       if 'cazy' in database_maps and database_maps['cazy'] in orf_dict[orfn]:
          cazyFn = orf_dict[orfn][database_maps['cazy']]
       else:
          cazyFn= ""

       if 'refseq' in database_maps and database_maps['refseq'] in orf_dict[orfn]:
          refseqFn = orf_dict[orfn][database_maps['refseq']]
       else:
          refseqFn= ""
    

       if not sampleName:
         sampleName = getSampleNameFromContig(orf_dict[orfn]['contig'])

      
       orfName = orfn
       contigName= orf_dict[orfn]['contig']
       if compact_output:
          orfName =  orfn
          contigName= ShortenContigId(contigName)

       row = [ orfName, contigName ]
       for dbname in dbnames:
         if dbname in database_maps and database_maps[dbname] in orf_dict[orfn]:
           row.append(orf_dict[orfn][database_maps[dbname]])
         else:
           row.append("")

#       print '\t'.join(headers)
#       print '\t'.join(row) 






       #fprintf(outputfile, "%s\n", orfName + "\t" + contigName + '\t' + cogFn + '\t' + keggFn +'\t' + seedFn + '\t' + cazyFn + '\t'+ metacycPwy)
       if addHeader:
           #fprintf(outputfile, "# %s\n", '\t'.join(headers)_"ORF_ID" + "\t" + "CONTIG_ID" + '\t' + "COG" + '\t' + "KEGG" +'\t' + "SEED" + '\t' + "CAZY" + '\t'+ "METACYC" + '\t' + "REFSEQ" )
           fprintf(outputfile, "# %s\n", '\t'.join(headers))
           addHeader=False

       #fprintf(outputfile, "%s\n", orfName + "\t" + contigName + '\t' + cogFn + '\t' + keggFn +'\t' + seedFn + '\t' + cazyFn + '\t'+ metacycPwy + '\t' + refseqFn )
       fprintf(outputfile, "%s\n", '\t'.join(row))
def print_orf_table(results, orfToContig,  output_dir,  outputfile, compact_output=False):

    if not path.exists(output_dir):
       makedirs(output_dir)


    orf_dict = {}
    for dbname in results.keys():
      print dbname, len(results[dbname].keys())
      for orfname in results[dbname]:

         for orf in results[dbname][orfname]:

           if not orf['query'] in orf_dict:
               orf_dict[orf['query']] = {}
 
           if dbname in orf_dict[orf['query']]:
               continue

           #if orf['query']=='2_0' and dbname=='refseq-nr-2014-01-18':
           #  print orf
            

           orf_dict[orf['query']]['contig'] = orfToContig[orfname]

           product =  orf['product'].strip()

           _results = re.search(r'cog', dbname, re.I)
           if _results:
              orf_dict[orf['query']][dbname] = cog_id(product)
              continue

           _results = re.search(r'kegg', dbname, re.I)
           if _results:
              orf_dict[orf['query']][dbname] =  kegg_id(product)
              continue

           _results = re.search(r'cazy', dbname, re.I)
           if _results:
              orf_dict[orf['query']][dbname] =  cazy_id(product)
              continue

           _results = re.search(r'metacyc', dbname, re.I)
           if _results:
              orf_dict[orf['query']][dbname] =  product
              continue

           _results = re.search(r'seed', dbname, re.I)
           if _results:
              orf_dict[orf['query']][dbname] =  seed_id(product)
             # print "---", orf_dict[orf['query']][dbname]
              continue

           #if dbname=='refseq-nr-2014-01-18':
           #   if orf['query']=='2_0':
           #      print product 
           orf_dict[orf['query']][dbname] =  product

    # compute the databases
    database_maps = {}
    for dbname in results.keys():
       _results = re.search(r'cog', dbname, re.I)
       if _results:
         database_maps['cog'] = dbname

       _results = re.search(r'kegg', dbname, re.I)
       if _results:
         database_maps['kegg'] = dbname

       _results = re.search(r'cazy', dbname, re.I)
       if _results:
         database_maps['cazy'] = dbname

       _results = re.search(r'seed', dbname, re.I)
       if _results:
         database_maps['seed'] = dbname

       _results = re.search(r'metacyc', dbname, re.I)
       if _results:
         database_maps['metacyc'] = dbname

       _results = re.search(r'refseq', dbname, re.I)
       if _results:
         database_maps['refseq'] = dbname


    sampleName = None
    for orfn in orf_dict:
       #if orfn=='2_0':
       #  print orfn, '<<',  orf_dict[orfn], ' >> xxxx'
       #_keys =  orf_dict[orfn].keys()
       #_results = re.search(r'cog', dbname, re.I)

       if 'cog' in database_maps and  database_maps['cog'] in orf_dict[orfn]:
          cogFn = orf_dict[orfn][database_maps['cog']]
       else:
          cogFn = ""

       if 'kegg' in database_maps and database_maps['kegg'] in orf_dict[orfn]:
          keggFn = orf_dict[orfn][database_maps['kegg']]
          #print  orfn, keggFn
       else:
          keggFn = ""

       if 'metacyc' in database_maps and database_maps['metacyc'] in orf_dict[orfn]:
          metacycPwy = orf_dict[orfn][database_maps['metacyc']]
       else:
          metacycPwy = ""

       if 'seed' in database_maps and database_maps['seed'] in orf_dict[orfn]:
          seedFn = orf_dict[orfn][database_maps['seed']]
       else:
          seedFn = ""

       if 'cazy' in database_maps and database_maps['cazy'] in orf_dict[orfn]:
          cazyFn = orf_dict[orfn][database_maps['cazy']]
       else:
          cazyFn= ""

       if 'refseq' in database_maps and database_maps['refseq'] in orf_dict[orfn]:
          refseqFn = orf_dict[orfn][database_maps['refseq']]
       else:
          refseqFn= ""
    
       if not sampleName:
         sampleName = getSampleNameFromContig(orf_dict[orfn]['contig'])

      
       orfName = orfn
       contigName= orf_dict[orfn]['contig']
       if compact_output:
          orfName =  orfn
          contigName= ShortenContigId(contigName)

       #fprintf(outputfile, "%s\n", orfName + "\t" + contigName + '\t' + cogFn + '\t' + keggFn +'\t' + seedFn + '\t' + cazyFn + '\t'+ metacycPwy)
       fprintf(outputfile, "%s\n", orfName + "\t" + contigName + '\t' + cogFn + '\t' + keggFn +'\t' + seedFn + '\t' + cazyFn + '\t'+ metacycPwy + '\t' + refseqFn )