def write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight,
                             results_dictionary, orf_dictionary, contig,
                             candidate_orf_pos, orfid, compact_output):
    global errorcode
    try:
        fields = [
            'source', 'feature', 'start', 'end', 'score', 'strand', 'frame'
        ]

        output_line = orf_dictionary[contig][candidate_orf_pos]['seqname']

        #if compact_output:
        #output_line = ShortenContigId(output_line)

        for field in fields:
            output_line += "\t" + str(
                orf_dictionary[contig][candidate_orf_pos][field])

        #if compact_output:
        try:
            attributes = "ID=" + ShortenORFId(
                orf_dictionary[contig][candidate_orf_pos]['id'])
            attributes += ";" + "locus_tag=" + ShortenORFId(
                orf_dictionary[contig][candidate_orf_pos]['locus_tag'])
        except:
            attributes = "ID=" + orf_dictionary[contig][candidate_orf_pos]['id']
            attributes += ";" + "locus_tag=" + orf_dictionary[contig][
                candidate_orf_pos]['locus_tag']

        attributes += ";" + "contig_length=" + orf_dictionary[contig][
            candidate_orf_pos]['contig_length']
        attributes += ";" + "orf_length=" + orf_dictionary[contig][
            candidate_orf_pos]['orf_length']
        attributes += ";" + "partial=" + orf_dictionary[contig][
            candidate_orf_pos]['partial']
        attributes += ";" + "sourcedb=" + candidatedbname

        if candidatedbname in results_dictionary:
            attributes += ";" + "annotvalue=" + str(
                results_dictionary[candidatedbname][orfid]['value'])
            attributes += ";" + "ec=" + str(
                results_dictionary[candidatedbname][orfid]['ec'])
            attributes += ";" + "product=" + results_dictionary[
                candidatedbname][orfid]['product']
        else:
            attributes += ";" + "annotvalue=" + str('0')
            attributes += ";" + "ec=" + str('')
            attributes += ";" + "product=" + 'hypothetical protein'

        output_line += '\t' + attributes

        if candidatedbname in results_dictionary:
            fprintf(outputgff_file, "%s\n", output_line)
    except:
        eprintf("ERROR : Failure to annotate in contig %s\n", contig)
        #print orf_dictionary[contig]
        print traceback.print_exc(10)
        insert_error(errorcode)
        exit_process()
Esempio n. 2
0
def  write_16S_tRNA_gene_info(rRNA_dictionary, outputgff_file, tag):
      fields = [  'source', 'feature', 'start', 'end', 'score', 'strand', 'frame' ]
      for rRNA in rRNA_dictionary:
          output_line= rRNA_dictionary[rRNA]['id']
          for field in fields:
             output_line += "\t"+ str(rRNA_dictionary[rRNA][field])

          attributes = "ID="+ShortenORFId(rRNA_dictionary[rRNA]['seqname']) + tag
          attributes += ";" + "locus_tag="+ShortenORFId(rRNA_dictionary[rRNA]['seqname']) + tag
          attributes += ";" + "orf_length=" + str(rRNA_dictionary[rRNA]['orf_length'])
          attributes += ";" + "contig_length=" + str(rRNA_dictionary[rRNA]['contig_length'])
          attributes += ";" + "ec="
          attributes += ";" + "product="+rRNA_dictionary[rRNA]['product']
          output_line += '\t' + attributes
          fprintf(outputgff_file, "%s\n", output_line);
Esempio n. 3
0
def write_refscores(refscore_file, refscores, compact_output=False):
    for key, value in refscores.items():
        orfid = key
        if compact_output:
            orfid = ShortenORFId(key)

        fprintf(refscore_file, "%s\t%s\n", orfid, value)
    def next(self):
        if self.i < self.size:

            try:
                fields = [x.strip() for x in self.lines[self.i].split('\t')]
                #print self.fieldmap['ec'], fields, self.i,  self.blastoutput

                self.data['query'] = ShortenORFId(
                    fields[self.fieldmap['query']])
                self.data['q_length'] = int(fields[self.fieldmap['q_length']])
                self.data['bitscore'] = float(
                    fields[self.fieldmap['bitscore']])
                self.data['bsr'] = float(fields[self.fieldmap['bsr']])
                self.data['expect'] = float(fields[self.fieldmap['expect']])
                self.data['identity'] = float(
                    fields[self.fieldmap['identity']])
                self.data['ec'] = fields[self.fieldmap['ec']]
                self.data['product'] = re.sub(r'=', ' ',
                                              fields[self.fieldmap['product']])
                self.i = self.i + 1
                return self.data
            except:
                print self.lines[self.i]
                print data
                sys.exit(0)
                return None
        else:
            raise StopIteration()
def insert_attribute(attributes, attribStr):
    rawfields = re.split('=', attribStr)
    if len(rawfields) == 2:
        if rawfields[0].strip().lower() == 'id':
            orfid = ShortenORFId(rawfields[1].strip())
            attributes[rawfields[0].strip().lower()] = orfid
        else:
            attributes[rawfields[0].strip().lower()] = rawfields[1].strip()
Esempio n. 6
0
 def create_refBitScores(self):
    refscorefile = open(self.refscore_file,'r')
    for line in refscorefile:
        words =[ x.strip()  for x in  line.split('\t') ]
        if len(words) == 2:
           orfid = ShortenORFId(words[0])
           try:
             self.refBitScores[orfid]= int((self.Lambda*float(words[1]) -  self.lnk )/self.ln2)
           except:
             self.refBitScores[orfid]= int(1)
    refscorefile.close()
    def next(self):
        if self.i % self.Size == 0:
            self.refillBuffer()

        if self.i % self.Size < self.size:
            words = [
                x.strip()
                for x in self.lines[self.i % self.Size].rstrip().split('\t')
            ]

            if len(words) != 12:
                self.i = self.i + 1
                return None
            '''shorten the ORF id'''
            words[0] = ShortenORFId(words[0])
            #if  self.opts.algorithm =='LAST':
            if self.needToPermute:
                self.permuteForLAST(words)

            if not words[0] in self.hits_counts:
                self.hits_counts[words[0]] = 0

            if self.hits_counts[words[0]] >= self.opts.limit:
                self.i = self.i + 1
                return None

            if len(words) != 12 or not self.isWithinCutoffs(
                    words, self.data, self.opts, self.annot_map,
                    self.refBitScores):
                self.i = self.i + 1
                return None

            self.hits_counts[words[0]] += 1
            self.i = self.i + 1

            try:
                return self.data
            except:
                return None
        else:
            self.blastoutputfile.close()
            raise StopIteration()
def create_annotation(results_dictionary, dbname,  annotated_gff,  output_dir, Taxons, orfsPicked, orfToContig, lca, compact_output= False, sample_name = ""):

    meganTree = None
    #lca.set_results_dictionary(results_dictionary)
    if not path.exists(output_dir):
        makedirs(output_dir)

    orf_dictionary={}
    #process_gff_file(annotated_gff, orf_dictionary)
    gffreader = GffFileParser(annotated_gff)

    output_table_name = output_dir + PATHDELIM + sample_name + ".functional_and_taxonomic_table.txt"
    output_table_file = open(output_table_name, 'a')

    count = 0
    for contig in gffreader:
        #    shortORFId = getShortORFId(orf['id'])
        for orf in gffreader.orf_dictionary[contig]:
            shortORFId = getShortORFId(orf['id'])
            count +=1
            #shortORFId = ShortenORFId(orf['id'])
            if shortORFId not in orfsPicked:
                continue

            orfToContig[shortORFId] = contig

            taxonomy = None
            #_results = re.search(r'refseq', opts_global.database_name, re.I)
            if shortORFId in Taxons:
                taxonomy1=Taxons[shortORFId]
                #print taxonomy1, shortORFId
                taxonomy_id=lca.get_supported_taxon(taxonomy1, return_id=True)
                # print taxonomy_id
                preferred_taxonomy = lca.get_preferred_taxonomy(taxonomy_id)

                if preferred_taxonomy:
                    taxonomy = preferred_taxonomy
                else:
                    taxonomy = Taxons[shortORFId]
            else:
                taxonomy = 'root'
            product = orf['product'] # leave product as it is
            # product = re.sub(r'\[{1,2}.+?\]{1,2}','', orf['product']).strip()
            # product = re.sub(r'\[[^\[]+?\]','', orf['product']).strip()
            # if "partial" in orf['product']:
            #     print orf['product'].strip()
            #     print product
            orf_id = orf['id']
            seqname =  orf['seqname']
            if compact_output:
              orf_id = ShortenORFId(orf_id)
              seqname =  ShortenContigId(seqname)

            fprintf(output_table_file, "%s", orf_id)
            fprintf(output_table_file, "\t%s", orf['orf_length'])
            fprintf(output_table_file, "\t%s", orf['start'])
            fprintf(output_table_file, "\t%s", orf['end'])
            fprintf(output_table_file, "\t%s", seqname)
            fprintf(output_table_file, "\t%s", orf['contig_length'])
            fprintf(output_table_file, "\t%s", orf['strand'])
            fprintf(output_table_file, "\t%s", orf['ec'])
            # fprintf(output_table_file, "\t%s", str(species))
            fprintf(output_table_file, "\t%s", taxonomy)
            fprintf(output_table_file, "\t%s\n", product)

            # adding taxons to the megan tree
            #if meganTree and taxonomy != '':
            #    meganTree.insertTaxon(taxonomy)
            #print meganTree.getChildToParentMap()

    output_table_file.close()
def write_ptinput_files(output_dir_name, contig_dict, sample_name,
                        nucleotide_seq_dict, protein_seq_dict, compact_output):

    try:
        #print output_dir_name
        removeDir(output_dir_name)
        #print output_dir_name
        makedirs(output_dir_name)
        genetic_elements_file = open(
            output_dir_name + "/.tmp.genetic-elements.dat", 'w')

        reducedpffile = open(output_dir_name + "/tmp.reduced.txt", 'w')
    except:
        print "cannot create the pathway tools files"
        print "perhaps there is already a folder " + output_dir_name
        traceback.print_exc(file=sys.stdout)

    count = 0
    outputStr = ""

    # iterate over every contig sequence
    first_hits = {}

    for key in contig_dict:
        first = True
        if count % 10000 == 0:
            #print "count " + str(count)
            #outputfile.write(outputStr)
            outputStr = ""
        count += 1

        for attrib in contig_dict[key]:

            id = attrib['id']
            shortid = ""
            if attrib['feature'] == 'CDS':
                shortid = 'O_' + ShortenORFId(attrib['id'])
            if attrib['feature'] == 'tRNA':
                shortid = 'O_' + ShortentRNAId(attrib['id'])

            try:
                protein_seq = protein_seq_dict[id]
            except:
                protein_seq = ""
            try:
                if attrib['product'] == 'hypothetical protein':
                    continue
            except:
                print attrib
                sys.exit(0)

            if attrib['product'] in first_hits:
                if attrib['ec']:
                    if attrib['ec'] in first_hits[attrib['product']]:
                        fprintf(reducedpffile, "%s\t%s\n", shortid,
                                first_hits[attrib['product']]['n'])

                    # to  remove redundancy add "continue "
                    # continue
                    else:
                        first_hits[attrib['product']]['ec'] = attrib['ec']
                        first_hits[attrib['product']]['n'] = shortid
                else:
                    fprintf(reducedpffile, "%s\t%s\n", shortid,
                            first_hits[attrib['product']]['n'])
                    # to  remove redundancy add "continue "
                    #continue
            else:
                first_hits[attrib['product']] = {}
                first_hits[attrib['product']]['n'] = shortid
                first_hits[attrib['product']]['ec'] = attrib['ec']

            # create the pf file
            write_to_pf_file(output_dir_name, shortid, attrib)

            # append to the gen elements file
            append_genetic_elements_file(genetic_elements_file,
                                         output_dir_name, shortid)
        #endfor

        #write the sequence now only once per contig
        try:
            contig_seq = nucleotide_seq_dict[key]
        except:
            printf(
                "ERROR: Contig %s missing file in \"preprocessed\" folder for sample\n",
                key)
            continue

        fastaStr = wrap("", 0, 62, contig_seq)

        #write_ptools_input_files(genetic_elements_file, output_dir_name, shortid, fastaStr)
        write_input_sequence_file(output_dir_name, shortid, fastaStr)
    #endif

    rename(output_dir_name + "/tmp.reduced.txt",
           output_dir_name + "/reduced.txt")

    # Niels: removing annotated.gff from sample_name
    sample_name = re.sub(".annot.gff", '', sample_name)
    sample_name = re.sub('.*/', '', sample_name)
    sample_name = re.sub(r'[\\].', '', sample_name)

    # Niels: trim sample_name to less than 35 characters
    # as it causes PGDB creation to fail
    if (len(sample_name) > 35):
        sample_name = sample_name[0:35]

    if not sample_name[0].isalpha():
        sample_name = 'E' + sample_name

    write_organisms_dat_file(output_dir_name, sample_name)

    genetic_elements_file.close()
    rename(output_dir_name + "/.tmp.genetic-elements.dat",
           output_dir_name + "/genetic-elements.dat")