def process_parsed_blastoutput(dbname, blastparser, cutoffs,
                               annotation_results, pickorfs):
    fields = [
        'target', 'q_length', 'bitscore', 'bsr', 'expect', 'identity', 'ec',
        'query'
    ]
    fields.append('product')
    for data in blastparser:
        if data != None and isWithinCutoffs(data, cutoffs):
            # if dbname=='refseq':
            #if "partial" in data['product']:
            #    print data['query'] + '\t' + str(data['q_length']) +'\t' + str(data['bitscore']) +'\t' + str(data['expect']) +'\t' + str(data['identity']) + '\t' + str(data['bsr']) + '\t' + data['ec'] + '\t' + data['product']
            annotation = {}
            shortORFId = None
            for field in fields:
                if field in data:
                    if field == 'query':
                        shortORFId = getShortORFId(data[field])
                        annotation[field] = shortORFId
                    else:
                        annotation[field] = data[field]

            if not shortORFId in pickorfs:
                blastparser.rewind()
                return None

            annotation['dbname'] = dbname
            if not shortORFId in annotation_results:
                annotation_results[shortORFId] = []

            annotation_results[shortORFId].append(annotation)

    return None
def process_parsed_blastoutput(dbname, blastparser, cutoffs, annotation_results, pickorfs):
    fields = ['target', 'q_length', 'bitscore', 'bsr', 'expect', 'identity', 'ec', 'query' ]
    fields.append('product')
    for data in blastparser:
        if  data!=None and isWithinCutoffs(data, cutoffs) :
            # if dbname=='refseq':
            #if "partial" in data['product']:
            #    print data['query'] + '\t' + str(data['q_length']) +'\t' + str(data['bitscore']) +'\t' + str(data['expect']) +'\t' + str(data['identity']) + '\t' + str(data['bsr']) + '\t' + data['ec'] + '\t' + data['product']
            annotation = {}
            shortORFId = None
            for field in fields:
                if field in data:
                    if field == 'query':
                        shortORFId =  getShortORFId(data[field])
                        annotation[field] = shortORFId
                    else:
                        annotation[field] = data[field]

            if not shortORFId in pickorfs:
                blastparser.rewind()
                return None

            annotation['dbname'] = dbname
            if not shortORFId in annotation_results:
                annotation_results[shortORFId] = []

            annotation_results[shortORFId].append(annotation)

    return None
def create_annotation(results_dictionary, dbname,  annotated_gff,  output_dir, Taxons, orfsPicked, orfToContig, lca):

    meganTree = None
    #lca.set_results_dictionary(results_dictionary)
    if not path.exists(output_dir):
        makedirs(output_dir)

    orf_dictionary={}
    #process_gff_file(annotated_gff, orf_dictionary)
    gffreader = GffFileParser(annotated_gff)
    output_table_file = open(output_dir + '/functional_and_taxonomic_table.txt', 'a')

    count = 0
    for contig in gffreader:
        #    shortORFId = getShortORFId(orf['id'])
        for orf in gffreader.orf_dictionary[contig]:
            shortORFId = getShortORFId(orf['id'])
            if shortORFId not in orfsPicked:
                continue

            orfToContig[shortORFId] = contig

            taxonomy = None

            #_results = re.search(r'refseq', opts_global.database_name, re.I)
            if shortORFId in Taxons:
                taxonomy1=Taxons[shortORFId]
                taxonomy_id=lca.get_supported_taxon(taxonomy1, return_id=True)
                # print taxonomy_id
                preferred_taxonomy = lca.get_preferred_taxonomy(taxonomy_id)
                if preferred_taxonomy:
                    taxonomy = preferred_taxonomy
                else:
                    taxonomy = Taxons[shortORFId]
            else:
                taxonomy = 'root'
            product = orf['product'] # leave product as it is
            # product = re.sub(r'\[{1,2}.+?\]{1,2}','', orf['product']).strip()
            # product = re.sub(r'\[[^\[]+?\]','', orf['product']).strip()
            # if "partial" in orf['product']:
            #     print orf['product'].strip()
            #     print product
            fprintf(output_table_file, "%s", orf['id'])
            fprintf(output_table_file, "\t%s", orf['orf_length'])
            fprintf(output_table_file, "\t%s", orf['start'])
            fprintf(output_table_file, "\t%s", orf['end'])
            fprintf(output_table_file, "\t%s", orf['seqname'])
            fprintf(output_table_file, "\t%s", orf['contig_length'])
            fprintf(output_table_file, "\t%s", orf['strand'])
            fprintf(output_table_file, "\t%s", orf['ec'])
            # fprintf(output_table_file, "\t%s", str(species))
            fprintf(output_table_file, "\t%s", taxonomy)
            fprintf(output_table_file, "\t%s\n", product)

            # adding taxons to the megan tree
            #if meganTree and taxonomy != '':
            #    meganTree.insertTaxon(taxonomy)
            #print meganTree.getChildToParentMap()

    output_table_file.close()
Esempio n. 4
0
    def compute_min_support_tree(self,
                                 annotate_gff_file,
                                 pickorfs,
                                 dbname='refseq'):
        self.tax_dbname = dbname
        gffreader = GffFileParser(annotate_gff_file)
        try:
            for contig in gffreader:
                for orf in gffreader.orf_dictionary[contig]:
                    shortORFId = getShortORFId(orf['id'])

                    #print shortORFId, orf['id']
                    if not shortORFId in pickorfs:
                        continue
                    #print ">", shortORFId, orf['id']

                    taxonomy = None
                    species = []
                    if self.tax_dbname in self.results_dictionary:
                        if shortORFId in self.results_dictionary[
                                self.tax_dbname]:
                            #compute the top hit wrt score
                            top_score = 0
                            for hit in self.results_dictionary[
                                    self.tax_dbname][shortORFId]:
                                #  print hit['bitscore'], self.lca_min_score, top_score

                                if hit['bitscore'] >= self.lca_min_score and hit[
                                        'bitscore'] >= top_score:
                                    top_score = hit['bitscore']

                            for hit in self.results_dictionary[
                                    self.tax_dbname][shortORFId]:
                                if (100 - self.lca_top_percent
                                    ) * top_score / 100 < hit['bitscore']:
                                    names = self.get_species(hit)
                                    if names:
                                        species.append(names)
                            # print self.results_dictionary[dbname][shortORFId][0]['product']
                            # print  orf['id']
                            # print  orf['id'], species
                            # print  orf['id'], len(self.results_dictionary[dbname][shortORFId]), species
                    taxonomy = self.getTaxonomy(species)
                    # taxonomy_id = self.getTaxonomy(species, return_id=True)
                    # print taxonomy
                    # print taxonomy_id
                    # print taxonomy,  orf['id'], species
                    self.update_taxon_support_count(taxonomy)
                    # preferred_taxonomy = self.get_preferred_taxonomy(taxonomy_id)
                    # print taxonomy
                    # print preferred_taxonomy
                    pickorfs[shortORFId] = taxonomy

        except:
            import traceback
            traceback.print_exc()
            print "ERROR : Cannot read annotated gff file "
def get_list_of_queries(annotated_gff):
    orfList = {}
    gffreader = GffFileParser(annotated_gff)
    count = 0
    for contig in gffreader:
        for orf in gffreader.orf_dictionary[contig]:
            orfid = getShortORFId(orf['id'])
            orfList[orfid] = 1
            count += 1
    #      if count%500000==0:
    #         print count

    return orfList.keys()
def get_list_of_queries(annotated_gff):
    orfList = {}
    gffreader = GffFileParser(annotated_gff)
    count = 0
    for contig in  gffreader:
       for orf in  gffreader.orf_dictionary[contig]:
          orfid =  getShortORFId(orf['id'])
          orfList[orfid]  = 1
          count += 1
    #      if count%500000==0:
    #         print count

    return orfList.keys()
Esempio n. 7
0
    def compute_min_support_tree(self, annotate_gff_file, pickorfs, dbname= 'refseq'):
        self.tax_dbname = dbname
        gffreader = GffFileParser(annotate_gff_file)
        try:
            for contig in  gffreader:
                for orf in  gffreader.orf_dictionary[contig]:
                    shortORFId = getShortORFId(orf['id'])

                    #print shortORFId, orf['id']
                    if not shortORFId in pickorfs:
                        continue
                    #print ">", shortORFId, orf['id']

                    taxonomy = None
                    species = []
                    if self.tax_dbname in self.results_dictionary:
                        if shortORFId in self.results_dictionary[self.tax_dbname]:
                            #compute the top hit wrt score
                            top_score = 0
                            for hit in self.results_dictionary[self.tax_dbname][shortORFId]:
                                #  print hit['bitscore'], self.lca_min_score, top_score

                                if hit['bitscore'] >= self.lca_min_score and hit['bitscore'] >= top_score:
                                    top_score = hit['bitscore']

                            for hit in self.results_dictionary[self.tax_dbname][shortORFId]:
                                if (100-self.lca_top_percent)*top_score/100 < hit['bitscore']:
                                    names = self.get_species(hit)
                                    if names:
                                        species.append(names)
                            # print self.results_dictionary[dbname][shortORFId][0]['product']
                            # print  orf['id']
                            # print  orf['id'], species
                            # print  orf['id'], len(self.results_dictionary[dbname][shortORFId]), species
                    taxonomy=self.getTaxonomy(species)
                    # taxonomy_id = self.getTaxonomy(species, return_id=True)
                    # print taxonomy
                    # print taxonomy_id
                    # print taxonomy,  orf['id'], species
                    self.update_taxon_support_count(taxonomy)
                    # preferred_taxonomy = self.get_preferred_taxonomy(taxonomy_id)
                    # print taxonomy
                    # print preferred_taxonomy
                    pickorfs[shortORFId] = taxonomy

        except:
            import traceback
            traceback.print_exc()
            print "ERROR : Cannot read annotated gff file "
def process_parsed_blastoutput(dbname, blastparser, cutoffs, annotation_results, pickorfs):
    fields = ['target', 'q_length', 'bitscore', 'bsr', 'expect', 'identity', 'ec', 'query' ]
    fields.append('product')

    try:
      for data in blastparser:
    #    if dbname=='refseq-nr-2014-01-18':
    #       print  'refseq process',  data

        if  data!=None and isWithinCutoffs(data, cutoffs) :
            #if dbname=='refseq-nr-2014-01-18':
            #      print  'refseq process',  data
            # if dbname=='refseq':
            #if "partial" in data['product']:
            #    print data['query'] + '\t' + str(data['q_length']) +'\t' + str(data['bitscore']) +'\t' + str(data['expect']) +'\t' + str(data['identity']) + '\t' + str(data['bsr']) + '\t' + data['ec'] + '\t' + data['product']
            annotation = {}
            shortORFId = None
            for field in fields:
                if field in data:
                    if field == 'query':
                        shortORFId =  getShortORFId(data[field])
                        annotation[field] = shortORFId
                    else:
                        annotation[field] = data[field]

            if not shortORFId in pickorfs:
                continue


        #        blastparser.rewind()
        #        return None

            annotation['dbname'] = dbname
            if not shortORFId in annotation_results:
                annotation_results[shortORFId] = []

            #if dbname=='refseq-nr-2014-01-18':
                  #print  annotation
            annotation_results[shortORFId].append(annotation)
    except:
       print  traceback.print_exc()
       

    #if dbname=='refseq-nr-2014-01-18':
    #     print 'annot refseq process', len(annotation_results)

    return None
def merge_sorted_parsed_files(dbname,
                              filenames,
                              outputfilename,
                              orfRanks,
                              verbose=False,
                              errorlogger=None):
    linecount = 0
    readerhandles = []

    if verbose:
        eprintf("Processing database  : %s\n", dbname)

    if len(filenames) == 0:
        eprintf(
            "WARNING : Cannot find any B/LAST output file for database : %\n",
            dbname)
        exit_process()

    try:
        for i in range(len(filenames)):
            #print filenames
            readerhandles.append(BlastOutputTsvParser(dbname, filenames[i]))
    except OSError:
        eprintf("ERROR: Cannot read sequence file : %s\n", filenames[i])
        exit_process()

    # set error and warning parameters
    for readerhandle in readerhandles:
        readerhandle.setMaxErrorsLimit(5)
        readerhandle.setErrorAndWarningLogger(errorlogger)
        readerhandle.setSTEP_NAME('PARSE BLAST')

    try:
        outputfile = open(outputfilename, 'w')
        fieldmapHeaderLine = readerhandles[0].getHeaderLine()
        fprintf(outputfile, "%s\n", fieldmapHeaderLine)
    except OSError:
        eprintf("ERROR: Cannot create sequence file : %s\n", outputfilename)
        exit_process()

    values = []
    for i in range(len(filenames)):
        iterate = iter(readerhandles[i])
        try:
            next(iterate)
            line = readerhandles[i].getProcessedLine()
            fields = [x.strip() for x in line.split('\t')]
            shortORFId = getShortORFId(fields[0])
            values.append((i, orfRanks[shortORFId], line))
        except:
            outputfile.close()
            return

    S = len(filenames)
    BuildHeap(S, values)

    while S > 0:
        try:
            iterate = iter(readerhandles[values[0][0]])
            line = readerhandles[values[0][0]].getProcessedLine()
            fields = [x.strip() for x in line.split('\t')]
            #print fields[0], orfRanks[fields[0]]
            fprintf(outputfile, "%s\n", line)
            next(iterate)

            line = readerhandles[values[0][0]].getProcessedLine()
            fields = [x.strip() for x in line.split('\t')]
            shortORFId = getShortORFId(fields[0])
            values[0] = (values[0][0], orfRanks[shortORFId], line)
        except:
            #print 'finished ' + str(S)
            values[0] = values[S - 1]
            S = S - 1

        if S > 0:
            Heapify(values, 0, S)

    #print 'line count ' + str(linecount)
    outputfile.close()
def create_annotation(results_dictionary, dbname, annotated_gff, output_dir,
                      Taxons, orfsPicked, orfToContig, lca):

    meganTree = None
    #lca.set_results_dictionary(results_dictionary)
    if not path.exists(output_dir):
        makedirs(output_dir)

    orf_dictionary = {}
    #process_gff_file(annotated_gff, orf_dictionary)
    gffreader = GffFileParser(annotated_gff)
    output_table_file = open(
        output_dir + '/functional_and_taxonomic_table.txt', 'a')

    count = 0
    for contig in gffreader:
        #    shortORFId = getShortORFId(orf['id'])
        for orf in gffreader.orf_dictionary[contig]:
            shortORFId = getShortORFId(orf['id'])
            if shortORFId not in orfsPicked:
                continue

            orfToContig[shortORFId] = contig

            taxonomy = None

            #_results = re.search(r'refseq', opts_global.database_name, re.I)
            if shortORFId in Taxons:
                taxonomy1 = Taxons[shortORFId]
                taxonomy_id = lca.get_supported_taxon(taxonomy1,
                                                      return_id=True)
                # print taxonomy_id
                preferred_taxonomy = lca.get_preferred_taxonomy(taxonomy_id)
                if preferred_taxonomy:
                    taxonomy = preferred_taxonomy
                else:
                    taxonomy = Taxons[shortORFId]
            else:
                taxonomy = 'root'
            product = orf['product']  # leave product as it is
            # product = re.sub(r'\[{1,2}.+?\]{1,2}','', orf['product']).strip()
            # product = re.sub(r'\[[^\[]+?\]','', orf['product']).strip()
            # if "partial" in orf['product']:
            #     print orf['product'].strip()
            #     print product
            fprintf(output_table_file, "%s", orf['id'])
            fprintf(output_table_file, "\t%s", orf['orf_length'])
            fprintf(output_table_file, "\t%s", orf['start'])
            fprintf(output_table_file, "\t%s", orf['end'])
            fprintf(output_table_file, "\t%s", orf['seqname'])
            fprintf(output_table_file, "\t%s", orf['contig_length'])
            fprintf(output_table_file, "\t%s", orf['strand'])
            fprintf(output_table_file, "\t%s", orf['ec'])
            # fprintf(output_table_file, "\t%s", str(species))
            fprintf(output_table_file, "\t%s", taxonomy)
            fprintf(output_table_file, "\t%s\n", product)

            # adding taxons to the megan tree
            #if meganTree and taxonomy != '':
            #    meganTree.insertTaxon(taxonomy)
            #print meganTree.getChildToParentMap()

    output_table_file.close()
def merge_sorted_parsed_files(dbname, filenames, outputfilename, orfRanks, verbose=False, errorlogger = None):
    linecount = 0
    readerhandles = []

    if verbose:
       eprintf("Processing for database  : %s\n", dbname)

    if len(filenames)==0:
       eprintf("WARNING : Cannot find any B/LAST output file for database : %\n", dbname)
       exit_process()

    try:
       for i in range(len(filenames)):
         #print filenames
         readerhandles.append(BlastOutputTsvParser(dbname, filenames[i]) )
    except OSError:
      eprintf("ERROR: Cannot read sequence file : %s\n", filenames[i])
      exit_process()

    # set error and warning parameters 
    for readerhandle in readerhandles:
        readerhandle.setMaxErrorsLimit(5)
        readerhandle.setErrorAndWarningLogger(errorlogger)
        readerhandle.setSTEP_NAME('PARSE BLAST')

    try:
       outputfile = open(outputfilename, 'w')
       fieldmapHeaderLine = readerhandles[0].getHeaderLine()
       fprintf(outputfile, "%s\n",fieldmapHeaderLine)
    except OSError:
       eprintf("ERROR: Cannot create sequence file : %s\n", outputfilename)
       exit_process()

    values = []
    for i in range(len(filenames)):
       iterate = iter(readerhandles[i])
       try :
          next(iterate)
          line = readerhandles[i].getProcessedLine()
          fields  = [ x.strip() for x in line.split('\t') ]
          shortORFId = getShortORFId(fields[0])
          values.append( (i, orfRanks[shortORFId], line) )
       except:
          outputfile.close()
          return

    S = len(filenames)
    BuildHeap(S, values)

    while S>0:
       try:
          iterate = iter(readerhandles[values[0][0]])
          line = readerhandles[values[0][0]].getProcessedLine()
          fields  = [ x.strip() for x in line.split('\t') ]
          #print fields[0], orfRanks[fields[0]]
          fprintf(outputfile, "%s\n",line)
          next(iterate)

          line = readerhandles[values[0][0]].getProcessedLine()
          fields  = [ x.strip() for x in line.split('\t') ]
          shortORFId = getShortORFId(fields[0])
          values[0] = (values[0][0], orfRanks[shortORFId], line)
       except:
          #import traceback
          #traceback.print_exc()
          #print 'finished ' + str(S)
          values[0] = values[S-1]
          S = S - 1

       if S>0:
          Heapify(values, 0, S)

    #print 'line count ' + str(linecount)
    outputfile.close()