def __init__(self, dbname,  blastoutput):
        self.dbname = dbname
        self.blastoutput = blastoutput
        self.i=1
        self.data = {}
        self.fieldmap={}
        self.seq_beg_pattern = re.compile("#")

        try:
           self.blastoutputfile = open( blastoutput,'r')
           self.lines=self.blastoutputfile.readlines()
           self.blastoutputfile.close()
           self.size = len(self.lines)
           if not self.seq_beg_pattern.search(self.lines[0]) :
              exit_process("First line must have field header names and begin with \"#\"")
           header = self.lines[0].replace('#','',1)
           fields = [ x.strip()  for x in header.rstrip().split('\t')]
           k = 0 
           for x in fields:
             self.fieldmap[x] = k 
             k+=1
           eprintf("\nProcessing database : %s\n", dbname)
           
        except AttributeError:
           eprintf("Cannot read the map file for database :%s\n", dbname)
           exit_process()
def process_blastout_file(blast_file, database, table, errorlogger=None):
    try:
        blastfile = open(blast_file, 'r')
    except IOError:
        eprintf("ERROR : Cannot write read file " + blast_file + " !")
        if errorlogger != None:
            errorlogger.write(
                "STATS_rRNA\tERROR\tCannot write read blast output file " +
                blast_file + " for database " + database)
        exit_process()

    blastLines = blastfile.readlines()
    blastfile.close()

    for line in blastLines:
        line = line.strip()
        fields = re.split('\t', line)
        if len(fields) < 12:
            continue
        fields[0] = str(fields[0].strip())
        fields[1] = str(fields[1].strip())
        fields[2] = float(fields[2].strip())
        fields[6] = int(fields[6].strip())
        fields[7] = int(fields[7].strip())
        fields[10] = float(fields[10].strip())
        fields[11] = float(fields[11].strip())
        table[str(fields[0].strip())] = [
            fields[2], fields[10], fields[11], fields[1], fields[6], fields[7]
        ]
def write_run_parameters_file(fileName, parameters):
    try:
       paramFile = open(fileName, 'w')
    except IOError:
       eprintf("Cannot write run parameters to file %s!\n", fileName)
       exit_process("Cannot write run parameters to file %s" %(fileName) )

#       16s_rRNA      {'min_identity': '40', 'max_evalue': '0.000001', 'min_bitscore': '06', 'refdbs': 'silva_104_rep_set,greengenes_db_DW'}
    paramFile.write("\nRun Date : " + str(date.today()) + " \n")

    paramFile.write("\n\nNucleotide Quality Control parameters[s.n")
    paramFile.write( "  min length" + "\t" + str(parameters['quality_control']['min_length']) + "\n")

    paramFile.write("\n\nORF prediction parameters[s.n")
    paramFile.write( "  min length" + "\t" + str(parameters['orf_prediction']['min_length']) + "\n")
    paramFile.write( "  algorithm" + "\t" + str(parameters['orf_prediction']['algorithm']) + "\n")


    paramFile.write("\n\nAmino acid quality control and annotation parameters[s.n")
    paramFile.write( "  min bit score" + "\t" + str(parameters['annotation']['min_score']) + "\n")
    paramFile.write( "  min seq length" + "\t" + str(parameters['annotation']['min_length']) + "\n")
    paramFile.write( "  annotation reference dbs" + "\t" + str(parameters['annotation']['dbs']) + "\n")
    paramFile.write( "  min BSR" + "\t" + str(parameters['annotation']['min_bsr']) + "\n")
    paramFile.write( "  max evalue" + "\t" + str(parameters['annotation']['max_evalue']) + "\n")

    paramFile.write("\n\nPathway Tools parameters[s.n")
    paramFile.write( "  taxonomic pruning " + "\t" + str(parameters['ptools_settings']['taxonomic_pruning']) + "\n")

    paramFile.write("\n\nrRNA search/match parameters[s.n")
    paramFile.write( "  min identity" + "\t" + str(parameters['rRNA']['min_identity']) + "\n")
    paramFile.write( "  max evalue" + "\t" + str(parameters['rRNA']['max_evalue']) + "\n")
    paramFile.write( "  rRNA reference dbs" + "\t" + str(parameters['rRNA']['refdbs']) + "\n")

    paramFile.close()
def process_rRNA_16S_stats(rRNA_16S_file, rRNA_16S_dictionary):
     try:
        taxonomy_file = open(rRNA_16S_file, 'r')
     except IOError:
        eprintf("Cannot read file %s!\n", rRNA_16S_file)
        exit_process()

     tax_lines = taxonomy_file.readlines()
     similarity_pattern = re.compile("similarity")
     evalue_pattern = re.compile("evalue")
     bitscore_pattern = re.compile("bitscore")
     taxonomy_pattern = re.compile("taxonomy")
     headerScanned = False
     for line in tax_lines:
         if headerScanned == False:
            if similarity_pattern.search(line) and evalue_pattern.search(line) and bitscore_pattern.search(line) and  taxonomy_pattern.search(line):
                headerScanned = True
            continue
         fields = [ x.strip() for x in line.split('\t') ]
         if len(fields) >=6:
           if fields[1]!='-':
              rRNA_16S_dictionary[fields[0]] =  [ fields[1], fields[2], fields[5] ]
           else:
              if len(fields) >=12:
                 if fields[7]!='-':
                     rRNA_16S_dictionary[fields[0]] =  [ fields[7], fields[8], fields[11] ]

     taxonomy_file.close()
def write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight, results_dictionary, orf_dictionary, contig, candidate_orf_pos,  orfid):
   try:
      fields = [  'source', 'feature', 'start', 'end', 'score', 'strand', 'frame' ]


      output_line= orf_dictionary[contig][candidate_orf_pos]['seqname']

      for field in fields:
        # printf("\t%s", orf_dictionary[contig][candidate_orf_pos][field])
         output_line += "\t"+ str(orf_dictionary[contig][candidate_orf_pos][field])

      attributes = "ID="+orf_dictionary[contig][candidate_orf_pos]['id']
      attributes += ";" + "locus_tag="+orf_dictionary[contig][candidate_orf_pos]['locus_tag']
      attributes += ";" + "contig_length="+orf_dictionary[contig][candidate_orf_pos]['contig_length']
      attributes += ";" + "orf_length="+orf_dictionary[contig][candidate_orf_pos]['orf_length']
      attributes += ";" + "partial="+orf_dictionary[contig][candidate_orf_pos]['partial']
      attributes += ";" + "sourcedb="+candidatedbname
     
      if candidatedbname in results_dictionary:
         attributes += ";" + "annotvalue="+str(results_dictionary[candidatedbname][orfid]['value'])
         attributes += ";" + "ec="+str(results_dictionary[candidatedbname][orfid]['ec'])
         attributes += ";" + "product="+results_dictionary[candidatedbname][orfid]['product']
      else:
         attributes += ";" + "annotvalue="+str('0')
         attributes += ";" + "ec="+str('')
         attributes += ";" + "product="+'hypothetical protein'

      output_line += '\t' + attributes
      fprintf(outputgff_file, "%s\n", output_line);
   except:
      eprintf("ERROR : Failure to annotate in contig %s\n", contig)
      #print orf_dictionary[contig]
      print traceback.print_exc(10)
      exit_process()
def process_tRNA_stats(tRNA_stats_file, tRNA_dictionary):
    try:
        tRNA_file = open(tRNA_stats_file, 'r')
    except IOError:
        eprintf("Cannot read file %s!\n", tRNA_stats_file)
        exit_process()
    tRNA_lines = tRNA_file.readlines()

    sequence_name_pattern = re.compile("sequence name", re.I)
    number_pattern = re.compile("number", re.I)

    headerScanned = False
    for line in tRNA_lines:
        if number_pattern.search(line):
            continue
        if headerScanned == False:
            if sequence_name_pattern.search(line):
                headerScanned = True
            continue
        fields = [x.strip() for x in line.split('\t')]
        if len(fields) >= 6:
            name = get_sequence_number(fields[0])
            tRNA_dictionary[name] = [
                fields[3], fields[4], fields[5], fields[1]
            ]
def checkMissingParam_values(params, choices, logger=None):
    reqdCategoryParams = {
        'annotation': {
            'dbs': False
        },
        'orf_prediction': {},
        'rRNA': {},
        'metapaths_steps': {}
    }

    success = True
    for category in choices:
        for parameter in choices[category]:
            if (not params[category][parameter]) and\
               ( (category in reqdCategoryParams) and\
                  (parameter in reqdCategoryParams[category]) and   reqdCategoryParams[category][parameter]) :
                print(category, parameter)
                print(reqdCategoryParams)
                print(reqdCategoryParams[category])
                eprintf('ERROR: Empty parameter %s of type %s\n' %
                        (parameter, category))
                eprintf('Please select at least one database for %s\n' %
                        (category))
                if logger != None:
                    logger.write('ERROR\tEmpty parameter %s of type %s\n' %
                                 (parameter, category))
                    logger.write(
                        'Please select at least one database for %s\n' %
                        (category))
                success = False

    return success
Beispiel #8
0
def check_arguments(opts, args):

    return True 

    if len(opts.input_blastout) == 0:
         eprintf("There should be at least one blastoutput file\n")  
         return False

    if len(opts.database_name) == 0:
         eprintf("There should be at least one database name\n")  
         return False

    if len(opts.weight_db) == 0:
         eprint("There should be at least one weight\n")  
         return False

    if len(opts.input_blastout) != len(opts.database_name) or\
         len(opts.input_blastout) !=  len(opts.weight_db) :
         eprint("The num of database names, blastoutputs and database map file should be equal\n")
         return False

    if opts.output_gff == None:
       eprintf("Must specify the output gff file\n")
       return False

    if opts.output_comparative_annotation == None:
       eprintf("Must specify the output tables for comparative annotation\n")
       return False

    if opts.input_gff == None:
       eprintf("Must specify the input gff file\n")
       return False

    return True
def process_tRNA_stats(tRNA_stats_file, tRNA_dictionary, shortenorfid=False):
     counter_tRNA={}
     try:
        tRNA_file = open(tRNA_stats_file, 'r')
     except IOError:
        eprintf("Cannot read file %s!\n", tRNA_stats_file)
        exit_process()
     tRNA_lines = tRNA_file.readlines()

     sequence_name_pattern = re.compile("sequence name", re.I)
     number_pattern = re.compile("number", re.I)

     headerScanned = False
     for line in tRNA_lines:
         if number_pattern.search(line):
            continue
         if headerScanned == False:
            if sequence_name_pattern.search(line):
                headerScanned = True
            continue
         fields = [ x.strip() for x in line.split('\t') ]
         if len(fields) >=6:

              if shortenorfid:
                 name = get_sequence_number(fields[0])
              else:
                 name = fields[0]
              if not name in counter_tRNA:
                 counter_tRNA[name] =0

              _name = name + "_" + str(counter_tRNA[name])
              counter_tRNA[name] = counter_tRNA[name] +1

              tRNA_dictionary[_name] =  [ fields[3], fields[4], fields[5], fields[1] ]
Beispiel #10
0
def create_query_dictionary(blastoutputfile, query_dictionary, algorithm, errorlogger= None ):
       seq_beg_pattern = re.compile("^#")

       try:
          blastoutfh = open( blastoutputfile,'r')
       except:
          print "ERROR : cannot open B/LAST output file " + blastoutputfile + " to parse "
          return
  
       try:
          for line in blastoutfh:
             if not seq_beg_pattern.search(line):
                 words = line.rstrip().split('\t')
                 if len(words) != 12: 
                     continue
   
                 if algorithm =='BLAST': 
                    query_dictionary[words[1]] = 1
   
                 if algorithm =='LAST': 
                    query_dictionary[words[1]]= 1
          blastoutfh.close()
       except:
          eprintf("\nERROR : while reading  B/LAST output file " + blastoutputfile + " to parse " +\
                  "        : make sure B/LAST ing was done for the particular database")

          if errorlogger:
             errorlogger.write("\nERROR : while reading  B/LAST output file %s to parse\n" %(blastoutputfile))
             errorlogger.write("      : make sure B/LAST ing was done for the particular database\n")
          pass 
Beispiel #11
0
def add_tRNA_genes(tRNA_dictionary, tRNA_gff_dictionary, contig_lengths):
    for tRNA in tRNA_dictionary:
        start = int(tRNA_dictionary[tRNA][0])
        end = int(tRNA_dictionary[tRNA][1])
        if start > end:
            start = int(tRNA_dictionary[tRNA][1])
            end = int(tRNA_dictionary[tRNA][0])

        try:
            orf_length = end - start
        except:
            orf_length = 0

        contig_name = re.sub(r'_\d+$', '', tRNA)

        if contig_name in contig_lengths:
            contig_length = contig_lengths[contig_name]
        else:
            contig_length = 0

        if start > end or contig_length < end:
            eprintf("trna {}   {}  {}  {} {}\n".format(tRNA, start, end,
                                                       end - start,
                                                       contig_length))
            end = contig_length
            eprintf("trna {}   {}  {}  {} {}\n".format(tRNA, start, end,
                                                       end - start,
                                                       contig_length))

        dict = { 'id':ContigID(tRNA), 'seqname': tRNA, 'start':start, 'end':end,\
                 'strand':tRNA_dictionary[tRNA][2], 'score':" ", 'orf_length':str(orf_length),\
                 'contig_length':str(contig_length),\
                 'feature':'tRNA', 'source':'trnaScan-1.4', 'frame':0, 'product':'tRNA-' + tRNA_dictionary[tRNA][3], 'ec':'' }
        tRNA_gff_dictionary[tRNA] = dict.copy()
Beispiel #12
0
def runMicrobeCensus(microbeCensusExec, microbeCensusOutput, sample_name,
                     readFiles, rpkmFolder):

    num_threads = int(multiprocessing.cpu_count() * 0.8)
    if num_threads < 1:
        num_threads = 1
    status = True

    readfiles = [','.join(read) for read in readFiles]

    if len(readFiles) == 2:
        command_frags = [
            microbeCensusExec, ','.join(readfiles),
            microbeCensusOutput + ".tmp"
        ]

        result = getstatusoutput(' '.join(command_frags))
        print ' '.join(command_frags)

        if result[0] == 0:
            pass
            rename(microbeCensusOutput + ".tmp", microbeCensusOutput)
        else:
            eprintf(
                "ERROR:\tError while running MicrobeCensus on read  files %s\n",
                readFiles)
            status = False
    else:
        eprintf(
            "ERROR:\tThe number of read files for MicrobeCensus must be at most 3. Found %d:%s\n",
            len(readFiles), ','.join(readFiles))
        status = False

    return status
    def __init__(self, dbname, blastoutput):
        self.dbname = dbname
        self.blastoutput = blastoutput
        self.i = 1
        self.data = {}
        self.fieldmap = {}
        self.seq_beg_pattern = re.compile("#")

        try:
            self.blastoutputfile = open(blastoutput, 'r')
            self.lines = self.blastoutputfile.readlines()
            self.blastoutputfile.close()
            self.size = len(self.lines)
            if not self.seq_beg_pattern.search(self.lines[0]):
                exit_process(
                    "First line must have field header names and begin with \"#\""
                )
            header = self.lines[0].replace('#', '', 1)
            fields = [x.strip() for x in header.rstrip().split('\t')]
            k = 0
            for x in fields:
                self.fieldmap[x] = k
                k += 1
            eprintf("\nProcessing database : %s\n", dbname)

        except AttributeError:
            eprintf("Cannot read the map file for database :%s\n", dbname)
            exit_process()
def process_rRNA_16S_stats(rRNA_16S_file, rRNA_16S_dictionary):
    try:
        taxonomy_file = open(rRNA_16S_file, 'r')
    except IOError:
        eprintf("Cannot read file %s!\n", rRNA_16S_file)
        exit_process()

    tax_lines = taxonomy_file.readlines()
    similarity_pattern = re.compile("similarity")
    evalue_pattern = re.compile("evalue")
    bitscore_pattern = re.compile("bitscore")
    taxonomy_pattern = re.compile("taxonomy")
    headerScanned = False
    for line in tax_lines:
        if headerScanned == False:
            if similarity_pattern.search(line) and evalue_pattern.search(
                    line) and bitscore_pattern.search(
                        line) and taxonomy_pattern.search(line):
                headerScanned = True
            continue
        fields = [x.strip() for x in line.split('\t')]
        if len(fields) >= 6:
            if fields[1] != '-':
                rRNA_16S_dictionary[fields[0]] = [
                    fields[1], fields[2], fields[5]
                ]
            else:
                if len(fields) >= 12:
                    if fields[7] != '-':
                        rRNA_16S_dictionary[fields[0]] = [
                            fields[7], fields[8], fields[11]
                        ]

    taxonomy_file.close()
def read_map_file(dbname_map_filename, field_to_description, hierarchical_map) :
    try:
       map_file = open(dbname_map_filename, 'r')

       map_filelines = map_file.readlines()
    except:
       eprintf("ERROR: Cannot open file %s\n", dbname_map_filename)
       exit_process()

    tempfields = [ '', '', '', '', '', '', '' ]
    for line in map_filelines:
       pos = beginning_valid_field(line)
       if pos==-1:
          continue

       fields = [ x.strip() for x in line.split('\t') ]

       tempfields[pos] = fields[pos]
       if len(fields) > pos + 1:
          field_to_description[fields[pos]] = fields[pos+1]
       else:
          field_to_description[fields[pos]] = fields[pos]

       i=0
       temp_hierarchical_map = hierarchical_map
       while i < pos :
          temp_hierarchical_map = temp_hierarchical_map[ tempfields[i] ]
          i+=1

       temp_hierarchical_map[ tempfields[i] ] = {}
    fill_hierarchy_with_zeroes(hierarchical_map)
def runMicrobeCensus(microbeCensusExec, microbeCensusOutput,  sample_name, readFiles, rpkmFolder) :

    num_threads =  int(multiprocessing.cpu_count()*0.8)
    if num_threads < 1:
       num_threads = 1
    status = True

    readfiles= [ ','.join(read) for read in readFiles ]

    
    if len(readFiles) == 2:
       command_frags = [microbeCensusExec, ','.join(readfiles), microbeCensusOutput + ".tmp"]

       result = getstatusoutput(' '.join(command_frags))
       print ' '.join(command_frags)

       if result[0]==0:
          pass
          rename(microbeCensusOutput+".tmp", microbeCensusOutput)
       else:
          eprintf("ERROR:\tError while running MicrobeCensus on read  files %s\n", readFiles)
          status = False
    else:
          eprintf("ERROR:\tThe number of read files for MicrobeCensus must be at most 3. Found %d:%s\n", len(readFiles), ','.join(readFiles))
          status = False
          
    return status
def read_map_file(dbname_map_filename, field_to_description, hierarchical_map):
    try:
        map_file = open(dbname_map_filename, 'r')

        map_filelines = map_file.readlines()
    except:
        eprintf("ERROR: Cannot open file %s\n", dbname_map_filename)
        exit_process()

    tempfields = ['', '', '', '', '', '', '']
    for line in map_filelines:
        pos = beginning_valid_field(line)
        if pos == -1:
            continue

        fields = [x.strip() for x in line.split('\t')]

        tempfields[pos] = fields[pos]
        if len(fields) > pos + 1:
            field_to_description[fields[pos]] = fields[pos + 1]
        else:
            field_to_description[fields[pos]] = fields[pos]

        i = 0
        temp_hierarchical_map = hierarchical_map
        while i < pos:
            temp_hierarchical_map = temp_hierarchical_map[tempfields[i]]
            i += 1

        temp_hierarchical_map[tempfields[i]] = {}
    fill_hierarchy_with_zeroes(hierarchical_map)
def process_blastout_file(blast_file, database, table, errorlogger = None):
     try:
        blastfile = open(blast_file, 'r')
     except IOError:
        eprintf("ERROR : Cannot write read file " + blast_file + " !" )
        if errorlogger!=None:
          errorlogger.write("STATS_rRNA\tERROR\tCannot write read blast output file " + blast_file + " for database " + database )
        exit_process()

     blastLines = blastfile.readlines()
     blastfile.close()

     for line in blastLines:
        line = line.strip() 
        fields = re.split('\t', line)
        if len(fields) < 12:
           continue
        fields[0] =  str(fields[0].strip())
        fields[1] =  str(fields[1].strip())
        fields[2] =  float(fields[2].strip())
        fields[6] =  int(fields[6].strip())
        fields[7] =  int(fields[7].strip())
        fields[10] = float(fields[10].strip())
        fields[11] = float(fields[11].strip())
        table[str(fields[0].strip())] = [fields[2], fields [10], fields[11], fields[1], fields[6], fields[7]] 
def  checkMissingParam_values(params, choices, logger = None):
     reqdCategoryParams = { 
                            'annotation': {'dbs': False}, 
                            'orf_prediction':{}, 
                            'rRNA':{},
                            'metapaths_steps':{}
                         }

     success  = True
     for category in choices:
       for parameter in choices[category]:
         if (not params[category][parameter]) and\
            ( (category in reqdCategoryParams) and\
               (parameter in reqdCategoryParams[category]) and   reqdCategoryParams[category][parameter]) :
            print category, parameter
            print reqdCategoryParams
            print reqdCategoryParams[category]
            eprintf('ERROR: Empty parameter %s of type %s\n'  %(parameter, category))
            eprintf('Please select at least one database for %s\n'  %(category))
            if logger!=None:
               logger.write('ERROR\tEmpty parameter %s of type %s\n'  %(parameter, category))
               logger.write('Please select at least one database for %s\n'  %(category))
            success = False

     return success
def check_arguments(opts, args):

    return True 

    if len(opts.input_blastout) == 0:
         eprintf("There should be at least one blastoutput file\n")  
         return False

    if len(opts.database_name) == 0:
         eprintf("There should be at least one database name\n")  
         return False

    if len(opts.weight_db) == 0:
         eprint("There should be at least one weight\n")  
         return False

    if len(opts.input_blastout) != len(opts.database_name) or\
         len(opts.input_blastout) !=  len(opts.weight_db) :
         eprint("The num of database names, blastoutputs and database map file should be equal\n")
         return False

    if opts.output_gff == None:
       eprintf("Must specify the output gff file\n")
       return False

    if opts.output_comparative_annotation == None:
       eprintf("Must specify the output tables for comparative annotation\n")
       return False

    if opts.input_gff == None:
       eprintf("Must specify the input gff file\n")
       return False

    return True
def write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight,
                             results_dictionary, orf_dictionary, contig,
                             candidate_orf_pos, orfid, compact_output):
    global errorcode
    try:
        fields = [
            'source', 'feature', 'start', 'end', 'score', 'strand', 'frame'
        ]

        output_line = orf_dictionary[contig][candidate_orf_pos]['seqname']

        #if compact_output:
        #output_line = ShortenContigId(output_line)

        for field in fields:
            output_line += "\t" + str(
                orf_dictionary[contig][candidate_orf_pos][field])

        #if compact_output:
        try:
            attributes = "ID=" + ShortenORFId(
                orf_dictionary[contig][candidate_orf_pos]['id'])
            attributes += ";" + "locus_tag=" + ShortenORFId(
                orf_dictionary[contig][candidate_orf_pos]['locus_tag'])
        except:
            attributes = "ID=" + orf_dictionary[contig][candidate_orf_pos]['id']
            attributes += ";" + "locus_tag=" + orf_dictionary[contig][
                candidate_orf_pos]['locus_tag']

        attributes += ";" + "contig_length=" + orf_dictionary[contig][
            candidate_orf_pos]['contig_length']
        attributes += ";" + "orf_length=" + orf_dictionary[contig][
            candidate_orf_pos]['orf_length']
        attributes += ";" + "partial=" + orf_dictionary[contig][
            candidate_orf_pos]['partial']
        attributes += ";" + "sourcedb=" + candidatedbname

        if candidatedbname in results_dictionary:
            attributes += ";" + "annotvalue=" + str(
                results_dictionary[candidatedbname][orfid]['value'])
            attributes += ";" + "ec=" + str(
                results_dictionary[candidatedbname][orfid]['ec'])
            attributes += ";" + "product=" + results_dictionary[
                candidatedbname][orfid]['product']
        else:
            attributes += ";" + "annotvalue=" + str('0')
            attributes += ";" + "ec=" + str('')
            attributes += ";" + "product=" + 'hypothetical protein'

        output_line += '\t' + attributes

        if candidatedbname in results_dictionary:
            fprintf(outputgff_file, "%s\n", output_line)
    except:
        eprintf("ERROR : Failure to annotate in contig %s\n", contig)
        #print orf_dictionary[contig]
        print traceback.print_exc(10)
        insert_error(errorcode)
        exit_process()
    def next(self):
        if self.i % self.SIZE == 0:
            self.refillBuffer()
            if len(self.lines)==0:
                raise StopIteration()

        if self.i % self.SIZE < self.size:
            fields = [ x.strip()  for x in self.lines[self.i % self.SIZE].split('\t')]
            try:
                self.data = {}
                self.data['query'] = fields[self.fieldmap['query']]
                self.data['q_length'] = int(fields[self.fieldmap['q_length']])
                self.data['bitscore'] = float(fields[self.fieldmap['bitscore']])
                self.data['bsr'] = float(fields[self.fieldmap['bsr']])
                self.data['target'] = fields[self.fieldmap['target']]
                self.data['aln_length'] = float(fields[self.fieldmap['aln_length']])
                self.data['expect'] = float(fields[self.fieldmap['expect']])
                self.data['identity'] = float(fields[self.fieldmap['identity']])
                self.data['ec'] = fields[self.fieldmap['ec']]
                self.data['product'] = re.sub(r'=',' ',fields[self.fieldmap['product']])
                self.lineToProcess = self.lines[self.i % self.SIZE]
            except:
                self.ERROR_COUNT += 1
                if self.MAX_READ_ERRORS_ALLOWED > self.ERROR_COUNT:
                    eprintf("%s\tWARNING\till-formatted line \"%s\" \t %s\n", self.STEP_NAME,  self.lines[self.i % self.SIZE], self.blastoutput)
                    if self.error_and_warning_logger != None:
                        self.error_and_warning_logger.write("%s\tWARNING\till-formatted line :\"%s\" \t source : %s\n" %(self.STEP_NAME,  re.sub(r'\t', '<tab>', self.lines[self.i % self.SIZE]) , self.blastoutput))
                    self.i = self.i + 1
                    self.next()
                else:
                    if self.error_and_warning_logger != None:
                        self.error_and_warning_logger.write("%s\tERROR\tThe number of lines in file %s exceeded the max tolerance %d\n" %(self.blastoutput,  self.MAX_READ_ERRORS_ALLOWED) )
                    exit_process()


                #              print "<<<<<<-------"
                #              print 'self size ' + str(self.size)
                #              print 'line ' + self.lines[self.i % self.SIZE]
                #              print 'num fields ' + str(len(fields))
                #              fields = [ x  for x in self.lines[self.i % self.SIZE].split('\t')]
                #              for field in fields:
                #                 print field
                #              print 'next line ' + self.lines[(self.i + 1) % self.SIZE]
                #              print ' field map ' + str(self.fieldmap)
                #              print 'index ' + str(self.i)
                #              print 'data ' + str(self.data)
                #              print 'fields ' + str(fields)
                #              print ' while processing file ' + self.blastoutput
                #              print ">>>>>>-------"
                #              import traceback
                #              print traceback.print_exc()

            self.i = self.i + 1
            return self.data
        else:
            self.lineToProcess = None
            self.blastoutputfile.close()
            raise StopIteration()
def process_rRNA_16S_stats(dbname,
                           rRNA_16S_file,
                           orf_read_rpkgs,
                           opts,
                           shortenorfid=False):
    print "Processing rRNA database : ", dbname
    counter_rRNA = {}
    if not doesFileExist(rRNA_16S_file):
        return
    try:
        taxonomy_file = open(rRNA_16S_file, 'r')
    except IOError:
        eprintf("Cannot read file %s!\n", rRNA_16S_file)
        exit_process()

    tax_lines = taxonomy_file.readlines()
    similarity_pattern = re.compile("similarity")
    evalue_pattern = re.compile("evalue")
    bitscore_pattern = re.compile("bitscore")
    taxonomy_pattern = re.compile("taxonomy")
    headerScanned = False

    seencounter = {}
    for line in tax_lines:
        if headerScanned == False:
            if similarity_pattern.search(line) and evalue_pattern.search(
                    line) and bitscore_pattern.search(
                        line) and taxonomy_pattern.search(line):
                headerScanned = True
            continue
        fields = [x.strip() for x in line.split('\t')]
        if len(fields) >= 6:
            if not fields[0] in seencounter:
                seencounter[fields[0]] = 0
            else:
                seencounter[fields[0]] += 1

            _name = fields[0] + "_" + str(seencounter[fields[0]]) + "_rRNA"

            if not fields[6] in counter_rRNA:
                counter_rRNA[fields[6]] = 0.0

            name = ShortenrRNAId(_name)
            if name in orf_read_rpkgs:
                counter_rRNA[fields[6]] += orf_read_rpkgs[name]
            else:
                counter_rRNA[fields[6]] += 0

    taxonomy_file.close()
    with open(
            opts.outputdir + PATHDELIM + opts.sample_name + "." + dbname +
            ".read_rpkgs.txt", 'w') as fout:
        fprintf(fout, "# Gene\tCounts\n")
        for name in counter_rRNA:
            fprintf(fout, "%s\t%0.2f\n", name, counter_rRNA[name])

    return len(counter_rRNA)
def environment_variables_defined():
    variables = ['METAPATHWAYS_DB']
    status =True
    for variable in variables:
      if not variable in os.environ:
         eprintf("%-10s:Environment variable %s not defined! Please set %s as \'export %s=<value>\'\n" %('ERROR', variable, variable,variable))
         if variables in ['METAPATHWAYS_DB']:
           status=False
    
    return status
def environment_variables_defined():
    variables = ['METAPATHWAYS_DB']
    status =True
    for variable in variables:
      if not variable in os.environ:
         eprintf("%-10s:Environment variable %s not defined! Please set %s as \'export %s=<value>\'\n" %('ERROR', variable, variable,variable))
         if variables in ['METAPATHWAYS_DB']:
           status=False
    
    return status
    def next(self):
        if self.i % self.SIZE == 0:
            self.refillBuffer()
            if len(self.lines) == 0:
                raise StopIteration()

        if self.i % self.SIZE < self.size:
            fields = [
                x.strip() for x in self.lines[self.i % self.SIZE].split('\t')
            ]
            try:
                self.data = {}
                self.data['query'] = fields[self.fieldmap['query']]
                self.data['q_length'] = int(fields[self.fieldmap['q_length']])
                self.data['bitscore'] = float(
                    fields[self.fieldmap['bitscore']])
                self.data['bsr'] = float(fields[self.fieldmap['bsr']])
                self.data['target'] = fields[self.fieldmap['target']]
                self.data['aln_length'] = float(
                    fields[self.fieldmap['aln_length']])
                self.data['expect'] = float(fields[self.fieldmap['expect']])
                self.data['identity'] = float(
                    fields[self.fieldmap['identity']])
                self.data['ec'] = fields[self.fieldmap['ec']]
                self.data['product'] = re.sub(r'=', ' ',
                                              fields[self.fieldmap['product']])
                self.lineToProcess = self.lines[self.i % self.SIZE]
            except:
                self.ERROR_COUNT += 1
                if self.MAX_READ_ERRORS_ALLOWED > self.ERROR_COUNT:
                    eprintf("%s\tWARNING\till-formatted line \"%s\" \t %s\n",
                            self.STEP_NAME, self.lines[self.i % self.SIZE],
                            self.blastoutput)
                    if self.error_and_warning_logger != None:
                        self.error_and_warning_logger.write(
                            "%s\tWARNING\till-formatted line :\"%s\" \t source : %s\n"
                            % (self.STEP_NAME,
                               re.sub(r'\t', '<tab>',
                                      self.lines[self.i % self.SIZE]),
                               self.blastoutput))
                    self.i = self.i + 1
                    self.next()
                else:
                    if self.error_and_warning_logger != None:
                        self.error_and_warning_logger.write(
                            "%s\tERROR\tThe number of lines in file %s exceeded the max tolerance %d\n"
                            % (self.blastoutput, self.MAX_READ_ERRORS_ALLOWED))
                    exit_process()

            self.i = self.i + 1
            return self.data
        else:
            self.lineToProcess = None
            self.blastoutputfile.close()
            raise StopIteration()
def get_pipeline_steps(steps_log_file):
    try:
       logfile = open(steps_log_file, 'r')
    except IOError:
       eprintf("Did not find %s!\n", logfile) 
       eprintf("Try running in \'complete\' run-type\n")
    else:
       lines = logfile.readlines()

    pipeline_steps = None
    return pipeline_steps
def get_pipeline_steps(steps_log_file):
    try:
        logfile = open(steps_log_file, 'r')
    except IOError:
        eprintf("Did not find %s!\n", logfile)
        eprintf("Try running in \'complete\' run-type\n")
    else:
        lines = logfile.readlines()

    pipeline_steps = None
    return pipeline_steps
def report_missing_filenames(input_output_list, sample_subset, logger=None):
    foundFiles = {}
    for samplePath in input_output_list.keys():
       sampleName =  path.basename(input_output_list[samplePath]) 
       foundFiles[sampleName] =True

    for sample_in_subset in sample_subset:
       if not sample_in_subset in foundFiles:
          eprintf("ERROR\tCannot find input file for sample %s\n!", sample_in_subset)
          if logger:
             logger.printf("ERROR\tCannot file input for sample %s!\n", sample_in_subset)
def check_arguments(opts, args):

    if opts.blastdir == None:
        eprintf("The blast_results folder must be specified\n")
        return False

    if opts.sample_name == None:
        eprintf("There should be at least one sample name\n")
        return False

    return True
def report_missing_filenames(input_output_list, sample_subset, logger=None):
    foundFiles = {}
    for samplePath in input_output_list.keys():
       sampleName =  path.basename(input_output_list[samplePath]) 
       foundFiles[sampleName] =True

    for sample_in_subset in sample_subset:
       if not sample_in_subset in foundFiles:
          eprintf("ERROR\tCannot find input file for sample %s\n!", sample_in_subset)
          if logger:
             logger.printf("ERROR\tCannot file input for sample %s!\n", sample_in_subset)
Beispiel #32
0
    def __init__(self, dbname,  blastoutput, database_mapfile, refscore_file, opts, errorlogger =None):
        self.Size = 10000
        self.dbname = dbname
        self.ln2 = 0.69314718055994530941
        self.lnk = math.log(opts.k)
        self.Lambda = opts.Lambda
        self.blastoutput = blastoutput
        self.database_mapfile =database_mapfile
        self.refscore_file = refscore_file
        self.annot_map = {} 
        self.i=0
        self.opts = opts
        self.hits_counts = {}
        self.data = {}
        self.refscores = {}
        self.refBitScores = {}
        self.needToPermute = False;

        self.MAX_READ_ERRORS_ALLOWED = 100
        self.ERROR_COUNT = 0
        self.STEP_NAME = 'PARSE_BLAST'
        self.error_and_warning_logger = errorlogger 


        #print "trying to open blastoutput file " + blastoutput
        query_dictionary = {}
        create_query_dictionary(self.blastoutput, query_dictionary, self.opts.algorithm, errorlogger =  errorlogger) 
        try:
            self.blastoutputfile = open(self.blastoutput,'r')
        except:
            eprintf("\nERROR : cannot open B/LAST output file " + blastoutput + " to parse "+\
                      "      : make sure \"B/LAST\"ing was done for the particular database" )

            if self.error_and_warning_logger:
               self.error_and_warning_logger.write("ERROR : cannot open B/LAST output file %s %s to parse \n" +\
                                             "      : make sure \"B/LAST\"ing was done for "+\
                                             "the particular database" %(blastoutput) )
            exit_process( "Cannot open B/LAST output file " + blastoutput )

        try:
            self.create_refBitScores()
        except:
            print traceback.print_exc(10)
            exit_process( "Error while reading from  B/LAST refscore file " + self.refscore_file )

        try:
           create_dictionary(database_mapfile, self.annot_map, query_dictionary)
           query_dictionary = {}
        except AttributeError:
           eprintf("Cannot read the map file for database : %s\n" % (dbname))
           if errorlogger!= None:
              errorlogger.write("PARSE_BLAST\tERROR\tCannot read the map file %s for database : %s\tDelete the formatted files for the database in the \"formatted\" folder\n" %(database_mapfile, dbname))

           exit_process("Cannot read the map file for database  " + dbname)
def  checkParam_values(allcategorychoices, parameters, runlogger = None):
     for category in allcategorychoices:
        for choice in allcategorychoices[category]:
           if choice in parameters: 

             if not parameters[choice] in allcategorychoices[category][choice]:
                 logger.write('ERROR\tIncorrect setting in your parameter file')
                 logger.write('for step %s as %s' %(choice, parameters[choices]))
                 eprintf("ERROR: Incorrect setting in your parameter file" +\
                         "       for step %s as %s", choice, parameters[choices])
                 exit_process()
 def __init__(self, gff_filename):
     self.Size = 10000
     self.i=0
     self.orf_dictionary = {}
     self.gff_beg_pattern = re.compile("^#")
     self.lines= []
     self.size=0
     try:
        self.gff_file = open( gff_filename,'r')
     except AttributeError:
        eprintf("Cannot read the map file for database : %s\n", dbname)
        exit_process()
 def __init__(self, gff_filename):
     self.Size = 10000
     self.i = 0
     self.orf_dictionary = {}
     self.gff_beg_pattern = re.compile("^#")
     self.lines = []
     self.size = 0
     try:
         self.gff_file = open(gff_filename, 'r')
     except AttributeError:
         eprintf("Cannot read the map file for database : %s\n", dbname)
         exit_process()
def main(argv, errorlogger = None, runcommand = None, runstatslogger = None):
    global parser

    options, args = parser.parse_args(argv)

    if options.algorithm == 'BLAST':
       _execute_BLAST(options)
    elif options.algorithm == 'LAST':
        _execute_LAST(options)
    else:
        eprintf("ERROR\tUnrecognized algorithm name for FUNC_SEARCH\n")
        if errorlogger:
            errorlogger.printf("ERROR\tUnrecognized algorithm name for FUNC_SEARCH\n")
        exit_process("ERROR\tUnrecognized algorithm name for FUNC_SEARCH\n")
def halt_on_invalid_input(input_output_list, filetypes, sample_subset):

    for samplePath in input_output_list.keys():
       sampleName =  path.basename(input_output_list[samplePath]) 

       ''' in the selected list'''
       if not sampleName in sample_subset:
          continue

       if filetypes[samplePath][0]=='UNKNOWN':
          eprintf("ERROR\tIncorrect input sample %s. Check for bad characters or format\n!", samplePath)
          return False

    return True
def get_ORF_annotations_hits(sample_name, folder_path):
    results = []
    # for the LAST algorithm
    regPattern = re.compile(r'.annot.gff$', re.IGNORECASE)
    input_dir = folder_path +  PATHDELIM + 'results' + PATHDELIM + 'annotation_table' 
    file_name = input_dir + PATHDELIM +  'ORF_annotation_table.txt'

    eprintf("\nCounting number of ORFs for mapping to functional classification ...")
    count  =  get_number_of_uncommented_lines(file_name)
    eprintf("done\n")
    results.append( ('Total orfs count for functional classification', count ) )
    if results==[]:
       return None
    return results
Beispiel #39
0
def check_for_error_in_input_file_name(shortname, globalerrorlogger=None):

    """  creates a list of  input output pairs if input is  an input dir """
    clean = True
    if not re.search(r'^[a-zA-Z]',shortname):
         eprintf("ERROR\tSample name %s must begin with an alphabet!\n",shortname)
         if globalerrorlogger:
            globalerrorlogger.printf("ERROR\tSample name %s must begin with an alphabet!\n",shortname)
         clean = False

    if re.search(r'[.]',shortname):
         eprintf("ERROR\tSample name %s contains a '.' in its name!\n",shortname)
         if globalerrorlogger:
            globalerrorlogger.printf("ERROR\tSample name %s contains a '.' in its name!\n",shortname)
         clean = False

    if len(shortname)<2:
         eprintf("ERROR\tSample name %s is too short!\n",shortname)
         if globalerrorlogger:
             globalerrorlogger.printf("ERROR\tSample name %s is too short1\n",shortname)
         clean = False

    if clean:
         return clean

    errmessage = """ Sample names before the  suffixes .fasta, .fas, .fna, .faa or .gbk, must  consist only of alphabets, digits and _; and should consist of at least two characters """
    eprintf("ERROR\t%s\n",errmessage)
    if globalerrorlogger:
        globalerrorlogger.printf("ERROR\t%s\n",errmessage)
        exit_process("ERROR\t" + errmessage + "Exiting!" + "\n")
    return False
def remove_unspecified_samples(input_output_list, sample_subset,  globalerrorlogger = None):
   """ keep only the samples that are specified  before processing  """

   shortened_names = {}
   input_sample_list = input_output_list.keys()
   for sample_name in input_sample_list:
      short_sample_name = derive_sample_name(sample_name) 
    #  print short_sample_name, len(short_sample_name)
      if len(short_sample_name) > 35:
         eprintf("ERROR\tSample name %s must not be longer than 35 characters!\n",short_sample_name)
         if globalerrorlogger:
             globalerrorlogger.printf("ERROR\tSample name %s must not be longer than 35 characters!\n",short_sample_name)
      if not derive_sample_name(sample_name) in sample_subset and  sample_subset:
         del input_output_list[sample_name]
def check_for_error_in_input_file_name(shortname, globalerrorlogger=None):

    """  creates a list of  input output pairs if input is  an input dir """
    clean = True
    if not re.search(r'^[a-zA-Z]',shortname):
         eprintf("ERROR\tSample name %s must begin with an alphabet!\n",shortname)
         if globalerrorlogger:
            globalerrorlogger.printf("ERROR\tSample name %s must begin with an alphabet!\tConsider prefixing an alphabet to the front\n",shortname)
         clean = False

    if re.search(r'[.]',shortname):
         eprintf("ERROR\tSample name %s contains a '.' in its name!\n",shortname)
         if globalerrorlogger:
            globalerrorlogger.printf("ERROR\tSample name %s contains a '.' in its name!\n",shortname)
         clean = False

    if len(shortname)<2:
         eprintf("ERROR\tSample name %s is too short!\n",shortname)
         if globalerrorlogger:
             globalerrorlogger.printf("ERROR\tSample name %s is too short1\n",shortname)
         clean = False

    if clean:
         return clean

    errmessage = """Sample names before the  suffixes .fasta, .fas, .fna, .faa or .gbk, must  consist only of alphabets, digits and _; and should consist of at least two characters """
    eprintf("ERROR\t%s\n",errmessage)
    if globalerrorlogger:
        globalerrorlogger.printf("ERROR\t%s\n",errmessage)
    #    exit_process(errmessage + "Exiting!" + "\n", logger=globalerrorlogger)
    return False
def halt_on_invalid_input(input_output_list, filetypes, sample_subset):

    for samplePath in input_output_list.keys():
       sampleName =  path.basename(input_output_list[samplePath]) 

       ''' in the selected list'''
       if not sampleName in sample_subset:
          continue

       if filetypes[samplePath][0]=='UNKNOWN':
          eprintf("ERROR\tIncorrect input sample %s. Check for bad characters or format\n!", samplePath)
          return False

    return True
def remove_unspecified_samples(input_output_list, sample_subset,  globalerrorlogger = None):
   """ keep only the samples that are specified  before processing  """

   shortened_names = {}
   input_sample_list = input_output_list.keys()
   for sample_name in input_sample_list:
      short_sample_name = derive_sample_name(sample_name) 
    #  print short_sample_name, len(short_sample_name)
      if len(short_sample_name) > 35:
         eprintf("ERROR\tSample name %s must not be longer than 35 characters!\n",short_sample_name)
         if globalerrorlogger:
             globalerrorlogger.printf("ERROR\tSample name %s must not be longer than 35 characters!\n",short_sample_name)
      if not derive_sample_name(sample_name) in sample_subset and  sample_subset:
         del input_output_list[sample_name]
def checkParam_values(allcategorychoices, parameters, runlogger=None):
    for category in allcategorychoices:
        for choice in allcategorychoices[category]:
            if choice in parameters:

                if not parameters[choice] in allcategorychoices[category][
                        choice]:
                    logger.write(
                        'ERROR\tIncorrect setting in your parameter file')
                    logger.write('for step %s as %s' %
                                 (choice, parameters[choices]))
                    eprintf("ERROR: Incorrect setting in your parameter file" +\
                            "       for step %s as %s", choice, parameters[choices])
                    exit_process()
Beispiel #45
0
def process_rRNA_16S_stats(rRNA_16S_file,
                           rRNA_16S_dictionary,
                           shortenorfid=False):
    counter_rRNA = {}
    if not doesFileExist(rRNA_16S_file):
        return
    try:
        taxonomy_file = open(rRNA_16S_file, 'r')
    except IOError:
        eprintf("Cannot read file %s!\n", rRNA_16S_file)
        exit_process()

    tax_lines = taxonomy_file.readlines()
    similarity_pattern = re.compile("similarity")
    evalue_pattern = re.compile("evalue")
    bitscore_pattern = re.compile("bitscore")
    taxonomy_pattern = re.compile("taxonomy")
    headerScanned = False
    for line in tax_lines:
        if headerScanned == False:
            if similarity_pattern.search(line) and evalue_pattern.search(
                    line) and bitscore_pattern.search(
                        line) and taxonomy_pattern.search(line):
                headerScanned = True
            continue
        fields = [x.strip() for x in line.split('\t')]
        if len(fields) >= 6:

            if shortenorfid:
                name = get_sequence_number(fields[0])
            else:
                name = fields[0]

            if not name in counter_rRNA:
                counter_rRNA[name] = 0

            _name = name + "_" + str(counter_rRNA[name])
            counter_rRNA[name] = counter_rRNA[name] + 1

            if fields[1] != '-':
                rRNA_16S_dictionary[_name] = [fields[1], fields[2], fields[5]]
            else:
                if len(fields) >= 12:
                    if fields[7] != '-':
                        rRNA_16S_dictionary[_name] = [
                            fields[7], fields[8], fields[11]
                        ]

    taxonomy_file.close()
def get_functional_taxonomic_hits(sample_name, folder_path):
    results = []
    # for the LAST algorithm
    regPattern = re.compile(r'.annot.gff$', re.IGNORECASE)
    input_dir = folder_path +  PATHDELIM + 'results' + PATHDELIM + 'annotation_table' 
    file_name = input_dir + PATHDELIM +  'functional_and_taxonomic_table.txt'

    eprintf("\nCounting number of functionally and taxonomically ORFs ...")
    count  =  get_number_of_uncommented_lines(file_name)
    eprintf("done\n")
    results.append( ('Total number of taxonomically and taxonmically annotated ORFs', count ) )

    if results==[]:
       return None
    return results
def get_BLAST_LAST_parsed_hits(sample_name, folder_path):
    results = []
    # for the LAST algorithm

    regPattern = re.compile(r'.LASTout.parsed.txt$', re.IGNORECASE)
    input_dir = folder_path + PATHDELIM + 'blast_results'
    files = [
        re.sub(r'.*\/', '', f)
        for f in glob(input_dir + PATHDELIM + sample_name + '*')
        if regPattern.search(f)
    ]
    regPattern = re.compile(r'[.](.*)[.]LASTout.parsed.txt$', re.IGNORECASE)

    for file in files:
        result = regPattern.search(file)
        if result:
            database = result.group(1)
            file_name = input_dir + PATHDELIM + sample_name + '.' + result.group(
                1) + '.LASTout.parsed.txt'
            eprintf("\nParse LAST hits for : %s...", database)
            count = get_number_of_uncommented_lines(file_name)
            results.append(('Total number of selected hits in ' + database +
                            ' with LAST ', count))

    # now for the BLAST algorithm
    regPattern = re.compile(r'.BLASTout.parsed.txt')
    input_dir = folder_path + PATHDELIM + 'blast_results'
    files = [
        re.sub(r'.*\/', '', f)
        for f in glob(input_dir + PATHDELIM + sample_name + '*')
        if regPattern.search(f)
    ]
    regPattern = re.compile(r'[.](.*)[.]BLASTout')

    for file in files:
        result = regPattern.search(file)
        if result:
            database = result.group(1)
            file_name = input_dir + PATHDELIM + sample_name + '.' + result.group(
                1) + '.BLASTout.parsed.txt'
            eprintf("\nParse BLAST hits for : %s...", database)
            count = get_number_of_uncommented_lines(file_name)
            results.append(('Total number of selected hits in ' + database +
                            ' with BLAST ', count))

    if results == []:
        return None
    return results
Beispiel #48
0
def formatted_db_exists(dbname, suffixes):
    for suffix in suffixes:
       allfileList = glob(dbname + '*.' + suffix) 
       fileList = []
       tempFilePattern = re.compile(r''+ dbname + '\d*.' + suffix +'$');

       for aFile in allfileList:
           searchResult =  tempFilePattern.search(aFile)
           if searchResult:
             fileList.append(aFile)

       if len(fileList)==0 :
          eprintf("ERROR :  if formatted correctely then expected the files with pattern %s\n", dbname + suffix)
          return False

    return True
Beispiel #49
0
 def permuteForLAST(self, words):
     try :
        temp = copy(words)
        words[0] = temp[6] # query
        words[1] = temp[1] # target
        words[2] = 100.0 # percent id
        words[3] = temp[3]  #aln length
        words[6] = temp[2]
        words[7] = int(temp[2]) + int(temp[3]) - 1
        words[10] = 0.0   # evalue
        words[11] = temp[0]
     except:
        eprintf("ERROR : Invalid B/LAST output file %s \n" % (self.blastoutput))
        if self.error_and_warning_logger:   
            self.error_and_warning_logger.write("ERROR : Invalid B/LAST output file" %(self.blastoutput))
        exit_process( "ERROR : Invalid B/LAST output file %s " % (self.blastoutput))
def get_ORF_annotations_hits(sample_name, folder_path):
    results = []
    # for the LAST algorithm
    regPattern = re.compile(r'.annot.gff$', re.IGNORECASE)
    input_dir = folder_path + PATHDELIM + 'results' + PATHDELIM + 'annotation_table'
    file_name = input_dir + PATHDELIM + 'ORF_annotation_table.txt'

    eprintf(
        "\nCounting number of ORFs for mapping to functional classification ..."
    )
    count = get_number_of_uncommented_lines(file_name)
    eprintf("done\n")
    results.append(('Total orfs count for functional classification', count))
    if results == []:
        return None
    return results
def write_run_parameters_file(fileName, parameters):
    try:
        paramFile = open(fileName, 'w')
    except IOError:
        eprintf("Cannot write run parameters to file %s!\n", fileName)
        exit_process("Cannot write run parameters to file %s" % (fileName))

#       16s_rRNA      {'min_identity': '40', 'max_evalue': '0.000001', 'min_bitscore': '06', 'refdbs': 'silva_104_rep_set,greengenes_db_DW'}
    paramFile.write("\nRun Date : " + str(date.today()) + " \n")

    paramFile.write("\n\nNucleotide Quality Control parameters[s.n")
    paramFile.write("  min length" + "\t" +
                    str(parameters['quality_control']['min_length']) + "\n")

    paramFile.write("\n\nORF prediction parameters[s.n")
    paramFile.write("  min length" + "\t" +
                    str(parameters['orf_prediction']['min_length']) + "\n")
    paramFile.write("  algorithm" + "\t" +
                    str(parameters['orf_prediction']['algorithm']) + "\n")

    paramFile.write(
        "\n\nAmino acid quality control and annotation parameters[s.n")
    paramFile.write("  min bit score" + "\t" +
                    str(parameters['annotation']['min_score']) + "\n")
    paramFile.write("  min seq length" + "\t" +
                    str(parameters['annotation']['min_length']) + "\n")
    paramFile.write("  annotation reference dbs" + "\t" +
                    str(parameters['annotation']['dbs']) + "\n")
    paramFile.write("  min BSR" + "\t" +
                    str(parameters['annotation']['min_bsr']) + "\n")
    paramFile.write("  max evalue" + "\t" +
                    str(parameters['annotation']['max_evalue']) + "\n")

    paramFile.write("\n\nPathway Tools parameters[s.n")
    paramFile.write("  taxonomic pruning " + "\t" +
                    str(parameters['ptools_settings']['taxonomic_pruning']) +
                    "\n")

    paramFile.write("\n\nrRNA search/match parameters[s.n")
    paramFile.write("  min identity" + "\t" +
                    str(parameters['rRNA']['min_identity']) + "\n")
    paramFile.write("  max evalue" + "\t" +
                    str(parameters['rRNA']['max_evalue']) + "\n")
    paramFile.write("  rRNA reference dbs" + "\t" +
                    str(parameters['rRNA']['refdbs']) + "\n")

    paramFile.close()
def add_refscore_to_file(blast_table_out, refscore_file, allNames):
    infile = open( blast_table_out,'r')

    refscores = {}
    lines = infile.readlines()
    for line in lines:
       line=line.rstrip()
       fields = line.split('\t')
       if len(fields) != 12:
          eprintf("ERROR: Error in line \n%s\n of the blastout file %s" %(line, blast_table_out))
          exit_process("ERROR: Error in line \n%s\n of the blastout file %s" %(line, blast_table_out))

    for key, value in refscores.iteritems():
       allNames[key] = True
       fprintf(refscore_file, "%s\t%s\n",key, value)

    infile.close()
def process_gff_file(gff_file_name, orf_dictionary):
    try:
        gfffile = open(gff_file_name, 'r')
    except IOError:
        eprintf("Cannot read file %s!\n", gff_file_name)

    gff_lines = gfffile.readlines()
    gff_beg_pattern = re.compile("^#")
    gfffile.close()

    count = 0
    for line in gff_lines:
        line = line.strip()
        if gff_beg_pattern.search(line):
            continue
        insert_orf_into_dict(line, orf_dictionary)
        count += 1
def get_functional_taxonomic_hits(sample_name, folder_path):
    results = []
    # for the LAST algorithm
    regPattern = re.compile(r'.annot.gff$', re.IGNORECASE)
    input_dir = folder_path + PATHDELIM + 'results' + PATHDELIM + 'annotation_table'
    file_name = input_dir + PATHDELIM + 'functional_and_taxonomic_table.txt'

    eprintf("\nCounting number of functionally and taxonomically ORFs ...")
    count = get_number_of_uncommented_lines(file_name)
    eprintf("done\n")
    results.append(
        ('Total number of taxonomically and taxonmically annotated ORFs',
         count))

    if results == []:
        return None
    return results
def process_gff_file(gff_file_name, orf_dictionary):
     try:
        gfffile = open(gff_file_name, 'r')
     except IOError:
        eprintf("Cannot read file %s!\n", gff_file_name)

     gff_lines = gfffile.readlines()
     gff_beg_pattern = re.compile("^#")
     gfffile.close()
     
     count = 0
     for line in gff_lines:
        line = line.strip() 
        if gff_beg_pattern.search(line):
          continue
        insert_orf_into_dict(line, orf_dictionary)
        count += 1
def process_rRNA_16S_stats(rRNA_16S_file, rRNA_16S_dictionary, shortenorfid=False):
     counter_rRNA={}
     if not doesFileExist(rRNA_16S_file):
         return
     try:
        taxonomy_file = open(rRNA_16S_file, 'r')
     except IOError:
        eprintf("Cannot read file %s!\n", rRNA_16S_file)
        exit_process()

     tax_lines = taxonomy_file.readlines()
     similarity_pattern = re.compile("similarity")
     evalue_pattern = re.compile("evalue")
     bitscore_pattern = re.compile("bitscore")
     taxonomy_pattern = re.compile("taxonomy")
     headerScanned = False
     for line in tax_lines:
         if headerScanned == False:
            if similarity_pattern.search(line) and evalue_pattern.search(line) and bitscore_pattern.search(line) and  taxonomy_pattern.search(line):
                headerScanned = True
            continue
         fields = [ x.strip() for x in line.split('\t') ]
         if len(fields) >=6:

           if shortenorfid:
              name = get_sequence_number(fields[0])
           else:
              name = fields[0]
            
           if not name in counter_rRNA:
              counter_rRNA[name] =0

           _name = name + "_" + str(counter_rRNA[name])
           counter_rRNA[name] = counter_rRNA[name]  + 1

           
           if fields[1]!='-':
              rRNA_16S_dictionary[_name] =  [ fields[1], fields[2], fields[5] ]
           else:
              if len(fields) >=12:
                 if fields[7]!='-':
                     rRNA_16S_dictionary[_name] =  [ fields[7], fields[8], fields[11] ]

     taxonomy_file.close()
def get_annotation_hits(sample_name, folder_path):
    results = []
    # for the LAST algorithm
    regPattern = re.compile(r'.annot.gff$', re.IGNORECASE)
    input_dir = folder_path +  PATHDELIM + 'genbank' 
    files = [ re.sub(r'.*\/','',f) for f in glob(input_dir + PATHDELIM + sample_name + '*')  if regPattern.search(f) ]
    regPattern = re.compile(r'(.*)[.]annot.gff$', re.IGNORECASE)
    
    for file in files:
      result = regPattern.search(file)
      if result:
         file_name = input_dir + PATHDELIM + sample_name +    '.annot.gff'
         eprintf("\nCounting number of annotations...")
         count  =  get_number_of_uncommented_lines(file_name)
         eprintf("done\n")
         results.append( ('Total number of valid annotations', count ) )
    if results==[]:
       return None
    return results