Exemple #1
0
def main(argv):
    (opts, args) = parser.parse_args()
    if check_arguments(opts, args):
        print usage
        sys.exit(0)

    input_folder = opts.input_folder
    output_file = opts.output_file

    filePATTERN = re.compile(r'.*COG[0-9]*.*\.fa')
    cogSeqMatchesPATTERN = re.compile(
        r'[a-zA-Z]*_(.*)__[0-9]*__*(COG[0-9]*).*.fa')
    list = []
    for file in listdir(input_folder):
        if filePATTERN.match(file):
            hits = cogSeqMatchesPATTERN.search(file)
            if hits:
                list.append((hits.group(1), hits.group(2)))

    try:
        outputfile = open(output_file, 'w')
    except:
        print "Cannot open file to MLTreeMap hits"
        sys.exit(0)

    fprintf(outputfile, "Sequences\tCOG\n")
    for seq, cog in list:
        fprintf(outputfile, "%s\t%s\n", seq, cog)

    outputfile.close()
def write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight, results_dictionary, orf_dictionary, contig, candidate_orf_pos,  orfid):
      fields = [  'source', 'feature', 'start', 'end', 'score', 'strand', 'frame' ]


      output_line= orf_dictionary[contig][candidate_orf_pos]['seqname']

      for field in fields:
        # printf("\t%s", orf_dictionary[contig][candidate_orf_pos][field])
         output_line += "\t"+ str(orf_dictionary[contig][candidate_orf_pos][field])

      attributes = "ID="+orf_dictionary[contig][candidate_orf_pos]['id']
      attributes += ";" + "locus_tag="+orf_dictionary[contig][candidate_orf_pos]['locus_tag']
      attributes += ";" + "contig_length="+orf_dictionary[contig][candidate_orf_pos]['contig_length']
      attributes += ";" + "orf_length="+orf_dictionary[contig][candidate_orf_pos]['orf_length']
      attributes += ";" + "partial="+orf_dictionary[contig][candidate_orf_pos]['partial']
      attributes += ";" + "sourcedb="+candidatedbname
     
      if candidatedbname in results_dictionary:
         attributes += ";" + "annotvalue="+str(results_dictionary[candidatedbname][orfid]['value'])
         attributes += ";" + "ec="+str(results_dictionary[candidatedbname][orfid]['ec'])
         attributes += ";" + "product="+results_dictionary[candidatedbname][orfid]['product']
      else:
         attributes += ";" + "annotvalue="+str('0')
         attributes += ";" + "ec="+str('')
         attributes += ";" + "product="+'hypothetical protein'

      output_line += '\t' + attributes
      fprintf(outputgff_file, "%s\n", output_line);
def main(argv): 
    (opts, args) = parser.parse_args()
    if check_arguments(opts, args):
       print usage
       sys.exit(0)

    input_folder = opts.input_folder
    output_file = opts.output_file

    filePATTERN = re.compile(r'.*COG[0-9]*.*\.fa');
    cogSeqMatchesPATTERN = re.compile(r'[a-zA-Z]*_(.*)__[0-9]*__*(COG[0-9]*).*.fa');
    list= []
    for file in  listdir(input_folder):
      if filePATTERN.match(file):
         hits =  cogSeqMatchesPATTERN.search( file) 
         if hits:
             list.append( (hits.group(1), hits.group(2)) )
         

    try:
        outputfile  = open(output_file, 'w')
    except:
        print "Cannot open file to MLTreeMap hits"
        sys.exit(0)




    fprintf(outputfile, "Sequences\tCOG\n")
    for seq, cog in list:
        fprintf(outputfile, "%s\t%s\n",seq, cog)

    outputfile.close()
def print_orf_table(results, refseq2peg,  output_dir):
    if not path.exists(output_dir):
       makedirs(output_dir)
   
    outputfile = open( output_dir +'/ORF_annotation_table.txt', 'w')

    orf_dict = {}
    for dbname in ['refseq', 'cog', 'kegg' ]:
      for seqname in results[dbname]:
         for orf in results[dbname][seqname]:
           if not orf['query'] in orf_dict:
               orf_dict[orf['query']] = {}
           if dbname =='cog': 
              cog =  cog_id(orf['product'])
              orf_dict[orf['query']][dbname] = cog

           if dbname =='kegg': 
              kegg =  kegg_id(orf['product'])
              orf_dict[orf['query']][dbname] = kegg

           if dbname=='refseq':
              refseq =  refseq_id(orf['target'])
              if refseq in refseq2peg:
                 refseq = refseq2peg[refseq]
              else:
                 refseq = ""

              orf_dict[orf['query']][dbname] = refseq

           orf_dict[orf['query']]['contig'] = seqname

    for orfn in orf_dict:
       if 'cog' in orf_dict[orfn]:
          cogFn = orf_dict[orfn]['cog']
       else:
          cogFn = ""

       if 'kegg' in orf_dict[orfn]:
          keggFn = orf_dict[orfn]['kegg']
       else:
          keggFn = ""

       if 'metacyc' in orf_dict[orfn]:
          metacycPwy = orf_dict[orfn]['metacyc']
       else:
          metacycPwy = ""

       if 'refseq' in orf_dict[orfn]:
          refseqFn = orf_dict[orfn]['refseq']
       else:
          refseqFn = ""

       fprintf(outputfile, "%s\n", orfn + "\t" + orf_dict[orfn]['contig'] + '\t' + cogFn + '\t' + keggFn +'\t' + refseqFn + '\t' + metacycPwy)


    outputfile.close() 
def copy_faa_gff_orf_prediction(source_files, target_files):
    for source, target in zip(source_files, target_files):
        #print source + ' ' + target
        sourcefile = open(source, 'r')
        targetfile = open(target, 'w')
        sourcelines = sourcefile.readlines()
        for line in sourcelines:
            fprintf(targetfile, "%s\n", line.strip())

        sourcefile.close()
        targetfile.close()
def print_orf_table(results, output_dir):
    if not path.exists(output_dir):
       makedirs(output_dir)
   
    outputfile = open( output_dir +'/ORF_annotation_table.txt', 'w')

    orf_dict = {}
    for dbname in results.iterkeys():
      for seqname in results[dbname]:
         for orf in results[dbname][seqname]:
           if not orf['query'] in orf_dict:
               orf_dict[orf['query']] = {}

           if dbname =='cog': 
              cog =  cog_id(orf['product'])
              orf_dict[orf['query']][dbname] = cog

           if dbname =='kegg': 
              kegg =  kegg_id(orf['product'])
              orf_dict[orf['query']][dbname] = kegg

           if dbname=='seed':
              seed =  orf['product']
              orf_dict[orf['query']][dbname] = re.sub(r'\[.*\]','', seed).strip()
             

           orf_dict[orf['query']]['contig'] = seqname

    for orfn in orf_dict:
       if 'cog' in orf_dict[orfn]:
          cogFn = orf_dict[orfn]['cog']
       else:
          cogFn = ""

       if 'kegg' in orf_dict[orfn]:
          keggFn = orf_dict[orfn]['kegg']
       else:
          keggFn = ""

       if 'metacyc' in orf_dict[orfn]:
          metacycPwy = orf_dict[orfn]['metacyc']
       else:
          metacycPwy = ""

       if 'seed' in orf_dict[orfn]:
          seedFn = orf_dict[orfn]['seed']
       else:
          seedFn = ""

       fprintf(outputfile, "%s\n", orfn + "\t" + orf_dict[orfn]['contig'] + '\t' + cogFn + '\t' + keggFn +'\t' + seedFn + '\t' + metacycPwy)

    outputfile.close() 
def print_counts_at_level(hierarchical_map, field_to_description,  depth, level, outputfile): 
    
    if type(hierarchical_map) is type(0):
       return hierarchical_map
    count = 0
    for key in hierarchical_map:  
       tempcount = print_counts_at_level(hierarchical_map[key],field_to_description, depth+1, level, outputfile)
       if depth==level:
          if key in field_to_description:
              fprintf(outputfile, "%s\n", key + '\t' + field_to_description[key] + '\t' +  str(tempcount) )
          else:
              fprintf(outputfile, "%s\n", key + '\t' + ' ' + '\t' + str(tempcount))
       count+=tempcount
    return count
def write_16S_tRNA_gene_info(rRNA_dictionary, outputgff_file, tag):
    fields = ['source', 'feature', 'start', 'end', 'score', 'strand', 'frame']
    for rRNA in rRNA_dictionary:
        output_line = rRNA_dictionary[rRNA]['seqname']
        for field in fields:
            output_line += "\t" + str(rRNA_dictionary[rRNA][field])

        attributes = "ID=" + rRNA_dictionary[rRNA]['seqname'] + tag
        attributes += ";" + "locus_tag=" + rRNA_dictionary[rRNA][
            'seqname'] + tag
        attributes += ";" + "ec="
        attributes += ";" + "product=" + rRNA_dictionary[rRNA]['product']
        output_line += '\t' + attributes
        fprintf(outputgff_file, "%s\n", output_line)
def  write_16S_tRNA_gene_info(rRNA_dictionary, outputgff_file, tag):
      fields = [  'source', 'feature', 'start', 'end', 'score', 'strand', 'frame' ]
      for rRNA in rRNA_dictionary:
          output_line= rRNA_dictionary[rRNA]['seqname']
          for field in fields:
             output_line += "\t"+ str(rRNA_dictionary[rRNA][field])

          attributes = "ID="+rRNA_dictionary[rRNA]['seqname'] + tag
          attributes += ";" + "locus_tag="+rRNA_dictionary[rRNA]['seqname'] + tag
          attributes += ";" + "orf_length=0"
          attributes += ";" + "contig_length=0"
          attributes += ";" + "ec="
          attributes += ";" + "product="+rRNA_dictionary[rRNA]['product']
          output_line += '\t' + attributes
          fprintf(outputgff_file, "%s\n", output_line);
Exemple #10
0
def add_refscore_to_file(blast_table_out, refscore_file, allNames):
    infile = open(blast_table_out, 'r')

    refscores = {}
    lines = infile.readlines()
    for line in lines:
        line = line.rstrip()
        fields = line.split('\t')
        if len(fields) != 12:
            print 'Error in the blastout file'
            sys.exit(1)

    for key, value in refscores.iteritems():
        allNames[key] = True
        fprintf(refscore_file, "%s\t%s\n", key, value)

    infile.close()
def add_refscore_to_file(blast_table_out, refscore_file, allNames):
    infile = open( blast_table_out,'r')

    refscores = {}
    lines = infile.readlines()
    for line in lines:
       line=line.rstrip()
       fields = line.split('\t')
       if len(fields) != 12:
          print 'Error in the blastout file'
          sys.exit(1)

    for key, value in refscores.iteritems():
       allNames[key] = True
       fprintf(refscore_file, "%s\t%s\n",key, value)

    infile.close()
def print_counts_at_level(hierarchical_map, field_to_description, depth, level,
                          outputfile):

    if type(hierarchical_map) is type(0):
        return hierarchical_map
    count = 0
    for key in hierarchical_map:
        tempcount = print_counts_at_level(hierarchical_map[key],
                                          field_to_description, depth + 1,
                                          level, outputfile)
        if depth == level:
            if key in field_to_description:
                fprintf(
                    outputfile, "%s\n", key + '\t' +
                    field_to_description[key] + '\t' + str(tempcount))
            else:
                fprintf(outputfile, "%s\n",
                        key + '\t' + ' ' + '\t' + str(tempcount))
        count += tempcount
    return count
def process_gbk_file(input_gbk, output_gbk, headers, gff_dictionary):

    tag = re.sub(r'[.]gbk', '', input_gbk)
    tag = re.sub(r'.*/', '', tag)

    output_gbk_file = open(output_gbk, 'w')
    serializer = genbank.GenBankRecordSerializer()
    with open(input_gbk, 'r') as genbank_file:
        out_list = []
        count = 0
        for record in genbank.GenBankRecordParser(genbank_file.read()):
            count += 1

            record.locus = tag + str(count)
            if count % 1000 == 0:
                print 'Count = ' + str(count)

            if headers and 'REFERENCES' in headers:
                record.references_ = headers['REFERENCES']

            i = 0
            for feature in record.features:
                if feature.type == "CDS":
                    if feature.locus_tag in gff_dictionary:
                        record.features[i].product = 'aaaaa ' + gff_dictionary[
                            feature.locus_tag]['product']
                i += 1

            #record.locus = "hello"

            out_list.append(serializer.serialize(record))
            if count % 1000 == 0:
                output_str = '\n'.join(out_list)
                out_list = []
                fprintf(output_gbk_file, '%s\n', output_str)

        output_str = '\n'.join(out_list)
        fprintf(output_gbk_file, '%s\n', output_str)

        output_gbk_file.close()
def process_file(genbank_filename, output_fna, output_faa, output_gff):

    with open(genbank_filename, 'r') as genbank_file:
        outputfnafile = open(output_fna, 'w')
        outputfaafile = open(output_faa, 'w')
        outputgfffile = open(output_gff, 'w')
        for record in genbank.GenBankRecordParser(genbank_file.read()):
            fprintf(outputfnafile, ">%s\n%s\n", record.locus, record.sequence)
            for feature in record.features:
                if feature.type == 'CDS':
                    fprintf(outputfaafile, ">%s\n%s\n", feature.locus_tag,
                            feature.translation)

                    start, end, strand = feature.coordinates
                    gff_Str = record.locus
                    gff_Str += '\t' + 'Genbank file'
                    gff_Str += '\t' + 'CDS'
                    gff_Str += '\t' + start
                    gff_Str += '\t' + end
                    gff_Str += '\t' + '0'
                    gff_Str += '\t' + strand
                    gff_Str += '\t' + '0'

                    gff_Str += '\tID=' + feature.locus_tag
                    gff_Str += ';locus_tag=' + feature.locus_tag
                    if feature.product:
                        gff_Str += ';product=' + feature.product

                    fprintf(outputgfffile, "%s\n", gff_Str)

        outputfnafile.close()
        outputgfffile.close()
        outputfaafile.close()
def process_file(genbank_filename, output_fna, output_faa, output_gff):
   
   with open(genbank_filename, 'r') as genbank_file:
      outputfnafile = open(output_fna,'w')
      outputfaafile = open(output_faa,'w')
      outputgfffile = open(output_gff,'w')
      for record in genbank.GenBankRecordParser(genbank_file.read()):
        fprintf(outputfnafile, ">%s\n%s\n", record.locus, record.sequence)
        for feature in record.features:
           if feature.type=='CDS':
              fprintf(outputfaafile, ">%s\n%s\n",  feature.locus_tag, feature.translation)

              start, end, strand = feature.coordinates 
              gff_Str = record.locus 
              gff_Str += '\t' + 'Genbank file' 
              gff_Str += '\t' + 'CDS' 
              gff_Str += '\t' + start 
              gff_Str += '\t' + end 
              gff_Str += '\t' + '0'
              gff_Str += '\t' + strand
              gff_Str += '\t' + '0'
 
              gff_Str +=  '\tID=' + feature.locus_tag
              gff_Str +=  ';locus_tag=' + feature.locus_tag
              if feature.product:  
                 gff_Str +=  ';product=' + feature.product

              fprintf(outputgfffile, "%s\n",  gff_Str)
            

      outputfnafile.close()
      outputgfffile.close()
      outputfaafile.close()
def process_gbk_file(input_gbk, output_gbk, headers, gff_dictionary):

    tag = re.sub(r"[.]gbk", "", input_gbk)
    tag = re.sub(r".*/", "", tag)

    output_gbk_file = open(output_gbk, "w")
    serializer = genbank.GenBankRecordSerializer()
    with open(input_gbk, "r") as genbank_file:
        out_list = []
        count = 0
        for record in genbank.GenBankRecordParser(genbank_file.read()):
            count += 1

            record.locus = tag + str(count)
            if count % 1000 == 0:
                print "Count = " + str(count)

            if headers and "REFERENCES" in headers:
                record.references_ = headers["REFERENCES"]

            i = 0
            for feature in record.features:
                if feature.type == "CDS":
                    if feature.locus_tag in gff_dictionary:
                        record.features[i].product = "aaaaa " + gff_dictionary[feature.locus_tag]["product"]
                i += 1

            # record.locus = "hello"

            out_list.append(serializer.serialize(record))
            if count % 1000 == 0:
                output_str = "\n".join(out_list)
                out_list = []
                fprintf(output_gbk_file, "%s\n", output_str)

        output_str = "\n".join(out_list)
        fprintf(output_gbk_file, "%s\n", output_str)

        output_gbk_file.close()
def write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight,
                             results_dictionary, orf_dictionary, contig,
                             candidate_orf_pos, orfid):
    fields = ['source', 'feature', 'start', 'end', 'score', 'strand', 'frame']

    #      print contig
    #      print orf_dictionary[contig]

    #      print results_dictionary

    output_line = orf_dictionary[contig][candidate_orf_pos]['seqname']

    for field in fields:
        # printf("\t%s", orf_dictionary[contig][candidate_orf_pos][field])
        output_line += "\t" + str(
            orf_dictionary[contig][candidate_orf_pos][field])

    attributes = "ID=" + orf_dictionary[contig][candidate_orf_pos]['id']
    attributes += ";" + "locus_tag=" + orf_dictionary[contig][
        candidate_orf_pos]['locus_tag']
    attributes += ";" + "partial=" + orf_dictionary[contig][candidate_orf_pos][
        'partial']
    attributes += ";" + "sourcedb=" + candidatedbname

    if candidatedbname in results_dictionary:
        attributes += ";" + "annotvalue=" + str(
            results_dictionary[candidatedbname][orfid]['value'])
        attributes += ";" + "ec=" + str(
            results_dictionary[candidatedbname][orfid]['ec'])
        attributes += ";" + "product=" + results_dictionary[candidatedbname][
            orfid]['product']
    else:
        attributes += ";" + "annotvalue=" + str('0')
        attributes += ";" + "ec=" + str('')
        attributes += ";" + "product=" + 'hypothetical protein'

    output_line += '\t' + attributes
    fprintf(outputgff_file, "%s\n", output_line)
def add_last_refscore_to_file(blast_table_out, refscore_file, allNames):
    commentPATTERN = re.compile(r'^#')

    infile = open(blast_table_out, 'r')
    refscores = {}
    lines = infile.readlines()
    for line in lines:
        if commentPATTERN.match(line):
            continue
        line = line.rstrip()
        fields = line.split('\t')
        if len(fields) != 12:
            print 'Error in the blastout file'
            sys.exit(1)
        if fields[6].rstrip() == fields[1].rstrip():
            #    fprintf(refscore_file, "%s\t%s\n",fields[0], fields[11])
            refscores[fields[1]] = fields[0]

    for key, value in refscores.iteritems():
        allNames[key] = True
        fprintf(refscore_file, "%s\t%s\n", key, value)

    infile.close()
def add_last_refscore_to_file(blast_table_out, refscore_file, allNames):
    commentPATTERN = re.compile(r'^#')

    infile = open( blast_table_out,'r')
    refscores = {}
    lines = infile.readlines()
    for line in lines:
       if commentPATTERN.match(line):
          continue
       line=line.rstrip()
       fields = line.split('\t')
       if len(fields) != 12:
          print 'Error in the blastout file'
          sys.exit(1)
       if fields[6].rstrip()==fields[1].rstrip():
      #    fprintf(refscore_file, "%s\t%s\n",fields[0], fields[11])
          refscores[fields[1]]=fields[0]

    for key, value in refscores.iteritems():
       allNames[key] = True
       fprintf(refscore_file, "%s\t%s\n",key, value)

    infile.close()
def print_counts_at_level(hierarchical_map, field_to_description,  depth, level, outputfile, printKey=True, header=None): 
    
    if type(hierarchical_map) is type(0):
       return hierarchical_map
    if header:
       fprintf(outputfile, "%s\n",header )

    count = 0
    for key in hierarchical_map:  
       tempcount = print_counts_at_level(hierarchical_map[key],field_to_description, depth+1, level, outputfile, printKey=printKey)
       if depth==level:
          if key in field_to_description:
              if printKey:
                 fprintf(outputfile, "%s\n", key + '\t' + field_to_description[key] + '\t' +  str(tempcount) )
              else:
                 fprintf(outputfile, "%s\n",  field_to_description[key] + '\t' +  str(tempcount) )
          else:
              if printKey:
                 print "True 2"
                 fprintf(outputfile, "%s\n", key + '\t' + ' ' + '\t' + str(tempcount))
              else:
                 fprintf(outputfile, "%s\n", key +  '\t' + str(tempcount))
       count+=tempcount
    return count
def process_blastoutput(dbname, blastoutput,  mapfile, refscore_file, opts):
    blastparser =  BlastOutputParser(dbname, blastoutput, mapfile, refscore_file, opts)

    fields = ['q_length', 'bitscore', 'bsr', 'expect', 'aln_length', 'identity', 'ec' ]
    if opts.taxonomy:
       fields.append('taxonomy')
    fields.append('product')

    output_blastoutput_parsed = blastoutput + '.parsed.txt'
    outputfile = open(output_blastoutput_parsed, 'w') 

    fprintf(outputfile, "#%s",'query')
    for field in fields:
         fprintf(outputfile,"\t%s",field)
    fprintf(outputfile, "\n")

    for data in blastparser:
        if not data:
          continue
        try:
          fprintf(outputfile, "%s",data['query'])
        except:
           print data
           sys.exit()
        for field in fields:
           fprintf(outputfile, "\t%s",data[field])
        fprintf(outputfile, "\n")

    outputfile.close()


#    add_refscore_to_file(blastoutput,refscore_file, allNames)
    return None
def create_annotation(results_dictionary, annotated_gff,  output_dir, ncbi_taxonomy_tree_file, min_score, top_percent, min_support):
    meganTree = None
    lca = None
    if 'refseq' in results_dictionary:
        lca = LCAComputation(ncbi_taxonomy_tree_file)
        lca.setParameters(min_score, top_percent, min_support)
        meganTree = MeganTree(lca)

    if not path.exists(output_dir):
       makedirs(output_dir)

   
    orf_dictionary={}
    #process_gff_file(annotated_gff, orf_dictionary)
    gffreader = GffFileParser(annotated_gff)
    output_table_file = open(output_dir + '/functional_and_taxonomic_table.txt', 'w')
    fprintf(output_table_file, "ORF_ID\tORF_length\tstart\tend\tContig_Name\tContig_length\tstrand\tec\ttaxonomy\tproduct\n")

    count = 0
    for contig in  gffreader:
       for orf in  gffreader.orf_dictionary[contig]:
          taxonomy = None
          if count%10000==0 :
             pass 
          species = []
          if 'refseq' in results_dictionary:
            if orf['id'] in results_dictionary['refseq']:
                for hit in results_dictionary['refseq'][orf['id']]:
                   if hit['bitscore'] >= min_score:
                      names = get_species(hit)
                      if names:
                        species.append(names) 
                      #print '---------------------------'
          #         else:
          #              print "hit " + hit['query']  + ' ' + hit['dbname'] + ' ' + str(hit['bitscore'] )
          if lca: 
            taxonomy=lca.getTaxonomy(species)
          fprintf(output_table_file, "%s", orf['id'])
          fprintf(output_table_file, "\t%s", orf['orf_length'])
          fprintf(output_table_file, "\t%s", orf['start'])
          fprintf(output_table_file, "\t%s", orf['end'])
          fprintf(output_table_file, "\t%s", orf['seqname'])
          fprintf(output_table_file, "\t%s", orf['contig_length'])
          fprintf(output_table_file, "\t%s", orf['strand'])
          fprintf(output_table_file, "\t%s", orf['ec'])
          # fprintf(output_table_file, "\t%s", str(species))
          fprintf(output_table_file, "\t%s", taxonomy)
          fprintf(output_table_file, "\t%s\n", orf['product'])
          if meganTree and taxonomy != '':
              meganTree.insertTaxon(taxonomy)
              # print 'inserted taxon of taxonomy : ', taxonomy
          #print meganTree.getChildToParentMap()
    output_table_file.close()
    # print meganTree.getParentToChildrenMap()

    if meganTree:
        print output_dir + '/megan_tree.tre'
        megan_tree_file = open(output_dir + '/megan_tree.tre', 'w')
        #print meganTree.printTree('1')
        # exit()
        fprintf(megan_tree_file,  "%s;", meganTree.printTree('1'))
        # print 'wrote out megan_tree_file'
        megan_tree_file.close()
def main(argv): 
    (opts, args) = parser.parse_args()

    if not valid_arguments(opts, args):
       print usage
       sys.exit(0)

    min_length = opts.min_length
    inputfile = open(opts.input_fasta,'r')
    outfile = open(opts.output_fasta, 'w') 
    logfile = open(opts.log_file, 'w') 
     

    if opts.map_file:
       mapfile = open(opts.map_file, 'w') 
    else:
       mapfile = None

    
    sample_name = opts.input_fasta;
    sample_name = re.sub(r'^.*/','',sample_name, re.I)
    sample_name = re.sub(r'^.*\\','',sample_name, re.I)
    sample_name = re.sub(r'\.fasta$','',sample_name, re.I)
    sample_name = re.sub(r'\.fna$','',sample_name, re.I)
    sample_name = re.sub(r'\.faa$','',sample_name, re.I)
    sample_name = re.sub(r'\.fas$','',sample_name, re.I)

    BEFORE = 'BEFORE'
    AFTER = 'AFTER'
    NUMSEQ = "Number of sequences :"   
    NUMSEQ_SHORTER = "Number of sequences shorter than "
    AV_LENGTH= "Average length of sequences:"
    MIN_LENGTH= "Minimum length of sequences:"
    MAX_LENGTH= "Maximum length of sequences:" 

    stats = { 
              MIN_LENGTH: { 'BEFORE':10000000, 'AFTER':1000000 },  
              MAX_LENGTH: { 'BEFORE': 0, 'AFTER':0 },  
              NUMSEQ : { 'BEFORE' :0, 'AFTER':0},   
              NUMSEQ_SHORTER : { 'BEFORE':0, 'AFTER':0 },
              AV_LENGTH : { 'BEFORE':0, 'AFTER':0 },
            }  

    length_distribution = {}
    length_cumulative_distribution = {}

    for i in range(0,31):
        length_distribution[i]= 0
        length_cumulative_distribution[i]= 0

    seq_count = 0
    allNames= dict()
    outputStr = ""
    outputLines = []
    for record in read_fasta_records(inputfile):
        seqname = record.name
        seq = record.sequence
        length = len(seq) 
        
        index = int(len(seq) / 50);
        if index >= 30:
            index = 30
    #print length($seq) ."\t".$index."\n";
        length_distribution[index] += 1

        if length < stats[MIN_LENGTH][BEFORE] :
            stats[MIN_LENGTH][BEFORE] = length

        if length > stats[MAX_LENGTH][BEFORE] : 
            stats[MAX_LENGTH][BEFORE] = length

        if length < MIN_LENGTH:
            stats[NUMSEQ_SHORTER][BEFORE] += 1

        stats[AV_LENGTH][BEFORE]  =  stats[AV_LENGTH][BEFORE] + length

        seqvalue = filter_sequence(seq)

        stats[NUMSEQ][BEFORE] += 1
        
        seqlen = len(seqvalue)
        if seqlen>= min_length :
           
           stats[NUMSEQ][AFTER] += 1
           stats[AV_LENGTH][AFTER]  =  stats[AV_LENGTH][AFTER] + seqlen
           if mapfile==None:
              fprintf(outfile, "%s\n", seqname)
           else:
               fprintf(outfile, ">%s\n",  sample_name + '_' + str(seq_count) )
               key = re.sub(r'^>','',seqname)
               fprintf(mapfile, "%s\n", sample_name+ '_' + str(seq_count) + '\t' + key)
               seq_count += 1

           fprintf(outfile, "%s\n",seqvalue)

           if  seqlen < stats[MIN_LENGTH][AFTER] :
               stats[MIN_LENGTH][AFTER] = seqlen
             
           if  seqlen > stats[MAX_LENGTH][AFTER] :
               stats[MAX_LENGTH][AFTER] = seqlen

    if stats[NUMSEQ][BEFORE] > 0 :
      stats[AV_LENGTH][BEFORE]  = stats[AV_LENGTH][BEFORE]/stats[NUMSEQ][BEFORE]
    else:
      stats[AV_LENGTH][BEFORE]  = 0
    if stats[NUMSEQ][AFTER] > 0 :
       stats[AV_LENGTH][AFTER]  = stats[AV_LENGTH][AFTER]/stats[NUMSEQ][AFTER]
    else :
       stats[AV_LENGTH][AFTER]  = 0

    outfile.close()
    inputfile.close()
    if mapfile != None:
       mapfile.close()


    fprintf(logfile, "   %s\n", " \tBEFORE\tAFTER");
    fprintf(logfile, "   %s\n", NUMSEQ +'\t' + str(stats[NUMSEQ][BEFORE]) + '\t' + str(stats[NUMSEQ][AFTER]));
    fprintf(logfile, "   %s\n", NUMSEQ_SHORTER + str(MIN_LENGTH) + ':\t'+ str(stats[NUMSEQ_SHORTER][BEFORE]) + '\t' + str(stats[NUMSEQ_SHORTER][AFTER]))
    fprintf(logfile, "   %s\n", AV_LENGTH +'\t' + str(stats[AV_LENGTH][BEFORE]) + '\t'+ str(stats[AV_LENGTH][AFTER] ))
    fprintf(logfile, "   %s\n", MIN_LENGTH + '\t' + str(stats[MIN_LENGTH][BEFORE]) +'\t'+ str(stats[MIN_LENGTH][AFTER]))
    fprintf(logfile, "   %s\n", MAX_LENGTH +'\t'+ str(stats[MAX_LENGTH][BEFORE]) + '\t' +  str(stats[MAX_LENGTH][AFTER]))

    fprintf(logfile, "\n\n");
    fprintf(logfile, "   READ_LENGTH_RANGE\tFREQUENCY\t\tMIN_LENGTH\tCUMULATIVE_FREQUENCY\n");
    fprintf(logfile, "   -----------------\t---------\t\t----------\t--------------------\n");

    i  = 30
    length_cumulative_distribution[i] = length_cumulative_distribution[i];
    i  -= 1
    while i >= 0:
       length_cumulative_distribution[i] = length_cumulative_distribution[i+1] + length_distribution[i];
       i -= 1

    for i in range(0,31):
       fprintf(logfile, "   %s\n", str(i*50) + '-' + str((i+1)*50) + '\t' +\
                str(length_distribution[i]) +'\t\t\t' + str( (i+1)*50) + '\t' + str(length_cumulative_distribution[i]) )

    logfile.close()
def create_annotation(results_dictionary, annotated_gff,  output_dir):

    file = 'blastDB/ncbi_taxonomy_tree.txt'
    lca = LCAComputation(file)

    if not path.exists(output_dir):
       makedirs(output_dir)

   
    orf_dictionary={}
    #process_gff_file(annotated_gff, orf_dictionary)

    gffreader = GffFileParser(annotated_gff)

    output_table_file = open(output_dir + '/functional_and_taxonomic_table.txt', 'w')
  
    fprintf(output_table_file, "id\tseqname\tstart\tend\tstrand\tec\ttaxonomy\tproduct\n")

    count = 0
    for contig in  gffreader:
       for orf in  gffreader.orf_dictionary[contig]:
          #print orf
          if count%10000==0 :
             # print "fandt " + str(count)
             pass 

          species = []
          if 'refseq' in results_dictionary:
            if orf['id']  in results_dictionary['refseq']:
                for hit in   results_dictionary['refseq'][orf['id']]:
                   names = get_species(hit)
                   if names:
                      species.append(names) 
                      #print species
                      #print '---------------------------'

          taxonomy=lca.getTaxonomy(species)
          fprintf(output_table_file, "%s", orf['id'])
          fprintf(output_table_file, "\t%s", orf['seqname'])
          fprintf(output_table_file, "\t%s", orf['start'])
          fprintf(output_table_file, "\t%s", orf['end'])
          fprintf(output_table_file, "\t%s", orf['strand'])
          fprintf(output_table_file, "\t%s", orf['ec'])
          #fprintf(output_table_file, "\t%s", str(species))
          fprintf(output_table_file, "\t%s", taxonomy)
          fprintf(output_table_file, "\t%s\n", orf['product'])
                      
    output_table_file.close()
def create_annotation(dbname_weight, results_dictionary, input_gff,  rRNA_16S_stats_files, tRNA_stats_files,  output_gff, output_comparative_annotation):
    orf_dictionary={}
#    process_gff_file(input_gff, orf_dictionary)
    gffreader = GffFileParser(input_gff)

    output_gff_tmp = output_gff + ".tmp"
    outputgff_file = open( output_gff_tmp, 'w')
    output_comp_annot_file1 = open( output_comparative_annotation + '.1.txt', 'w')
    output_comp_annot_file2 = open( output_comparative_annotation + '.2.txt', 'w')

    output_comp_annot_file1_Str = 'orf_id\tref dbname\tEC\tproduct\tvalue'
    fprintf(output_comp_annot_file1,'%s\n', output_comp_annot_file1_Str)

    output_comp_annot_file2_Str = 'orf_id'
    dbnames = dbname_weight.keys()
    for dbname in dbnames:
         weight = dbname_weight[dbname]
         output_comp_annot_file2_Str += '\t{0}(EC) \t{0}(product)\t{0}(value)'.format(dbname)
    fprintf(output_comp_annot_file2,'%s\n', output_comp_annot_file2_Str)
       

#    gffreader = GffReader(input_gff)
    for contig in  gffreader:
       count = 0
       for orf in  gffreader.orf_dictionary[contig]:
         #print orf
         value = 0.0001
         success =False
         output_comp_annot_file1_Str = ''
         output_comp_annot_file2_Str = ''
         for dbname in dbnames:
            weight = dbname_weight[dbname]
            value = 0
            if orf['id'] in results_dictionary[dbname]:
                if value < results_dictionary[dbname][orf['id']]['value']:
                    value = results_dictionary[dbname][orf['id']]['value']
                    candidatedbname=dbname
                    success =True
                    candidate_orf_pos = count 

                    if output_comp_annot_file1_Str:
                        output_comp_annot_file1_Str += '{0}\t{1}\t{2}\t{3}\t{4}\n'.format('', dbname,\
                               results_dictionary[dbname][orf['id']]['ec'],\
                               results_dictionary[dbname][orf['id']]['product'],\
                               str(results_dictionary[dbname][orf['id']]['value']*float(weight)))
                    else:
                        output_comp_annot_file1_Str += '{0}\t{1}\t{2}\t{3}\t{4}\n'.format(orf['id'], dbname,\
                               results_dictionary[dbname][orf['id']]['ec'],\
                               results_dictionary[dbname][orf['id']]['product'],\
                               str(results_dictionary[dbname][orf['id']]['value']*float(weight)))


                    if output_comp_annot_file2_Str:
                        output_comp_annot_file2_Str += '\t{0}\t{1}\t{2}'.format(\
                               results_dictionary[dbname][orf['id']]['ec'],\
                               results_dictionary[dbname][orf['id']]['product'],\
                               str(results_dictionary[dbname][orf['id']]['value']*float(weight)))
                    else:
                        output_comp_annot_file2_Str += '{0}\t{1}\t{2}\t{3}'.format(orf['id'], 
                               results_dictionary[dbname][orf['id']]['ec'],\
                               results_dictionary[dbname][orf['id']]['product'],\
                               str(results_dictionary[dbname][orf['id']]['value']*float(weight)))

            else: 
                if not output_comp_annot_file1_Str:
                   output_comp_annot_file1_Str += '{0}\t{1}\t{2}\t{3}\t{4}\n'.format(orf['id'], '','','','')

                if output_comp_annot_file2_Str:
                   output_comp_annot_file2_Str += '\t{0}\t{1}\t{2}'.format('', '','')
                else:
                   output_comp_annot_file2_Str += '{0}\t{1}\t{2}\t{3}'.format(orf['id'], '','','','')


         if success:  # there was a database hit
            fprintf(output_comp_annot_file1,'%s\n', output_comp_annot_file1_Str)
            fprintf(output_comp_annot_file2,'%s\n', output_comp_annot_file2_Str)
            write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight, results_dictionary, gffreader.orf_dictionary, contig, candidate_orf_pos,  orf['id']) 
         else:   # if it was not  a hit then it is a hypothetical protein
            #print gffreader.orf_dictionary
            write_annotation_for_orf(outputgff_file, 'None', '0', results_dictionary, gffreader.orf_dictionary, contig, count, orf['id']) 
         
         count +=1  #move to the next orf

       #del orf_dictionary[contig]   
    output_comp_annot_file1.close()
    output_comp_annot_file2.close()

    # now deal with the rRNA sequences  if there is rRNA stats file
    if len(rRNA_16S_stats_files) > 0 :
       rRNA_16S_dictionary={} 
       for rRNA_16S_stats_file in rRNA_16S_stats_files:
          process_rRNA_16S_stats(rRNA_16S_stats_file, rRNA_16S_dictionary)

       rRNA_dictionary = {}
       add_16S_genes(rRNA_16S_dictionary, rRNA_dictionary) 
       #print rRNA_dictionary
       write_16S_tRNA_gene_info(rRNA_dictionary, outputgff_file, '_rRNA')

    # now deal with the tRNA sequences  if there is tRNA stats file
    if len(tRNA_stats_files) > 0 :
       tRNA_dictionary={} 
       for tRNA_stats_file in tRNA_stats_files:
          process_tRNA_stats(tRNA_stats_file, tRNA_dictionary)

       tRNA_gff_dictionary = {}
       add_tRNA_genes(tRNA_dictionary, tRNA_gff_dictionary) 
       write_16S_tRNA_gene_info(tRNA_gff_dictionary, outputgff_file, '_tRNA')
       #print tRNA_dictionary


    outputgff_file.close()     
    rename(output_gff_tmp, output_gff)
def create_annotation(results_dictionary, annotated_gff, output_dir):

    file = 'blastDB/ncbi_taxonomy_tree.txt'
    lca = LCAComputation(file)

    if not path.exists(output_dir):
        makedirs(output_dir)

    orf_dictionary = {}
    #process_gff_file(annotated_gff, orf_dictionary)

    gffreader = GffFileParser(annotated_gff)

    output_table_file = open(
        output_dir + '/functional_and_taxonomic_table.txt', 'w')

    fprintf(output_table_file,
            "id\tseqname\tstart\tend\tstrand\tec\ttaxonomy\tproduct\n")

    count = 0
    for contig in gffreader:
        for orf in gffreader.orf_dictionary[contig]:
            #print orf
            if count % 10000 == 0:
                # print "fandt " + str(count)
                pass

            species = []
            if 'refseq' in results_dictionary:
                if orf['id'] in results_dictionary['refseq']:
                    for hit in results_dictionary['refseq'][orf['id']]:
                        names = get_species(hit)
                        if names:
                            species.append(names)
                            #print species
                            #print '---------------------------'

            taxonomy = lca.getTaxonomy(species)
            fprintf(output_table_file, "%s", orf['id'])
            fprintf(output_table_file, "\t%s", orf['seqname'])
            fprintf(output_table_file, "\t%s", orf['start'])
            fprintf(output_table_file, "\t%s", orf['end'])
            fprintf(output_table_file, "\t%s", orf['strand'])
            fprintf(output_table_file, "\t%s", orf['ec'])
            #fprintf(output_table_file, "\t%s", str(species))
            fprintf(output_table_file, "\t%s", taxonomy)
            fprintf(output_table_file, "\t%s\n", orf['product'])

    output_table_file.close()
Exemple #27
0
def process_blastoutput(dbname, blastoutput, mapfile, refscore_file, opts):
    blastparser = BlastOutputParser(dbname, blastoutput, mapfile,
                                    refscore_file, opts)

    fields = [
        'q_length', 'bitscore', 'bsr', 'expect', 'aln_length', 'identity', 'ec'
    ]
    if opts.taxonomy:
        fields.append('taxonomy')
    fields.append('product')

    output_blastoutput_parsed = blastoutput + '.parsed.txt'
    outputfile = open(output_blastoutput_parsed, 'w')

    fprintf(outputfile, "#%s", 'query')
    for field in fields:
        fprintf(outputfile, "\t%s", field)
    fprintf(outputfile, "\n")

    for data in blastparser:
        if not data:
            continue
        try:
            fprintf(outputfile, "%s", data['query'])
        except:
            print data
            sys.exit()
        for field in fields:
            fprintf(outputfile, "\t%s", data[field])
        fprintf(outputfile, "\n")

    outputfile.close()

    #    add_refscore_to_file(blastoutput,refscore_file, allNames)
    return None
def main(argv):
    (opts, args) = parser.parse_args()
    if check_arguments(opts, args):
        print usage
        sys.exit(0)

    input_fasta = opts.input_fasta
    output_file = opts.output_file
    blast_executable = opts.blast_executable
    formatdb_executable = opts.formatdb_executable
    algorithm = opts.algorithm

    # input file to blast with itself to commpute refscore
    infile = open(input_fasta, 'r')

    #this file has the refscores of the entire file
    outfile = open(output_file, 'w')

    count = 0

    allNames = dict()
    for record in read_fasta_records(infile):
        if count % SIZE == 0:
            if count > 0:
                seq_subset_file.close()
                compute_refscores(formatdb_executable, blast_executable,
                                  seq_subset_file, outfile, allNames,
                                  algorithm)

                # now remove the old file
                if algorithm == 'BLAST':
                    remove_blast_index_files(seq_subset_file.name)

                if algorithm == 'LAST':
                    remove_last_index_files(seq_subset_file.name)

                remove(seq_subset_file.name)

            seq_subset_file = open(
                output_file + '.tmp.' + str(count) + '.fasta', 'w')
        allNames[record.name.replace(">", "")] = False
        fprintf(seq_subset_file, "%s\n", record.name)
        fprintf(seq_subset_file, "%s\n", record.sequence)

        count = count + 1

    #print str(count) + "   "  + "going to blast last sequence "
    if (count) % SIZE != 0:
        #print str(count) + "   "  + "last sequence "
        seq_subset_file.close()
        compute_refscores(formatdb_executable, blast_executable,
                          seq_subset_file, outfile, allNames, algorithm)
        remove(seq_subset_file.name)
        if algorithm == 'BLAST':
            remove_blast_index_files(seq_subset_file.name)
        if algorithm == 'LAST':
            remove_last_index_files(seq_subset_file.name)

    #print count
    for key in allNames:
        if allNames[key] == False:
            fprintf(outfile, "%s\t%s\n", key, 1000000)

    outfile.close()
def process_blastoutput(dbname, blastoutput,  mapfile, refscore_file, opts):
    blastparser =  BlastOutputParser(dbname, blastoutput, mapfile, refscore_file, opts)

    fields = ['target','q_length', 'bitscore', 'bsr', 'expect', 'aln_length', 'identity', 'ec' ]
    if opts.taxonomy:
       fields.append('taxonomy')
    fields.append('product')

    output_blastoutput_parsed = blastoutput + '.parsed.txt'
    
    # temporary file is used to deal with incomplete processing of the file
    output_blastoutput_parsed_tmp =  output_blastoutput_parsed + ".tmp"
    outputfile = open(output_blastoutput_parsed_tmp, 'w') 

    # write the headers out
    fprintf(outputfile, "#%s",'query')
    for field in fields:
         fprintf(outputfile,"\t%s",field)
    fprintf(outputfile, "\n")

    for data in blastparser:
        if not data:
          continue
        try:
          fprintf(outputfile, "%s",data['query'])
        except:
           print 'data is : ', data, '\n'
           sys.exit()
        for field in fields:
           fprintf(outputfile, "\t%s",data[field])
        fprintf(outputfile, "\n")

    outputfile.close()
    rename(output_blastoutput_parsed_tmp, output_blastoutput_parsed)


    return None
def create_annotation(dbname_weight, results_dictionary, input_gff,
                      rRNA_16S_stats_files, tRNA_stats_files, output_gff,
                      output_comparative_annotation):
    orf_dictionary = {}
    #    process_gff_file(input_gff, orf_dictionary)
    gffreader = GffFileParser(input_gff)

    outputgff_file = open(output_gff, 'w')
    output_comp_annot_file1 = open(output_comparative_annotation + '.1.txt',
                                   'w')
    output_comp_annot_file2 = open(output_comparative_annotation + '.2.txt',
                                   'w')

    output_comp_annot_file1_Str = 'orf_id\tref dbname\tEC\tproduct\tvalue'
    fprintf(output_comp_annot_file1, '%s\n', output_comp_annot_file1_Str)

    output_comp_annot_file2_Str = 'orf_id'
    dbnames = dbname_weight.keys()
    for dbname in dbnames:
        weight = dbname_weight[dbname]
        output_comp_annot_file2_Str += '\t{0}(EC) \t{0}(product)\t{0}(value)'.format(
            dbname)
    fprintf(output_comp_annot_file2, '%s\n', output_comp_annot_file2_Str)

    #    gffreader = GffReader(input_gff)
    for contig in gffreader:
        count = 0
        for orf in gffreader.orf_dictionary[contig]:
            #print orf
            value = 0.0001
            success = False
            output_comp_annot_file1_Str = ''
            output_comp_annot_file2_Str = ''
            for dbname in dbnames:
                weight = dbname_weight[dbname]
                value = 0
                if orf['id'] in results_dictionary[dbname]:
                    if value < results_dictionary[dbname][orf['id']]['value']:
                        value = results_dictionary[dbname][orf['id']]['value']
                        candidatedbname = dbname
                        success = True
                        candidate_orf_pos = count

                        if output_comp_annot_file1_Str:
                            output_comp_annot_file1_Str += '{0}\t{1}\t{2}\t{3}\t{4}\n'.format('', dbname,\
                                   results_dictionary[dbname][orf['id']]['ec'],\
                                   results_dictionary[dbname][orf['id']]['product'],\
                                   str(results_dictionary[dbname][orf['id']]['value']*float(weight)))
                        else:
                            output_comp_annot_file1_Str += '{0}\t{1}\t{2}\t{3}\t{4}\n'.format(orf['id'], dbname,\
                                   results_dictionary[dbname][orf['id']]['ec'],\
                                   results_dictionary[dbname][orf['id']]['product'],\
                                   str(results_dictionary[dbname][orf['id']]['value']*float(weight)))

                        if output_comp_annot_file2_Str:
                            output_comp_annot_file2_Str += '\t{0}\t{1}\t{2}'.format(\
                                   results_dictionary[dbname][orf['id']]['ec'],\
                                   results_dictionary[dbname][orf['id']]['product'],\
                                   str(results_dictionary[dbname][orf['id']]['value']*float(weight)))
                        else:
                            output_comp_annot_file2_Str += '{0}\t{1}\t{2}\t{3}'.format(orf['id'],
                                   results_dictionary[dbname][orf['id']]['ec'],\
                                   results_dictionary[dbname][orf['id']]['product'],\
                                   str(results_dictionary[dbname][orf['id']]['value']*float(weight)))

                else:
                    if not output_comp_annot_file1_Str:
                        output_comp_annot_file1_Str += '{0}\t{1}\t{2}\t{3}\t{4}\n'.format(
                            orf['id'], '', '', '', '')

                    if output_comp_annot_file2_Str:
                        output_comp_annot_file2_Str += '\t{0}\t{1}\t{2}'.format(
                            '', '', '')
                    else:
                        output_comp_annot_file2_Str += '{0}\t{1}\t{2}\t{3}'.format(
                            orf['id'], '', '', '', '')

            if success:  # there was a database hit
                fprintf(output_comp_annot_file1, '%s\n',
                        output_comp_annot_file1_Str)
                fprintf(output_comp_annot_file2, '%s\n',
                        output_comp_annot_file2_Str)
                write_annotation_for_orf(outputgff_file, candidatedbname,
                                         dbname_weight, results_dictionary,
                                         gffreader.orf_dictionary, contig,
                                         candidate_orf_pos, orf['id'])
            else:  # if it was not  a hit then it is a hypothetical protein
                #print gffreader.orf_dictionary
                write_annotation_for_orf(outputgff_file, 'None', '0',
                                         results_dictionary,
                                         gffreader.orf_dictionary, contig,
                                         count, orf['id'])

            count += 1  #move to the next orf

        #del orf_dictionary[contig]

    output_comp_annot_file1.close()
    output_comp_annot_file2.close()

    # now deal with the rRNA sequences  if there is rRNA stats file
    if len(rRNA_16S_stats_files) > 0:
        rRNA_16S_dictionary = {}
        for rRNA_16S_stats_file in rRNA_16S_stats_files:
            process_rRNA_16S_stats(rRNA_16S_stats_file, rRNA_16S_dictionary)

        rRNA_dictionary = {}
        add_16S_genes(rRNA_16S_dictionary, rRNA_dictionary)
        #print rRNA_dictionary
        write_16S_tRNA_gene_info(rRNA_dictionary, outputgff_file, '_rRNA')

    # now deal with the tRNA sequences  if there is tRNA stats file
    if len(tRNA_stats_files) > 0:
        tRNA_dictionary = {}
        for tRNA_stats_file in tRNA_stats_files:
            process_tRNA_stats(tRNA_stats_file, tRNA_dictionary)

        tRNA_gff_dictionary = {}
        add_tRNA_genes(tRNA_dictionary, tRNA_gff_dictionary)
        write_16S_tRNA_gene_info(tRNA_gff_dictionary, outputgff_file, '_tRNA')
        #print tRNA_dictionary

    outputgff_file.close()
def main(argv): 
    global parser
    (opts, args) = parser.parse_args(argv)
    if check_arguments(opts, args):
       print usage
       sys.exit(0)

    input_fasta = opts.input_fasta
    output_file = opts.output_file
    blast_executable = opts.blast_executable
    formatdb_executable = opts.formatdb_executable
    algorithm = opts.algorithm
 
    # input file to blast with itself to commpute refscore
    infile = open(input_fasta,'r')
   
    #this file has the refscores of the entire file
    outfile = open(output_file, 'w') 

    count = 0

    allNames= dict()
    for record in read_fasta_records(infile):
        if count % SIZE == 0:
            if count > 0:
              seq_subset_file.close()
              compute_refscores(formatdb_executable, blast_executable,seq_subset_file, outfile, allNames, algorithm);

              # now remove the old file
              if algorithm == 'BLAST' :
                 remove_blast_index_files(seq_subset_file.name)

              if algorithm == 'LAST' :
                 remove_last_index_files(seq_subset_file.name)

              remove(seq_subset_file.name)

            seq_subset_file = open(output_file +'.tmp.'+ str(count) +'.fasta','w')
        allNames[record.name.replace(">","")] = False;    
        fprintf(seq_subset_file, "%s\n", record.name)
        fprintf(seq_subset_file, "%s\n", record.sequence)

        count = count + 1

    #print str(count) + "   "  + "going to blast last sequence "
    if (count) % SIZE != 0:
       #print str(count) + "   "  + "last sequence "
       seq_subset_file.close()
       compute_refscores(formatdb_executable, blast_executable,seq_subset_file, outfile, allNames, algorithm);
       remove(seq_subset_file.name)
       if algorithm == 'BLAST' :
          remove_blast_index_files(seq_subset_file.name)
       if algorithm == 'LAST' :
          remove_last_index_files(seq_subset_file.name)


    #print count
    for key in allNames:
        if allNames[key] ==False:
           fprintf(outfile, "%s\t%s\n",key, 1000000)

    outfile.close()
def create_annotation(results_dictionary, annotated_gff,  output_dir, ncbi_taxonomy_tree_file):

    lca = LCAComputation(ncbi_taxonomy_tree_file)

    if not path.exists(output_dir):
       makedirs(output_dir)

   
    orf_dictionary={}
    #process_gff_file(annotated_gff, orf_dictionary)

    gffreader = GffFileParser(annotated_gff)

    output_table_file = open(output_dir + '/functional_and_taxonomic_table.txt', 'w')
  
    fprintf(output_table_file, "ORF_ID\tORF_length\tstart\tend\tContig_Name\tContig_length\tstrand\tec\ttaxonomy\tproduct\n")

    meganTree = MeganTree(lca)
    count = 0
    for contig in  gffreader:
       for orf in  gffreader.orf_dictionary[contig]:
          if count%10000==0 :
             # print "fandt " + str(count)
             pass 

          species = []
          if 'refseq' in results_dictionary:
            if orf['id']  in results_dictionary['refseq']:
                for hit in   results_dictionary['refseq'][orf['id']]:
                   names = get_species(hit)
                   if names:
                      species.append(names) 
                      #print species
                      #print '---------------------------'

          taxonomy=lca.getTaxonomy(species)
          fprintf(output_table_file, "%s", orf['id'])
          fprintf(output_table_file, "\t%s", orf['orf_length'])
          fprintf(output_table_file, "\t%s", orf['start'])
          fprintf(output_table_file, "\t%s", orf['end'])
          fprintf(output_table_file, "\t%s", orf['seqname'])
          fprintf(output_table_file, "\t%s", orf['contig_length'])
          fprintf(output_table_file, "\t%s", orf['strand'])
          fprintf(output_table_file, "\t%s", orf['ec'])
          #fprintf(output_table_file, "\t%s", str(species))
          fprintf(output_table_file, "\t%s", taxonomy)
          fprintf(output_table_file, "\t%s\n", orf['product'])
          meganTree.insertTaxon(taxonomy)
          #print meganTree.getChildToParentMap()
                      
    output_table_file.close()
    #print meganTree.getChildToParentMap()
   # print meganTree.getParentToChildrenMap()

    megan_tree_file = open(output_dir + '/megan_tree.tre', 'w')
    fprintf(megan_tree_file,  "%s;", meganTree.printTree('1'))
    megan_tree_file.close()