Python LCAComputation.set_results_dictionary Beispiele

Programmiersprache: Python

Namespace / Paketname: libs.python_modules.taxonomy

Klasse / Typ: LCAComputation

Methode / Funktion: set_results_dictionary

Beispiele auf hotexamples.com: 2

Python LCAComputation.set_results_dictionary - 2 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die libs.python_modules.taxonomy.LCAComputation.set_results_dictionary, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

LCAComputation(3)

clear_cells(2)

get_a_Valid_ID(2)

get_lca(2)

setParameters(2)

translateIdToName(2)

wtd(2)

compute_min_support_tree(1)

load_accession_to_taxon_map(1)

set_results_dictionary(1)

Beispiel #1

Datei anzeigen

Datei: MetaPathways_create_reports_fast.py Projekt: nakeene/metapathways2

def main(argv, errorlogger=None, runstatslogger=None):
    global parser
    (opts, args) = parser.parse_args(argv)
    global opts_global
    opts_global = opts
    if not check_arguments(opts, args):
        print usage
        sys.exit(0)

    db_to_map_Maps = {
        'cog': opts.input_cog_maps,
        'seed': opts.input_seed_maps,
        'kegg': opts.input_kegg_maps,
        'cazy': opts.input_cazy_maps
    }

    results_dictionary = {}
    dbname_weight = {}

    checkOrCreateFolder(opts.output_dir)
    output_table_file = open(
        opts.output_dir + PATHDELIM + 'functional_and_taxonomic_table.txt',
        'w')
    fprintf(
        output_table_file,
        "ORF_ID\tORF_length\tstart\tend\tContig_Name\tContig_length\tstrand\tec\ttaxonomy\tproduct\n"
    )
    output_table_file.close()

    #    print "memory used  = %s" %(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss /1000000)
    listOfOrfs = get_list_of_queries(opts.input_annotated_gff)
    listOfOrfs.sort(key=lambda tup: tup, reverse=False)

    if opts.blastdir != None and opts.sample_name != None:
        try:
            database_names, input_blastouts, weight_dbs = getBlastFileNames(
                opts)
        except:
            print traceback.print_exc(10)
            pass
    else:
        database_names = opts.database_name
        input_blastouts = opts.input_blastout
        weight_dbs = opts.weight_db

##### uncomment the following lines
    for dbname, blastoutput in zip(database_names, input_blastouts):
        create_sorted_parse_blast_files(dbname,
                                        blastoutput,
                                        listOfOrfs,
                                        verbose=opts.verbose,
                                        errorlogger=errorlogger)
#####

# process in blocks of size _stride
    lca = LCAComputation(opts.ncbi_taxonomy_map, opts.ncbi_megan_map)
    lca.setParameters(opts.lca_min_score, opts.lca_top_percent,
                      opts.lca_min_support)

    blastParsers = {}
    for dbname, blastoutput in zip(database_names, input_blastouts):
        blastParsers[dbname] = BlastOutputTsvParser(dbname,
                                                    blastoutput + '.tmp')
        blastParsers[dbname].setMaxErrorsLimit(5)
        blastParsers[dbname].setErrorAndWarningLogger(errorlogger)

    # this part of the code computes the occurence of each of the taxons
    # which is use in the later stage is used to evaluate the min support
    # as used in the MEGAN software

    start = 0
    Length = len(listOfOrfs)
    _stride = 100000
    Taxons = {}
    while start < Length:
        pickorfs = {}
        last = min(Length, start + _stride)
        for i in range(start, last):
            pickorfs[listOfOrfs[i]] = 'root'
        start = last
        #print 'Num of Min support orfs ' + str(start)

        results_dictionary = {}
        for dbname, blastoutput in zip(database_names, input_blastouts):
            results = re.search(r'refseq', dbname, re.I)
            if results:
                #if True:
                try:
                    results_dictionary[dbname] = {}
                    process_parsed_blastoutput(dbname, blastParsers[dbname],
                                               opts,
                                               results_dictionary[dbname],
                                               pickorfs)
                    #print results_dictionary[dbname].keys()[1:5]
                    lca.set_results_dictionary(results_dictionary)
                    lca.compute_min_support_tree(opts.input_annotated_gff,
                                                 pickorfs,
                                                 dbname=dbname)
                    for key, taxon in pickorfs.iteritems():
                        Taxons[key] = taxon
                except:
                    eprintf("ERROR: while training for min support tree %s\n",
                            dbname)
                    traceback.print_exc()

    blastParsers = {}
    for dbname, blastoutput in zip(database_names, input_blastouts):
        blastParsers[dbname] = BlastOutputTsvParser(dbname,
                                                    blastoutput + '.tmp')

    # this loop determines the actual/final taxonomy of each of the ORFs
    # taking into consideration the min support
    filePermTypes = {}
    start = 0
    outputfile = open(opts.output_dir + '/ORF_annotation_table.txt', 'w')

    short_to_long_dbnames = {}
    for dbname in database_names:
        results = re.search(r'^seed', dbname, re.IGNORECASE)
        if results:
            short_to_long_dbnames['seed'] = dbname

        results = re.search(r'^cog', dbname, re.IGNORECASE)
        if results:
            short_to_long_dbnames['cog'] = dbname

        results = re.search(r'^kegg', dbname, re.IGNORECASE)
        if results:
            short_to_long_dbnames['kegg'] = dbname

        results = re.search(r'^cazy', dbname, re.IGNORECASE)
        if results:
            short_to_long_dbnames['cazy'] = dbname

    standard_dbs = ['cog', 'seed', 'kegg', 'cazy']
    standard_db_maps = [
        opts.input_cog_maps, opts.input_seed_maps, opts.input_kegg_maps,
        opts.input_cazy_maps
    ]
    field_to_description = {}
    hierarchical_map = {}

    for db in standard_dbs:
        if db in short_to_long_dbnames:
            field_to_description[db] = {}
            hierarchical_map[db] = {}

    for dbname in standard_dbs:
        if dbname in short_to_long_dbnames:
            try:
                read_map_file(db_to_map_Maps[dbname],
                              field_to_description[dbname],
                              hierarchical_map[dbname])
            except:
                raise
                pass

    while start < Length:
        pickorfs = {}
        last = min(Length, start + _stride)
        for i in range(start, last):
            pickorfs[listOfOrfs[i]] = True
        start = last
        gc.collect()
        eprintf(
            "\nMemory used  = %s MB\n",
            str(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000000))
        results_dictionary = {}
        for dbname, blastoutput in zip(database_names, input_blastouts):
            try:
                results_dictionary[dbname] = {}
                eprintf("Processing database : %s...", dbname)
                process_parsed_blastoutput(dbname, blastParsers[dbname], opts,
                                           results_dictionary[dbname],
                                           pickorfs)
                eprintf("done\n")
            except:
                traceback.print_exc()
                eprintf("ERROR: %s\n", dbname)
                pass
        # print dbname + ' ' + str(len(results_dictionary[dbname]))

        eprintf("Num orfs processed  : %s\n", str(start))

        # create the annotations now
        orfToContig = {}

        create_annotation(results_dictionary, database_names,
                          opts.input_annotated_gff, opts.output_dir, Taxons,
                          pickorfs, orfToContig, lca)

        for std_dbname, db_map_filename in zip(standard_dbs, standard_db_maps):
            if std_dbname in short_to_long_dbnames:
                create_table(
                    results_dictionary[short_to_long_dbnames[std_dbname]],
                    std_dbname, opts.output_dir, hierarchical_map,
                    field_to_description)


#             create_table(results_dictionary[dbname], opts.input_kegg_maps, 'kegg', opts.output_dir, filePermType)

        print_orf_table(results_dictionary, orfToContig, opts.output_dir,
                        outputfile)

    for std_dbname, db_map_filename in zip(standard_dbs, standard_db_maps):
        if std_dbname in short_to_long_dbnames:
            print_kegg_cog_tables(std_dbname,
                                  opts.output_dir,
                                  hierarchical_map,
                                  field_to_description,
                                  filePermType='w')

    outputfile.close()
    # now remove the temporary files
    for dbname, blastoutput in zip(database_names, input_blastouts):
        try:
            remove(blastoutput + '.tmp')
        except:
            pass

Beispiel #2

Datei anzeigen

Datei: MetaPathways_create_reports_fast.py Projekt: nielshanson/myapp

def main(argv, errorlogger = None,  runstatslogger = None):
    global parser
    (opts, args) = parser.parse_args(argv)
    global opts_global
    opts_global = opts
    if not check_arguments(opts, args):
       print usage
       sys.exit(0)


    db_to_map_Maps =  {'cog':opts.input_cog_maps, 'seed':opts.input_seed_maps, 'kegg':opts.input_kegg_maps, 'cazy':opts.input_cazy_maps}


    results_dictionary={}
    dbname_weight={}

    checkOrCreateFolder(opts.output_dir)
    output_table_file = open(opts.output_dir + PATHDELIM +'functional_and_taxonomic_table.txt', 'w')
    fprintf(output_table_file, "ORF_ID\tORF_length\tstart\tend\tContig_Name\tContig_length\tstrand\tec\ttaxonomy\tproduct\n")
    output_table_file.close()

#    print "memory used  = %s" %(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss /1000000)
    listOfOrfs =  get_list_of_queries(opts.input_annotated_gff)
    listOfOrfs.sort(key=lambda tup: tup, reverse=False)

    #printlist(listOfOrfs,5)
    #sys.exit(0)


##### uncomment the following lines
    for dbname, blastoutput in zip(opts.database_name, opts.input_blastout):
      create_sorted_parse_blast_files(dbname, blastoutput, listOfOrfs, verbose= opts.verbose, errorlogger = errorlogger)
#####

    # process in blocks of size _stride
    lca = LCAComputation(opts.ncbi_taxonomy_map, opts.ncbi_megan_map)
    lca.setParameters(opts.lca_min_score, opts.lca_top_percent, opts.lca_min_support)

    blastParsers={}
    for dbname, blastoutput in zip( opts.database_name, opts.input_blastout):
        blastParsers[dbname] =  BlastOutputTsvParser(dbname, blastoutput + '.tmp')
        blastParsers[dbname].setMaxErrorsLimit(5)
        blastParsers[dbname].setErrorAndWarningLogger(errorlogger)

    # this part of the code computes the occurence of each of the taxons
    # which is use in the later stage is used to evaluate the min support
    # as used in the MEGAN software

    start = 0
    Length = len(listOfOrfs)
    _stride = 100000
    Taxons = {}
    while start < Length:
       pickorfs= {}
       last =  min(Length, start + _stride)
       for i in range(start, last):
          pickorfs[listOfOrfs[i]]= 'root'
       start = last
       #print 'Num of Min support orfs ' + str(start)

       results_dictionary={}
       for dbname, blastoutput in zip( opts.database_name, opts.input_blastout):
          results = re.search(r'refseq', dbname, re.I)
          if results:
          #if True:
            try:
               results_dictionary[dbname]={}
               process_parsed_blastoutput(dbname, blastParsers[dbname], opts, results_dictionary[dbname], pickorfs)
               #print results_dictionary[dbname].keys()[1:5]
               lca.set_results_dictionary(results_dictionary)
               lca.compute_min_support_tree(opts.input_annotated_gff, pickorfs, dbname = dbname )
               for key, taxon  in pickorfs.iteritems():
                   Taxons[key] = taxon
            except:
               eprintf("ERROR: while training for min support tree %s\n", dbname)
               import traceback
               traceback.print_exc()

    blastParsers={}
    for dbname, blastoutput in zip( opts.database_name, opts.input_blastout):
        blastParsers[dbname] =  BlastOutputTsvParser(dbname, blastoutput + '.tmp')

    # this loop determines the actual/final taxonomy of each of the ORFs 
    # taking into consideration the min support
    filePermTypes= {}
    start = 0
    outputfile = open( opts.output_dir +'/ORF_annotation_table.txt', 'w')


    short_to_long_dbnames = {}
    for dbname in opts.database_name:
      results = re.search(r'^seed', dbname,  re.IGNORECASE)
      if results:
          short_to_long_dbnames['seed'] = dbname

      results = re.search(r'^cog', dbname,  re.IGNORECASE)
      if results:
          short_to_long_dbnames['cog'] = dbname

      results = re.search(r'^kegg', dbname, re.IGNORECASE)
      if results:
          short_to_long_dbnames['kegg'] = dbname

      results = re.search(r'^cazy', dbname, re.IGNORECASE)
      if results:
          short_to_long_dbnames['cazy'] = dbname

    standard_dbs = ['cog', 'seed', 'kegg', 'cazy']
    standard_db_maps = [opts.input_cog_maps, opts.input_seed_maps, opts.input_kegg_maps, opts.input_cazy_maps]
    field_to_description = {}
    hierarchical_map = {}

    for db in standard_dbs:
      if db in short_to_long_dbnames:
        field_to_description[db] = {}
        hierarchical_map[db] = {}

    for dbname in standard_dbs:
       if dbname in short_to_long_dbnames:
          try:
            read_map_file(db_to_map_Maps[dbname], field_to_description[dbname], hierarchical_map[dbname])
          except:
            raise
            pass

    while start < Length:
       pickorfs= {}
       last =  min(Length, start + _stride)
       for  i in range(start, last):
          pickorfs[listOfOrfs[i]]= True
       start = last
       gc.collect()
       eprintf("\nMemory used  = %s MB\n", str(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1000000))
       results_dictionary={}
       for dbname, blastoutput in zip( opts.database_name, opts.input_blastout):
            try:
               results_dictionary[dbname]={}
               eprintf("Processing database %s...", dbname)
               process_parsed_blastoutput(dbname, blastParsers[dbname], opts, results_dictionary[dbname], pickorfs)
               eprintf("done\n")
            except:
               import traceback
               traceback.print_exc()
               eprintf("ERROR: %s\n", dbname)
               pass
           # print dbname + ' ' + str(len(results_dictionary[dbname]))

       eprintf("Num orfs processed  : %s\n", str(start))

       # create the annotations now
       orfToContig = {}

       create_annotation(results_dictionary, opts.database_name,  opts.input_annotated_gff, opts.output_dir, Taxons, pickorfs, orfToContig, lca)

       for std_dbname, db_map_filename in zip(standard_dbs, standard_db_maps):
         if std_dbname in short_to_long_dbnames:
              create_table(results_dictionary[short_to_long_dbnames[std_dbname]], std_dbname,  opts.output_dir, hierarchical_map, field_to_description)

#             create_table(results_dictionary[dbname], opts.input_kegg_maps, 'kegg', opts.output_dir, filePermType)

       print_orf_table(results_dictionary, orfToContig, opts.output_dir, outputfile)

    for std_dbname, db_map_filename in zip(standard_dbs, standard_db_maps):
       if std_dbname in short_to_long_dbnames:
          print_kegg_cog_tables(std_dbname, opts.output_dir, hierarchical_map, field_to_description,  filePermType = 'w')

    outputfile.close()
    # now remove the temporary files
    for dbname, blastoutput in zip( opts.database_name, opts.input_blastout):
        try:
           remove( blastoutput + '.tmp')
        except:
           pass