def ExtractPathway_WTD(options):
    # Extract pathways and WTD
    # place to store list of expected taxonomic range(s)
    printf('INFO\tEntering the WTD calculations!\n')
    serialized_metacyc_taxa_ranges = "/tmp/metacyc_pwy_taxa_range.pk"
    serialized_metacyc_taxa_ranges_tmp = "/tmp/metacyc_pwy_taxa_range.pk.tmp"
    try:
        if options.wtd and not path.isfile(serialized_metacyc_taxa_ranges):
            # get MetaCyc's expected taxonomic range(s) and serialize for later use in /tmp
            # try:
            printf('INFO\tGetting MetaCyc Expected Taxonomic Range(s)\n')
            pythonCyc = startPathwayTools('meta', options.ptoolsExec, True)

            pwys = pythonCyc.getAllPathways()

            pwy_taxa_range = {}  # hash from pwy to expected taxonomic range(s)
            pwy_taxa_range_pk = open(serialized_metacyc_taxa_ranges_tmp, "w")

            # get expected taxonomic ranges for each pathway
            for pwy in pwys:
                # printf(" " + pwy)
                my_expected_taxonomic_range = pythonCyc.getExpectedTaxonomicRange(
                    pwy)
                pwy_taxa_range[pwy] = my_expected_taxonomic_range
            # printf(" " + pwy)

            # write the pathway
            pickle.dump(pwy_taxa_range, pwy_taxa_range_pk)
            pwy_taxa_range_pk.close()
            StopPathwayTools()
            rename(serialized_metacyc_taxa_ranges_tmp,
                   serialized_metacyc_taxa_ranges)
        else:
            # read expected taxonomic range from serialized file
            exepected_taxa_in = open(serialized_metacyc_taxa_ranges, "r")
            pwy_taxa_range = pickle.load(exepected_taxa_in)

        # create mapping of preferred NCBI to MEGAN taxonomy
        megan_map = {}
        if options.ncbi_megan_map:
            with open(options.ncbi_megan_map) as megan_map_file:
                for line in megan_map_file:
                    fields = line.split("\t")
                    fields = map(str.strip, fields)
                    megan_map[fields[0]] = fields[1]

        # get ORF to taxa map from annotation_table
        printf("INFO\tGetting ORF to Taxa Map from AnnotationTable\n")
        orf_lca = {}
        with open(options.annotation_table) as f:
            for line in f:
                fields = line.split("\t")
                orf_lca[fields[0].strip()] = fields[8].strip()

        # get pathway ORFs and Rxns
        pwy_to_orfs = {}
        pwy_to_long = {}
        pwy_to_rxns = {}
        try:
            pythonCyc = startPathwayTools(options.sample_name.lower(),
                                          options.ptoolsExec, True)
            pwys = pythonCyc.getAllPathways()

            for pwy in pwys:
                # printf(" " + pwy)
                genes = pythonCyc.getPathwayORFs(pwy)
                rxns = pythonCyc.getPathwayReactionInfo(pwy)
                pwy_to_orfs[pwy] = genes
                pwy_to_long[pwy] = cleanup(
                    pythonCyc.get_slot_value(pwy, "common-name"))
                pwy_to_rxns[pwy] = rxns
            # printf("\n")
            StopPathwayTools()

        except:
            print """
            Problem connecting to Pathway Tools. Check the /tmp/ptools-socket file.
            """
    except:
        print """
        Problem calculating WTD via Pathway Tools. Check the /tmp/ptools-socket file.
        """

    # get LCA per pathway
    pwy_lca = {}
    # load NCBI taxonomy map
    printf("INFO\tLoading NCBI Taxonomy Map\n")
    lca = LCAComputation([options.ncbi_tree], )

    for pwy in pwy_to_orfs:
        orfs = pwy_to_orfs[pwy]
        taxa_ids = []
        for orf in orfs:
            if orf in orf_lca:
                # could strip out id here
                res = re.search("(.+?)\(([0-9]+?)\)", orf_lca[orf])
                if res:
                    taxa_annotation = res.group(1)
                    id = res.group(2)
                else:
                    id = lca.get_a_Valid_ID([orf_lca[orf]])
                taxa_ids.append(id)
        pwy_lca_id = lca.get_lca(taxa_ids, True)
        # print "In run_pathologic"
        # print pwy_lca_id
        # print pwy_lca_id
        lca.clear_cells(taxa_ids)

        pwy_lca[pwy] = [pwy_lca_id, lca.translateIdToName(pwy_lca_id)]

    # calculate weighted taxonomic distance
    pwy_to_wtd = {}
    for pwy in pwy_lca:

        C = []  # list of distances
        C_taxa = []  # list of parallel observed-expected taxa pairs
        C_pos = []  # list of non-negative distances
        C_pos_taxa = []  # list of parallel observed-expected taxa pairs
        C_neg = []  # list of negative distances
        C_neg_taxa = []  # list of parallel observed-expected taxa pairs

        if len(pwy_taxa_range[pwy]) > 0:
            for expected in pwy_taxa_range[pwy]:
                dist = lca.wtd(expected[0], pwy_lca[pwy][0])
                if dist or dist == 0:
                    # valid distance
                    # add distance respective lists
                    C.append(dist)  # add distance
                    C_taxa.append([expected[0], pwy_lca[pwy][0]])
                    if dist >= 0:
                        C_pos.append(dist)  # add to non-negative list
                        C_pos_taxa.append([expected[0], pwy_lca[pwy][0]])
                    else:
                        C_neg.append(dist)  # add to negative list
                        C_neg_taxa.append([expected[0], pwy_lca[pwy][0]])
                else:
                    print "Not a valid distance"
                    continue
        else:
            # no expected taxonomy, set to root
            min_taxa = "1"
            dist = lca.wtd(min_taxa, pwy_lca[pwy][0])
            # add distance respective lists
            C.append(dist)  # add distance
            C_taxa.append([min_taxa, pwy_lca[pwy][0]])
            if dist >= 0:
                C_pos.append(dist)  # add to non-negative list
                C_pos_taxa.append([min_taxa, pwy_lca[pwy][0]])
            else:
                C_neg.append(dist)  # add to negative list
                C_neg_taxa.append([min_taxa, pwy_lca[pwy][0]])

        # find index with max distance (closest to expected taxonomy)
        max_index, max_dist = max(enumerate(C), key=operator.itemgetter(1))
        max_taxa = C_taxa[max_index]

        # remap to preferred names
        observed = get_preferred_taxa_name(max_taxa[1], megan_map,
                                           lca.id_to_name)
        expected = get_preferred_taxa_name(max_taxa[0], megan_map,
                                           lca.id_to_name)

        pwy_to_wtd[pwy] = [max_dist, observed, expected]

    # write out pathway table
    table_out_tmp = options.table_out + ".tmp"
    try:
        out = open(table_out_tmp, "w")
    except:
        print "Had problems opening file: " + options.table_out

    # write appropreate header
    if options.wtd:
        header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tWTD\tOBSERVED\tEXPECTED\tORFS\n"
    else:
        header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tORFS\n"
    out.write(header)

    sample = options.sample_name  # sample name
    for pwy in pwy_to_orfs:
        # generate output line
        line = []
        line.append(sample)  # sample name
        line.append(pwy)  # pathway name
        line.append(pwy_to_long[pwy])  # pathway longname
        line.append(pwy_to_rxns[pwy][0])  # pathway num reactions
        line.append(pwy_to_rxns[pwy][1])  # pathway covered reactions
        line.append(len(pwy_to_orfs[pwy]))  # num orfs
        if options.wtd:
            line.append(pwy_to_wtd[pwy][0])  # wtd
            line.append(pwy_to_wtd[pwy][1])  # wtd observed taxa
            line.append(pwy_to_wtd[pwy][2])  # wtd expected taxa
        line.append("[" + ",".join(pwy_to_orfs[pwy]) + "]")  # list of ORFs

        line = map(str, line)  # cast all to string

        out.write("\t".join(line) + "\n")  # write out line
    try:
        out.close()  # close file
        rename(table_out_tmp, options.table_out)
    except:
        print "Had problems closing file: " + options.table_out
def main(argv, errorlogger=None, runstatslogger=None):
    global parser
    (opts, args) = parser.parse_args(argv)
    global opts_global
    opts_global = opts
    if not check_arguments(opts, args):
        print usage
        sys.exit(0)

    db_to_map_Maps = {
        'cog': opts.input_cog_maps,
        'seed': opts.input_seed_maps,
        'kegg': opts.input_kegg_maps,
        'cazy': opts.input_cazy_maps
    }

    results_dictionary = {}
    dbname_weight = {}

    checkOrCreateFolder(opts.output_dir)
    output_table_file = open(
        opts.output_dir + PATHDELIM + 'functional_and_taxonomic_table.txt',
        'w')
    fprintf(
        output_table_file,
        "ORF_ID\tORF_length\tstart\tend\tContig_Name\tContig_length\tstrand\tec\ttaxonomy\tproduct\n"
    )
    output_table_file.close()

    #    print "memory used  = %s" %(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss /1000000)
    listOfOrfs = get_list_of_queries(opts.input_annotated_gff)
    listOfOrfs.sort(key=lambda tup: tup, reverse=False)

    if opts.blastdir != None and opts.sample_name != None:
        try:
            database_names, input_blastouts, weight_dbs = getBlastFileNames(
                opts)
        except:
            print traceback.print_exc(10)
            pass
    else:
        database_names = opts.database_name
        input_blastouts = opts.input_blastout
        weight_dbs = opts.weight_db

##### uncomment the following lines
    for dbname, blastoutput in zip(database_names, input_blastouts):
        create_sorted_parse_blast_files(dbname,
                                        blastoutput,
                                        listOfOrfs,
                                        verbose=opts.verbose,
                                        errorlogger=errorlogger)
#####

# process in blocks of size _stride
    lca = LCAComputation(opts.ncbi_taxonomy_map, opts.ncbi_megan_map)
    lca.setParameters(opts.lca_min_score, opts.lca_top_percent,
                      opts.lca_min_support)

    blastParsers = {}
    for dbname, blastoutput in zip(database_names, input_blastouts):
        blastParsers[dbname] = BlastOutputTsvParser(dbname,
                                                    blastoutput + '.tmp')
        blastParsers[dbname].setMaxErrorsLimit(5)
        blastParsers[dbname].setErrorAndWarningLogger(errorlogger)

    # this part of the code computes the occurence of each of the taxons
    # which is use in the later stage is used to evaluate the min support
    # as used in the MEGAN software

    start = 0
    Length = len(listOfOrfs)
    _stride = 100000
    Taxons = {}
    while start < Length:
        pickorfs = {}
        last = min(Length, start + _stride)
        for i in range(start, last):
            pickorfs[listOfOrfs[i]] = 'root'
        start = last
        #print 'Num of Min support orfs ' + str(start)

        results_dictionary = {}
        for dbname, blastoutput in zip(database_names, input_blastouts):
            results = re.search(r'refseq', dbname, re.I)
            if results:
                #if True:
                try:
                    results_dictionary[dbname] = {}
                    process_parsed_blastoutput(dbname, blastParsers[dbname],
                                               opts,
                                               results_dictionary[dbname],
                                               pickorfs)
                    #print results_dictionary[dbname].keys()[1:5]
                    lca.set_results_dictionary(results_dictionary)
                    lca.compute_min_support_tree(opts.input_annotated_gff,
                                                 pickorfs,
                                                 dbname=dbname)
                    for key, taxon in pickorfs.iteritems():
                        Taxons[key] = taxon
                except:
                    eprintf("ERROR: while training for min support tree %s\n",
                            dbname)
                    traceback.print_exc()

    blastParsers = {}
    for dbname, blastoutput in zip(database_names, input_blastouts):
        blastParsers[dbname] = BlastOutputTsvParser(dbname,
                                                    blastoutput + '.tmp')

    # this loop determines the actual/final taxonomy of each of the ORFs
    # taking into consideration the min support
    filePermTypes = {}
    start = 0
    outputfile = open(opts.output_dir + '/ORF_annotation_table.txt', 'w')

    short_to_long_dbnames = {}
    for dbname in database_names:
        results = re.search(r'^seed', dbname, re.IGNORECASE)
        if results:
            short_to_long_dbnames['seed'] = dbname

        results = re.search(r'^cog', dbname, re.IGNORECASE)
        if results:
            short_to_long_dbnames['cog'] = dbname

        results = re.search(r'^kegg', dbname, re.IGNORECASE)
        if results:
            short_to_long_dbnames['kegg'] = dbname

        results = re.search(r'^cazy', dbname, re.IGNORECASE)
        if results:
            short_to_long_dbnames['cazy'] = dbname

    standard_dbs = ['cog', 'seed', 'kegg', 'cazy']
    standard_db_maps = [
        opts.input_cog_maps, opts.input_seed_maps, opts.input_kegg_maps,
        opts.input_cazy_maps
    ]
    field_to_description = {}
    hierarchical_map = {}

    for db in standard_dbs:
        if db in short_to_long_dbnames:
            field_to_description[db] = {}
            hierarchical_map[db] = {}

    for dbname in standard_dbs:
        if dbname in short_to_long_dbnames:
            try:
                read_map_file(db_to_map_Maps[dbname],
                              field_to_description[dbname],
                              hierarchical_map[dbname])
            except:
                raise
                pass

    while start < Length:
        pickorfs = {}
        last = min(Length, start + _stride)
        for i in range(start, last):
            pickorfs[listOfOrfs[i]] = True
        start = last
        gc.collect()
        eprintf(
            "\nMemory used  = %s MB\n",
            str(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000000))
        results_dictionary = {}
        for dbname, blastoutput in zip(database_names, input_blastouts):
            try:
                results_dictionary[dbname] = {}
                eprintf("Processing database : %s...", dbname)
                process_parsed_blastoutput(dbname, blastParsers[dbname], opts,
                                           results_dictionary[dbname],
                                           pickorfs)
                eprintf("done\n")
            except:
                traceback.print_exc()
                eprintf("ERROR: %s\n", dbname)
                pass
        # print dbname + ' ' + str(len(results_dictionary[dbname]))

        eprintf("Num orfs processed  : %s\n", str(start))

        # create the annotations now
        orfToContig = {}

        create_annotation(results_dictionary, database_names,
                          opts.input_annotated_gff, opts.output_dir, Taxons,
                          pickorfs, orfToContig, lca)

        for std_dbname, db_map_filename in zip(standard_dbs, standard_db_maps):
            if std_dbname in short_to_long_dbnames:
                create_table(
                    results_dictionary[short_to_long_dbnames[std_dbname]],
                    std_dbname, opts.output_dir, hierarchical_map,
                    field_to_description)


#             create_table(results_dictionary[dbname], opts.input_kegg_maps, 'kegg', opts.output_dir, filePermType)

        print_orf_table(results_dictionary, orfToContig, opts.output_dir,
                        outputfile)

    for std_dbname, db_map_filename in zip(standard_dbs, standard_db_maps):
        if std_dbname in short_to_long_dbnames:
            print_kegg_cog_tables(std_dbname,
                                  opts.output_dir,
                                  hierarchical_map,
                                  field_to_description,
                                  filePermType='w')

    outputfile.close()
    # now remove the temporary files
    for dbname, blastoutput in zip(database_names, input_blastouts):
        try:
            remove(blastoutput + '.tmp')
        except:
            pass
def main(argv, errorlogger = None,  runstatslogger = None):
    global parser
    (opts, args) = parser.parse_args(argv)
    global opts_global
    opts_global = opts
    if not check_arguments(opts, args):
       print usage
       sys.exit(0)


    db_to_map_Maps =  {'cog':opts.input_cog_maps, 'seed':opts.input_seed_maps, 'kegg':opts.input_kegg_maps, 'cazy':opts.input_cazy_maps}


    results_dictionary={}
    dbname_weight={}

    checkOrCreateFolder(opts.output_dir)
    output_table_file = open(opts.output_dir + PATHDELIM +'functional_and_taxonomic_table.txt', 'w')
    fprintf(output_table_file, "ORF_ID\tORF_length\tstart\tend\tContig_Name\tContig_length\tstrand\tec\ttaxonomy\tproduct\n")
    output_table_file.close()

#    print "memory used  = %s" %(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss /1000000)
    listOfOrfs =  get_list_of_queries(opts.input_annotated_gff)
    listOfOrfs.sort(key=lambda tup: tup, reverse=False)

    #printlist(listOfOrfs,5)
    #sys.exit(0)


##### uncomment the following lines
    for dbname, blastoutput in zip(opts.database_name, opts.input_blastout):
      create_sorted_parse_blast_files(dbname, blastoutput, listOfOrfs, verbose= opts.verbose, errorlogger = errorlogger)
#####

    # process in blocks of size _stride
    lca = LCAComputation(opts.ncbi_taxonomy_map, opts.ncbi_megan_map)
    lca.setParameters(opts.lca_min_score, opts.lca_top_percent, opts.lca_min_support)

    blastParsers={}
    for dbname, blastoutput in zip( opts.database_name, opts.input_blastout):
        blastParsers[dbname] =  BlastOutputTsvParser(dbname, blastoutput + '.tmp')
        blastParsers[dbname].setMaxErrorsLimit(5)
        blastParsers[dbname].setErrorAndWarningLogger(errorlogger)

    # this part of the code computes the occurence of each of the taxons
    # which is use in the later stage is used to evaluate the min support
    # as used in the MEGAN software

    start = 0
    Length = len(listOfOrfs)
    _stride = 100000
    Taxons = {}
    while start < Length:
       pickorfs= {}
       last =  min(Length, start + _stride)
       for i in range(start, last):
          pickorfs[listOfOrfs[i]]= 'root'
       start = last
       #print 'Num of Min support orfs ' + str(start)

       results_dictionary={}
       for dbname, blastoutput in zip( opts.database_name, opts.input_blastout):
          results = re.search(r'refseq', dbname, re.I)
          if results:
          #if True:
            try:
               results_dictionary[dbname]={}
               process_parsed_blastoutput(dbname, blastParsers[dbname], opts, results_dictionary[dbname], pickorfs)
               #print results_dictionary[dbname].keys()[1:5]
               lca.set_results_dictionary(results_dictionary)
               lca.compute_min_support_tree(opts.input_annotated_gff, pickorfs, dbname = dbname )
               for key, taxon  in pickorfs.iteritems():
                   Taxons[key] = taxon
            except:
               eprintf("ERROR: while training for min support tree %s\n", dbname)
               import traceback
               traceback.print_exc()

    blastParsers={}
    for dbname, blastoutput in zip( opts.database_name, opts.input_blastout):
        blastParsers[dbname] =  BlastOutputTsvParser(dbname, blastoutput + '.tmp')

    # this loop determines the actual/final taxonomy of each of the ORFs 
    # taking into consideration the min support
    filePermTypes= {}
    start = 0
    outputfile = open( opts.output_dir +'/ORF_annotation_table.txt', 'w')


    short_to_long_dbnames = {}
    for dbname in opts.database_name:
      results = re.search(r'^seed', dbname,  re.IGNORECASE)
      if results:
          short_to_long_dbnames['seed'] = dbname

      results = re.search(r'^cog', dbname,  re.IGNORECASE)
      if results:
          short_to_long_dbnames['cog'] = dbname

      results = re.search(r'^kegg', dbname, re.IGNORECASE)
      if results:
          short_to_long_dbnames['kegg'] = dbname

      results = re.search(r'^cazy', dbname, re.IGNORECASE)
      if results:
          short_to_long_dbnames['cazy'] = dbname

    standard_dbs = ['cog', 'seed', 'kegg', 'cazy']
    standard_db_maps = [opts.input_cog_maps, opts.input_seed_maps, opts.input_kegg_maps, opts.input_cazy_maps]
    field_to_description = {}
    hierarchical_map = {}

    for db in standard_dbs:
      if db in short_to_long_dbnames:
        field_to_description[db] = {}
        hierarchical_map[db] = {}

    for dbname in standard_dbs:
       if dbname in short_to_long_dbnames:
          try:
            read_map_file(db_to_map_Maps[dbname], field_to_description[dbname], hierarchical_map[dbname])
          except:
            raise
            pass

    while start < Length:
       pickorfs= {}
       last =  min(Length, start + _stride)
       for  i in range(start, last):
          pickorfs[listOfOrfs[i]]= True
       start = last
       gc.collect()
       eprintf("\nMemory used  = %s MB\n", str(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1000000))
       results_dictionary={}
       for dbname, blastoutput in zip( opts.database_name, opts.input_blastout):
            try:
               results_dictionary[dbname]={}
               eprintf("Processing database %s...", dbname)
               process_parsed_blastoutput(dbname, blastParsers[dbname], opts, results_dictionary[dbname], pickorfs)
               eprintf("done\n")
            except:
               import traceback
               traceback.print_exc()
               eprintf("ERROR: %s\n", dbname)
               pass
           # print dbname + ' ' + str(len(results_dictionary[dbname]))

       eprintf("Num orfs processed  : %s\n", str(start))

       # create the annotations now
       orfToContig = {}

       create_annotation(results_dictionary, opts.database_name,  opts.input_annotated_gff, opts.output_dir, Taxons, pickorfs, orfToContig, lca)

       for std_dbname, db_map_filename in zip(standard_dbs, standard_db_maps):
         if std_dbname in short_to_long_dbnames:
              create_table(results_dictionary[short_to_long_dbnames[std_dbname]], std_dbname,  opts.output_dir, hierarchical_map, field_to_description)

#             create_table(results_dictionary[dbname], opts.input_kegg_maps, 'kegg', opts.output_dir, filePermType)

       print_orf_table(results_dictionary, orfToContig, opts.output_dir, outputfile)

    for std_dbname, db_map_filename in zip(standard_dbs, standard_db_maps):
       if std_dbname in short_to_long_dbnames:
          print_kegg_cog_tables(std_dbname, opts.output_dir, hierarchical_map, field_to_description,  filePermType = 'w')

    outputfile.close()
    # now remove the temporary files
    for dbname, blastoutput in zip( opts.database_name, opts.input_blastout):
        try:
           remove( blastoutput + '.tmp')
        except:
           pass
def main(argv):
    global parser
    (opts, args) = parser.parse_args()

    if not check_arguments(opts, args):
        print usage
        sys.exit(0)

    # place to store list of expected taxonomic range(s)
    serialized_metacyc_taxa_ranges = "/tmp/metacyc_pwy_taxa_range.pk"

    if opts.wtd and not os.path.isfile(serialized_metacyc_taxa_ranges):
        # get MetaCyc's expected taxonomic range(s) and serialize for later use in /tmp
        try:
            print "Getting MetaCyc Expected Taxonomic Range(s)"

            # connect to Pathway Tools
            cyc = PythonCyc()
            cyc.setOrganism("meta")
            cyc.setPToolsExec(opts.pathway_tools)
            cyc.startPathwayTools()

            pwys = cyc.getAllPathways()

            pwy_taxa_range = {}  # hash from pwy to expected taxonomic range(s)
            pwy_taxa_range_pk = open(serialized_metacyc_taxa_ranges, "w")

            # get expected taxonomic ranges for each pathway
            for pwy in pwys:
                my_expected_taxonomic_range = cyc.getExpectedTaxonomicRange(pwy)
                pwy_taxa_range[pwy] = my_expected_taxonomic_range

            # write the pathway
            pickle.dump(pwy_taxa_range, pwy_taxa_range_pk)
            pwy_taxa_range_pk.close()

            # close Pathway Tools
            cyc.stopPathwayTools()
        except:
            print """
            Problem connecting to Pathway Tools. Check the /tmp/ptools-socket file.
            """
    else:
        # read expected taxonomic range from serialized file
        exepected_taxa_in = open(serialized_metacyc_taxa_ranges, "r")
        pwy_taxa_range = pickle.load(exepected_taxa_in)

    # create mapping of preferred NCBI to MEGAN taxonomy
    megan_map = {}
    if opts.ncbi_megan_map:
        with open(opts.ncbi_megan_map) as megan_map_file:
            for line in megan_map_file:
                fields = line.split("\t")
                fields = map(str.strip, fields)
                megan_map[fields[0]] = fields[1]

    # get ORF to taxa map from annotation_table
    print "Getting ORF to Taxa Map from AnnotationTable"
    orf_lca = {}
    with open(opts.annotation_table) as f:
        for line in f:
            fields = line.split("\t")
            orf_lca[fields[0].strip()] = fields[8].strip()

    # get pathway ORFs and Rxns
    pwy_to_orfs = {}
    pwy_to_long = {}
    pwy_to_rxns = {}
    try:
        cyc = PythonCyc()
        cyc.setOrganism(opts.pgdb_name)
        cyc.setPToolsExec(opts.pathway_tools)
        cyc.startPathwayTools()
        pwys = cyc.getAllPathways()
        for pwy in pwys:
            genes = cyc.getPathwayORFs(pwy)
            rxns = cyc.getPathwayReactionInfo(pwy)
            pwy_to_orfs[pwy] = genes
            pwy_to_long[pwy] = cleanup(cyc.get_slot_value(pwy, "common-name"))
            pwy_to_rxns[pwy] = rxns

        cyc.stopPathwayTools()
    except:
        print """
        Problem connecting to Pathway Tools. Check the /tmp/ptools-socket file.
        """

    # get LCA per pathway
    pwy_lca = {}
    # load NCBI taxonomy map
    print "Loading NCBI Taxonomy Map"
    lca = LCAComputation([opts.ncbi_tree])
    lca.setParameters(opts.lca_min_score, opts.lca_top_percent, opts.lca_min_support)

    for pwy in pwy_to_orfs:
        orfs = pwy_to_orfs[pwy]
        taxa_ids = []
        for orf in orfs:
            if orf in orf_lca:
                id = lca.get_a_Valid_ID([orf_lca[orf]])
                taxa_ids.append(id)
        pwy_lca_id = lca.get_lca(taxa_ids, True)
        lca.clear_cells(taxa_ids)

        pwy_lca[pwy] = [pwy_lca_id, lca.translateIdToName(pwy_lca_id)]

    # calculate weighted taxonomic distance
    pwy_to_wtd = {}
    for pwy in pwy_lca:

        C = []  # list of distances
        C_taxa = []  # list of parallel observed-expected taxa pairs
        C_pos = []  # list of non-negative distances
        C_pos_taxa = []  # list of parallel observed-expected taxa pairs
        C_neg = []  # list of negative distances
        C_neg_taxa = []  # list of parallel observed-expected taxa pairs
        if pwy in pwy_taxa_range:
            if len(pwy_taxa_range[pwy]) > 0:
                for expected in pwy_taxa_range[pwy]:
                    dist = lca.wtd(expected[0], pwy_lca[pwy][0])
                    if dist or dist == 0:
                        # valid distance
                        # add distance respective lists
                        C.append(dist)  # add distance
                        C_taxa.append([expected[0], pwy_lca[pwy][0]])
                        if dist >= 0:
                            C_pos.append(dist)  # add to non-negative list
                            C_pos_taxa.append([expected[0], pwy_lca[pwy][0]])
                        else:
                            C_neg.append(dist)  # add to negative list
                            C_neg_taxa.append([expected[0], pwy_lca[pwy][0]])
                    else:
                        print "Not a valid distance"
                        continue
            else:
                # no expected taxonomy, set to root
                min_taxa = "1"
                dist = lca.wtd(min_taxa, pwy_lca[pwy][0])
                # add distance respective lists
                C.append(dist)  # add distance
                C_taxa.append([min_taxa, pwy_lca[pwy][0]])
                if dist >= 0:
                    C_pos.append(dist)  # add to non-negative list
                    C_pos_taxa.append([min_taxa, pwy_lca[pwy][0]])
                else:
                    C_neg.append(dist)  # add to negative list
                    C_neg_taxa.append([min_taxa, pwy_lca[pwy][0]])

            # find index with max distance (closest to expected taxonomy)
            max_index, max_dist = max(enumerate(C), key=operator.itemgetter(1))
            max_taxa = C_taxa[max_index]

            # remap to preferred names
            observed = get_preferred_taxa_name(max_taxa[1], megan_map, lca.id_to_name)
            expected = get_preferred_taxa_name(max_taxa[0], megan_map, lca.id_to_name)

            pwy_to_wtd[pwy] = [max_dist, observed, expected]

    # write out pathway table
    try:
        out = open(opts.table_out, "w")
    except:
        print "Had problems opening file: " + opts.table_out

    # write appropreate header
    if opts.wtd:
        header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tWTD\tOBSERVED\tEXPECTED\tORFS\n"
    else:
        header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tORFS\n"
    out.write(header)

    sample = opts.pgdb_name  # sample name
    for pwy in pwy_to_orfs:
        # generate output line
        line = []
        line.append(sample)  # sample name
        line.append(pwy)  # pathway name
        line.append(pwy_to_long[pwy])  # pathway longname
        line.append(pwy_to_rxns[pwy][0])  # pathway num reactions
        line.append(pwy_to_rxns[pwy][1])  # pathway covered reactions
        line.append(len(pwy_to_orfs[pwy]))  # num orfs
        if opts.wtd:
            if pwy in pwy_to_wtd:
                line.append(pwy_to_wtd[pwy][0])  # wtd
                line.append(pwy_to_wtd[pwy][1])  # wtd observed taxa
                line.append(pwy_to_wtd[pwy][2])  # wtd expected taxa
            else:
                line.append("NA")
                line.append("NA")
                line.append("NA")
        line.append("[" + ",".join(pwy_to_orfs[pwy]) + "]")  # list of ORFs

        line = map(str, line)  # cast all to string

        out.write("\t".join(line) + "\n")  # write out line
    try:
        out.close()  # close file
    except:
        print "Had problems closing file: " + opts.table_out
def  ExtractPathway_WTD(options):
    # Extract pathways and WTD
   # place to store list of expected taxonomic range(s)
    printf('\n')
    printf('INFO\tEntering the WTD calculations!\n')
    serialized_metacyc_taxa_ranges = "/tmp/metacyc_pwy_taxa_range.pk"

    try:
        #print  options.wtd,  not path.isfile(serialized_metacyc_taxa_ranges),  serialized_metacyc_taxa_ranges
        if options.wtd and not path.isfile(serialized_metacyc_taxa_ranges):
            # get MetaCyc's expected taxonomic range(s) and serialize for later use in /tmp
            # try:
            printf('INFO\tGetting MetaCyc Expected Taxonomic Range(s)\n')
            pythonCyc = startPathwayTools('meta', options.ptoolsExec, True)

            pwys = pythonCyc.getAllPathways()

            pwy_taxa_range = {} # hash from pwy to expected taxonomic range(s)
            pwy_taxa_range_pk = open(serialized_metacyc_taxa_ranges ,"w")

            # get expected taxonomic ranges for each pathway
            for pwy in pwys:
                printf(" " + pwy) 
                my_expected_taxonomic_range = pythonCyc.getExpectedTaxonomicRange(pwy)
                pwy_taxa_range[pwy] = my_expected_taxonomic_range
            # printf(" " + pwy)

            # write the pathway
            pickle.dump(pwy_taxa_range, pwy_taxa_range_pk)
            pwy_taxa_range_pk.close()
            StopPathwayTools()

        # read expected taxonomic range from serialized file
        exepected_taxa_in = open(serialized_metacyc_taxa_ranges ,"r")
        pwy_taxa_range = pickle.load(exepected_taxa_in)

        # create mapping of preferred NCBI to MEGAN taxonomy
        megan_map = {}
        if options.ncbi_megan_map:
            with open(options.ncbi_megan_map) as megan_map_file:
                for line in megan_map_file:
                    fields = line.split("\t")
                    fields = map(str.strip, fields)
                    megan_map[ fields[0] ] = fields[1]

        # get ORF to taxa map from annotation_table
        printf("INFO\tGetting ORF to Taxa Map from AnnotationTable\n")
        orf_lca = {}
        with open(options.annotation_table) as f:
            for line in f:
                fields = line.split("\t")
                orf_lca[fields[0].strip()] = fields[8].strip()

        # get pathway ORFs and Rxns
        pwy_to_orfs = {}
        pwy_to_long = {}
        pwy_to_rxns = {}
        try:
            pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True)
            pwys = pythonCyc.getAllPathways()

            for pwy in pwys:
                printf(" " + pwy)
                genes = pythonCyc.getPathwayORFs(pwy)
                rxns = pythonCyc.getPathwayReactionInfo(pwy)
                pwy_to_orfs[pwy] = genes
                pwy_to_long[pwy] = cleanup(pythonCyc.get_slot_value(pwy, "common-name"))
                pwy_to_rxns[pwy] = rxns
            # printf("\n")
            StopPathwayTools()

        except:
            insert_error(9)
            print """
            Problem connecting to Pathway Tools. Check the /tmp/ptools-socket file.
            """
    except:
        print """
        Problem calculating WTD via Pathway Tools. Check the /tmp/ptools-socket file.
        """
        insert_error(9)

    # get LCA per pathway
    pwy_lca = {}
    # load NCBI taxonomy map
    printf("\nINFO\tLoading NCBI Taxonomy Map\n")
    lca = LCAComputation([ options.ncbi_tree ], )

    for pwy in pwy_to_orfs:
        orfs = pwy_to_orfs[pwy]
        taxa_ids = []
        for orf in orfs:
            if orf in orf_lca:
                # could strip out id here
                res = re.search("(.+?)\(([0-9]+?)\)",  orf_lca[orf] )
                if res:
                    taxa_annotation = res.group(1)
                    id = res.group(2)
                else:
                    id = lca.get_a_Valid_ID([ orf_lca[orf] ])
                taxa_ids.append(id)
        pwy_lca_id = lca.get_lca(taxa_ids, True)
        # print "In run_pathologic"
        # print pwy_lca_id
        # print pwy_lca_id
        lca.clear_cells(taxa_ids)

        pwy_lca[pwy] = [pwy_lca_id, lca.translateIdToName(pwy_lca_id)]

    # calculate weighted taxonomic distance
    pwy_to_wtd = {}
    printf("INFO\tCalculating WTD\n")

    for pwy in pwy_lca:

        C = [] # list of distances
        C_taxa = [] # list of parallel observed-expected taxa pairs
        C_pos = [] # list of non-negative distances
        C_pos_taxa = [] # list of parallel observed-expected taxa pairs
        C_neg = [] # list of negative distances
        C_neg_taxa = [] # list of parallel observed-expected taxa pairs

        if pwy in pwy_taxa_range and  len(pwy_taxa_range[pwy]) :
            for expected in pwy_taxa_range[pwy]:
                dist = lca.wtd(expected[0], pwy_lca[pwy][0])
                if dist or dist == 0:
                    # valid distance
                    # add distance respective lists
                    C.append(dist) # add distance
                    C_taxa.append([ expected[0], pwy_lca[pwy][0] ])
                    if dist >= 0:
                        C_pos.append(dist)  # add to non-negative list
                        C_pos_taxa.append([ expected[0], pwy_lca[pwy][0] ])
                    else:
                        C_neg.append(dist)  # add to negative list
                        C_neg_taxa.append([ expected[0], pwy_lca[pwy][0] ])
                else:
                    print "Not a valid distance"
                    continue
        else:
            # no expected taxonomy, set to root
            min_taxa = "1"
            dist = lca.wtd(min_taxa, pwy_lca[pwy][0])
            # add distance respective lists
            C.append(dist) # add distance
            C_taxa.append([ min_taxa, pwy_lca[pwy][0] ])
            if dist >= 0:
                C_pos.append(dist)  # add to non-negative list
                C_pos_taxa.append([ min_taxa, pwy_lca[pwy][0] ])
            else:
                C_neg.append(dist)  # add to negative list
                C_neg_taxa.append([ min_taxa, pwy_lca[pwy][0] ])

        # find index with max distance (closest to expected taxonomy)
        max_index, max_dist = max(enumerate(C), key=operator.itemgetter(1))
        max_taxa = C_taxa[max_index]

        # remap to preferred names
        observed = get_preferred_taxa_name(max_taxa[1], megan_map, lca.id_to_name)
        expected = get_preferred_taxa_name(max_taxa[0], megan_map, lca.id_to_name)

        pwy_to_wtd[pwy] = [ max_dist, observed, expected ]

    # write out pathway table
    table_out_tmp  = options.table_out + ".tmp"
    try:
        out = open(table_out_tmp, "w")
    except:
        print "Had problems opening file: " + options.table_out
        insert_error(9)

    # write appropreate header
    if options.wtd:
        header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tWTD\tOBSERVED\tEXPECTED\tORFS\n"
    else:
        header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tORFS\n"
    out.write(header)

    sample = options.sample_name # sample name
    for pwy in pwy_to_orfs:
        # generate output line
        line = []
        line.append(sample) # sample name
        line.append(pwy) # pathway name
        line.append(pwy_to_long[pwy]) # pathway longname
        line.append(pwy_to_rxns[pwy][0]) # pathway num reactions
        line.append(pwy_to_rxns[pwy][1]) # pathway covered reactions
        line.append(len(pwy_to_orfs[pwy])) # num orfs
        if options.wtd:
            line.append(pwy_to_wtd[pwy][0]) # wtd
            line.append(pwy_to_wtd[pwy][1]) # wtd observed taxa
            line.append(pwy_to_wtd[pwy][2]) # wtd expected taxa
        line.append("[" + ",".join(pwy_to_orfs[pwy]) + "]") # list of ORFs

        line = map(str, line) # cast all to string

        out.write("\t".join(line) + "\n") # write out line
    try:
        out.close() # close file
        rename(table_out_tmp, options.table_out)
    except:
        print "Had problems closing file: " + options.table_out
        insert_error(9)
def main(argv):
    global parser
    (opts, args) = parser.parse_args()

    if not check_arguments(opts, args):
        print(usage)
        sys.exit(0)

    # place to store list of expected taxonomic range(s)
    serialized_metacyc_taxa_ranges = "/tmp/metacyc_pwy_taxa_range.pk"

    if opts.wtd and not os.path.isfile(serialized_metacyc_taxa_ranges):
        # get MetaCyc's expected taxonomic range(s) and serialize for later use in /tmp
        try:
            print('Getting MetaCyc Expected Taxonomic Range(s)')

            # connect to Pathway Tools
            cyc = PythonCyc()
            cyc.setOrganism('meta')
            cyc.setPToolsExec(opts.pathway_tools)
            cyc.startPathwayTools()

            pwys = cyc.getAllPathways()

            pwy_taxa_range = {} # hash from pwy to expected taxonomic range(s)
            pwy_taxa_range_pk = open(serialized_metacyc_taxa_ranges ,"w")

            # get expected taxonomic ranges for each pathway
            for pwy in pwys:
                my_expected_taxonomic_range = cyc.getExpectedTaxonomicRange(pwy)
                pwy_taxa_range[pwy] = my_expected_taxonomic_range

            # write the pathway
            pickle.dump(pwy_taxa_range, pwy_taxa_range_pk)
            pwy_taxa_range_pk.close()

            # close Pathway Tools
            cyc.stopPathwayTools()
        except:
            print( """
            Problem connecting to Pathway Tools. Check the /tmp/ptools-socket file.
            """)
    else:
        # read expected taxonomic range from serialized file
        exepected_taxa_in = open(serialized_metacyc_taxa_ranges ,"r")
        pwy_taxa_range = pickle.load(exepected_taxa_in)

    # create mapping of preferred NCBI to MEGAN taxonomy
    megan_map = {}
    if opts.ncbi_megan_map:
        with open(opts.ncbi_megan_map) as megan_map_file:
            for line in megan_map_file:
                fields = line.split("\t")
                fields = map(str.strip, fields)
                megan_map[ fields[0] ] = fields[1]

    # get ORF to taxa map from annotation_table
    print("Getting ORF to Taxa Map from AnnotationTable")
    orf_lca = {}
    with open(opts.annotation_table) as f:
        for line in f:
            fields = line.split("\t")
            orf_lca[fields[0].strip()] = fields[8].strip()

    # get pathway ORFs and Rxns
    pwy_to_orfs = {}
    pwy_to_long = {}
    pwy_to_rxns = {}
    try:
        cyc = PythonCyc()
        cyc.setOrganism(opts.pgdb_name)
        cyc.setPToolsExec(opts.pathway_tools)
        cyc.startPathwayTools()
        pwys = cyc.getAllPathways()
        for pwy in pwys:
            genes = cyc.getPathwayORFs(pwy)
            rxns = cyc.getPathwayReactionInfo(pwy)
            pwy_to_orfs[pwy] = genes
            pwy_to_long[pwy] = cleanup(cyc.get_slot_value(pwy, "common-name"))
            pwy_to_rxns[pwy] = rxns

        cyc.stopPathwayTools()
    except:
        print("""
        Problem connecting to Pathway Tools. Check the /tmp/ptools-socket file.
        """)

    # get LCA per pathway
    pwy_lca = {}
    # load NCBI taxonomy map
    print("Loading NCBI Taxonomy Map")
    lca = LCAComputation([ opts.ncbi_tree ])
    lca.setParameters(opts.lca_min_score, opts.lca_top_percent, opts.lca_min_support)

    for pwy in pwy_to_orfs:
        orfs = pwy_to_orfs[pwy]
        taxa_ids = []
        for orf in orfs:
            if orf in orf_lca:
                id = lca.get_a_Valid_ID([ orf_lca[orf] ])
                taxa_ids.append(id)
        pwy_lca_id = lca.get_lca(taxa_ids, True)
        lca.clear_cells(taxa_ids)

        pwy_lca[pwy] = [pwy_lca_id, lca.translateIdToName(pwy_lca_id)]

    # calculate weighted taxonomic distance
    pwy_to_wtd = {}
    for pwy in pwy_lca:

        C = [] # list of distances
        C_taxa = [] # list of parallel observed-expected taxa pairs
        C_pos = [] # list of non-negative distances
        C_pos_taxa = [] # list of parallel observed-expected taxa pairs
        C_neg = [] # list of negative distances
        C_neg_taxa = [] # list of parallel observed-expected taxa pairs
        if pwy in pwy_taxa_range:
            if len(pwy_taxa_range[pwy]) > 0:
                for expected in pwy_taxa_range[pwy]:
                    dist = lca.wtd(expected[0], pwy_lca[pwy][0])
                    if dist or dist == 0:
                        # valid distance
                        # add distance respective lists
                        C.append(dist) # add distance
                        C_taxa.append([ expected[0], pwy_lca[pwy][0] ])
                        if dist >= 0:
                            C_pos.append(dist)  # add to non-negative list
                            C_pos_taxa.append([ expected[0], pwy_lca[pwy][0] ])
                        else:
                            C_neg.append(dist)  # add to negative list
                            C_neg_taxa.append([ expected[0], pwy_lca[pwy][0] ])
                    else:
                        print("Not a valid distance")
                        continue
            else:
                # no expected taxonomy, set to root
                min_taxa = "1"
                dist = lca.wtd(min_taxa, pwy_lca[pwy][0])
                # add distance respective lists
                C.append(dist) # add distance
                C_taxa.append([ min_taxa, pwy_lca[pwy][0] ])
                if dist >= 0:
                    C_pos.append(dist)  # add to non-negative list
                    C_pos_taxa.append([ min_taxa, pwy_lca[pwy][0] ])
                else:
                    C_neg.append(dist)  # add to negative list
                    C_neg_taxa.append([ min_taxa, pwy_lca[pwy][0] ])

            # find index with max distance (closest to expected taxonomy)
            max_index, max_dist = max(enumerate(C), key=operator.itemgetter(1))
            max_taxa = C_taxa[max_index]

            # remap to preferred names
            observed = get_preferred_taxa_name(max_taxa[1], megan_map, lca.id_to_name)
            expected = get_preferred_taxa_name(max_taxa[0], megan_map, lca.id_to_name)

            pwy_to_wtd[pwy] = [ max_dist, observed, expected ]

    # write out pathway table
    try:
        out = open(opts.table_out, "w")
    except:
        print("Had problems opening file: " + opts.table_out)

    # write appropreate header
    if opts.wtd:
        header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tWTD\tOBSERVED\tEXPECTED\tORFS\n"
    else:
        header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tORFS\n"
    out.write(header)

    sample = opts.pgdb_name # sample name
    for pwy in pwy_to_orfs:
        # generate output line
        line = []
        line.append(sample) # sample name
        line.append(pwy) # pathway name
        line.append(pwy_to_long[pwy]) # pathway longname
        line.append(pwy_to_rxns[pwy][0]) # pathway num reactions
        line.append(pwy_to_rxns[pwy][1]) # pathway covered reactions
        line.append(len(pwy_to_orfs[pwy])) # num orfs
        if opts.wtd:
            if pwy in pwy_to_wtd:
                line.append(pwy_to_wtd[pwy][0]) # wtd
                line.append(pwy_to_wtd[pwy][1]) # wtd observed taxa
                line.append(pwy_to_wtd[pwy][2]) # wtd expected taxa
            else:
                line.append("NA")
                line.append("NA")
                line.append("NA")
        line.append("[" + ",".join(pwy_to_orfs[pwy]) + "]") # list of ORFs

        line = map(str, line) # cast all to string

        out.write("\t".join(line) + "\n") # write out line
    try:
        out.close() # close file
    except:
        print("Had problems closing file: " + opts.table_out)