Beispiel #1
0
def compute_fasta_stats(formats, input_file, seqtype, priority):
    MIN_LENGTH='MIN_LENGTH'
    MAX_LENGTH='MAX_LENGTH'
    NUMSEQ='NUMSEQ' 
    TOTAL_LENGTH='TOTAL_LENGTH'
    AVG_LENGTH='AVG_LENGTH' 

    stats = { 
              MIN_LENGTH: 0,  
              MAX_LENGTH: 0,  
              NUMSEQ : 0,    
              TOTAL_LENGTH: 0,
              AVG_LENGTH : 0 
    }  

    """ min length """
    _MAX = 1000000000000
    stats[MAX_LENGTH] = -(_MAX)
    stats[MIN_LENGTH]= _MAX

    fastareader= FastaReader(input_file)

    """ process one fasta sequence at a time """
    lengths_str=""
    for record in fastareader:
        seqname = record.name
        seq = record.sequence
        length = len(seq)
        
        stats[NUMSEQ] += 1
        
        stats[AVG_LENGTH]  =  stats[AVG_LENGTH] + length

        if stats[MIN_LENGTH] > length:
           stats[MIN_LENGTH] = length

        if stats[MAX_LENGTH] < length:
           stats[MAX_LENGTH] = length



    
    if stats[NUMSEQ] > 0 :
      stats[AVG_LENGTH]  = stats[AVG_LENGTH]/stats[NUMSEQ]
    else:
      stats[AVG_LENGTH]  = 0



    #     printf("%s\tNumber of sequences in input file BEFORE QC (%s)\t%s\n" %(str(priority), opts.seqtype,  str(stats[NUMSEQ][BEFORE])) )

    #     printf("%s\tNumber of sequences AFTER QC (%s)\t%s\n" %(str(priority + 5), opts.seqtype, str(stats[NUMSEQ][AFTER])))
    printf(formats  %(str(priority + 5), str(stats[NUMSEQ])))
    printf("%s\t-min length\t%s\n" %(str(priority + 6), str(stats[MIN_LENGTH])) )
    printf("%s\t-avg length\t%s\n" %( str(priority + 7), str(int(stats[AVG_LENGTH]))))
    printf("%s\t-max length\t%s\n" %( str(priority + 8), str(stats[MAX_LENGTH])) )
    printf("%s\t-total base pairs (bp)\t%s\n" %( str(priority + 9), str(int(stats[AVG_LENGTH]* stats[NUMSEQ])) ))
Beispiel #2
0
def main(argv, errorlogger=None, runcommand=None, runstatslogger=None):
    global parser

    options, args = parser.parse_args(argv)

    # is there a pathwaytools executable installed
    if False and not path.exists(options.ptoolsExec):
        eprintf("ERROR\tPathwayTools executable %s not found!\n",
                options.ptoolsExec)
        if errorlogger:
            errorlogger.printf(
                "ERROR\tPathwayTools executable %s not found!\n",
                options.ptoolsExec)
        exit_process("ERROR\tPathwayTools executable %s not found!\n" %
                     (options.ptoolsExec))

    # command to build the ePGDB
    command = "%s " % (options.ptoolsExec)
    command += " -api"

    pythonCyc = startPathwayTools(options.sample_name.lower(),
                                  options.ptoolsExec, True)
    #resultLines = pythonCyc.getReactionListLines()
    resultLines = pythonCyc.getFlatFiles()
    StopPathwayTools()
    try:
        if False:
            pythonCyc = startPathwayTools(options.sample_name.lower(),
                                          options.ptoolsExec, True)
            pythonCyc.setDebug()  # disable pathway debug statements
            printf("INFO\tExtracting the reaction list from ePGDB " +
                   options.sample_name + "\n")
            resultLines = pythonCyc.getReactionListLines()
            #pythonCyc.stopPathwayTools()
            reaction_list_file = open(options.reactions_list + ".tmp", 'w')
            for line in resultLines:
                fprintf(reaction_list_file, "%s\n", line.strip())
            reaction_list_file.close()
            StopPathwayTools()

    except:
        print traceback.print_exc(10)
        eprintf("ERROR\tFailed to run extract pathways for %s : \n" %
                (options.sample_name))
        eprintf(
            "INFO\tKill any other PathwayTools instance running on the machine and try again"
        )
        if errorlogger:
            errorlogger.write(
                "ERROR\tFailed to run extract pathways for %s : " %
                (options.sample_name))
            errorlogger.write(
                "INFO\tKill any other PathwayTools instance running on the machine and try again\n"
            )
        StopPathwayTools()
def dry_run_status(commands):
    for command in commands:
        printf("%s", command[0])
        if command[4] == True:
            printf("%s", " Required")
        else:
            printf("%s", " Not Required")
    printf("\n")
def dry_run_status( commands ):
    for command in commands:
        printf("%s", command[0])
        if command[4] == True:
           printf("%s", " Required")
        else:
           printf("%s", " Not Required")
    printf("\n")
def main(argv, errorlogger = None, runcommand = None, runstatslogger = None):
    global parser

    options, args = parser.parse_args(argv)

    # is there a pathwaytools executable installed
    if False and not path.exists(options.ptoolsExec):
       eprintf("ERROR\tPathwayTools executable %s not found!\n", options.ptoolsExec)
       if errorlogger:
          errorlogger.printf("ERROR\tPathwayTools executable %s not found!\n",  options.ptoolsExec)
       exit_process("ERROR\tPathwayTools executable %s not found!\n" %(options.ptoolsExec))


    # command to build the ePGDB
    command = "%s "  %(options.ptoolsExec)
    command += " -api"

    pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True)
    #resultLines = pythonCyc.getReactionListLines()
    resultLines = pythonCyc.getFlatFiles()
    StopPathwayTools()
    try:
      if False:
         pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True)
         pythonCyc.setDebug() # disable pathway debug statements
         printf("INFO\tExtracting the reaction list from ePGDB " + options.sample_name + "\n")
         resultLines = pythonCyc.getReactionListLines()
         #pythonCyc.stopPathwayTools()
         reaction_list_file = open(options.reactions_list + ".tmp", 'w')
         for line in resultLines:
          fprintf(reaction_list_file,"%s\n",line.strip())
         reaction_list_file.close()
         StopPathwayTools()

    except:
           print traceback.print_exc(10)
           eprintf("ERROR\tFailed to run extract pathways for %s : \n" %(options.sample_name))
           eprintf("INFO\tKill any other PathwayTools instance running on the machine and try again")
           if errorlogger:
               errorlogger.write("ERROR\tFailed to run extract pathways for %s : " %(options.sample_name))
               errorlogger.write("INFO\tKill any other PathwayTools instance running on the machine and try again\n")
           StopPathwayTools()
def ExtractPathway_WTD(options):
    # Extract pathways and WTD
    # place to store list of expected taxonomic range(s)
    printf('INFO\tEntering the WTD calculations!\n')
    serialized_metacyc_taxa_ranges = "/tmp/metacyc_pwy_taxa_range.pk"
    serialized_metacyc_taxa_ranges_tmp = "/tmp/metacyc_pwy_taxa_range.pk.tmp"
    try:
        if options.wtd and not path.isfile(serialized_metacyc_taxa_ranges):
            # get MetaCyc's expected taxonomic range(s) and serialize for later use in /tmp
            # try:
            printf('INFO\tGetting MetaCyc Expected Taxonomic Range(s)\n')
            pythonCyc = startPathwayTools('meta', options.ptoolsExec, True)

            pwys = pythonCyc.getAllPathways()

            pwy_taxa_range = {}  # hash from pwy to expected taxonomic range(s)
            pwy_taxa_range_pk = open(serialized_metacyc_taxa_ranges_tmp, "w")

            # get expected taxonomic ranges for each pathway
            for pwy in pwys:
                # printf(" " + pwy)
                my_expected_taxonomic_range = pythonCyc.getExpectedTaxonomicRange(
                    pwy)
                pwy_taxa_range[pwy] = my_expected_taxonomic_range
            # printf(" " + pwy)

            # write the pathway
            pickle.dump(pwy_taxa_range, pwy_taxa_range_pk)
            pwy_taxa_range_pk.close()
            StopPathwayTools()
            rename(serialized_metacyc_taxa_ranges_tmp,
                   serialized_metacyc_taxa_ranges)
        else:
            # read expected taxonomic range from serialized file
            exepected_taxa_in = open(serialized_metacyc_taxa_ranges, "r")
            pwy_taxa_range = pickle.load(exepected_taxa_in)

        # create mapping of preferred NCBI to MEGAN taxonomy
        megan_map = {}
        if options.ncbi_megan_map:
            with open(options.ncbi_megan_map) as megan_map_file:
                for line in megan_map_file:
                    fields = line.split("\t")
                    fields = map(str.strip, fields)
                    megan_map[fields[0]] = fields[1]

        # get ORF to taxa map from annotation_table
        printf("INFO\tGetting ORF to Taxa Map from AnnotationTable\n")
        orf_lca = {}
        with open(options.annotation_table) as f:
            for line in f:
                fields = line.split("\t")
                orf_lca[fields[0].strip()] = fields[8].strip()

        # get pathway ORFs and Rxns
        pwy_to_orfs = {}
        pwy_to_long = {}
        pwy_to_rxns = {}
        try:
            pythonCyc = startPathwayTools(options.sample_name.lower(),
                                          options.ptoolsExec, True)
            pwys = pythonCyc.getAllPathways()

            for pwy in pwys:
                # printf(" " + pwy)
                genes = pythonCyc.getPathwayORFs(pwy)
                rxns = pythonCyc.getPathwayReactionInfo(pwy)
                pwy_to_orfs[pwy] = genes
                pwy_to_long[pwy] = cleanup(
                    pythonCyc.get_slot_value(pwy, "common-name"))
                pwy_to_rxns[pwy] = rxns
            # printf("\n")
            StopPathwayTools()

        except:
            print """
            Problem connecting to Pathway Tools. Check the /tmp/ptools-socket file.
            """
    except:
        print """
        Problem calculating WTD via Pathway Tools. Check the /tmp/ptools-socket file.
        """

    # get LCA per pathway
    pwy_lca = {}
    # load NCBI taxonomy map
    printf("INFO\tLoading NCBI Taxonomy Map\n")
    lca = LCAComputation([options.ncbi_tree], )

    for pwy in pwy_to_orfs:
        orfs = pwy_to_orfs[pwy]
        taxa_ids = []
        for orf in orfs:
            if orf in orf_lca:
                # could strip out id here
                res = re.search("(.+?)\(([0-9]+?)\)", orf_lca[orf])
                if res:
                    taxa_annotation = res.group(1)
                    id = res.group(2)
                else:
                    id = lca.get_a_Valid_ID([orf_lca[orf]])
                taxa_ids.append(id)
        pwy_lca_id = lca.get_lca(taxa_ids, True)
        # print "In run_pathologic"
        # print pwy_lca_id
        # print pwy_lca_id
        lca.clear_cells(taxa_ids)

        pwy_lca[pwy] = [pwy_lca_id, lca.translateIdToName(pwy_lca_id)]

    # calculate weighted taxonomic distance
    pwy_to_wtd = {}
    for pwy in pwy_lca:

        C = []  # list of distances
        C_taxa = []  # list of parallel observed-expected taxa pairs
        C_pos = []  # list of non-negative distances
        C_pos_taxa = []  # list of parallel observed-expected taxa pairs
        C_neg = []  # list of negative distances
        C_neg_taxa = []  # list of parallel observed-expected taxa pairs

        if len(pwy_taxa_range[pwy]) > 0:
            for expected in pwy_taxa_range[pwy]:
                dist = lca.wtd(expected[0], pwy_lca[pwy][0])
                if dist or dist == 0:
                    # valid distance
                    # add distance respective lists
                    C.append(dist)  # add distance
                    C_taxa.append([expected[0], pwy_lca[pwy][0]])
                    if dist >= 0:
                        C_pos.append(dist)  # add to non-negative list
                        C_pos_taxa.append([expected[0], pwy_lca[pwy][0]])
                    else:
                        C_neg.append(dist)  # add to negative list
                        C_neg_taxa.append([expected[0], pwy_lca[pwy][0]])
                else:
                    print "Not a valid distance"
                    continue
        else:
            # no expected taxonomy, set to root
            min_taxa = "1"
            dist = lca.wtd(min_taxa, pwy_lca[pwy][0])
            # add distance respective lists
            C.append(dist)  # add distance
            C_taxa.append([min_taxa, pwy_lca[pwy][0]])
            if dist >= 0:
                C_pos.append(dist)  # add to non-negative list
                C_pos_taxa.append([min_taxa, pwy_lca[pwy][0]])
            else:
                C_neg.append(dist)  # add to negative list
                C_neg_taxa.append([min_taxa, pwy_lca[pwy][0]])

        # find index with max distance (closest to expected taxonomy)
        max_index, max_dist = max(enumerate(C), key=operator.itemgetter(1))
        max_taxa = C_taxa[max_index]

        # remap to preferred names
        observed = get_preferred_taxa_name(max_taxa[1], megan_map,
                                           lca.id_to_name)
        expected = get_preferred_taxa_name(max_taxa[0], megan_map,
                                           lca.id_to_name)

        pwy_to_wtd[pwy] = [max_dist, observed, expected]

    # write out pathway table
    table_out_tmp = options.table_out + ".tmp"
    try:
        out = open(table_out_tmp, "w")
    except:
        print "Had problems opening file: " + options.table_out

    # write appropreate header
    if options.wtd:
        header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tWTD\tOBSERVED\tEXPECTED\tORFS\n"
    else:
        header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tORFS\n"
    out.write(header)

    sample = options.sample_name  # sample name
    for pwy in pwy_to_orfs:
        # generate output line
        line = []
        line.append(sample)  # sample name
        line.append(pwy)  # pathway name
        line.append(pwy_to_long[pwy])  # pathway longname
        line.append(pwy_to_rxns[pwy][0])  # pathway num reactions
        line.append(pwy_to_rxns[pwy][1])  # pathway covered reactions
        line.append(len(pwy_to_orfs[pwy]))  # num orfs
        if options.wtd:
            line.append(pwy_to_wtd[pwy][0])  # wtd
            line.append(pwy_to_wtd[pwy][1])  # wtd observed taxa
            line.append(pwy_to_wtd[pwy][2])  # wtd expected taxa
        line.append("[" + ",".join(pwy_to_orfs[pwy]) + "]")  # list of ORFs

        line = map(str, line)  # cast all to string

        out.write("\t".join(line) + "\n")  # write out line
    try:
        out.close()  # close file
        rename(table_out_tmp, options.table_out)
    except:
        print "Had problems closing file: " + options.table_out
def main(argv, errorlogger=None, runcommand=None, runstatslogger=None):
    global parser

    options, args = parser.parse_args(argv)
    if options.inputfolder == None:
        parser.error('ERROR\tInput folder for Pathologic not found')
    else:
        # required files to be able to build ePGDB
        files = [
            options.inputfolder + PATHDELIM + '0.pf',
            # options.inputfolder + PATHDELIM + '0.fasta',
            options.inputfolder + PATHDELIM + 'genetic-elements.dat',
            options.inputfolder + PATHDELIM + 'organism-params.dat'
        ]

        if files_exist(files, errorlogger=errorlogger):
            exit_process(
                "ERROR\tCannot find all inputs for Pathologic in folder %s : "
                % (options.inputfolder))

    # is there a pathwaytools executable installed
    if not path.exists(options.ptoolsExec):
        eprintf("ERROR\tPathwayTools executable %s not found!\n",
                options.ptoolsExec)
        if errorlogger:
            errorlogger.printf(
                "ERROR\tPathwayTools executable %s not found!\n",
                options.ptoolsExec)
        exit_process("ERROR\tPathwayTools executable %s not found!\n" %
                     (options.ptoolsExec))

    # command to build the ePGDB
    command = "%s -patho %s" % (options.ptoolsExec, options.inputfolder)
    if options.no_taxonomic_pruning:
        command += " -no-taxonomic-pruning "

    if options.no_web_cel_overview:
        command += " -no-web-cel-overview"

    command += " -api"

    status = 0
    fix_pgdb_input_files(options.pgdbdir, pgdbs=[])

    if not path.exists(options.pgdbdir):
        status = runPathologicCommand(runcommand=command)
        fix_pgdb_input_files(options.pgdbdir, pgdbs=[])
    if status != 0:
        eprintf("ERROR\tFailed to run Pathologic on input %s : \n" %
                (options.inputfolder))
        eprintf(
            "INFO\tKill any other PathwayTools instance running on the machine and try again\n"
        )
        if errorlogger:
            errorlogger.write(
                "ERROR\tFailed to run Pathologic on input %s : " %
                (options.inputfolder))
            errorlogger.write(
                "INFO\tKill any other PathwayTools instance running on the machine and try again"
            )
            errorlogger.write("     : " + command)
        exit_process("ERROR\tFailed to run Pathologic on input %s : " %
                     (options.inputfolder))

    if not path.exists(options.reactions_list):
        try:
            pythonCyc = startPathwayTools(options.sample_name.lower(),
                                          options.ptoolsExec, True)
            pythonCyc.setDebug()  # disable pathway debug statements
            printf("INFO\tExtracting the reaction list from ePGDB " +
                   options.sample_name + "\n")
            resultLines = pythonCyc.getReactionListLines()
            #pythonCyc.stopPathwayTools()
            reaction_list_file = open(options.reactions_list + ".tmp", 'w')
            for line in resultLines:
                fprintf(reaction_list_file, "%s\n", line.strip())
            reaction_list_file.close()
            rename(options.reactions_list + ".tmp", options.reactions_list)

            StopPathwayTools()

        except:
            print traceback.print_exc(10)
            eprintf("ERROR\tFailed to run extract pathways for %s : \n" %
                    (options.sample_name))
            eprintf(
                "INFO\tKill any other PathwayTools instance running on the machine and try again"
            )
            if errorlogger:
                errorlogger.write(
                    "ERROR\tFailed to run extract pathways for %s : " %
                    (options.sample_name))
                errorlogger.write(
                    "INFO\tKill any other PathwayTools instance running on the machine and try again\n"
                )
            StopPathwayTools()

    if not path.exists(options.table_out):
        ExtractPathway_WTD(options)
def  ExtractPathway_WTD(options):
    # Extract pathways and WTD
   # place to store list of expected taxonomic range(s)
    printf('\n')
    printf('INFO\tEntering the WTD calculations!\n')
    serialized_metacyc_taxa_ranges = "/tmp/metacyc_pwy_taxa_range.pk"

    try:
        #print  options.wtd,  not path.isfile(serialized_metacyc_taxa_ranges),  serialized_metacyc_taxa_ranges
        if options.wtd and not path.isfile(serialized_metacyc_taxa_ranges):
            # get MetaCyc's expected taxonomic range(s) and serialize for later use in /tmp
            # try:
            printf('INFO\tGetting MetaCyc Expected Taxonomic Range(s)\n')
            pythonCyc = startPathwayTools('meta', options.ptoolsExec, True)

            pwys = pythonCyc.getAllPathways()

            pwy_taxa_range = {} # hash from pwy to expected taxonomic range(s)
            pwy_taxa_range_pk = open(serialized_metacyc_taxa_ranges ,"w")

            # get expected taxonomic ranges for each pathway
            for pwy in pwys:
                printf(" " + pwy) 
                my_expected_taxonomic_range = pythonCyc.getExpectedTaxonomicRange(pwy)
                pwy_taxa_range[pwy] = my_expected_taxonomic_range
            # printf(" " + pwy)

            # write the pathway
            pickle.dump(pwy_taxa_range, pwy_taxa_range_pk)
            pwy_taxa_range_pk.close()
            StopPathwayTools()

        # read expected taxonomic range from serialized file
        exepected_taxa_in = open(serialized_metacyc_taxa_ranges ,"r")
        pwy_taxa_range = pickle.load(exepected_taxa_in)

        # create mapping of preferred NCBI to MEGAN taxonomy
        megan_map = {}
        if options.ncbi_megan_map:
            with open(options.ncbi_megan_map) as megan_map_file:
                for line in megan_map_file:
                    fields = line.split("\t")
                    fields = map(str.strip, fields)
                    megan_map[ fields[0] ] = fields[1]

        # get ORF to taxa map from annotation_table
        printf("INFO\tGetting ORF to Taxa Map from AnnotationTable\n")
        orf_lca = {}
        with open(options.annotation_table) as f:
            for line in f:
                fields = line.split("\t")
                orf_lca[fields[0].strip()] = fields[8].strip()

        # get pathway ORFs and Rxns
        pwy_to_orfs = {}
        pwy_to_long = {}
        pwy_to_rxns = {}
        try:
            pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True)
            pwys = pythonCyc.getAllPathways()

            for pwy in pwys:
                printf(" " + pwy)
                genes = pythonCyc.getPathwayORFs(pwy)
                rxns = pythonCyc.getPathwayReactionInfo(pwy)
                pwy_to_orfs[pwy] = genes
                pwy_to_long[pwy] = cleanup(pythonCyc.get_slot_value(pwy, "common-name"))
                pwy_to_rxns[pwy] = rxns
            # printf("\n")
            StopPathwayTools()

        except:
            insert_error(9)
            print """
            Problem connecting to Pathway Tools. Check the /tmp/ptools-socket file.
            """
    except:
        print """
        Problem calculating WTD via Pathway Tools. Check the /tmp/ptools-socket file.
        """
        insert_error(9)

    # get LCA per pathway
    pwy_lca = {}
    # load NCBI taxonomy map
    printf("\nINFO\tLoading NCBI Taxonomy Map\n")
    lca = LCAComputation([ options.ncbi_tree ], )

    for pwy in pwy_to_orfs:
        orfs = pwy_to_orfs[pwy]
        taxa_ids = []
        for orf in orfs:
            if orf in orf_lca:
                # could strip out id here
                res = re.search("(.+?)\(([0-9]+?)\)",  orf_lca[orf] )
                if res:
                    taxa_annotation = res.group(1)
                    id = res.group(2)
                else:
                    id = lca.get_a_Valid_ID([ orf_lca[orf] ])
                taxa_ids.append(id)
        pwy_lca_id = lca.get_lca(taxa_ids, True)
        # print "In run_pathologic"
        # print pwy_lca_id
        # print pwy_lca_id
        lca.clear_cells(taxa_ids)

        pwy_lca[pwy] = [pwy_lca_id, lca.translateIdToName(pwy_lca_id)]

    # calculate weighted taxonomic distance
    pwy_to_wtd = {}
    printf("INFO\tCalculating WTD\n")

    for pwy in pwy_lca:

        C = [] # list of distances
        C_taxa = [] # list of parallel observed-expected taxa pairs
        C_pos = [] # list of non-negative distances
        C_pos_taxa = [] # list of parallel observed-expected taxa pairs
        C_neg = [] # list of negative distances
        C_neg_taxa = [] # list of parallel observed-expected taxa pairs

        if pwy in pwy_taxa_range and  len(pwy_taxa_range[pwy]) :
            for expected in pwy_taxa_range[pwy]:
                dist = lca.wtd(expected[0], pwy_lca[pwy][0])
                if dist or dist == 0:
                    # valid distance
                    # add distance respective lists
                    C.append(dist) # add distance
                    C_taxa.append([ expected[0], pwy_lca[pwy][0] ])
                    if dist >= 0:
                        C_pos.append(dist)  # add to non-negative list
                        C_pos_taxa.append([ expected[0], pwy_lca[pwy][0] ])
                    else:
                        C_neg.append(dist)  # add to negative list
                        C_neg_taxa.append([ expected[0], pwy_lca[pwy][0] ])
                else:
                    print "Not a valid distance"
                    continue
        else:
            # no expected taxonomy, set to root
            min_taxa = "1"
            dist = lca.wtd(min_taxa, pwy_lca[pwy][0])
            # add distance respective lists
            C.append(dist) # add distance
            C_taxa.append([ min_taxa, pwy_lca[pwy][0] ])
            if dist >= 0:
                C_pos.append(dist)  # add to non-negative list
                C_pos_taxa.append([ min_taxa, pwy_lca[pwy][0] ])
            else:
                C_neg.append(dist)  # add to negative list
                C_neg_taxa.append([ min_taxa, pwy_lca[pwy][0] ])

        # find index with max distance (closest to expected taxonomy)
        max_index, max_dist = max(enumerate(C), key=operator.itemgetter(1))
        max_taxa = C_taxa[max_index]

        # remap to preferred names
        observed = get_preferred_taxa_name(max_taxa[1], megan_map, lca.id_to_name)
        expected = get_preferred_taxa_name(max_taxa[0], megan_map, lca.id_to_name)

        pwy_to_wtd[pwy] = [ max_dist, observed, expected ]

    # write out pathway table
    table_out_tmp  = options.table_out + ".tmp"
    try:
        out = open(table_out_tmp, "w")
    except:
        print "Had problems opening file: " + options.table_out
        insert_error(9)

    # write appropreate header
    if options.wtd:
        header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tWTD\tOBSERVED\tEXPECTED\tORFS\n"
    else:
        header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tORFS\n"
    out.write(header)

    sample = options.sample_name # sample name
    for pwy in pwy_to_orfs:
        # generate output line
        line = []
        line.append(sample) # sample name
        line.append(pwy) # pathway name
        line.append(pwy_to_long[pwy]) # pathway longname
        line.append(pwy_to_rxns[pwy][0]) # pathway num reactions
        line.append(pwy_to_rxns[pwy][1]) # pathway covered reactions
        line.append(len(pwy_to_orfs[pwy])) # num orfs
        if options.wtd:
            line.append(pwy_to_wtd[pwy][0]) # wtd
            line.append(pwy_to_wtd[pwy][1]) # wtd observed taxa
            line.append(pwy_to_wtd[pwy][2]) # wtd expected taxa
        line.append("[" + ",".join(pwy_to_orfs[pwy]) + "]") # list of ORFs

        line = map(str, line) # cast all to string

        out.write("\t".join(line) + "\n") # write out line
    try:
        out.close() # close file
        rename(table_out_tmp, options.table_out)
    except:
        print "Had problems closing file: " + options.table_out
        insert_error(9)
def main(argv, errorlogger = None, runcommand = None, runstatslogger = None):
    global parser

    options, args = parser.parse_args(argv)
    if options.inputfolder ==None:
       parser.error('ERROR\tInput folder for Pathologic not found')
    else:
      # required files to be able to build ePGDB
      files = [ 
                #options.inputfolder + PATHDELIM + '0.pf',
                # options.inputfolder + PATHDELIM + '0.fasta',
                options.inputfolder + PATHDELIM + 'genetic-elements.dat',  
                options.inputfolder + PATHDELIM + 'organism-params.dat'
              ]

      if files_exist( files , errorlogger = errorlogger):
        exit_process("ERROR\tCannot find all inputs for Pathologic in folder %s : "  %(options.inputfolder) )

    # is there a pathwaytools executable installed
    if not path.exists(options.ptoolsExec):
       eprintf("ERROR\tPathwayTools executable %s not found!\n", options.ptoolsExec)
       if errorlogger:
          errorlogger.printf("ERROR\tPathwayTools executable %s not found!\n",  options.ptoolsExec)
       exit_process("ERROR\tPathwayTools executable %s not found!\n" %(options.ptoolsExec))


    # command to build the ePGDB
    command = "%s -patho %s"  %(options.ptoolsExec, options.inputfolder)
    if options.no_taxonomic_pruning:
       command += " -no-taxonomic-pruning "

    if options.no_web_cel_overview:
       command += " -no-web-cel-overview"

    command += " -tip"
    command += " -api"

    status =0
    fix_pgdb_input_files(options.pgdbdir, pgdbs = [])


    if not path.exists(options.pgdbdir):
      status  = runPathologicCommand(runcommand = command) 
      fix_pgdb_input_files(options.pgdbdir, pgdbs = [])
    if status!=0:
       eprintf("ERROR\tFailed to run Pathologic on input %s : \n" %(options.inputfolder))
       eprintf("INFO\tKill any other PathwayTools instance running on the machine and try again\n")
       if errorlogger:
          errorlogger.write("ERROR\tFailed to run Pathologic on input %s : " %(options.inputfolder))
          errorlogger.write("INFO\tKill any other PathwayTools instance running on the machine and try again")
          errorlogger.write("     : " + command)
          insert_error(9)
       sys.exit(0)
       #exit_process("ERROR\tFailed to run Pathologic on input %s : "  %(options.inputfolder) )


    if not path.exists(options.reactions_list):
       try:
           pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True)
           pythonCyc.setDebug() # disable pathway debug statements
           printf("INFO\tExtracting the reaction list from ePGDB " + options.sample_name + "\n")
           resultLines = pythonCyc.getReactionListLines()
           #pythonCyc.stopPathwayTools()
           reaction_list_file = open(options.reactions_list + ".tmp", 'w')
           for line in resultLines:
              fprintf(reaction_list_file,"%s\n",line.strip())
           reaction_list_file.close()
           rename(options.reactions_list + ".tmp", options.reactions_list)
           StopPathwayTools()

       except:
           print traceback.print_exc(10)
           eprintf("ERROR\tFailed to run extract pathways for %s : \n" %(options.sample_name))
           eprintf("INFO\tKill any other PathwayTools instance running on the machine and try again")
           if errorlogger:
               errorlogger.write("ERROR\tFailed to run extract pathways for %s : " %(options.sample_name))
               errorlogger.write("INFO\tKill any other PathwayTools instance running on the machine and try again\n")
           insert_error(9)
           StopPathwayTools()

    if not path.exists(options.table_out):
        ExtractPathway_WTD(options)