def _execute_tRNA_Scan(options): global errorcode args = [] if options.trna_executable: args.append(options.trna_executable) if options.trna_i: args += ["-i", options.trna_i] if options.trna_o: args += ["-o", options.trna_o] if options.trna_D: args += ["-D", options.trna_D] if options.trna_T: args += ["-T", options.trna_T] if options.trna_F: args += ["-F", options.trna_F] result = getstatusoutput(' '.join(args)) if result[0] != 0: insert_error(errorcode) return result
def _execute_tRNA_Scan(options): global errorcode args= [ ] if options.trna_executable : args.append( options.trna_executable ) if options.trna_i: args += [ "-i", options.trna_i ] if options.trna_o: args += [ "-o", options.trna_o ] if options.trna_D: args += [ "-D", options.trna_D ] if options.trna_T: args += [ "-T", options.trna_T ] if options.trna_F: args += [ "-F", options.trna_F] result = getstatusoutput(' '.join(args) ) if result[0]!=0: insert_error(errorcode) return result
def write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight, results_dictionary, orf_dictionary, contig, candidate_orf_pos, orfid, compact_output): global errorcode try: fields = [ 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame' ] output_line = orf_dictionary[contig][candidate_orf_pos]['seqname'] #if compact_output: #output_line = ShortenContigId(output_line) for field in fields: output_line += "\t" + str( orf_dictionary[contig][candidate_orf_pos][field]) #if compact_output: try: attributes = "ID=" + ShortenORFId( orf_dictionary[contig][candidate_orf_pos]['id']) attributes += ";" + "locus_tag=" + ShortenORFId( orf_dictionary[contig][candidate_orf_pos]['locus_tag']) except: attributes = "ID=" + orf_dictionary[contig][candidate_orf_pos]['id'] attributes += ";" + "locus_tag=" + orf_dictionary[contig][ candidate_orf_pos]['locus_tag'] attributes += ";" + "contig_length=" + orf_dictionary[contig][ candidate_orf_pos]['contig_length'] attributes += ";" + "orf_length=" + orf_dictionary[contig][ candidate_orf_pos]['orf_length'] attributes += ";" + "partial=" + orf_dictionary[contig][ candidate_orf_pos]['partial'] attributes += ";" + "sourcedb=" + candidatedbname if candidatedbname in results_dictionary: attributes += ";" + "annotvalue=" + str( results_dictionary[candidatedbname][orfid]['value']) attributes += ";" + "ec=" + str( results_dictionary[candidatedbname][orfid]['ec']) attributes += ";" + "product=" + results_dictionary[ candidatedbname][orfid]['product'] else: attributes += ";" + "annotvalue=" + str('0') attributes += ";" + "ec=" + str('') attributes += ";" + "product=" + 'hypothetical protein' output_line += '\t' + attributes if candidatedbname in results_dictionary: fprintf(outputgff_file, "%s\n", output_line) except: eprintf("ERROR : Failure to annotate in contig %s\n", contig) #print orf_dictionary[contig] print traceback.print_exc(10) insert_error(errorcode) exit_process()
def MetaPathways_parse_blast(argv, errorlogger=None, runstatslogger=None): createParser() try: main(argv, errorlogger=errorlogger, runstatslogger=runstatslogger) except: insert_error(5) return (0, '') return (0, '')
def Multiseq_preprocess(argv, errorlogger=None, runstatslogger=None): createParser() try: main(argv, errorlogger=errorlogger, runstatslogger=runstatslogger) except: insert_error(1) return (1, '') return (0, '')
def MetaPathways_parse_blast(argv, errorlogger = None, runstatslogger = None): createParser() try: main(argv, errorlogger = errorlogger, runstatslogger = runstatslogger) except: insert_error(5) return (0,'') return (0,'')
def MetaPathways_filter_input(argv, errorlogger = None, runstatslogger = None): createParser() global errorcode try: main(argv, errorlogger = errorlogger, runstatslogger = runstatslogger) except: insert_error(errorcode) return (0,'') return (0,'')
def MetaPathways_filter_input(argv, errorlogger=None, runstatslogger=None): createParser() global errorcode try: main(argv, errorlogger=errorlogger, runstatslogger=runstatslogger) except: insert_error(errorcode) return (0, '') return (0, '')
def MetaPathways_run_pathologic(argv, extra_command = None, errorlogger = None, runstatslogger =None): if errorlogger != None: errorlogger.write("#STEP\tBUILD_PGDB\n") createParser() try: main(argv, errorlogger = errorlogger, runcommand= extra_command, runstatslogger = runstatslogger) except: insert_error(error_code) return (1,'Error running pathologic') return (0,'')
def MetaPathways_annotate_fast(argv, errorlogger=None, runstatslogger=None): createParser() errorlogger.write("#STEP\tANNOTATE_ORFS\n") try: main(argv, errorlogger=errorlogger, runstatslogger=runstatslogger) except: insert_error(errorcode) return (0, '') return (0, '')
def MetaPathways_create_amino_sequences(argv, errorlogger = None, runstatslogger = None): global errorcode createParser() try: res = main(argv, errorlogger = errorlogger, runstatslogger = runstatslogger) except: insert_error(errorcode) return (1,'') return (res[0], res[1])
def MetaPathways_orf_prediction(argv, extra_command = None, errorlogger = None, runstatslogger =None): global errorcode if errorlogger != None: errorlogger.write("#STEP\tORF_PREDICTION\n") createParser() try: main(argv, errorlogger = errorlogger, runcommand= extra_command, runstatslogger = runstatslogger) except: insert_error(errrocode) return (0,'')
def MetaPathways_refscore(argv, errorlogger = None, runstatslogger = None): createParser( ) if errorlogger: errorlogger.write("#STEP\tCOMPUTE_REFSCORE\n") try: main(argv, errorlogger = errorlogger, runstatslogger = runstatslogger) except: insert_error(15) return (0,traceback.format_exc(10)) return (0,'')
def MetaPathways_refscore(argv, errorlogger=None, runstatslogger=None): createParser() if errorlogger: errorlogger.write("#STEP\tCOMPUTE_REFSCORE\n") try: main(argv, errorlogger=errorlogger, runstatslogger=runstatslogger) except: insert_error(15) return (0, traceback.format_exc(10)) return (0, '')
def MetaPathways_annotate_fast(argv, errorlogger = None, runstatslogger = None): createParser() errorlogger.write("#STEP\tANNOTATE_ORFS\n") try: main(argv, errorlogger = errorlogger, runstatslogger = runstatslogger) except: insert_error(errorcode) return (0,'') return (0,'')
def MetaPathways_func_search(argv, extra_command = None, errorlogger = None, runstatslogger =None): if errorlogger != None: errorlogger.write("#STEP\tFUNC_SEARCH\n") createParser() try: code = main(argv, errorlogger = errorlogger, runcommand= extra_command, runstatslogger = runstatslogger) except: insert_error(4) return (0,'') return (0,'')
def MetaPathways_rRNA_stats_calculator(argv, extra_command = None, errorlogger = None, runstatslogger =None): if errorlogger != None: errorlogger.write("#STEP\tSTATS_rRNA\n") createParser() try: main(argv, errorlogger = errorlogger, runcommand= extra_command, runstatslogger = runstatslogger) except: insert_error(6) return (0,'') return (0,'')
def main(argv, errorlogger =None, runstatslogger = None): global parser (opts, args) = parser.parse_args(argv) if not check_arguments(opts, args): print usage sys.exit(0) results_dictionary={} dbname_weight={} contig_lengths = {} read_contig_lengths(opts.contig_map_file, contig_lengths) if opts.blastdir !=None and opts.sample_name != None: try: database_names, input_blastouts, weight_dbs = getBlastFileNames(opts) except: print traceback.print_exc(10) insert_error(errorcode) pass else: database_names = opts.database_name input_blastouts = opts.input_blastout weight_dbs = opts.weight_db priority = 6000 count_annotations = {} print '' for dbname, blastoutput, weight in zip(database_names, input_blastouts, weight_dbs): results_dictionary[dbname]={} dbname_weight[dbname] = weight count = process_parsed_blastoutput(dbname, weight, blastoutput, opts, results_dictionary[dbname]) if runstatslogger!=None: runstatslogger.write("%s\tProtein Annotations from %s\t%s\n" %( str(priority), dbname, str(count))) count_annotations priority += 1 for dbname in results_dictionary: print dbname, len(results_dictionary[dbname].keys()) for seqname in results_dictionary[dbname]: count_annotations[seqname] = True count = len(count_annotations) if runstatslogger!=None: runstatslogger.write("%s\tTotal Protein Annotations\t%s\n" %( str(priority), str(count))) #create the annotations from he results create_annotation(dbname_weight, results_dictionary, opts.input_gff, opts.rRNA_16S, opts.tRNA, opts.output_gff, opts.output_comparative_annotation, contig_lengths, compact_output = opts.compact_output)
def MetaPathways_tRNA_scan(argv, extra_command = None, errorlogger = None, runstatslogger =None): global errorcode if errorlogger != None: errorlogger.write("#STEP\ttRNA_SCAN\n") createParser() result =[0, ''] try: result = main(argv, errorlogger = errorlogger, runcommand= extra_command, runstatslogger = runstatslogger) except: insert_error(errorcode) return (res[0], res[1]) return (result[0],'')
def write_new_file(lines, output_file): print("Fixing file " + output_file ) try: outputfile = open(output_file,'w') pass except IOError: print("ERROR :Cannot open output file " + output_file) insert_error(9) for line in lines: fprintf(outputfile, "%s\n", line) outputfile.close()
def MetaPathways_create_amino_sequences(argv, errorlogger=None, runstatslogger=None): global errorcode createParser() try: res = main(argv, errorlogger=errorlogger, runstatslogger=runstatslogger) except: insert_error(errorcode) return (1, '') return (res[0], res[1])
def write_new_file(lines, output_file): print "Fixing file " + output_file try: outputfile = open(output_file,'w') pass except IOError: print "ERROR :Cannot open output file " + output_file insert_error(9) for line in lines: fprintf(outputfile, "%s\n", line) outputfile.close()
def write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight, results_dictionary, orf_dictionary, contig, candidate_orf_pos, orfid, compact_output): global errorcode try: fields = [ 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame' ] output_line= orf_dictionary[contig][candidate_orf_pos]['seqname'] #if compact_output: #output_line = ShortenContigId(output_line) for field in fields: output_line += "\t"+ str(orf_dictionary[contig][candidate_orf_pos][field]) #if compact_output: try: attributes = "ID="+ShortenORFId(orf_dictionary[contig][candidate_orf_pos]['id']) attributes += ";" + "locus_tag="+ShortenORFId(orf_dictionary[contig][candidate_orf_pos]['locus_tag']) except: attributes = "ID="+orf_dictionary[contig][candidate_orf_pos]['id'] attributes += ";" + "locus_tag="+orf_dictionary[contig][candidate_orf_pos]['locus_tag'] attributes += ";" + "contig_length="+orf_dictionary[contig][candidate_orf_pos]['contig_length'] attributes += ";" + "orf_length="+orf_dictionary[contig][candidate_orf_pos]['orf_length'] attributes += ";" + "partial="+orf_dictionary[contig][candidate_orf_pos]['partial'] attributes += ";" + "sourcedb="+candidatedbname if candidatedbname in results_dictionary: attributes += ";" + "annotvalue="+str(results_dictionary[candidatedbname][orfid]['value']) attributes += ";" + "ec="+str(results_dictionary[candidatedbname][orfid]['ec']) attributes += ";" + "product="+results_dictionary[candidatedbname][orfid]['product'] else: attributes += ";" + "annotvalue="+str('0') attributes += ";" + "ec="+str('') attributes += ";" + "product="+'hypothetical protein' output_line += '\t' + attributes if candidatedbname in results_dictionary: fprintf(outputgff_file, "%s\n", output_line); except: eprintf("ERROR : Failure to annotate in contig %s\n", contig) #print orf_dictionary[contig] print traceback.print_exc(10) insert_error(errorcode) exit_process()
def read_contig_lengths(contig_map_file, contig_lengths): try: mapfile = open(contig_map_file, 'r') except IOError: insert_error(errorcode) return mapfile_lines = mapfile.readlines() mapfile.close() for line in mapfile_lines: line = line.strip() fields = [x.strip() for x in line.split('\t')] if len(fields) != 3: contig_lengths = {} return contig_lengths[fields[0]] = int(fields[2])
def MetaPathways_func_search(argv, extra_command=None, errorlogger=None, runstatslogger=None): if errorlogger != None: errorlogger.write("#STEP\tFUNC_SEARCH\n") createParser() try: code = main(argv, errorlogger=errorlogger, runcommand=extra_command, runstatslogger=runstatslogger) except: insert_error(4) return (0, '') return (0, '')
def read_contig_lengths(contig_map_file, contig_lengths): try: mapfile = open(contig_map_file, 'r') except IOError: print "Cannot read file " + contig_map_file + " !" insert_error(errorcode) return mapfile_lines = mapfile.readlines() mapfile.close() for line in mapfile_lines: line = line.strip() fields = [ x.strip() for x in line.split('\t') ] if len(fields) != 3: contig_lengths = {} return contig_lengths[fields[0] ] = int(fields[2])
def MetaPathways_rRNA_stats_calculator(argv, extra_command=None, errorlogger=None, runstatslogger=None): if errorlogger != None: errorlogger.write("#STEP\tSTATS_rRNA\n") createParser() try: main(argv, errorlogger=errorlogger, runcommand=extra_command, runstatslogger=runstatslogger) except: insert_error(6) return (0, '') return (0, '')
def MetaPathways_tRNA_scan(argv, extra_command=None, errorlogger=None, runstatslogger=None): global errorcode if errorlogger != None: errorlogger.write("#STEP\ttRNA_SCAN\n") createParser() result = [0, ''] try: result = main(argv, errorlogger=errorlogger, runcommand=extra_command, runstatslogger=runstatslogger) except: insert_error(errorcode) return (res[0], res[1]) return (result[0], '')
def halt_process(secs=4, verbose=False): time.sleep(secs) errors = get_error_list() if len(errors) > 1: insert_error(200) if verbose: for errorcode in errors.keys(): eprintf("ERROR:\t%d\t%s\n", errorcode, errors[errorcode]) if len(errors.keys()) > 1: errorcode = 200 _exit(errorcode) elif len(errors.keys()) == 1: errorcode = errors.keys()[0] _exit(errorcode) _exit(0)
def halt_process(secs=4, verbose =False): time.sleep(secs) errors=get_error_list() if len(errors)>1: insert_error(200) if verbose: for errorcode in errors.keys(): eprintf("ERROR:\t%d\t%s\n",errorcode, errors[errorcode]) if len(errors.keys())>1: errorcode = 200 _exit(errorcode) elif len(errors.keys())==1: errorcode = errors.keys()[0] _exit(errorcode) _exit(0)
def main(argv, errorlogger = None, runcommand = None, runstatslogger = None): global parser options, args = parser.parse_args(argv) if options.inputfolder ==None: parser.error('ERROR\tInput folder for Pathologic not found') else: # required files to be able to build ePGDB files = [ #options.inputfolder + PATHDELIM + '0.pf', # options.inputfolder + PATHDELIM + '0.fasta', options.inputfolder + PATHDELIM + 'genetic-elements.dat', options.inputfolder + PATHDELIM + 'organism-params.dat' ] if files_exist( files , errorlogger = errorlogger): exit_process("ERROR\tCannot find all inputs for Pathologic in folder %s : " %(options.inputfolder) ) # is there a pathwaytools executable installed if not path.exists(options.ptoolsExec): eprintf("ERROR\tPathwayTools executable %s not found!\n", options.ptoolsExec) if errorlogger: errorlogger.printf("ERROR\tPathwayTools executable %s not found!\n", options.ptoolsExec) exit_process("ERROR\tPathwayTools executable %s not found!\n" %(options.ptoolsExec)) # command to build the ePGDB command = "%s -patho %s" %(options.ptoolsExec, options.inputfolder) if options.no_taxonomic_pruning: command += " -no-taxonomic-pruning " if options.no_web_cel_overview: command += " -no-web-cel-overview" command += " -tip" command += " -api" status =0 fix_pgdb_input_files(options.pgdbdir, pgdbs = []) if not path.exists(options.pgdbdir): status = runPathologicCommand(runcommand = command) fix_pgdb_input_files(options.pgdbdir, pgdbs = []) if status!=0: eprintf("ERROR\tFailed to run Pathologic on input %s : \n" %(options.inputfolder)) eprintf("INFO\tKill any other PathwayTools instance running on the machine and try again\n") if errorlogger: errorlogger.write("ERROR\tFailed to run Pathologic on input %s : " %(options.inputfolder)) errorlogger.write("INFO\tKill any other PathwayTools instance running on the machine and try again") errorlogger.write(" : " + command) insert_error(9) sys.exit(0) #exit_process("ERROR\tFailed to run Pathologic on input %s : " %(options.inputfolder) ) if not path.exists(options.reactions_list): try: pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True) pythonCyc.setDebug() # disable pathway debug statements printf("INFO\tExtracting the reaction list from ePGDB " + options.sample_name + "\n") resultLines = pythonCyc.getReactionListLines() #pythonCyc.stopPathwayTools() reaction_list_file = open(options.reactions_list + ".tmp", 'w') for line in resultLines: fprintf(reaction_list_file,"%s\n",line.strip()) reaction_list_file.close() rename(options.reactions_list + ".tmp", options.reactions_list) StopPathwayTools() except: print(traceback.print_exc(10)) eprintf("ERROR\tFailed to run extract pathways for %s : \n" %(options.sample_name)) eprintf("INFO\tKill any other PathwayTools instance running on the machine and try again") if errorlogger: errorlogger.write("ERROR\tFailed to run extract pathways for %s : " %(options.sample_name)) errorlogger.write("INFO\tKill any other PathwayTools instance running on the machine and try again\n") insert_error(9) StopPathwayTools() if not path.exists(options.table_out): ExtractPathway_WTD(options)
def ExtractPathway_WTD(options): # Extract pathways and WTD # place to store list of expected taxonomic range(s) printf('\n') printf('INFO\tEntering the WTD calculations!\n') serialized_metacyc_taxa_ranges = "/tmp/metacyc_pwy_taxa_range.pk" try: #print options.wtd, not path.isfile(serialized_metacyc_taxa_ranges), serialized_metacyc_taxa_ranges if options.wtd and not path.isfile(serialized_metacyc_taxa_ranges): # get MetaCyc's expected taxonomic range(s) and serialize for later use in /tmp # try: printf('INFO\tGetting MetaCyc Expected Taxonomic Range(s)\n') pythonCyc = startPathwayTools('meta', options.ptoolsExec, True) pwys = pythonCyc.getAllPathways() pwy_taxa_range = {} # hash from pwy to expected taxonomic range(s) pwy_taxa_range_pk = open(serialized_metacyc_taxa_ranges ,"w") # get expected taxonomic ranges for each pathway for pwy in pwys: printf(" " + pwy) my_expected_taxonomic_range = pythonCyc.getExpectedTaxonomicRange(pwy) pwy_taxa_range[pwy] = my_expected_taxonomic_range # printf(" " + pwy) # write the pathway pickle.dump(pwy_taxa_range, pwy_taxa_range_pk) pwy_taxa_range_pk.close() StopPathwayTools() # read expected taxonomic range from serialized file exepected_taxa_in = open(serialized_metacyc_taxa_ranges ,"r") pwy_taxa_range = pickle.load(exepected_taxa_in) # create mapping of preferred NCBI to MEGAN taxonomy megan_map = {} if options.ncbi_megan_map: with open(options.ncbi_megan_map) as megan_map_file: for line in megan_map_file: fields = line.split("\t") fields = map(str.strip, fields) megan_map[ fields[0] ] = fields[1] # get ORF to taxa map from annotation_table printf("INFO\tGetting ORF to Taxa Map from AnnotationTable\n") orf_lca = {} with open(options.annotation_table) as f: for line in f: fields = line.split("\t") orf_lca[fields[0].strip()] = fields[8].strip() # get pathway ORFs and Rxns pwy_to_orfs = {} pwy_to_long = {} pwy_to_rxns = {} try: pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True) pwys = pythonCyc.getAllPathways() for pwy in pwys: printf(" " + pwy) genes = pythonCyc.getPathwayORFs(pwy) rxns = pythonCyc.getPathwayReactionInfo(pwy) pwy_to_orfs[pwy] = genes pwy_to_long[pwy] = cleanup(pythonCyc.get_slot_value(pwy, "common-name")) pwy_to_rxns[pwy] = rxns # printf("\n") StopPathwayTools() except: insert_error(9) print(""" Problem connecting to Pathway Tools. Check the /tmp/ptools-socket file. """) except: print(""" Problem calculating WTD via Pathway Tools. Check the /tmp/ptools-socket file. """) insert_error(9) # get LCA per pathway pwy_lca = {} # load NCBI taxonomy map printf("\nINFO\tLoading NCBI Taxonomy Map\n") lca = LCAComputation([ options.ncbi_tree ], ) for pwy in pwy_to_orfs: orfs = pwy_to_orfs[pwy] taxa_ids = [] for orf in orfs: if orf in orf_lca: # could strip out id here res = re.search("(.+?)\(([0-9]+?)\)", orf_lca[orf] ) if res: taxa_annotation = res.group(1) id = res.group(2) else: id = lca.get_a_Valid_ID([ orf_lca[orf] ]) taxa_ids.append(id) pwy_lca_id = lca.get_lca(taxa_ids, True) # print "In run_pathologic" # print pwy_lca_id # print pwy_lca_id lca.clear_cells(taxa_ids) pwy_lca[pwy] = [pwy_lca_id, lca.translateIdToName(pwy_lca_id)] # calculate weighted taxonomic distance pwy_to_wtd = {} printf("INFO\tCalculating WTD\n") for pwy in pwy_lca: C = [] # list of distances C_taxa = [] # list of parallel observed-expected taxa pairs C_pos = [] # list of non-negative distances C_pos_taxa = [] # list of parallel observed-expected taxa pairs C_neg = [] # list of negative distances C_neg_taxa = [] # list of parallel observed-expected taxa pairs if pwy in pwy_taxa_range and len(pwy_taxa_range[pwy]) : for expected in pwy_taxa_range[pwy]: dist = lca.wtd(expected[0], pwy_lca[pwy][0]) if dist or dist == 0: # valid distance # add distance respective lists C.append(dist) # add distance C_taxa.append([ expected[0], pwy_lca[pwy][0] ]) if dist >= 0: C_pos.append(dist) # add to non-negative list C_pos_taxa.append([ expected[0], pwy_lca[pwy][0] ]) else: C_neg.append(dist) # add to negative list C_neg_taxa.append([ expected[0], pwy_lca[pwy][0] ]) else: print("Not a valid distance") continue else: # no expected taxonomy, set to root min_taxa = "1" dist = lca.wtd(min_taxa, pwy_lca[pwy][0]) # add distance respective lists C.append(dist) # add distance C_taxa.append([ min_taxa, pwy_lca[pwy][0] ]) if dist >= 0: C_pos.append(dist) # add to non-negative list C_pos_taxa.append([ min_taxa, pwy_lca[pwy][0] ]) else: C_neg.append(dist) # add to negative list C_neg_taxa.append([ min_taxa, pwy_lca[pwy][0] ]) # find index with max distance (closest to expected taxonomy) max_index, max_dist = max(enumerate(C), key=operator.itemgetter(1)) max_taxa = C_taxa[max_index] # remap to preferred names observed = get_preferred_taxa_name(max_taxa[1], megan_map, lca.id_to_name) expected = get_preferred_taxa_name(max_taxa[0], megan_map, lca.id_to_name) pwy_to_wtd[pwy] = [ max_dist, observed, expected ] # write out pathway table table_out_tmp = options.table_out + ".tmp" try: out = open(table_out_tmp, "w") except: print("Had problems opening file: " + options.table_out) insert_error(9) # write appropreate header if options.wtd: header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tWTD\tOBSERVED\tEXPECTED\tORFS\n" else: header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tORFS\n" out.write(header) sample = options.sample_name # sample name for pwy in pwy_to_orfs: # generate output line line = [] line.append(sample) # sample name line.append(pwy) # pathway name line.append(pwy_to_long[pwy]) # pathway longname line.append(pwy_to_rxns[pwy][0]) # pathway num reactions line.append(pwy_to_rxns[pwy][1]) # pathway covered reactions line.append(len(pwy_to_orfs[pwy])) # num orfs if options.wtd: line.append(pwy_to_wtd[pwy][0]) # wtd line.append(pwy_to_wtd[pwy][1]) # wtd observed taxa line.append(pwy_to_wtd[pwy][2]) # wtd expected taxa line.append("[" + ",".join(pwy_to_orfs[pwy]) + "]") # list of ORFs line = map(str, line) # cast all to string out.write("\t".join(line) + "\n") # write out line try: out.close() # close file rename(table_out_tmp, options.table_out) except: print("Had problems closing file: " + options.table_out) insert_error(9)
def __init__(self, dbname, blastoutput, database_mapfile, refscore_file, opts, errorlogger=None): self.Size = 10000 self.dbname = dbname self.ln2 = 0.69314718055994530941 self.lnk = math.log(opts.k) self.Lambda = opts.Lambda self.blastoutput = blastoutput self.database_mapfile = database_mapfile self.refscore_file = refscore_file self.annot_map = {} self.i = 0 self.opts = opts self.hits_counts = {} self.data = {} self.refscores = {} self.refBitScores = {} self.needToPermute = False self.MAX_READ_ERRORS_ALLOWED = 10 self.ERROR_COUNT = 0 self.STEP_NAME = 'PARSE_BLAST' self.error_and_warning_logger = errorlogger #print "trying to open blastoutput file " + blastoutput query_dictionary = {} try: create_query_dictionary(self.blastoutput, query_dictionary, self.opts.algorithm, errorlogger=errorlogger) except: insert_error(5) try: self.blastoutputfile = open(self.blastoutput, 'r') except: eprintf("\nERROR : cannot open B/LAST output file " + blastoutput + " to parse "+\ " : make sure \"B/LAST\"ing was done for the particular database" ) if self.error_and_warning_logger: self.error_and_warning_logger.write("ERROR : cannot open B/LAST output file %s %s to parse \n" +\ " : make sure \"B/LAST\"ing was done for "+\ "the particular database" %(blastoutput) ) insert_error(5) exit_process("Cannot open B/LAST output file " + blastoutput) try: self.create_refBitScores() except: print traceback.print_exc(10) exit_process("Error while reading from B/LAST refscore file " + self.refscore_file) try: create_dictionary(database_mapfile, self.annot_map, query_dictionary) query_dictionary = {} except AttributeError: eprintf("Cannot read the map file for database : %s\n" % (dbname)) if errorlogger != None: errorlogger.write( "PARSE_BLAST\tERROR\tCannot read the map file %s for database : %s\tDelete the formatted files for the database in the \"formatted\" folder\n" % (database_mapfile, dbname)) exit_process("Cannot read the map file for database " + dbname)
def main(argv, errorlogger=None, runstatslogger=None): global parser (opts, args) = parser.parse_args(argv) if not check_arguments(opts, args): print(usage) sys.exit(0) results_dictionary = {} dbname_weight = {} contig_lengths = {} read_contig_lengths(opts.contig_map_file, contig_lengths) if opts.blastdir != None and opts.sample_name != None: try: database_names, input_blastouts, weight_dbs = getBlastFileNames( opts) except: insert_error(errorcode) pass else: database_names = opts.database_name input_blastouts = opts.input_blastout weight_dbs = opts.weight_db priority = 6000 count_annotations = {} for dbname, blastoutput, weight in zip(database_names, input_blastouts, weight_dbs): results_dictionary[dbname] = {} dbname_weight[dbname] = weight count = process_parsed_blastoutput(dbname, weight, blastoutput, opts, results_dictionary[dbname]) if runstatslogger != None: runstatslogger.write("%s\tProtein Annotations from %s\t%s\n" % (str(priority), dbname, str(count))) count_annotations priority += 1 for dbname in results_dictionary: print(dbname, len(results_dictionary[dbname].keys())) for seqname in results_dictionary[dbname]: count_annotations[seqname] = True count = len(count_annotations) if runstatslogger != None: runstatslogger.write("%s\tTotal Protein Annotations\t%s\n" % (str(priority), str(count))) #create the annotations from he results create_annotation(dbname_weight, results_dictionary, opts.input_gff, opts.rRNA_16S, opts.tRNA, opts.output_gff, opts.output_comparative_annotation, contig_lengths, sample_name=opts.sample_name, compact_output=opts.compact_output)
def main(argv, errorlogger = None, runcommand = None, runstatslogger = None): global parser options, args = parser.parse_args(argv) if options.inputfolder ==None: parser.error('ERROR\tInput folder for Pathologic not found') else: # required files to be able to build ePGDB files = [ #options.inputfolder + PATHDELIM + '0.pf', # options.inputfolder + PATHDELIM + '0.fasta', options.inputfolder + PATHDELIM + 'genetic-elements.dat', options.inputfolder + PATHDELIM + 'organism-params.dat' ] if files_exist( files , errorlogger = errorlogger): exit_process("ERROR\tCannot find all inputs for Pathologic in folder %s : " %(options.inputfolder) ) # is there a pathwaytools executable installed if not path.exists(options.ptoolsExec): eprintf("ERROR\tPathwayTools executable %s not found!\n", options.ptoolsExec) if errorlogger: errorlogger.printf("ERROR\tPathwayTools executable %s not found!\n", options.ptoolsExec) exit_process("ERROR\tPathwayTools executable %s not found!\n" %(options.ptoolsExec)) # command to build the ePGDB command = "%s -patho %s" %(options.ptoolsExec, options.inputfolder) if options.no_taxonomic_pruning: command += " -no-taxonomic-pruning " if options.no_web_cel_overview: command += " -no-web-cel-overview" command += " -tip" command += " -api" status =0 fix_pgdb_input_files(options.pgdbdir, pgdbs = []) if not path.exists(options.pgdbdir): status = runPathologicCommand(runcommand = command) fix_pgdb_input_files(options.pgdbdir, pgdbs = []) if status!=0: eprintf("ERROR\tFailed to run Pathologic on input %s : \n" %(options.inputfolder)) eprintf("INFO\tKill any other PathwayTools instance running on the machine and try again\n") if errorlogger: errorlogger.write("ERROR\tFailed to run Pathologic on input %s : " %(options.inputfolder)) errorlogger.write("INFO\tKill any other PathwayTools instance running on the machine and try again") errorlogger.write(" : " + command) insert_error(9) sys.exit(0) #exit_process("ERROR\tFailed to run Pathologic on input %s : " %(options.inputfolder) ) if not path.exists(options.reactions_list): try: pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True) pythonCyc.setDebug() # disable pathway debug statements printf("INFO\tExtracting the reaction list from ePGDB " + options.sample_name + "\n") resultLines = pythonCyc.getReactionListLines() #pythonCyc.stopPathwayTools() reaction_list_file = open(options.reactions_list + ".tmp", 'w') for line in resultLines: fprintf(reaction_list_file,"%s\n",line.strip()) reaction_list_file.close() rename(options.reactions_list + ".tmp", options.reactions_list) StopPathwayTools() except: print traceback.print_exc(10) eprintf("ERROR\tFailed to run extract pathways for %s : \n" %(options.sample_name)) eprintf("INFO\tKill any other PathwayTools instance running on the machine and try again") if errorlogger: errorlogger.write("ERROR\tFailed to run extract pathways for %s : " %(options.sample_name)) errorlogger.write("INFO\tKill any other PathwayTools instance running on the machine and try again\n") insert_error(9) StopPathwayTools() if not path.exists(options.table_out): ExtractPathway_WTD(options)
def main(argv, errorlogger=None, runstatslogger=None): # filtering options global parser options, args = parser.parse_args(argv) if not (options.gff_file or options.nucleotide_sequences or options.output_amino or options.output_nuc or options.output_gff): insert_error(errorcode) return (1, '') if not options.gff_file: parser.error('No gff files are specified') insert_error(errorcode) return (1, '') if not options.nucleotide_sequences: parser.error('Nucleotide sequences') insert_error(errorcode) return (1, '') if not options.output_amino: parser.error('Output anino acid file must be specified') insert_error(errorcode) return (1, '') if not options.output_nuc: parser.error('Output nucloetide sequences file must be specified') insert_error(errorcode) return (1, '') if not options.output_gff: parser.error('Output gff file must be specified') insert_error(errorcode) return (1, '') #print options if not path.exists(options.gff_file): print "gff file does not exist" insert_error(errorcode) return (1, '') if not path.exists(options.nucleotide_sequences): print "nucloetide sequences file does not exist" insert_error(errorcode) return (1, '') nucleotide_seq_dict = {} process_sequence_file(options.nucleotide_sequences, nucleotide_seq_dict) process_gff_file(options.gff_file, options.output_amino, options.output_nuc, options.output_gff, nucleotide_seq_dict)
def ExtractPathway_WTD(options): # Extract pathways and WTD # place to store list of expected taxonomic range(s) printf('\n') printf('INFO\tEntering the WTD calculations!\n') serialized_metacyc_taxa_ranges = "/tmp/metacyc_pwy_taxa_range.pk" try: #print options.wtd, not path.isfile(serialized_metacyc_taxa_ranges), serialized_metacyc_taxa_ranges if options.wtd and not path.isfile(serialized_metacyc_taxa_ranges): # get MetaCyc's expected taxonomic range(s) and serialize for later use in /tmp # try: printf('INFO\tGetting MetaCyc Expected Taxonomic Range(s)\n') pythonCyc = startPathwayTools('meta', options.ptoolsExec, True) pwys = pythonCyc.getAllPathways() pwy_taxa_range = {} # hash from pwy to expected taxonomic range(s) pwy_taxa_range_pk = open(serialized_metacyc_taxa_ranges ,"w") # get expected taxonomic ranges for each pathway for pwy in pwys: printf(" " + pwy) my_expected_taxonomic_range = pythonCyc.getExpectedTaxonomicRange(pwy) pwy_taxa_range[pwy] = my_expected_taxonomic_range # printf(" " + pwy) # write the pathway pickle.dump(pwy_taxa_range, pwy_taxa_range_pk) pwy_taxa_range_pk.close() StopPathwayTools() # read expected taxonomic range from serialized file exepected_taxa_in = open(serialized_metacyc_taxa_ranges ,"r") pwy_taxa_range = pickle.load(exepected_taxa_in) # create mapping of preferred NCBI to MEGAN taxonomy megan_map = {} if options.ncbi_megan_map: with open(options.ncbi_megan_map) as megan_map_file: for line in megan_map_file: fields = line.split("\t") fields = map(str.strip, fields) megan_map[ fields[0] ] = fields[1] # get ORF to taxa map from annotation_table printf("INFO\tGetting ORF to Taxa Map from AnnotationTable\n") orf_lca = {} with open(options.annotation_table) as f: for line in f: fields = line.split("\t") orf_lca[fields[0].strip()] = fields[8].strip() # get pathway ORFs and Rxns pwy_to_orfs = {} pwy_to_long = {} pwy_to_rxns = {} try: pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True) pwys = pythonCyc.getAllPathways() for pwy in pwys: printf(" " + pwy) genes = pythonCyc.getPathwayORFs(pwy) rxns = pythonCyc.getPathwayReactionInfo(pwy) pwy_to_orfs[pwy] = genes pwy_to_long[pwy] = cleanup(pythonCyc.get_slot_value(pwy, "common-name")) pwy_to_rxns[pwy] = rxns # printf("\n") StopPathwayTools() except: insert_error(9) print """ Problem connecting to Pathway Tools. Check the /tmp/ptools-socket file. """ except: print """ Problem calculating WTD via Pathway Tools. Check the /tmp/ptools-socket file. """ insert_error(9) # get LCA per pathway pwy_lca = {} # load NCBI taxonomy map printf("\nINFO\tLoading NCBI Taxonomy Map\n") lca = LCAComputation([ options.ncbi_tree ], ) for pwy in pwy_to_orfs: orfs = pwy_to_orfs[pwy] taxa_ids = [] for orf in orfs: if orf in orf_lca: # could strip out id here res = re.search("(.+?)\(([0-9]+?)\)", orf_lca[orf] ) if res: taxa_annotation = res.group(1) id = res.group(2) else: id = lca.get_a_Valid_ID([ orf_lca[orf] ]) taxa_ids.append(id) pwy_lca_id = lca.get_lca(taxa_ids, True) # print "In run_pathologic" # print pwy_lca_id # print pwy_lca_id lca.clear_cells(taxa_ids) pwy_lca[pwy] = [pwy_lca_id, lca.translateIdToName(pwy_lca_id)] # calculate weighted taxonomic distance pwy_to_wtd = {} printf("INFO\tCalculating WTD\n") for pwy in pwy_lca: C = [] # list of distances C_taxa = [] # list of parallel observed-expected taxa pairs C_pos = [] # list of non-negative distances C_pos_taxa = [] # list of parallel observed-expected taxa pairs C_neg = [] # list of negative distances C_neg_taxa = [] # list of parallel observed-expected taxa pairs if pwy in pwy_taxa_range and len(pwy_taxa_range[pwy]) : for expected in pwy_taxa_range[pwy]: dist = lca.wtd(expected[0], pwy_lca[pwy][0]) if dist or dist == 0: # valid distance # add distance respective lists C.append(dist) # add distance C_taxa.append([ expected[0], pwy_lca[pwy][0] ]) if dist >= 0: C_pos.append(dist) # add to non-negative list C_pos_taxa.append([ expected[0], pwy_lca[pwy][0] ]) else: C_neg.append(dist) # add to negative list C_neg_taxa.append([ expected[0], pwy_lca[pwy][0] ]) else: print "Not a valid distance" continue else: # no expected taxonomy, set to root min_taxa = "1" dist = lca.wtd(min_taxa, pwy_lca[pwy][0]) # add distance respective lists C.append(dist) # add distance C_taxa.append([ min_taxa, pwy_lca[pwy][0] ]) if dist >= 0: C_pos.append(dist) # add to non-negative list C_pos_taxa.append([ min_taxa, pwy_lca[pwy][0] ]) else: C_neg.append(dist) # add to negative list C_neg_taxa.append([ min_taxa, pwy_lca[pwy][0] ]) # find index with max distance (closest to expected taxonomy) max_index, max_dist = max(enumerate(C), key=operator.itemgetter(1)) max_taxa = C_taxa[max_index] # remap to preferred names observed = get_preferred_taxa_name(max_taxa[1], megan_map, lca.id_to_name) expected = get_preferred_taxa_name(max_taxa[0], megan_map, lca.id_to_name) pwy_to_wtd[pwy] = [ max_dist, observed, expected ] # write out pathway table table_out_tmp = options.table_out + ".tmp" try: out = open(table_out_tmp, "w") except: print "Had problems opening file: " + options.table_out insert_error(9) # write appropreate header if options.wtd: header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tWTD\tOBSERVED\tEXPECTED\tORFS\n" else: header = "SAMPLE\tPWY_NAME\tPWY_COMMON_NAME\tNUM_REACTIONS\tNUM_COVERED_REACTIONS\tORF_COUNT\tORFS\n" out.write(header) sample = options.sample_name # sample name for pwy in pwy_to_orfs: # generate output line line = [] line.append(sample) # sample name line.append(pwy) # pathway name line.append(pwy_to_long[pwy]) # pathway longname line.append(pwy_to_rxns[pwy][0]) # pathway num reactions line.append(pwy_to_rxns[pwy][1]) # pathway covered reactions line.append(len(pwy_to_orfs[pwy])) # num orfs if options.wtd: line.append(pwy_to_wtd[pwy][0]) # wtd line.append(pwy_to_wtd[pwy][1]) # wtd observed taxa line.append(pwy_to_wtd[pwy][2]) # wtd expected taxa line.append("[" + ",".join(pwy_to_orfs[pwy]) + "]") # list of ORFs line = map(str, line) # cast all to string out.write("\t".join(line) + "\n") # write out line try: out.close() # close file rename(table_out_tmp, options.table_out) except: print "Had problems closing file: " + options.table_out insert_error(9)
def __init__(self, dbname, blastoutput, database_mapfile, refscore_file, opts, errorlogger =None): self.Size = 10000 self.dbname = dbname self.ln2 = 0.69314718055994530941 self.lnk = math.log(opts.k) self.Lambda = opts.Lambda self.blastoutput = blastoutput self.database_mapfile =database_mapfile self.refscore_file = refscore_file self.annot_map = {} self.i=0 self.opts = opts self.hits_counts = {} self.data = {} self.refscores = {} self.refBitScores = {} self.needToPermute = False; self.MAX_READ_ERRORS_ALLOWED = 10 self.ERROR_COUNT = 0 self.STEP_NAME = 'PARSE_BLAST' self.error_and_warning_logger = errorlogger #print "trying to open blastoutput file " + blastoutput query_dictionary = {} try: create_query_dictionary(self.blastoutput, query_dictionary, self.opts.algorithm, errorlogger = errorlogger) except: insert_error(5) try: self.blastoutputfile = open(self.blastoutput,'r') except: eprintf("\nERROR : cannot open B/LAST output file " + blastoutput + " to parse "+\ " : make sure \"B/LAST\"ing was done for the particular database" ) if self.error_and_warning_logger: self.error_and_warning_logger.write("ERROR : cannot open B/LAST output file %s %s to parse \n" +\ " : make sure \"B/LAST\"ing was done for "+\ "the particular database" %(blastoutput) ) insert_error(5) exit_process( "Cannot open B/LAST output file " + blastoutput ) try: self.create_refBitScores() except: print traceback.print_exc(10) exit_process( "Error while reading from B/LAST refscore file " + self.refscore_file ) try: create_dictionary(database_mapfile, self.annot_map, query_dictionary) query_dictionary = {} except AttributeError: eprintf("Cannot read the map file for database : %s\n" % (dbname)) if errorlogger!= None: errorlogger.write("PARSE_BLAST\tERROR\tCannot read the map file %s for database : %s\tDelete the formatted files for the database in the \"formatted\" folder\n" %(database_mapfile, dbname)) exit_process("Cannot read the map file for database " + dbname)
def main(argv, errorlogger = None, runstatslogger = None): # filtering options global parser options, args = parser.parse_args(argv) if not(options.gff_file or options.nucleotide_sequences or options.output_amino or options.output_nuc or options.output_gff): insert_error(errorcode) return(1,'') if not options.gff_file: parser.error('No gff files are specified') insert_error(errorcode) return(1,'') if not options.nucleotide_sequences: parser.error('Nucleotide sequences') insert_error(errorcode) return(1,'') if not options.output_amino: parser.error('Output anino acid file must be specified') insert_error(errorcode) return(1,'') if not options.output_nuc: parser.error('Output nucloetide sequences file must be specified') insert_error(errorcode) return(1,'') if not options.output_gff: parser.error('Output gff file must be specified') insert_error(errorcode) return(1,'') #print options if not path.exists(options.gff_file): print "gff file does not exist" insert_error(errorcode) return(1,'') if not path.exists(options.nucleotide_sequences): print "nucloetide sequences file does not exist" insert_error(errorcode) return(1,'') nucleotide_seq_dict = {} process_sequence_file( options.nucleotide_sequences, nucleotide_seq_dict) process_gff_file(options.gff_file, options.output_amino, options.output_nuc, options.output_gff, nucleotide_seq_dict)