def annotate_gtf_parallel(input_gtf_file, output_gtf_file, gtf_sample_attr, num_processors, tmp_dir): # create queue input_queue = JoinableQueue(maxsize=num_processors * 3) # start worker processes procs = [] worker_gtf_files = [] for i in xrange(num_processors): worker_gtf_file = os.path.join(tmp_dir, "annotate_worker%03d.gtf" % (i)) worker_gtf_files.append(worker_gtf_file) args = (input_queue, worker_gtf_file, gtf_sample_attr) p = Process(target=annotate_gtf_worker, args=args) p.daemon = True p.start() procs.append(p) for lines in parse_loci(open(input_gtf_file)): input_queue.put(lines) # stop workers for p in procs: input_queue.put([]) # close queue input_queue.join() input_queue.close() # join worker processes for p in procs: p.join() # merge/sort worker gtf files logging.debug("Merging %d worker GTF file(s)" % (num_processors)) merge_sort_gtf_files(worker_gtf_files, output_gtf_file, tmp_dir=tmp_dir) # remove worker gtf files for filename in worker_gtf_files: if os.path.exists(filename): os.remove(filename)
def filter_transcripts(classify_dir, min_prec, min_rec, min_spec, opt_variable, tmp_dir): # setup input and output files lib_counts_file = os.path.join(classify_dir, LIB_COUNTS_FILE) lib_counts_list = list(LibCounts.from_file(lib_counts_file)) # filter each category expr_gtf_files = [] bkgd_gtf_files = [] for category_key in CATEGORIES: category_str = category_int_to_str[category_key] category_lib_counts = [x for x in lib_counts_list if x.category_counts[category_key] > 0] library_ids = [x.library_id for x in category_lib_counts] cinfo = CategoryInfo.create(library_ids, category_key, category_str, classify_dir) filter_category(category_lib_counts, cinfo, min_prec, min_rec, min_spec, opt_variable, tmp_dir) expr_gtf_files.append(cinfo.unann_expr_gtf_file) bkgd_gtf_files.append(cinfo.unann_bkgd_gtf_file) # only need one set of annotated gtf files expr_gtf_files.extend([cinfo.ann_expr_gtf_file, cinfo.ann_bkgd_gtf_file]) # merge transcripts logging.info("Merging filtered transcripts") expr_gtf_file = os.path.join(classify_dir, EXPR_GTF_FILE) bkgd_gtf_file = os.path.join(classify_dir, BKGD_GTF_FILE) merge_sort_gtf_files(expr_gtf_files, expr_gtf_file, tmp_dir=tmp_dir) merge_sort_gtf_files(bkgd_gtf_files, bkgd_gtf_file, tmp_dir=tmp_dir)
def merge_transcripts(results): # read library category statistics stats_list = list(CategoryStats.from_file(results.category_stats_file)) library_ids = [] expressed_gtf_files = [] background_gtf_files = [] for statsobj in stats_list: library_id = statsobj.library_id library_ids.append(library_id) prefix = os.path.join(results.classify_dir, library_id) expressed_gtf_files.append(prefix + ".expr.gtf") background_gtf_files.append(prefix + ".bkgd.gtf") library_id_map = {} for line in open(results.library_id_map): fields = line.strip().split('\t') library_id_map[fields[0]] = fields[1] # make a classification report logging.info("Writing classification report") fileh = open(results.classify_report_file, 'w') header_fields = [ "library_id", "library_name", "category", "train.auc", "test.auc", "train.cutoff", "train.tp", "train.fp", "train.fn", "train.tn", "train.sens", "train.spec", "train.balacc", "test.tp", "test.fp", "test.fn", "test.tn", "test.sens", "test.spec", "test.balacc" ] print >> fileh, '\t'.join(header_fields) for library_id in library_ids: prefix = os.path.join(results.classify_dir, library_id) library_name = library_id_map[library_id] intergenic_perf_file = prefix + ".intergenic.perf.txt" intronic_perf_file = prefix + ".intronic.perf.txt" input_fileh = open(intergenic_perf_file) input_fileh.next() for line in input_fileh: fields = ([library_id, library_name, "intergenic"] + line.strip().split('\t')) print >> fileh, '\t'.join(fields) input_fileh.close() input_fileh = open(intronic_perf_file) input_fileh.next() for line in input_fileh: fields = ([library_id, library_name, "intronic"] + line.strip().split('\t')) print >> fileh, '\t'.join(fields) input_fileh.close() fileh.close() # add reference gtf file expressed_gtf_files.append(results.ref_gtf_file) background_gtf_files.append(results.ref_gtf_file) # merge sort gtf files logging.info("Merging and sorting expressed GTF files") merge_sort_gtf_files(expressed_gtf_files, results.expressed_gtf_file, tmp_dir=results.tmp_dir) logging.info("Merging and sorting background GTF files") merge_sort_gtf_files(background_gtf_files, results.background_gtf_file, tmp_dir=results.tmp_dir) return 0
def annotate_gtf_parallel(input_gtf_file, output_gtf_file, gtf_sample_attr, num_processors, tmp_dir): # create queue input_queue = JoinableQueue(maxsize=num_processors*3) # start worker processes procs = [] worker_gtf_files = [] for i in xrange(num_processors): worker_gtf_file = os.path.join(tmp_dir, "annotate_worker%03d.gtf" % (i)) worker_gtf_files.append(worker_gtf_file) args = (input_queue, worker_gtf_file, gtf_sample_attr) p = Process(target=annotate_gtf_worker, args=args) p.daemon = True p.start() procs.append(p) for lines in parse_loci(open(input_gtf_file)): input_queue.put(lines) # stop workers for p in procs: input_queue.put([]) # close queue input_queue.join() input_queue.close() # join worker processes for p in procs: p.join() # merge/sort worker gtf files logging.debug("Merging %d worker GTF file(s)" % (num_processors)) merge_sort_gtf_files(worker_gtf_files, output_gtf_file, tmp_dir=tmp_dir) # remove worker gtf files for filename in worker_gtf_files: if os.path.exists(filename): os.remove(filename)
def run_parallel(config): """ runs assembly in parallel and merges output from child processes config: RunConfig object """ # create temp directory tmp_dir = os.path.join(config.output_dir, "tmp") if not os.path.exists(tmp_dir): logging.debug("Creating tmp directory '%s'" % (tmp_dir)) os.makedirs(tmp_dir) # create queue input_queue = JoinableQueue(maxsize=config.num_processors*3) # shared memory values locus_id_value_obj = LockValue(1) gene_id_value_obj = LockValue(1) tss_id_value_obj = LockValue(1) t_id_value_obj = LockValue(1) # start worker processes procs = [] worker_prefixes = [] for i in xrange(config.num_processors): worker_prefix = os.path.join(tmp_dir, "worker%03d" % (i)) worker_prefixes.append(worker_prefix) args = (input_queue, locus_id_value_obj, gene_id_value_obj, tss_id_value_obj, t_id_value_obj, worker_prefix, config) p = Process(target=assembly_worker, args=args) p.daemon = True p.start() procs.append(p) # parse gtf file for lines in parse_loci(open(config.gtf_input_file)): input_queue.put(lines) # stop workers for p in procs: input_queue.put([]) # close queue input_queue.join() input_queue.close() # join worker processes for p in procs: p.join() # merge gtf files if config.create_gtf: logging.info("Merging %d worker GTF files" % (config.num_processors)) worker_gtf_files = [prefix + ".gtf" for prefix in worker_prefixes] output_gtf_file = os.path.join(config.output_dir, "assembly.gtf") merge_sort_gtf_files(worker_gtf_files, output_gtf_file, tmp_dir=tmp_dir) # remove worker gtf files for filename in worker_gtf_files: if os.path.exists(filename): os.remove(filename) # merge bed files if config.create_bed: logging.info("Merging %d worker BED files" % (config.num_processors)) worker_bed_files = [p + ".bed" for p in worker_prefixes] output_bed_file = os.path.join(config.output_dir, "assembly.bed") merge_sort_files(worker_bed_files, output_bed_file, sort_func=sort_bed, tmp_dir=tmp_dir) # write bed file track description line track_name = os.path.basename(config.output_dir) track_line = ' '.join(['track name="%s"' % (track_name), 'description="%s"' % (track_name), 'visibility=pack', 'useScore=1']) track_file = os.path.join(config.output_dir, "assembly.bed.ucsc_track") fileh = open(track_file, "w") print >>fileh, track_line fileh.close() # merge bedgraph files if config.create_bedgraph: logging.info("Merging %d worker bedGraph files" % (config.num_processors)) for strand in xrange(0,3): strand_name = STRAND_NAMES[strand] bgfiles = ['%s_%s.bedgraph' % (p, strand_name) for p in worker_prefixes] output_file = os.path.join(config.output_dir, "assembly_%s.bedgraph" % strand_name) merge_sort_files(bgfiles, output_file, sort_func=sort_bed, tmp_dir=tmp_dir) track_name = '%s_%s' % (os.path.basename(config.output_dir), strand_name) track_line = ' '.join(['track type=bedGraph', 'name="%s"' % (track_name), 'description="%s"' % (track_name), 'visibility=full', 'color=%s' % (STRAND_COLORS[strand]), 'autoScale=on', 'alwaysZero=on', 'maxHeightPixels=64:64:11']) track_file = os.path.join(config.output_dir, "assembly_%s.bedgraph.ucsc_track" % strand_name) fileh = open(track_file, "w") print >>fileh, track_line fileh.close() # cleanup if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir) logging.info("Done") return 0
def merge_transcripts(results): # read library category statistics stats_list = list(CategoryStats.from_file(results.category_stats_file)) library_ids = [] expressed_gtf_files = [] background_gtf_files = [] for statsobj in stats_list: library_id = statsobj.library_id library_ids.append(library_id) prefix = os.path.join(results.classify_dir, library_id) expressed_gtf_files.append(prefix + ".expr.gtf") background_gtf_files.append(prefix + ".bkgd.gtf") library_id_map = {} for line in open(results.library_id_map): fields = line.strip().split("\t") library_id_map[fields[0]] = fields[1] # make a classification report logging.info("Writing classification report") fileh = open(results.classify_report_file, "w") header_fields = [ "library_id", "library_name", "category", "train.auc", "test.auc", "train.cutoff", "train.tp", "train.fp", "train.fn", "train.tn", "train.sens", "train.spec", "train.balacc", "test.tp", "test.fp", "test.fn", "test.tn", "test.sens", "test.spec", "test.balacc", ] print >> fileh, "\t".join(header_fields) for library_id in library_ids: prefix = os.path.join(results.classify_dir, library_id) library_name = library_id_map[library_id] intergenic_perf_file = prefix + ".intergenic.perf.txt" intronic_perf_file = prefix + ".intronic.perf.txt" input_fileh = open(intergenic_perf_file) input_fileh.next() for line in input_fileh: fields = [library_id, library_name, "intergenic"] + line.strip().split("\t") print >> fileh, "\t".join(fields) input_fileh.close() input_fileh = open(intronic_perf_file) input_fileh.next() for line in input_fileh: fields = [library_id, library_name, "intronic"] + line.strip().split("\t") print >> fileh, "\t".join(fields) input_fileh.close() fileh.close() # add reference gtf file expressed_gtf_files.append(results.ref_gtf_file) background_gtf_files.append(results.ref_gtf_file) # merge sort gtf files logging.info("Merging and sorting expressed GTF files") merge_sort_gtf_files(expressed_gtf_files, results.expressed_gtf_file, tmp_dir=results.tmp_dir) logging.info("Merging and sorting background GTF files") merge_sort_gtf_files(background_gtf_files, results.background_gtf_file, tmp_dir=results.tmp_dir) return 0
def run_parallel(config): """ runs assembly in parallel and merges output from child processes config: RunConfig object """ # create temp directory tmp_dir = os.path.join(config.output_dir, "tmp") if not os.path.exists(tmp_dir): logging.debug("Creating tmp directory '%s'" % (tmp_dir)) os.makedirs(tmp_dir) # create queue input_queue = JoinableQueue(maxsize=config.num_processors * 3) # shared memory values locus_id_value_obj = LockValue(1) gene_id_value_obj = LockValue(1) tss_id_value_obj = LockValue(1) t_id_value_obj = LockValue(1) # start worker processes procs = [] worker_prefixes = [] for i in xrange(config.num_processors): worker_prefix = os.path.join(tmp_dir, "worker%03d" % (i)) worker_prefixes.append(worker_prefix) args = ( input_queue, locus_id_value_obj, gene_id_value_obj, tss_id_value_obj, t_id_value_obj, worker_prefix, config, ) p = Process(target=assembly_worker, args=args) p.daemon = True p.start() procs.append(p) # parse gtf file for lines in parse_loci(open(config.gtf_input_file)): input_queue.put(lines) # stop workers for p in procs: input_queue.put([]) # close queue input_queue.join() input_queue.close() # join worker processes for p in procs: p.join() # merge gtf files if config.create_gtf: logging.info("Merging %d worker GTF files" % (config.num_processors)) worker_gtf_files = [prefix + ".gtf" for prefix in worker_prefixes] output_gtf_file = os.path.join(config.output_dir, "assembly.gtf") merge_sort_gtf_files(worker_gtf_files, output_gtf_file, tmp_dir=tmp_dir) # remove worker gtf files for filename in worker_gtf_files: if os.path.exists(filename): os.remove(filename) # merge bed files if config.create_bed: logging.info("Merging %d worker BED files" % (config.num_processors)) worker_bed_files = [p + ".bed" for p in worker_prefixes] output_bed_file = os.path.join(config.output_dir, "assembly.bed") merge_sort_files(worker_bed_files, output_bed_file, sort_func=sort_bed, tmp_dir=tmp_dir) # write bed file track description line track_name = os.path.basename(config.output_dir) track_line = " ".join( ['track name="%s"' % (track_name), 'description="%s"' % (track_name), "visibility=pack", "useScore=1"] ) track_file = os.path.join(config.output_dir, "assembly.bed.ucsc_track") fileh = open(track_file, "w") print >> fileh, track_line fileh.close() # merge bedgraph files if config.create_bedgraph: logging.info("Merging %d worker bedGraph files" % (config.num_processors)) for strand in xrange(0, 3): strand_name = STRAND_NAMES[strand] bgfiles = ["%s_%s.bedgraph" % (p, strand_name) for p in worker_prefixes] output_file = os.path.join(config.output_dir, "assembly_%s.bedgraph" % strand_name) merge_sort_files(bgfiles, output_file, sort_func=sort_bed, tmp_dir=tmp_dir) track_name = "%s_%s" % (os.path.basename(config.output_dir), strand_name) track_line = " ".join( [ "track type=bedGraph", 'name="%s"' % (track_name), 'description="%s"' % (track_name), "visibility=full", "color=%s" % (STRAND_COLORS[strand]), "autoScale=on", "alwaysZero=on", "maxHeightPixels=64:64:11", ] ) track_file = os.path.join(config.output_dir, "assembly_%s.bedgraph.ucsc_track" % strand_name) fileh = open(track_file, "w") print >> fileh, track_line fileh.close() # cleanup if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir) logging.info("Done") return 0