def annotate_gtf_parallel(input_gtf_file, output_gtf_file, gtf_sample_attr, num_processors, tmp_dir): # create queue input_queue = JoinableQueue(maxsize=num_processors * 3) # start worker processes procs = [] worker_gtf_files = [] for i in xrange(num_processors): worker_gtf_file = os.path.join(tmp_dir, "annotate_worker%03d.gtf" % (i)) worker_gtf_files.append(worker_gtf_file) args = (input_queue, worker_gtf_file, gtf_sample_attr) p = Process(target=annotate_gtf_worker, args=args) p.daemon = True p.start() procs.append(p) for lines in parse_loci(open(input_gtf_file)): input_queue.put(lines) # stop workers for p in procs: input_queue.put([]) # close queue input_queue.join() input_queue.close() # join worker processes for p in procs: p.join() # merge/sort worker gtf files logging.debug("Merging %d worker GTF file(s)" % (num_processors)) merge_sort_gtf_files(worker_gtf_files, output_gtf_file, tmp_dir=tmp_dir) # remove worker gtf files for filename in worker_gtf_files: if os.path.exists(filename): os.remove(filename)
def annotate_gtf(gtf_file, bed_dbs): # read reference databases bed_trees = [] for name,filename in bed_dbs: logging.debug("Loading BED db '%s' file '%s'" % (name,filename)) trees = build_interval_tree_from_bed(filename) bed_trees.append((name, trees)) # parse gtf file and annotate logging.debug("Annotating GTF") for lines in parse_loci(open(gtf_file)): features = [] transcripts = [] transcript_matches = collections.defaultdict(lambda: collections.defaultdict(lambda: set())) for line in lines: f = GTFFeature.from_string(line) features.append(f) t_id = f.attrs['transcript_id'] if f.feature_type == 'transcript': transcripts.append(f) elif f.feature_type == 'exon': for dbname,dbtrees in bed_trees: # intersect this exon with features hits = dbtrees[f.seqid].find(f.start, f.end) matches = set(hit.value for hit in hits if hit.strand == f.strand) f.attrs[dbname] = ','.join(sorted(matches)) # update transcript level matches transcript_matches[t_id][dbname].update(matches) # set transcript annotations for f in transcripts: t_id = f.attrs['transcript_id'] for dbname,dbtrees in bed_trees: matches = transcript_matches[t_id][dbname] f.attrs[dbname] = ','.join(sorted(matches)) # write features for f in features: print str(f) logging.debug("Done")
def annotate_gtf_parallel(input_gtf_file, output_gtf_file, gtf_sample_attr, num_processors, tmp_dir): # create queue input_queue = JoinableQueue(maxsize=num_processors*3) # start worker processes procs = [] worker_gtf_files = [] for i in xrange(num_processors): worker_gtf_file = os.path.join(tmp_dir, "annotate_worker%03d.gtf" % (i)) worker_gtf_files.append(worker_gtf_file) args = (input_queue, worker_gtf_file, gtf_sample_attr) p = Process(target=annotate_gtf_worker, args=args) p.daemon = True p.start() procs.append(p) for lines in parse_loci(open(input_gtf_file)): input_queue.put(lines) # stop workers for p in procs: input_queue.put([]) # close queue input_queue.join() input_queue.close() # join worker processes for p in procs: p.join() # merge/sort worker gtf files logging.debug("Merging %d worker GTF file(s)" % (num_processors)) merge_sort_gtf_files(worker_gtf_files, output_gtf_file, tmp_dir=tmp_dir) # remove worker gtf files for filename in worker_gtf_files: if os.path.exists(filename): os.remove(filename)
def run_parallel(config): """ runs assembly in parallel and merges output from child processes config: RunConfig object """ # create temp directory tmp_dir = os.path.join(config.output_dir, "tmp") if not os.path.exists(tmp_dir): logging.debug("Creating tmp directory '%s'" % (tmp_dir)) os.makedirs(tmp_dir) # create queue input_queue = JoinableQueue(maxsize=config.num_processors*3) # shared memory values locus_id_value_obj = LockValue(1) gene_id_value_obj = LockValue(1) tss_id_value_obj = LockValue(1) t_id_value_obj = LockValue(1) # start worker processes procs = [] worker_prefixes = [] for i in xrange(config.num_processors): worker_prefix = os.path.join(tmp_dir, "worker%03d" % (i)) worker_prefixes.append(worker_prefix) args = (input_queue, locus_id_value_obj, gene_id_value_obj, tss_id_value_obj, t_id_value_obj, worker_prefix, config) p = Process(target=assembly_worker, args=args) p.daemon = True p.start() procs.append(p) # parse gtf file for lines in parse_loci(open(config.gtf_input_file)): input_queue.put(lines) # stop workers for p in procs: input_queue.put([]) # close queue input_queue.join() input_queue.close() # join worker processes for p in procs: p.join() # merge gtf files if config.create_gtf: logging.info("Merging %d worker GTF files" % (config.num_processors)) worker_gtf_files = [prefix + ".gtf" for prefix in worker_prefixes] output_gtf_file = os.path.join(config.output_dir, "assembly.gtf") merge_sort_gtf_files(worker_gtf_files, output_gtf_file, tmp_dir=tmp_dir) # remove worker gtf files for filename in worker_gtf_files: if os.path.exists(filename): os.remove(filename) # merge bed files if config.create_bed: logging.info("Merging %d worker BED files" % (config.num_processors)) worker_bed_files = [p + ".bed" for p in worker_prefixes] output_bed_file = os.path.join(config.output_dir, "assembly.bed") merge_sort_files(worker_bed_files, output_bed_file, sort_func=sort_bed, tmp_dir=tmp_dir) # write bed file track description line track_name = os.path.basename(config.output_dir) track_line = ' '.join(['track name="%s"' % (track_name), 'description="%s"' % (track_name), 'visibility=pack', 'useScore=1']) track_file = os.path.join(config.output_dir, "assembly.bed.ucsc_track") fileh = open(track_file, "w") print >>fileh, track_line fileh.close() # merge bedgraph files if config.create_bedgraph: logging.info("Merging %d worker bedGraph files" % (config.num_processors)) for strand in xrange(0,3): strand_name = STRAND_NAMES[strand] bgfiles = ['%s_%s.bedgraph' % (p, strand_name) for p in worker_prefixes] output_file = os.path.join(config.output_dir, "assembly_%s.bedgraph" % strand_name) merge_sort_files(bgfiles, output_file, sort_func=sort_bed, tmp_dir=tmp_dir) track_name = '%s_%s' % (os.path.basename(config.output_dir), strand_name) track_line = ' '.join(['track type=bedGraph', 'name="%s"' % (track_name), 'description="%s"' % (track_name), 'visibility=full', 'color=%s' % (STRAND_COLORS[strand]), 'autoScale=on', 'alwaysZero=on', 'maxHeightPixels=64:64:11']) track_file = os.path.join(config.output_dir, "assembly_%s.bedgraph.ucsc_track" % strand_name) fileh = open(track_file, "w") print >>fileh, track_line fileh.close() # cleanup if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir) logging.info("Done") return 0
def run_parallel(config): """ runs assembly in parallel and merges output from child processes config: RunConfig object """ # create temp directory tmp_dir = os.path.join(config.output_dir, "tmp") if not os.path.exists(tmp_dir): logging.debug("Creating tmp directory '%s'" % (tmp_dir)) os.makedirs(tmp_dir) # create queue input_queue = JoinableQueue(maxsize=config.num_processors * 3) # shared memory values locus_id_value_obj = LockValue(1) gene_id_value_obj = LockValue(1) tss_id_value_obj = LockValue(1) t_id_value_obj = LockValue(1) # start worker processes procs = [] worker_prefixes = [] for i in xrange(config.num_processors): worker_prefix = os.path.join(tmp_dir, "worker%03d" % (i)) worker_prefixes.append(worker_prefix) args = ( input_queue, locus_id_value_obj, gene_id_value_obj, tss_id_value_obj, t_id_value_obj, worker_prefix, config, ) p = Process(target=assembly_worker, args=args) p.daemon = True p.start() procs.append(p) # parse gtf file for lines in parse_loci(open(config.gtf_input_file)): input_queue.put(lines) # stop workers for p in procs: input_queue.put([]) # close queue input_queue.join() input_queue.close() # join worker processes for p in procs: p.join() # merge gtf files if config.create_gtf: logging.info("Merging %d worker GTF files" % (config.num_processors)) worker_gtf_files = [prefix + ".gtf" for prefix in worker_prefixes] output_gtf_file = os.path.join(config.output_dir, "assembly.gtf") merge_sort_gtf_files(worker_gtf_files, output_gtf_file, tmp_dir=tmp_dir) # remove worker gtf files for filename in worker_gtf_files: if os.path.exists(filename): os.remove(filename) # merge bed files if config.create_bed: logging.info("Merging %d worker BED files" % (config.num_processors)) worker_bed_files = [p + ".bed" for p in worker_prefixes] output_bed_file = os.path.join(config.output_dir, "assembly.bed") merge_sort_files(worker_bed_files, output_bed_file, sort_func=sort_bed, tmp_dir=tmp_dir) # write bed file track description line track_name = os.path.basename(config.output_dir) track_line = " ".join( ['track name="%s"' % (track_name), 'description="%s"' % (track_name), "visibility=pack", "useScore=1"] ) track_file = os.path.join(config.output_dir, "assembly.bed.ucsc_track") fileh = open(track_file, "w") print >> fileh, track_line fileh.close() # merge bedgraph files if config.create_bedgraph: logging.info("Merging %d worker bedGraph files" % (config.num_processors)) for strand in xrange(0, 3): strand_name = STRAND_NAMES[strand] bgfiles = ["%s_%s.bedgraph" % (p, strand_name) for p in worker_prefixes] output_file = os.path.join(config.output_dir, "assembly_%s.bedgraph" % strand_name) merge_sort_files(bgfiles, output_file, sort_func=sort_bed, tmp_dir=tmp_dir) track_name = "%s_%s" % (os.path.basename(config.output_dir), strand_name) track_line = " ".join( [ "track type=bedGraph", 'name="%s"' % (track_name), 'description="%s"' % (track_name), "visibility=full", "color=%s" % (STRAND_COLORS[strand]), "autoScale=on", "alwaysZero=on", "maxHeightPixels=64:64:11", ] ) track_file = os.path.join(config.output_dir, "assembly_%s.bedgraph.ucsc_track" % strand_name) fileh = open(track_file, "w") print >> fileh, track_line fileh.close() # cleanup if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir) logging.info("Done") return 0