def main(): logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('genome_fasta_file') parser.add_argument('bed_file') args = parser.parse_args() # check args if not os.path.exists(args.genome_fasta_file): parser.error('genome fasta file %s not found' % args.genome_fasta_file) if not os.path.exists(args.bed_file): parser.error('bed file %s not found' % args.bed_file) logging.info('genome fasta file: %s' % args.genome_fasta_file) logging.info('bed file: %s' % args.bed_file) # process bed file to get junctions logging.info('Reading Junctions') splice_juncs = set() fasta_fh = FastaFile(args.genome_fasta_file) with open(args.bed_file) as bed_fh: for line in bed_fh: t = Transfrag.from_bed(line) if t.chrom not in fasta_fh: continue for start, end in t.iterintrons(): splice_juncs.add((t.chrom, start, end, t.strand)) logging.info('Read %d Junctions' % (len(splice_juncs))) logging.info('Profiling Splice Motifs') motif_counter = Counter() for chrom, start, end, strand in splice_juncs: s = fasta_fh.fetch(chrom, start, start + 2) s += fasta_fh.fetch(chrom, end - 2, end) if strand == Strand.NEG: s = dna_reverse_complement(s) motif_counter[s] += 1 fasta_fh.close() # report statistics total = sum(motif_counter.values()) print '\t'.join(['motif', 'count', 'frac']) for motif, count in motif_counter.most_common(): print '\t'.join([motif, str(count), str(float(count) / total)]) logging.info('Done')
def aggregate_parallel(samples, args, results): ''' Process and aggregate GTF input files samples: list of Sample objects args: from Argparse module. command-line arguments to configure the assembly process results: Results object containing input and output filenames ''' logging.info('Aggregating in parallel using %d processes' % (args.num_processes)) if args.filter_splice_juncs and args.ref_genome_fasta_file: # test opening FastaFile logging.info('Indexing reference genome fasta file (if necessary)') fasta_fh = FastaFile(args.ref_genome_fasta_file) fasta_fh.close() # create queue input_queue = JoinableQueue(maxsize=args.num_processes * 2) # start worker processes procs = [] worker_results = [] for i in xrange(args.num_processes): worker_id = 'aggregate_worker%03d' % i worker_dir = os.path.join(results.tmp_dir, worker_id) if not os.path.exists(worker_dir): os.makedirs(worker_dir) worker_results.append(Results(worker_dir)) p = Process(target=aggregate_worker, args=(input_queue, args, worker_dir)) p.start() procs.append(p) # reference gtf if args.ref_gtf_file is not None: logging.debug('Reference: %s' % args.ref_gtf_file) input_queue.put(Sample(args.ref_gtf_file, Sample.REF_ID)) # parse samples for sample in samples: input_queue.put(sample) for p in procs: input_queue.put(None) # close input queue input_queue.join() input_queue.close() # join worker processes for p in procs: p.join() # merge output files logging.info('Merging aggregated files') logging.debug('\tmerging bed files') retcode = merge_bed(input_files=[r.transfrags_bed_file for r in worker_results], output_file=results.transfrags_bed_file, num_processes=args.num_processes, tmp_dir=results.tmp_dir) if retcode != 0: raise TacoError('Error running linux merge') logging.debug('\tmerging filtered bed files') retcode = merge_bed(input_files=[r.transfrags_filtered_bed_file for r in worker_results], output_file=results.transfrags_filtered_bed_file, num_processes=args.num_processes, tmp_dir=results.tmp_dir) if retcode != 0: raise TacoError('Error running linux merge') logging.debug('\tmerging sample stats') def sort_key_field0(line): fields = line.split('\t', 1) return fields[0] stats_header = ['sample_id', 'num_transfrags', 'filtered_length', 'filtered_expr', 'filtered_splice\n'] stats_header = '\t'.join(stats_header) merge_files(input_files=[r.sample_stats_file for r in worker_results], output_file=results.sample_stats_file, key=sort_key_field0, header=stats_header) # cleanup worker data logging.info('Removing temporary files') def shutil_error_callback(func, path, excinfo): logging.error('Error removing tmp files path=%s message=%s' % (path, excinfo)) for r in worker_results: shutil.rmtree(r.output_dir, onerror=shutil_error_callback) logging.info('Aggregate done') return 0
def aggregate_worker(input_queue, args, output_dir): results = Results(output_dir) # create temp directories tmp_dir = os.path.join(results.output_dir, 'tmp') if not os.path.exists(tmp_dir): logging.debug('\tcreating tmp dir %s' % (tmp_dir)) os.makedirs(tmp_dir) # create set of unsorted results tmp_results = Results(tmp_dir) # setup genome fasta file genome_fasta_fh = None if args.filter_splice_juncs and args.ref_genome_fasta_file: genome_fasta_fh = FastaFile(args.ref_genome_fasta_file) # setup output files bed_fh = open(tmp_results.transfrags_bed_file, 'w') filtered_bed_fh = open(tmp_results.transfrags_filtered_bed_file, 'w') stats_fh = open(results.sample_stats_file, 'w') # process samples via input queue while True: sample = input_queue.get() if sample is None: break aggregate_sample(sample, gtf_expr_attr=args.gtf_expr_attr, is_ref=(sample._id == Sample.REF_ID), min_length=args.filter_min_length, min_expr=args.filter_min_expr, filter_splice_juncs=args.filter_splice_juncs, add_splice_motif=args.add_splice_motif, genome_fasta_fh=genome_fasta_fh, bed_fh=bed_fh, filtered_bed_fh=filtered_bed_fh, stats_fh=stats_fh) input_queue.task_done() input_queue.task_done() # cleanup and close files bed_fh.close() filtered_bed_fh.close() stats_fh.close() if genome_fasta_fh: genome_fasta_fh.close() # sort output files logging.debug('Sorting aggregated files: "%s"' % (output_dir)) # sort bed file logging.debug('\ttransfrags bed file: %s' % (results.output_dir)) retcode = sort_bed(tmp_results.transfrags_bed_file, results.transfrags_bed_file, num_processes=1, tmp_dir=tmp_dir) if retcode != 0: raise TacoError('Error running linux sort') os.remove(tmp_results.transfrags_bed_file) # sort filtered bed file logging.debug('\tfiltered transfrags bed file: %s' % (results.output_dir)) retcode = sort_bed(tmp_results.transfrags_filtered_bed_file, results.transfrags_filtered_bed_file, num_processes=1, tmp_dir=tmp_dir) os.remove(tmp_results.transfrags_filtered_bed_file) if retcode != 0: raise TacoError('Error running linux sort') # remove temporary directories logging.debug('\t%s cleaning up' % (results.output_dir)) os.rmdir(tmp_dir)
def aggregate_parallel(samples, args, results): ''' Process and aggregate GTF input files samples: list of Sample objects args: from Argparse module. command-line arguments to configure the assembly process results: Results object containing input and output filenames ''' logging.info('Aggregating in parallel using %d processes' % (args.num_processes)) if args.filter_splice_juncs and args.ref_genome_fasta_file: # test opening FastaFile logging.info('Indexing reference genome fasta file (if necessary)') fasta_fh = FastaFile(args.ref_genome_fasta_file) fasta_fh.close() # create queue input_queue = JoinableQueue(maxsize=args.num_processes * 2) # start worker processes procs = [] worker_results = [] for i in xrange(args.num_processes): worker_id = 'aggregate_worker%03d' % i worker_dir = os.path.join(results.tmp_dir, worker_id) if not os.path.exists(worker_dir): os.makedirs(worker_dir) worker_results.append(Results(worker_dir)) p = Process(target=aggregate_worker, args=(input_queue, args, worker_dir)) p.start() procs.append(p) # reference gtf if args.ref_gtf_file is not None: logging.debug('Reference: %s' % args.ref_gtf_file) input_queue.put(Sample(args.ref_gtf_file, Sample.REF_ID)) # parse samples for sample in samples: input_queue.put(sample) for p in procs: input_queue.put(None) # close input queue input_queue.join() input_queue.close() # join worker processes for p in procs: p.join() # merge output files logging.info('Merging aggregated files') logging.debug('\tmerging bed files') retcode = merge_bed( input_files=[r.transfrags_bed_file for r in worker_results], output_file=results.transfrags_bed_file, num_processes=args.num_processes, tmp_dir=results.tmp_dir) if retcode != 0: raise TacoError('Error running linux merge') logging.debug('\tmerging filtered bed files') retcode = merge_bed( input_files=[r.transfrags_filtered_bed_file for r in worker_results], output_file=results.transfrags_filtered_bed_file, num_processes=args.num_processes, tmp_dir=results.tmp_dir) if retcode != 0: raise TacoError('Error running linux merge') logging.debug('\tmerging sample stats') def sort_key_field0(line): fields = line.split('\t', 1) return fields[0] stats_header = [ 'sample_id', 'num_transfrags', 'filtered_length', 'filtered_expr', 'filtered_splice\n' ] stats_header = '\t'.join(stats_header) merge_files(input_files=[r.sample_stats_file for r in worker_results], output_file=results.sample_stats_file, key=sort_key_field0, header=stats_header) # cleanup worker data logging.info('Removing temporary files') def shutil_error_callback(func, path, excinfo): logging.error('Error removing tmp files path=%s message=%s' % (path, excinfo)) for r in worker_results: shutil.rmtree(r.output_dir, onerror=shutil_error_callback) logging.info('Aggregate done') return 0