def aggregate(samples, ref_gtf_file, gtf_expr_attr, tmp_dir, output_gtf_file, stats_file): ''' Aggregate/merge individual sample GTF files ''' # setup output files tmp_file = os.path.join(tmp_dir, 'transcripts.unsorted.gtf') # aggregate ref gtf if ref_gtf_file is not None: logging.debug('Reference: %s' % ref_gtf_file) caggregate(ref_gtf_file, str(Sample.REF_ID), gtf_expr_attr, tmp_file, stats_file, str(True)) # aggregate sample gtfs for sample in samples: logging.debug('Sample: %s %s' % (sample._id, sample.gtf_file)) caggregate(sample.gtf_file, str(sample._id), gtf_expr_attr, tmp_file, stats_file, str(False)) # sort merged gtf logging.info("Sorting GTF") retcode = sort_gtf(tmp_file, output_gtf_file, tmp_dir=tmp_dir) if retcode != 0: logging.error("Error sorting GTF") if os.path.exists(output_gtf_file): os.remove(output_gtf_file) raise TacoError('Error sorting GTF') os.remove(tmp_file)
def get_expr_data(self, start, end, strand): if ((start < self.start) or (end > self.end)): m = ('query %d-%d outside locus bounds %d-%d' % (start, end, self.start, self.end)) raise TacoError(m) astart = start - self.start aend = end - self.start return self.expr_data[strand, astart:aend]
def _add_transfrag(self, t): if t.chrom != self.chrom: raise TacoError('chrom mismatch') if t.start < self.start: raise TacoError('transfrag start < locus start') if t.end > self.end: raise TacoError('transfrag end < locus end') self.strands.add(t.strand) self.strand_transfrags[t.strand].append(t) for exon in t.exons: astart = exon.start - self.start aend = exon.end - self.start if t.is_ref and self.guided_strand: self.strand_data[t.strand, astart:aend] = True else: self.expr_data[t.strand, astart:aend] += t.expr self.strand_data[t.strand, astart:aend] = True
def get_expr_data(self, start=None, end=None): if start is None: start = self.start if end is None: end = self.end if ((start < self.start) or (end > self.end)): m = ('query %d-%d outside locus bounds %d-%d' % (start, end, self.start, self.end)) raise TacoError(m) astart = start - self.start aend = end - self.start return self.expr_data[astart:aend]
def _add_transfrags(self, transfrags): self.transfrags = [] self.ref_transfrags = [] self.chrom = None self.start = None self.end = None self.strand = None self.ref_start_sites = set() self.ref_stop_sites = set() self.tree_starts = IntervalTree() self.tree_ends = IntervalTree() for t in transfrags: if self.chrom is None: self.chrom = t.chrom elif self.chrom != t.chrom: raise TacoError('chrom mismatch') if self.strand is None: self.strand = t.strand elif self.strand != t.strand: raise TacoError('strand mismatch') if self.start is None: self.start = t.start else: self.start = min(t.start, self.start) if self.end is None: self.end = t.end else: self.end = max(t.end, self.end) if t.is_ref: self.ref_start_sites.add(t.txstart) self.ref_stop_sites.add(t.txstop) self.ref_transfrags.append(t) else: self._add_to_interval_trees(t) self.transfrags.append(t) self.ref_start_sites = sorted(self.ref_start_sites) self.ref_stop_sites = sorted(self.ref_stop_sites)
def aggregate(samples, ref_gtf_file, gtf_expr_attr, tmp_dir, output_gtf_file, stats_file): ''' Aggregate/merge individual sample GTF files ''' # setup output files tmp_file = os.path.join(tmp_dir, 'transcripts.unsorted.gtf') tmp_fileh = open(tmp_file, 'w') stats_fileh = open(stats_file, 'w') # stats file has header fields = [ 'sample_id', 'num_transfrags', 'expr_quantiles', 'length_quantiles', 'num_exon_quantiles' ] print >> stats_fileh, '\t'.join(fields) # aggregate ref gtf if ref_gtf_file is not None: sample = Sample(ref_gtf_file, Sample.REF_ID) sample._id = Sample.REF_ID logging.debug('Reference: %s' % ref_gtf_file) _aggregate_gtf(sample.gtf_file, sample._id, gtf_expr_attr, tmp_fileh, stats_fileh, is_ref=True) # aggregate sample gtfs for sample in samples: logging.debug('Sample: %s %s' % (sample._id, sample.gtf_file)) _aggregate_gtf(sample.gtf_file, sample._id, gtf_expr_attr, tmp_fileh, stats_fileh) tmp_fileh.close() stats_fileh.close() # sort merged gtf logging.info("Sorting GTF") retcode = sort_gtf(tmp_file, output_gtf_file, tmp_dir=tmp_dir) if retcode != 0: logging.error("Error sorting GTF") if os.path.exists(output_gtf_file): os.remove(output_gtf_file) raise TacoError('Error sorting GTF') os.remove(tmp_file)
def _find_boundaries(transfrags): chrom = None start = None end = None strands = set() for t in transfrags: if chrom is None: chrom = t.chrom elif chrom != t.chrom: raise TacoError('chrom mismatch') strands.add(t.strand) if start is None: start = t.start else: start = min(t.start, start) if end is None: end = t.end else: end = max(t.end, end) return chrom, start, end, strands
def aggregate_parallel(samples, args, results): ''' Process and aggregate GTF input files samples: list of Sample objects args: from Argparse module. command-line arguments to configure the assembly process results: Results object containing input and output filenames ''' logging.info('Aggregating in parallel using %d processes' % (args.num_processes)) if args.filter_splice_juncs and args.ref_genome_fasta_file: # test opening FastaFile logging.info('Indexing reference genome fasta file (if necessary)') fasta_fh = FastaFile(args.ref_genome_fasta_file) fasta_fh.close() # create queue input_queue = JoinableQueue(maxsize=args.num_processes * 2) # start worker processes procs = [] worker_results = [] for i in xrange(args.num_processes): worker_id = 'aggregate_worker%03d' % i worker_dir = os.path.join(results.tmp_dir, worker_id) if not os.path.exists(worker_dir): os.makedirs(worker_dir) worker_results.append(Results(worker_dir)) p = Process(target=aggregate_worker, args=(input_queue, args, worker_dir)) p.start() procs.append(p) # reference gtf if args.ref_gtf_file is not None: logging.debug('Reference: %s' % args.ref_gtf_file) input_queue.put(Sample(args.ref_gtf_file, Sample.REF_ID)) # parse samples for sample in samples: input_queue.put(sample) for p in procs: input_queue.put(None) # close input queue input_queue.join() input_queue.close() # join worker processes for p in procs: p.join() # merge output files logging.info('Merging aggregated files') logging.debug('\tmerging bed files') retcode = merge_bed( input_files=[r.transfrags_bed_file for r in worker_results], output_file=results.transfrags_bed_file, num_processes=args.num_processes, tmp_dir=results.tmp_dir) if retcode != 0: raise TacoError('Error running linux merge') logging.debug('\tmerging filtered bed files') retcode = merge_bed( input_files=[r.transfrags_filtered_bed_file for r in worker_results], output_file=results.transfrags_filtered_bed_file, num_processes=args.num_processes, tmp_dir=results.tmp_dir) if retcode != 0: raise TacoError('Error running linux merge') logging.debug('\tmerging sample stats') def sort_key_field0(line): fields = line.split('\t', 1) return fields[0] stats_header = [ 'sample_id', 'num_transfrags', 'filtered_length', 'filtered_expr', 'filtered_splice\n' ] stats_header = '\t'.join(stats_header) merge_files(input_files=[r.sample_stats_file for r in worker_results], output_file=results.sample_stats_file, key=sort_key_field0, header=stats_header) # cleanup worker data logging.info('Removing temporary files') def shutil_error_callback(func, path, excinfo): logging.error('Error removing tmp files path=%s message=%s' % (path, excinfo)) for r in worker_results: shutil.rmtree(r.output_dir, onerror=shutil_error_callback) logging.info('Aggregate done') return 0
def aggregate_worker(input_queue, args, output_dir): results = Results(output_dir) # create temp directories tmp_dir = os.path.join(results.output_dir, 'tmp') if not os.path.exists(tmp_dir): logging.debug('\tcreating tmp dir %s' % (tmp_dir)) os.makedirs(tmp_dir) # create set of unsorted results tmp_results = Results(tmp_dir) # setup genome fasta file genome_fasta_fh = None if args.filter_splice_juncs and args.ref_genome_fasta_file: genome_fasta_fh = FastaFile(args.ref_genome_fasta_file) # setup output files bed_fh = open(tmp_results.transfrags_bed_file, 'w') filtered_bed_fh = open(tmp_results.transfrags_filtered_bed_file, 'w') stats_fh = open(results.sample_stats_file, 'w') # process samples via input queue while True: sample = input_queue.get() if sample is None: break aggregate_sample(sample, gtf_expr_attr=args.gtf_expr_attr, is_ref=(sample._id == Sample.REF_ID), min_length=args.filter_min_length, min_expr=args.filter_min_expr, filter_splice_juncs=args.filter_splice_juncs, add_splice_motif=args.add_splice_motif, genome_fasta_fh=genome_fasta_fh, bed_fh=bed_fh, filtered_bed_fh=filtered_bed_fh, stats_fh=stats_fh) input_queue.task_done() input_queue.task_done() # cleanup and close files bed_fh.close() filtered_bed_fh.close() stats_fh.close() if genome_fasta_fh: genome_fasta_fh.close() # sort output files logging.debug('Sorting aggregated files: "%s"' % (output_dir)) # sort bed file logging.debug('\ttransfrags bed file: %s' % (results.output_dir)) retcode = sort_bed(tmp_results.transfrags_bed_file, results.transfrags_bed_file, num_processes=1, tmp_dir=tmp_dir) if retcode != 0: raise TacoError('Error running linux sort') os.remove(tmp_results.transfrags_bed_file) # sort filtered bed file logging.debug('\tfiltered transfrags bed file: %s' % (results.output_dir)) retcode = sort_bed(tmp_results.transfrags_filtered_bed_file, results.transfrags_filtered_bed_file, num_processes=1, tmp_dir=tmp_dir) os.remove(tmp_results.transfrags_filtered_bed_file) if retcode != 0: raise TacoError('Error running linux sort') # remove temporary directories logging.debug('\t%s cleaning up' % (results.output_dir)) os.rmdir(tmp_dir)