Ejemplo n.º 1
0
def aggregate(samples, ref_gtf_file, gtf_expr_attr, tmp_dir, output_gtf_file,
              stats_file):
    '''
    Aggregate/merge individual sample GTF files
    '''
    # setup output files
    tmp_file = os.path.join(tmp_dir, 'transcripts.unsorted.gtf')

    # aggregate ref gtf
    if ref_gtf_file is not None:
        logging.debug('Reference: %s' % ref_gtf_file)
        caggregate(ref_gtf_file, str(Sample.REF_ID), gtf_expr_attr, tmp_file,
                   stats_file, str(True))

    # aggregate sample gtfs
    for sample in samples:
        logging.debug('Sample: %s %s' % (sample._id, sample.gtf_file))
        caggregate(sample.gtf_file, str(sample._id), gtf_expr_attr, tmp_file,
                   stats_file, str(False))

    # sort merged gtf
    logging.info("Sorting GTF")
    retcode = sort_gtf(tmp_file, output_gtf_file, tmp_dir=tmp_dir)
    if retcode != 0:
        logging.error("Error sorting GTF")
        if os.path.exists(output_gtf_file):
            os.remove(output_gtf_file)
        raise TacoError('Error sorting GTF')
    os.remove(tmp_file)
Ejemplo n.º 2
0
 def get_expr_data(self, start, end, strand):
     if ((start < self.start) or (end > self.end)):
         m = ('query %d-%d outside locus bounds %d-%d' %
              (start, end, self.start, self.end))
         raise TacoError(m)
     astart = start - self.start
     aend = end - self.start
     return self.expr_data[strand, astart:aend]
Ejemplo n.º 3
0
 def _add_transfrag(self, t):
     if t.chrom != self.chrom:
         raise TacoError('chrom mismatch')
     if t.start < self.start:
         raise TacoError('transfrag start < locus start')
     if t.end > self.end:
         raise TacoError('transfrag end < locus end')
     self.strands.add(t.strand)
     self.strand_transfrags[t.strand].append(t)
     for exon in t.exons:
         astart = exon.start - self.start
         aend = exon.end - self.start
         if t.is_ref and self.guided_strand:
             self.strand_data[t.strand, astart:aend] = True
         else:
             self.expr_data[t.strand, astart:aend] += t.expr
             self.strand_data[t.strand, astart:aend] = True
Ejemplo n.º 4
0
 def get_expr_data(self, start=None, end=None):
     if start is None:
         start = self.start
     if end is None:
         end = self.end
     if ((start < self.start) or (end > self.end)):
         m = ('query %d-%d outside locus bounds %d-%d' %
              (start, end, self.start, self.end))
         raise TacoError(m)
     astart = start - self.start
     aend = end - self.start
     return self.expr_data[astart:aend]
Ejemplo n.º 5
0
 def _add_transfrags(self, transfrags):
     self.transfrags = []
     self.ref_transfrags = []
     self.chrom = None
     self.start = None
     self.end = None
     self.strand = None
     self.ref_start_sites = set()
     self.ref_stop_sites = set()
     self.tree_starts = IntervalTree()
     self.tree_ends = IntervalTree()
     for t in transfrags:
         if self.chrom is None:
             self.chrom = t.chrom
         elif self.chrom != t.chrom:
             raise TacoError('chrom mismatch')
         if self.strand is None:
             self.strand = t.strand
         elif self.strand != t.strand:
             raise TacoError('strand mismatch')
         if self.start is None:
             self.start = t.start
         else:
             self.start = min(t.start, self.start)
         if self.end is None:
             self.end = t.end
         else:
             self.end = max(t.end, self.end)
         if t.is_ref:
             self.ref_start_sites.add(t.txstart)
             self.ref_stop_sites.add(t.txstop)
             self.ref_transfrags.append(t)
         else:
             self._add_to_interval_trees(t)
             self.transfrags.append(t)
     self.ref_start_sites = sorted(self.ref_start_sites)
     self.ref_stop_sites = sorted(self.ref_stop_sites)
Ejemplo n.º 6
0
def aggregate(samples, ref_gtf_file, gtf_expr_attr, tmp_dir, output_gtf_file,
              stats_file):
    '''
    Aggregate/merge individual sample GTF files
    '''
    # setup output files
    tmp_file = os.path.join(tmp_dir, 'transcripts.unsorted.gtf')
    tmp_fileh = open(tmp_file, 'w')
    stats_fileh = open(stats_file, 'w')
    # stats file has header
    fields = [
        'sample_id', 'num_transfrags', 'expr_quantiles', 'length_quantiles',
        'num_exon_quantiles'
    ]
    print >> stats_fileh, '\t'.join(fields)
    # aggregate ref gtf
    if ref_gtf_file is not None:
        sample = Sample(ref_gtf_file, Sample.REF_ID)
        sample._id = Sample.REF_ID
        logging.debug('Reference: %s' % ref_gtf_file)
        _aggregate_gtf(sample.gtf_file,
                       sample._id,
                       gtf_expr_attr,
                       tmp_fileh,
                       stats_fileh,
                       is_ref=True)

    # aggregate sample gtfs
    for sample in samples:
        logging.debug('Sample: %s %s' % (sample._id, sample.gtf_file))
        _aggregate_gtf(sample.gtf_file, sample._id, gtf_expr_attr, tmp_fileh,
                       stats_fileh)
    tmp_fileh.close()
    stats_fileh.close()

    # sort merged gtf
    logging.info("Sorting GTF")
    retcode = sort_gtf(tmp_file, output_gtf_file, tmp_dir=tmp_dir)
    if retcode != 0:
        logging.error("Error sorting GTF")
        if os.path.exists(output_gtf_file):
            os.remove(output_gtf_file)
        raise TacoError('Error sorting GTF')
    os.remove(tmp_file)
Ejemplo n.º 7
0
def _find_boundaries(transfrags):
    chrom = None
    start = None
    end = None
    strands = set()
    for t in transfrags:
        if chrom is None:
            chrom = t.chrom
        elif chrom != t.chrom:
            raise TacoError('chrom mismatch')
        strands.add(t.strand)
        if start is None:
            start = t.start
        else:
            start = min(t.start, start)
        if end is None:
            end = t.end
        else:
            end = max(t.end, end)
    return chrom, start, end, strands
Ejemplo n.º 8
0
def aggregate_parallel(samples, args, results):
    '''
    Process and aggregate GTF input files

    samples: list of Sample objects
    args: from Argparse module. command-line arguments to configure the
          assembly process
    results: Results object containing input and output filenames
    '''
    logging.info('Aggregating in parallel using %d processes' %
                 (args.num_processes))

    if args.filter_splice_juncs and args.ref_genome_fasta_file:
        # test opening FastaFile
        logging.info('Indexing reference genome fasta file (if necessary)')
        fasta_fh = FastaFile(args.ref_genome_fasta_file)
        fasta_fh.close()

    # create queue
    input_queue = JoinableQueue(maxsize=args.num_processes * 2)
    # start worker processes
    procs = []
    worker_results = []
    for i in xrange(args.num_processes):
        worker_id = 'aggregate_worker%03d' % i
        worker_dir = os.path.join(results.tmp_dir, worker_id)
        if not os.path.exists(worker_dir):
            os.makedirs(worker_dir)
        worker_results.append(Results(worker_dir))
        p = Process(target=aggregate_worker,
                    args=(input_queue, args, worker_dir))
        p.start()
        procs.append(p)

    # reference gtf
    if args.ref_gtf_file is not None:
        logging.debug('Reference: %s' % args.ref_gtf_file)
        input_queue.put(Sample(args.ref_gtf_file, Sample.REF_ID))
    # parse samples
    for sample in samples:
        input_queue.put(sample)
    for p in procs:
        input_queue.put(None)
    # close input queue
    input_queue.join()
    input_queue.close()
    # join worker processes
    for p in procs:
        p.join()

    # merge output files
    logging.info('Merging aggregated files')
    logging.debug('\tmerging bed files')
    retcode = merge_bed(
        input_files=[r.transfrags_bed_file for r in worker_results],
        output_file=results.transfrags_bed_file,
        num_processes=args.num_processes,
        tmp_dir=results.tmp_dir)
    if retcode != 0:
        raise TacoError('Error running linux merge')

    logging.debug('\tmerging filtered bed files')
    retcode = merge_bed(
        input_files=[r.transfrags_filtered_bed_file for r in worker_results],
        output_file=results.transfrags_filtered_bed_file,
        num_processes=args.num_processes,
        tmp_dir=results.tmp_dir)
    if retcode != 0:
        raise TacoError('Error running linux merge')

    logging.debug('\tmerging sample stats')

    def sort_key_field0(line):
        fields = line.split('\t', 1)
        return fields[0]

    stats_header = [
        'sample_id', 'num_transfrags', 'filtered_length', 'filtered_expr',
        'filtered_splice\n'
    ]
    stats_header = '\t'.join(stats_header)
    merge_files(input_files=[r.sample_stats_file for r in worker_results],
                output_file=results.sample_stats_file,
                key=sort_key_field0,
                header=stats_header)
    # cleanup worker data
    logging.info('Removing temporary files')

    def shutil_error_callback(func, path, excinfo):
        logging.error('Error removing tmp files path=%s message=%s' %
                      (path, excinfo))

    for r in worker_results:
        shutil.rmtree(r.output_dir, onerror=shutil_error_callback)
    logging.info('Aggregate done')
    return 0
Ejemplo n.º 9
0
def aggregate_worker(input_queue, args, output_dir):
    results = Results(output_dir)
    # create temp directories
    tmp_dir = os.path.join(results.output_dir, 'tmp')
    if not os.path.exists(tmp_dir):
        logging.debug('\tcreating tmp dir %s' % (tmp_dir))
        os.makedirs(tmp_dir)
    # create set of unsorted results
    tmp_results = Results(tmp_dir)
    # setup genome fasta file
    genome_fasta_fh = None
    if args.filter_splice_juncs and args.ref_genome_fasta_file:
        genome_fasta_fh = FastaFile(args.ref_genome_fasta_file)
    # setup output files
    bed_fh = open(tmp_results.transfrags_bed_file, 'w')
    filtered_bed_fh = open(tmp_results.transfrags_filtered_bed_file, 'w')
    stats_fh = open(results.sample_stats_file, 'w')
    # process samples via input queue
    while True:
        sample = input_queue.get()
        if sample is None:
            break
        aggregate_sample(sample,
                         gtf_expr_attr=args.gtf_expr_attr,
                         is_ref=(sample._id == Sample.REF_ID),
                         min_length=args.filter_min_length,
                         min_expr=args.filter_min_expr,
                         filter_splice_juncs=args.filter_splice_juncs,
                         add_splice_motif=args.add_splice_motif,
                         genome_fasta_fh=genome_fasta_fh,
                         bed_fh=bed_fh,
                         filtered_bed_fh=filtered_bed_fh,
                         stats_fh=stats_fh)
        input_queue.task_done()
    input_queue.task_done()
    # cleanup and close files
    bed_fh.close()
    filtered_bed_fh.close()
    stats_fh.close()
    if genome_fasta_fh:
        genome_fasta_fh.close()

    # sort output files
    logging.debug('Sorting aggregated files: "%s"' % (output_dir))
    # sort bed file
    logging.debug('\ttransfrags bed file: %s' % (results.output_dir))
    retcode = sort_bed(tmp_results.transfrags_bed_file,
                       results.transfrags_bed_file,
                       num_processes=1,
                       tmp_dir=tmp_dir)
    if retcode != 0:
        raise TacoError('Error running linux sort')
    os.remove(tmp_results.transfrags_bed_file)
    # sort filtered bed file
    logging.debug('\tfiltered transfrags bed file: %s' % (results.output_dir))
    retcode = sort_bed(tmp_results.transfrags_filtered_bed_file,
                       results.transfrags_filtered_bed_file,
                       num_processes=1,
                       tmp_dir=tmp_dir)
    os.remove(tmp_results.transfrags_filtered_bed_file)
    if retcode != 0:
        raise TacoError('Error running linux sort')
    # remove temporary directories
    logging.debug('\t%s cleaning up' % (results.output_dir))
    os.rmdir(tmp_dir)