Exemple #1
0
def main():
    logging.basicConfig(level=logging.DEBUG)
    parser = argparse.ArgumentParser()
    parser.add_argument('genome_fasta_file')
    parser.add_argument('bed_file')
    args = parser.parse_args()

    # check args
    if not os.path.exists(args.genome_fasta_file):
        parser.error('genome fasta file %s not found' % args.genome_fasta_file)
    if not os.path.exists(args.bed_file):
        parser.error('bed file %s not found' % args.bed_file)
    logging.info('genome fasta file: %s' % args.genome_fasta_file)
    logging.info('bed file: %s' % args.bed_file)

    # process bed file to get junctions
    logging.info('Reading Junctions')
    splice_juncs = set()
    fasta_fh = FastaFile(args.genome_fasta_file)
    with open(args.bed_file) as bed_fh:
        for line in bed_fh:
            t = Transfrag.from_bed(line)
            if t.chrom not in fasta_fh:
                continue
            for start, end in t.iterintrons():
                splice_juncs.add((t.chrom, start, end, t.strand))
    logging.info('Read %d Junctions' % (len(splice_juncs)))

    logging.info('Profiling Splice Motifs')
    motif_counter = Counter()
    for chrom, start, end, strand in splice_juncs:
        s = fasta_fh.fetch(chrom, start, start + 2)
        s += fasta_fh.fetch(chrom, end - 2, end)
        if strand == Strand.NEG:
            s = dna_reverse_complement(s)
        motif_counter[s] += 1
    fasta_fh.close()

    # report statistics
    total = sum(motif_counter.values())
    print '\t'.join(['motif', 'count', 'frac'])
    for motif, count in motif_counter.most_common():
        print '\t'.join([motif, str(count), str(float(count) / total)])
    logging.info('Done')
Exemple #2
0
def aggregate_parallel(samples, args, results):
    '''
    Process and aggregate GTF input files

    samples: list of Sample objects
    args: from Argparse module. command-line arguments to configure the
          assembly process
    results: Results object containing input and output filenames
    '''
    logging.info('Aggregating in parallel using %d processes' %
                 (args.num_processes))

    if args.filter_splice_juncs and args.ref_genome_fasta_file:
        # test opening FastaFile
        logging.info('Indexing reference genome fasta file (if necessary)')
        fasta_fh = FastaFile(args.ref_genome_fasta_file)
        fasta_fh.close()

    # create queue
    input_queue = JoinableQueue(maxsize=args.num_processes * 2)
    # start worker processes
    procs = []
    worker_results = []
    for i in xrange(args.num_processes):
        worker_id = 'aggregate_worker%03d' % i
        worker_dir = os.path.join(results.tmp_dir, worker_id)
        if not os.path.exists(worker_dir):
            os.makedirs(worker_dir)
        worker_results.append(Results(worker_dir))
        p = Process(target=aggregate_worker,
                    args=(input_queue, args, worker_dir))
        p.start()
        procs.append(p)

    # reference gtf
    if args.ref_gtf_file is not None:
        logging.debug('Reference: %s' % args.ref_gtf_file)
        input_queue.put(Sample(args.ref_gtf_file, Sample.REF_ID))
    # parse samples
    for sample in samples:
        input_queue.put(sample)
    for p in procs:
        input_queue.put(None)
    # close input queue
    input_queue.join()
    input_queue.close()
    # join worker processes
    for p in procs:
        p.join()

    # merge output files
    logging.info('Merging aggregated files')
    logging.debug('\tmerging bed files')
    retcode = merge_bed(input_files=[r.transfrags_bed_file for r in worker_results],
                        output_file=results.transfrags_bed_file,
                        num_processes=args.num_processes,
                        tmp_dir=results.tmp_dir)
    if retcode != 0:
        raise TacoError('Error running linux merge')

    logging.debug('\tmerging filtered bed files')
    retcode = merge_bed(input_files=[r.transfrags_filtered_bed_file for r in worker_results],
                        output_file=results.transfrags_filtered_bed_file,
                        num_processes=args.num_processes,
                        tmp_dir=results.tmp_dir)
    if retcode != 0:
        raise TacoError('Error running linux merge')

    logging.debug('\tmerging sample stats')
    def sort_key_field0(line):
        fields = line.split('\t', 1)
        return fields[0]
    stats_header = ['sample_id', 'num_transfrags', 'filtered_length',
                    'filtered_expr', 'filtered_splice\n']
    stats_header = '\t'.join(stats_header)
    merge_files(input_files=[r.sample_stats_file for r in worker_results],
                output_file=results.sample_stats_file,
                key=sort_key_field0,
                header=stats_header)
    # cleanup worker data
    logging.info('Removing temporary files')
    def shutil_error_callback(func, path, excinfo):
        logging.error('Error removing tmp files path=%s message=%s' %
                      (path, excinfo))
    for r in worker_results:
        shutil.rmtree(r.output_dir, onerror=shutil_error_callback)
    logging.info('Aggregate done')
    return 0
Exemple #3
0
def aggregate_worker(input_queue, args, output_dir):
    results = Results(output_dir)
    # create temp directories
    tmp_dir = os.path.join(results.output_dir, 'tmp')
    if not os.path.exists(tmp_dir):
        logging.debug('\tcreating tmp dir %s' % (tmp_dir))
        os.makedirs(tmp_dir)
    # create set of unsorted results
    tmp_results = Results(tmp_dir)
    # setup genome fasta file
    genome_fasta_fh = None
    if args.filter_splice_juncs and args.ref_genome_fasta_file:
        genome_fasta_fh = FastaFile(args.ref_genome_fasta_file)
    # setup output files
    bed_fh = open(tmp_results.transfrags_bed_file, 'w')
    filtered_bed_fh = open(tmp_results.transfrags_filtered_bed_file, 'w')
    stats_fh = open(results.sample_stats_file, 'w')
    # process samples via input queue
    while True:
        sample = input_queue.get()
        if sample is None:
            break
        aggregate_sample(sample,
                         gtf_expr_attr=args.gtf_expr_attr,
                         is_ref=(sample._id == Sample.REF_ID),
                         min_length=args.filter_min_length,
                         min_expr=args.filter_min_expr,
                         filter_splice_juncs=args.filter_splice_juncs,
                         add_splice_motif=args.add_splice_motif,
                         genome_fasta_fh=genome_fasta_fh,
                         bed_fh=bed_fh,
                         filtered_bed_fh=filtered_bed_fh,
                         stats_fh=stats_fh)
        input_queue.task_done()
    input_queue.task_done()
    # cleanup and close files
    bed_fh.close()
    filtered_bed_fh.close()
    stats_fh.close()
    if genome_fasta_fh:
        genome_fasta_fh.close()

    # sort output files
    logging.debug('Sorting aggregated files: "%s"' % (output_dir))
    # sort bed file
    logging.debug('\ttransfrags bed file: %s' % (results.output_dir))
    retcode = sort_bed(tmp_results.transfrags_bed_file,
                       results.transfrags_bed_file,
                       num_processes=1,
                       tmp_dir=tmp_dir)
    if retcode != 0:
        raise TacoError('Error running linux sort')
    os.remove(tmp_results.transfrags_bed_file)
    # sort filtered bed file
    logging.debug('\tfiltered transfrags bed file: %s' % (results.output_dir))
    retcode = sort_bed(tmp_results.transfrags_filtered_bed_file,
                       results.transfrags_filtered_bed_file,
                       num_processes=1,
                       tmp_dir=tmp_dir)
    os.remove(tmp_results.transfrags_filtered_bed_file)
    if retcode != 0:
        raise TacoError('Error running linux sort')
    # remove temporary directories
    logging.debug('\t%s cleaning up' % (results.output_dir))
    os.rmdir(tmp_dir)
Exemple #4
0
def aggregate_parallel(samples, args, results):
    '''
    Process and aggregate GTF input files

    samples: list of Sample objects
    args: from Argparse module. command-line arguments to configure the
          assembly process
    results: Results object containing input and output filenames
    '''
    logging.info('Aggregating in parallel using %d processes' %
                 (args.num_processes))

    if args.filter_splice_juncs and args.ref_genome_fasta_file:
        # test opening FastaFile
        logging.info('Indexing reference genome fasta file (if necessary)')
        fasta_fh = FastaFile(args.ref_genome_fasta_file)
        fasta_fh.close()

    # create queue
    input_queue = JoinableQueue(maxsize=args.num_processes * 2)
    # start worker processes
    procs = []
    worker_results = []
    for i in xrange(args.num_processes):
        worker_id = 'aggregate_worker%03d' % i
        worker_dir = os.path.join(results.tmp_dir, worker_id)
        if not os.path.exists(worker_dir):
            os.makedirs(worker_dir)
        worker_results.append(Results(worker_dir))
        p = Process(target=aggregate_worker,
                    args=(input_queue, args, worker_dir))
        p.start()
        procs.append(p)

    # reference gtf
    if args.ref_gtf_file is not None:
        logging.debug('Reference: %s' % args.ref_gtf_file)
        input_queue.put(Sample(args.ref_gtf_file, Sample.REF_ID))
    # parse samples
    for sample in samples:
        input_queue.put(sample)
    for p in procs:
        input_queue.put(None)
    # close input queue
    input_queue.join()
    input_queue.close()
    # join worker processes
    for p in procs:
        p.join()

    # merge output files
    logging.info('Merging aggregated files')
    logging.debug('\tmerging bed files')
    retcode = merge_bed(
        input_files=[r.transfrags_bed_file for r in worker_results],
        output_file=results.transfrags_bed_file,
        num_processes=args.num_processes,
        tmp_dir=results.tmp_dir)
    if retcode != 0:
        raise TacoError('Error running linux merge')

    logging.debug('\tmerging filtered bed files')
    retcode = merge_bed(
        input_files=[r.transfrags_filtered_bed_file for r in worker_results],
        output_file=results.transfrags_filtered_bed_file,
        num_processes=args.num_processes,
        tmp_dir=results.tmp_dir)
    if retcode != 0:
        raise TacoError('Error running linux merge')

    logging.debug('\tmerging sample stats')

    def sort_key_field0(line):
        fields = line.split('\t', 1)
        return fields[0]

    stats_header = [
        'sample_id', 'num_transfrags', 'filtered_length', 'filtered_expr',
        'filtered_splice\n'
    ]
    stats_header = '\t'.join(stats_header)
    merge_files(input_files=[r.sample_stats_file for r in worker_results],
                output_file=results.sample_stats_file,
                key=sort_key_field0,
                header=stats_header)
    # cleanup worker data
    logging.info('Removing temporary files')

    def shutil_error_callback(func, path, excinfo):
        logging.error('Error removing tmp files path=%s message=%s' %
                      (path, excinfo))

    for r in worker_results:
        shutil.rmtree(r.output_dir, onerror=shutil_error_callback)
    logging.info('Aggregate done')
    return 0
Exemple #5
0
def aggregate_worker(input_queue, args, output_dir):
    results = Results(output_dir)
    # create temp directories
    tmp_dir = os.path.join(results.output_dir, 'tmp')
    if not os.path.exists(tmp_dir):
        logging.debug('\tcreating tmp dir %s' % (tmp_dir))
        os.makedirs(tmp_dir)
    # create set of unsorted results
    tmp_results = Results(tmp_dir)
    # setup genome fasta file
    genome_fasta_fh = None
    if args.filter_splice_juncs and args.ref_genome_fasta_file:
        genome_fasta_fh = FastaFile(args.ref_genome_fasta_file)
    # setup output files
    bed_fh = open(tmp_results.transfrags_bed_file, 'w')
    filtered_bed_fh = open(tmp_results.transfrags_filtered_bed_file, 'w')
    stats_fh = open(results.sample_stats_file, 'w')
    # process samples via input queue
    while True:
        sample = input_queue.get()
        if sample is None:
            break
        aggregate_sample(sample,
                         gtf_expr_attr=args.gtf_expr_attr,
                         is_ref=(sample._id == Sample.REF_ID),
                         min_length=args.filter_min_length,
                         min_expr=args.filter_min_expr,
                         filter_splice_juncs=args.filter_splice_juncs,
                         add_splice_motif=args.add_splice_motif,
                         genome_fasta_fh=genome_fasta_fh,
                         bed_fh=bed_fh,
                         filtered_bed_fh=filtered_bed_fh,
                         stats_fh=stats_fh)
        input_queue.task_done()
    input_queue.task_done()
    # cleanup and close files
    bed_fh.close()
    filtered_bed_fh.close()
    stats_fh.close()
    if genome_fasta_fh:
        genome_fasta_fh.close()

    # sort output files
    logging.debug('Sorting aggregated files: "%s"' % (output_dir))
    # sort bed file
    logging.debug('\ttransfrags bed file: %s' % (results.output_dir))
    retcode = sort_bed(tmp_results.transfrags_bed_file,
                       results.transfrags_bed_file,
                       num_processes=1,
                       tmp_dir=tmp_dir)
    if retcode != 0:
        raise TacoError('Error running linux sort')
    os.remove(tmp_results.transfrags_bed_file)
    # sort filtered bed file
    logging.debug('\tfiltered transfrags bed file: %s' % (results.output_dir))
    retcode = sort_bed(tmp_results.transfrags_filtered_bed_file,
                       results.transfrags_filtered_bed_file,
                       num_processes=1,
                       tmp_dir=tmp_dir)
    os.remove(tmp_results.transfrags_filtered_bed_file)
    if retcode != 0:
        raise TacoError('Error running linux sort')
    # remove temporary directories
    logging.debug('\t%s cleaning up' % (results.output_dir))
    os.rmdir(tmp_dir)