Python merge_sort_gtf_files Examples, assemblyline.lib.gtf.merge_sort_gtf_files Python Examples

Example #1

0

Show file

File: annotate_transcripts.py Project: redlin810/assemblyline

def annotate_gtf_parallel(input_gtf_file, output_gtf_file, gtf_sample_attr,
                          num_processors, tmp_dir):
    # create queue
    input_queue = JoinableQueue(maxsize=num_processors * 3)
    # start worker processes
    procs = []
    worker_gtf_files = []
    for i in xrange(num_processors):
        worker_gtf_file = os.path.join(tmp_dir,
                                       "annotate_worker%03d.gtf" % (i))
        worker_gtf_files.append(worker_gtf_file)
        args = (input_queue, worker_gtf_file, gtf_sample_attr)
        p = Process(target=annotate_gtf_worker, args=args)
        p.daemon = True
        p.start()
        procs.append(p)
    for lines in parse_loci(open(input_gtf_file)):
        input_queue.put(lines)
    # stop workers
    for p in procs:
        input_queue.put([])
    # close queue
    input_queue.join()
    input_queue.close()
    # join worker processes
    for p in procs:
        p.join()
    # merge/sort worker gtf files
    logging.debug("Merging %d worker GTF file(s)" % (num_processors))
    merge_sort_gtf_files(worker_gtf_files, output_gtf_file, tmp_dir=tmp_dir)
    # remove worker gtf files
    for filename in worker_gtf_files:
        if os.path.exists(filename):
            os.remove(filename)

Example #2

0

Show file

File: merge_filtered_libraries.py Project: redlin810/assemblyline

def filter_transcripts(classify_dir, min_prec, min_rec, min_spec, 
                       opt_variable, tmp_dir):
    # setup input and output files
    lib_counts_file = os.path.join(classify_dir, LIB_COUNTS_FILE)
    lib_counts_list = list(LibCounts.from_file(lib_counts_file))
    # filter each category
    expr_gtf_files = []
    bkgd_gtf_files = []
    for category_key in CATEGORIES:
        category_str = category_int_to_str[category_key]
        category_lib_counts = [x for x in lib_counts_list
                               if x.category_counts[category_key] > 0]
        library_ids = [x.library_id for x in category_lib_counts]
        cinfo = CategoryInfo.create(library_ids, category_key, 
                                    category_str, classify_dir)
        filter_category(category_lib_counts, cinfo, min_prec, min_rec, 
                        min_spec, opt_variable, tmp_dir)
        expr_gtf_files.append(cinfo.unann_expr_gtf_file)
        bkgd_gtf_files.append(cinfo.unann_bkgd_gtf_file)
    # only need one set of annotated gtf files
    expr_gtf_files.extend([cinfo.ann_expr_gtf_file,
                           cinfo.ann_bkgd_gtf_file])
    # merge transcripts
    logging.info("Merging filtered transcripts")
    expr_gtf_file = os.path.join(classify_dir, EXPR_GTF_FILE)
    bkgd_gtf_file = os.path.join(classify_dir, BKGD_GTF_FILE)
    merge_sort_gtf_files(expr_gtf_files, expr_gtf_file, tmp_dir=tmp_dir)
    merge_sort_gtf_files(bkgd_gtf_files, bkgd_gtf_file, tmp_dir=tmp_dir)

Example #3

0

Show file

File: classify_transcripts.py Project: redlin810/assemblyline

def merge_transcripts(results):
    # read library category statistics
    stats_list = list(CategoryStats.from_file(results.category_stats_file))
    library_ids = []
    expressed_gtf_files = []
    background_gtf_files = []
    for statsobj in stats_list:
        library_id = statsobj.library_id
        library_ids.append(library_id)
        prefix = os.path.join(results.classify_dir, library_id)
        expressed_gtf_files.append(prefix + ".expr.gtf")
        background_gtf_files.append(prefix + ".bkgd.gtf")
    library_id_map = {}
    for line in open(results.library_id_map):
        fields = line.strip().split('\t')
        library_id_map[fields[0]] = fields[1]
    # make a classification report
    logging.info("Writing classification report")
    fileh = open(results.classify_report_file, 'w')
    header_fields = [
        "library_id", "library_name", "category", "train.auc", "test.auc",
        "train.cutoff", "train.tp", "train.fp", "train.fn", "train.tn",
        "train.sens", "train.spec", "train.balacc", "test.tp", "test.fp",
        "test.fn", "test.tn", "test.sens", "test.spec", "test.balacc"
    ]
    print >> fileh, '\t'.join(header_fields)
    for library_id in library_ids:
        prefix = os.path.join(results.classify_dir, library_id)
        library_name = library_id_map[library_id]
        intergenic_perf_file = prefix + ".intergenic.perf.txt"
        intronic_perf_file = prefix + ".intronic.perf.txt"
        input_fileh = open(intergenic_perf_file)
        input_fileh.next()
        for line in input_fileh:
            fields = ([library_id, library_name, "intergenic"] +
                      line.strip().split('\t'))
            print >> fileh, '\t'.join(fields)
        input_fileh.close()
        input_fileh = open(intronic_perf_file)
        input_fileh.next()
        for line in input_fileh:
            fields = ([library_id, library_name, "intronic"] +
                      line.strip().split('\t'))
            print >> fileh, '\t'.join(fields)
        input_fileh.close()
    fileh.close()
    # add reference gtf file
    expressed_gtf_files.append(results.ref_gtf_file)
    background_gtf_files.append(results.ref_gtf_file)
    # merge sort gtf files
    logging.info("Merging and sorting expressed GTF files")
    merge_sort_gtf_files(expressed_gtf_files,
                         results.expressed_gtf_file,
                         tmp_dir=results.tmp_dir)
    logging.info("Merging and sorting background GTF files")
    merge_sort_gtf_files(background_gtf_files,
                         results.background_gtf_file,
                         tmp_dir=results.tmp_dir)
    return 0

Example #4

0

Show file

File: annotate_transcripts.py Project: BioXiao/assemblyline

def annotate_gtf_parallel(input_gtf_file,
                          output_gtf_file, 
                          gtf_sample_attr, 
                          num_processors, 
                          tmp_dir):
    # create queue
    input_queue = JoinableQueue(maxsize=num_processors*3)
    # start worker processes
    procs = []
    worker_gtf_files = []
    for i in xrange(num_processors):
        worker_gtf_file = os.path.join(tmp_dir, "annotate_worker%03d.gtf" % (i))
        worker_gtf_files.append(worker_gtf_file)
        args = (input_queue, worker_gtf_file, gtf_sample_attr)
        p = Process(target=annotate_gtf_worker, args=args)
        p.daemon = True
        p.start()
        procs.append(p)
    for lines in parse_loci(open(input_gtf_file)):
        input_queue.put(lines)
    # stop workers
    for p in procs:
        input_queue.put([])
    # close queue
    input_queue.join()
    input_queue.close()
    # join worker processes
    for p in procs:
        p.join()
    # merge/sort worker gtf files
    logging.debug("Merging %d worker GTF file(s)" % (num_processors))
    merge_sort_gtf_files(worker_gtf_files, output_gtf_file, tmp_dir=tmp_dir)
    # remove worker gtf files
    for filename in worker_gtf_files:
        if os.path.exists(filename):
            os.remove(filename)

Example #5

0

Show file

File: assemble_transcripts_himem.py Project: redlin810/assemblyline

def run_parallel(config):
    """
    runs assembly in parallel and merges output from child processes 

    config: RunConfig object
    """
    # create temp directory
    tmp_dir = os.path.join(config.output_dir, "tmp")
    if not os.path.exists(tmp_dir):
        logging.debug("Creating tmp directory '%s'" % (tmp_dir))
        os.makedirs(tmp_dir)
    # create queue
    input_queue = JoinableQueue(maxsize=config.num_processors*3)
    # shared memory values
    locus_id_value_obj = LockValue(1)
    gene_id_value_obj = LockValue(1)
    tss_id_value_obj = LockValue(1)
    t_id_value_obj = LockValue(1)
    # start worker processes
    procs = []
    worker_prefixes = []
    for i in xrange(config.num_processors):
        worker_prefix = os.path.join(tmp_dir, "worker%03d" % (i))
        worker_prefixes.append(worker_prefix)
        args = (input_queue, 
                locus_id_value_obj,
                gene_id_value_obj,
                tss_id_value_obj,
                t_id_value_obj,
                worker_prefix,
                config)
        p = Process(target=assembly_worker, args=args)
        p.daemon = True
        p.start()
        procs.append(p)
    # parse gtf file                
    for lines in parse_loci(open(config.gtf_input_file)):
        input_queue.put(lines)
    # stop workers
    for p in procs:
        input_queue.put([])
    # close queue
    input_queue.join()
    input_queue.close()
    # join worker processes
    for p in procs:
        p.join()
    # merge gtf files
    if config.create_gtf:
        logging.info("Merging %d worker GTF files" % 
                     (config.num_processors))
        worker_gtf_files = [prefix + ".gtf" for prefix in worker_prefixes]
        output_gtf_file = os.path.join(config.output_dir, "assembly.gtf")
        merge_sort_gtf_files(worker_gtf_files, output_gtf_file, 
                             tmp_dir=tmp_dir)
        # remove worker gtf files
        for filename in worker_gtf_files:
            if os.path.exists(filename):
                os.remove(filename)
    # merge bed files
    if config.create_bed:
        logging.info("Merging %d worker BED files" % 
                     (config.num_processors))
        worker_bed_files = [p + ".bed" for p in worker_prefixes]
        output_bed_file = os.path.join(config.output_dir, "assembly.bed")
        merge_sort_files(worker_bed_files, output_bed_file, 
                         sort_func=sort_bed, 
                         tmp_dir=tmp_dir)
        # write bed file track description line
        track_name = os.path.basename(config.output_dir)
        track_line = ' '.join(['track name="%s"' % (track_name),
                               'description="%s"' % (track_name),
                               'visibility=pack',
                               'useScore=1'])
        track_file = os.path.join(config.output_dir, 
                                  "assembly.bed.ucsc_track")
        fileh = open(track_file, "w")
        print >>fileh, track_line
        fileh.close()
    # merge bedgraph files
    if config.create_bedgraph:
        logging.info("Merging %d worker bedGraph files" % 
                     (config.num_processors))
        for strand in xrange(0,3):
            strand_name = STRAND_NAMES[strand]
            bgfiles = ['%s_%s.bedgraph' % (p, strand_name)
                       for p in worker_prefixes]
            output_file = os.path.join(config.output_dir, 
                                       "assembly_%s.bedgraph" % strand_name)
            merge_sort_files(bgfiles, output_file, 
                             sort_func=sort_bed, 
                             tmp_dir=tmp_dir)
            track_name = '%s_%s' % (os.path.basename(config.output_dir), 
                                    strand_name)
            track_line = ' '.join(['track type=bedGraph',
                                   'name="%s"' % (track_name),
                                   'description="%s"' % (track_name),
                                   'visibility=full',
                                   'color=%s' % (STRAND_COLORS[strand]),
                                   'autoScale=on',
                                   'alwaysZero=on',
                                   'maxHeightPixels=64:64:11'])
            track_file = os.path.join(config.output_dir, 
                                      "assembly_%s.bedgraph.ucsc_track" % strand_name)
            fileh = open(track_file, "w")
            print >>fileh, track_line
            fileh.close()
    # cleanup
    if os.path.exists(tmp_dir):
        shutil.rmtree(tmp_dir)
    logging.info("Done")
    return 0

Example #6

0

Show file

File: classify_transcripts.py Project: charlottekyng/assemblyline

def merge_transcripts(results):
    # read library category statistics
    stats_list = list(CategoryStats.from_file(results.category_stats_file))
    library_ids = []
    expressed_gtf_files = []
    background_gtf_files = []
    for statsobj in stats_list:
        library_id = statsobj.library_id
        library_ids.append(library_id)
        prefix = os.path.join(results.classify_dir, library_id)
        expressed_gtf_files.append(prefix + ".expr.gtf")
        background_gtf_files.append(prefix + ".bkgd.gtf")
    library_id_map = {}
    for line in open(results.library_id_map):
        fields = line.strip().split("\t")
        library_id_map[fields[0]] = fields[1]
    # make a classification report
    logging.info("Writing classification report")
    fileh = open(results.classify_report_file, "w")
    header_fields = [
        "library_id",
        "library_name",
        "category",
        "train.auc",
        "test.auc",
        "train.cutoff",
        "train.tp",
        "train.fp",
        "train.fn",
        "train.tn",
        "train.sens",
        "train.spec",
        "train.balacc",
        "test.tp",
        "test.fp",
        "test.fn",
        "test.tn",
        "test.sens",
        "test.spec",
        "test.balacc",
    ]
    print >> fileh, "\t".join(header_fields)
    for library_id in library_ids:
        prefix = os.path.join(results.classify_dir, library_id)
        library_name = library_id_map[library_id]
        intergenic_perf_file = prefix + ".intergenic.perf.txt"
        intronic_perf_file = prefix + ".intronic.perf.txt"
        input_fileh = open(intergenic_perf_file)
        input_fileh.next()
        for line in input_fileh:
            fields = [library_id, library_name, "intergenic"] + line.strip().split("\t")
            print >> fileh, "\t".join(fields)
        input_fileh.close()
        input_fileh = open(intronic_perf_file)
        input_fileh.next()
        for line in input_fileh:
            fields = [library_id, library_name, "intronic"] + line.strip().split("\t")
            print >> fileh, "\t".join(fields)
        input_fileh.close()
    fileh.close()
    # add reference gtf file
    expressed_gtf_files.append(results.ref_gtf_file)
    background_gtf_files.append(results.ref_gtf_file)
    # merge sort gtf files
    logging.info("Merging and sorting expressed GTF files")
    merge_sort_gtf_files(expressed_gtf_files, results.expressed_gtf_file, tmp_dir=results.tmp_dir)
    logging.info("Merging and sorting background GTF files")
    merge_sort_gtf_files(background_gtf_files, results.background_gtf_file, tmp_dir=results.tmp_dir)
    return 0

Example #7

0

Show file

File: assemble_transcripts_himem.py Project: charlottekyng/assemblyline

def run_parallel(config):
    """
    runs assembly in parallel and merges output from child processes 

    config: RunConfig object
    """
    # create temp directory
    tmp_dir = os.path.join(config.output_dir, "tmp")
    if not os.path.exists(tmp_dir):
        logging.debug("Creating tmp directory '%s'" % (tmp_dir))
        os.makedirs(tmp_dir)
    # create queue
    input_queue = JoinableQueue(maxsize=config.num_processors * 3)
    # shared memory values
    locus_id_value_obj = LockValue(1)
    gene_id_value_obj = LockValue(1)
    tss_id_value_obj = LockValue(1)
    t_id_value_obj = LockValue(1)
    # start worker processes
    procs = []
    worker_prefixes = []
    for i in xrange(config.num_processors):
        worker_prefix = os.path.join(tmp_dir, "worker%03d" % (i))
        worker_prefixes.append(worker_prefix)
        args = (
            input_queue,
            locus_id_value_obj,
            gene_id_value_obj,
            tss_id_value_obj,
            t_id_value_obj,
            worker_prefix,
            config,
        )
        p = Process(target=assembly_worker, args=args)
        p.daemon = True
        p.start()
        procs.append(p)
    # parse gtf file
    for lines in parse_loci(open(config.gtf_input_file)):
        input_queue.put(lines)
    # stop workers
    for p in procs:
        input_queue.put([])
    # close queue
    input_queue.join()
    input_queue.close()
    # join worker processes
    for p in procs:
        p.join()
    # merge gtf files
    if config.create_gtf:
        logging.info("Merging %d worker GTF files" % (config.num_processors))
        worker_gtf_files = [prefix + ".gtf" for prefix in worker_prefixes]
        output_gtf_file = os.path.join(config.output_dir, "assembly.gtf")
        merge_sort_gtf_files(worker_gtf_files, output_gtf_file, tmp_dir=tmp_dir)
        # remove worker gtf files
        for filename in worker_gtf_files:
            if os.path.exists(filename):
                os.remove(filename)
    # merge bed files
    if config.create_bed:
        logging.info("Merging %d worker BED files" % (config.num_processors))
        worker_bed_files = [p + ".bed" for p in worker_prefixes]
        output_bed_file = os.path.join(config.output_dir, "assembly.bed")
        merge_sort_files(worker_bed_files, output_bed_file, sort_func=sort_bed, tmp_dir=tmp_dir)
        # write bed file track description line
        track_name = os.path.basename(config.output_dir)
        track_line = " ".join(
            ['track name="%s"' % (track_name), 'description="%s"' % (track_name), "visibility=pack", "useScore=1"]
        )
        track_file = os.path.join(config.output_dir, "assembly.bed.ucsc_track")
        fileh = open(track_file, "w")
        print >> fileh, track_line
        fileh.close()
    # merge bedgraph files
    if config.create_bedgraph:
        logging.info("Merging %d worker bedGraph files" % (config.num_processors))
        for strand in xrange(0, 3):
            strand_name = STRAND_NAMES[strand]
            bgfiles = ["%s_%s.bedgraph" % (p, strand_name) for p in worker_prefixes]
            output_file = os.path.join(config.output_dir, "assembly_%s.bedgraph" % strand_name)
            merge_sort_files(bgfiles, output_file, sort_func=sort_bed, tmp_dir=tmp_dir)
            track_name = "%s_%s" % (os.path.basename(config.output_dir), strand_name)
            track_line = " ".join(
                [
                    "track type=bedGraph",
                    'name="%s"' % (track_name),
                    'description="%s"' % (track_name),
                    "visibility=full",
                    "color=%s" % (STRAND_COLORS[strand]),
                    "autoScale=on",
                    "alwaysZero=on",
                    "maxHeightPixels=64:64:11",
                ]
            )
            track_file = os.path.join(config.output_dir, "assembly_%s.bedgraph.ucsc_track" % strand_name)
            fileh = open(track_file, "w")
            print >> fileh, track_line
            fileh.close()
    # cleanup
    if os.path.exists(tmp_dir):
        shutil.rmtree(tmp_dir)
    logging.info("Done")
    return 0