Ejemplo n.º 1
0
def main():
    # setup logging
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    logging.info("AssemblyLine %s" % (assemblyline.__version__))
    logging.info("----------------------------------")
    # parse command line
    parser = argparse.ArgumentParser()
    parser.add_argument('--tmp-dir',
                        dest="tmp_dir",
                        default=None,
                        help="directory for sort to store temp files")
    parser.add_argument("input_gtf_file")
    parser.add_argument("output_gtf_file")
    args = parser.parse_args()
    # check command line
    if not os.path.exists(args.input_gtf_file):
        parser.error("input gtf file %s not found" % (args.input_gtf_file))
    logging.info("Parameters:")
    logging.info("input gtf file:  %s" % (args.input_gtf_file))
    logging.info("output gtf file: %s" % (args.output_gtf_file))
    logging.info("Sorting")
    sort_gtf(args.input_gtf_file, args.output_gtf_file, tmp_dir=args.tmp_dir)
    logging.info("Done")
    return 0
def main():
    # parse command line
    parser = argparse.ArgumentParser()
    parser.add_argument("-v", "--verbose", action="store_true", 
                        dest="verbose", default=False)
    parser.add_argument("ref_gtf_file")
    parser.add_argument("gtf_file")
    args = parser.parse_args()
    # set logging level
    if args.verbose:
        level = logging.DEBUG
    else:
        level = logging.INFO
    logging.basicConfig(level=level,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    # check command line parameters
    if not os.path.exists(args.ref_gtf_file):
        parser.error("GTF file %s not found" % (args.ref_gtf_file))
    if not os.path.exists(args.gtf_file):
        parser.error("GTF file %s not found" % (args.gtf_file))
    logging.info("AssemblyLine %s" % (assemblyline.__version__))
    logging.info("----------------------------------")   
    # show parameters
    logging.info("Parameters:")
    logging.info("verbose logging:       %s" % (args.verbose))
    logging.info("ref gtf file:          %s" % (args.ref_gtf_file))
    logging.info("assembly gtf file:     %s" % (args.gtf_file))
    # find CDS regions
    if not os.path.exists('tmp.srt.gtf'):
        with open('tmp.gtf', 'w') as outfileh:
            logging.info("Reading CDS regions from reference GTF")
            for f in get_cds_features(args.ref_gtf_file):
                print >>outfileh, str(f)
            logging.info("Reading transcripts from assembly GTF")
            i = 0
            for f in GTFFeature.parse(open(args.gtf_file)):
                print >>outfileh, str(f)
                i += 1
                if i % 100000 == 0:
                    logging.debug("Parsed %d transcripts" % (i))
        logging.info("Sorting GTF file")
        sort_gtf('tmp.gtf', 'tmp.srt.gtf')
    for locus_transcripts in parse_gtf(open('tmp.srt.gtf')):
        locus_chrom = locus_transcripts[0].chrom
        locus_start = locus_transcripts[0].start
        locus_end = max(t.end for t in locus_transcripts)
        logging.debug("[LOCUS] %s:%d-%d %d transcripts" % 
                      (locus_chrom, locus_start, locus_end, 
                       len(locus_transcripts)))
        for start, end, strand, m, t, c in categorize(locus_transcripts):
            fields = [locus_chrom, str(start), str(end), '%s|%s|%s' % (m,t,c), '0', strand_int_to_str(strand)]
            print '\t'.join(fields)    
    return 0
Ejemplo n.º 3
0
def compare_assembly(ref_gtf_file, test_gtf_file, output_dir, 
                     gtf_score_attr, tmp_dir):
    # output files
    compare_file = os.path.join(output_dir, "compare_transcripts.txt")
    global_stats_file = os.path.join(output_dir, "global_stats.txt")
    tmp_gtf_file = os.path.join(output_dir, "tmp.gtf")
    tmp_sorted_gtf_file = os.path.splitext(tmp_gtf_file)[0] + ".srt.gtf"
    # merge and sort ref/test gtf files
    logging.info("Merging reference and test GTF files")
    # make temporary file to store merged ref/test gtf files
    outfh = open(tmp_gtf_file, "w")
    logging.info("Adding reference GTF file")
    add_gtf_file(ref_gtf_file, outfh, is_ref=True, sample_id=None)
    logging.info("Adding test GTF file")
    add_gtf_file(test_gtf_file, outfh, is_ref=False, sample_id='assembly')
    outfh.close()
    logging.info("Sorting merged GTF file")
    sort_gtf(tmp_gtf_file, tmp_sorted_gtf_file, tmp_dir=tmp_dir)
    os.remove(tmp_gtf_file)
    # compare assemblies
    logging.info("Comparing assemblies")
    cmp_fh = open(compare_file, "w")
    print >>cmp_fh, '\t'.join(map(str, MatchStats.header_fields()))
    stats_obj = GlobalStats()
    for locus_transcripts in parse_gtf(open(tmp_sorted_gtf_file)):
        locus_chrom = locus_transcripts[0].chrom
        locus_start = locus_transcripts[0].start
        locus_end = max(t.end for t in locus_transcripts)
        logging.debug("[LOCUS] %s:%d-%d %d transcripts" % 
                      (locus_chrom, locus_start, locus_end, 
                       len(locus_transcripts)))    
        # score transcripts
        for t in locus_transcripts:
            if gtf_score_attr is None:
                t.score = 0.0
            else:
                t.score = float(t.attrs.get(gtf_score_attr, 0.0))
        # run comparison
        for mobj in compare_locus(locus_transcripts):
            print >>cmp_fh, str(mobj)
        # measure global stats
        locus_stats_obj = gather_global_stats(locus_transcripts)
        stats_obj = stats_obj + locus_stats_obj        
    # cleanup
    cmp_fh.close()
    logging.info("Printing report")    
    f = open(global_stats_file, "w")
    print >>f, stats_obj.report()
    os.remove(tmp_sorted_gtf_file)
    logging.info("Done")    
Ejemplo n.º 4
0
def main():
    # setup logging
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    logging.info("AssemblyLine %s" % (assemblyline.__version__))
    logging.info("----------------------------------")
    # parse command line
    parser = argparse.ArgumentParser()
    parser.add_argument('--tmp-dir', dest="tmp_dir", default=None,
                        help="directory for sort to store temp files")
    parser.add_argument("input_gtf_file")
    parser.add_argument("output_gtf_file")
    args = parser.parse_args()
    # check command line
    if not os.path.exists(args.input_gtf_file):
        parser.error("input gtf file %s not found" % (args.input_gtf_file))
    logging.info("Parameters:")
    logging.info("input gtf file:  %s" % (args.input_gtf_file))
    logging.info("output gtf file: %s" % (args.output_gtf_file))
    logging.info("Sorting")
    sort_gtf(args.input_gtf_file, args.output_gtf_file, 
             tmp_dir=args.tmp_dir)
    logging.info("Done")
    return 0
def main():
    # parse command line
    parser = argparse.ArgumentParser()
    parser.add_argument("-v",
                        "--verbose",
                        action="store_true",
                        dest="verbose",
                        default=False)
    parser.add_argument("ref_gtf_file")
    parser.add_argument("gtf_file")
    args = parser.parse_args()
    # set logging level
    if args.verbose:
        level = logging.DEBUG
    else:
        level = logging.INFO
    logging.basicConfig(
        level=level,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    # check command line parameters
    if not os.path.exists(args.ref_gtf_file):
        parser.error("GTF file %s not found" % (args.ref_gtf_file))
    if not os.path.exists(args.gtf_file):
        parser.error("GTF file %s not found" % (args.gtf_file))
    logging.info("AssemblyLine %s" % (assemblyline.__version__))
    logging.info("----------------------------------")
    # show parameters
    logging.info("Parameters:")
    logging.info("verbose logging:       %s" % (args.verbose))
    logging.info("ref gtf file:          %s" % (args.ref_gtf_file))
    logging.info("assembly gtf file:     %s" % (args.gtf_file))
    # find CDS regions
    if not os.path.exists('tmp.srt.gtf'):
        with open('tmp.gtf', 'w') as outfileh:
            logging.info("Reading CDS regions from reference GTF")
            for f in get_cds_features(args.ref_gtf_file):
                print >> outfileh, str(f)
            logging.info("Reading transcripts from assembly GTF")
            i = 0
            for f in GTFFeature.parse(open(args.gtf_file)):
                print >> outfileh, str(f)
                i += 1
                if i % 100000 == 0:
                    logging.debug("Parsed %d transcripts" % (i))
        logging.info("Sorting GTF file")
        sort_gtf('tmp.gtf', 'tmp.srt.gtf')
    for locus_transcripts in parse_gtf(open('tmp.srt.gtf')):
        locus_chrom = locus_transcripts[0].chrom
        locus_start = locus_transcripts[0].start
        locus_end = max(t.end for t in locus_transcripts)
        logging.debug(
            "[LOCUS] %s:%d-%d %d transcripts" %
            (locus_chrom, locus_start, locus_end, len(locus_transcripts)))
        for start, end, strand, m, t, c in categorize(locus_transcripts):
            fields = [
                locus_chrom,
                str(start),
                str(end),
                '%s|%s|%s' % (m, t, c), '0',
                strand_int_to_str(strand)
            ]
            print '\t'.join(fields)
    return 0
def main():
    # setup logging
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    logging.info("AssemblyLine %s" % (assemblyline.__version__))
    logging.info("----------------------------------")
    # parse command line
    parser = argparse.ArgumentParser()
    parser.add_argument('--min-transcript-length', type=int, 
                        dest="min_transcript_length",
                        metavar="N",
                        default=config.MIN_TRANSCRIPT_LENGTH,
                        help="Skip ab initio transcripts equal to or below "
                        "this length [default=%(default)s]")
    parser.add_argument("--gtf-score-attr", dest="gtf_score_attr", 
                        default="FPKM", metavar="ATTR",
                        help="GTF attribute field containing transcript "
                        "expression [default='%(default)s']")
    parser.add_argument('-o', '--output-dir', dest="output_dir", 
                        default="assemblyline_out",
                        help="directory to store assemblyline results and "
                        "intermediate files [default=%(default)s]")
    parser.add_argument("--random-test-frac", dest="random_test_frac", 
                        default=0.1, metavar="FRAC", type=float,
                        help="if no user-defined tests are specified "
                        "using '--tests' randomly designate a fraction "
                        "of reference transcripts as test data for use "
                        "in classification [default=%(default)s]")
    parser.add_argument("--tests", dest="test_file", default=None,
                        help="(optional) text file containing "
                        "reference 'gene_id' attributes "
                        "(one per line) that define test cases "
                        "to use for validation purposes")
    parser.add_argument('ref_gtf_file')
    parser.add_argument('library_table_file')
    args = parser.parse_args()
    # check command line parameters
    if not os.path.exists(args.library_table_file):
        parser.error("library table file %s not found" % (args.library_table_file))
    if args.min_transcript_length < 0:
        parser.error("min_transcript_length < 0")
    if not os.path.exists(args.ref_gtf_file):
        parser.error("reference GTF file %s not found" % (args.ref_gtf_file))
    if (args.test_file is not None) and (not os.path.exists(args.test_file)):
        parser.error("test file %s not found" % (args.test_file))
    if (args.random_test_frac < 0):
        parser.error("cannot set --random-test-frac < 0")
    # show parameters
    logging.info("Parameters:")
    logging.info("min transcript length: %d" % (args.min_transcript_length))
    logging.info("gtf score attr:        %s" % (args.gtf_score_attr))
    logging.info("output directory:      %s" % (args.output_dir))
    logging.info("reference GTF file:    %s" % (args.ref_gtf_file))
    logging.info("test file:             %s" % (args.test_file))
    logging.info("library table file:    %s" % (args.library_table_file))
    logging.info("----------------------------------")
    # setup results
    results = config.AssemblylineResults(args.output_dir)
    # create output directory
    if not os.path.exists(results.run_dir):
        logging.debug("Creating output directory '%s'" % (results.run_dir))
        os.makedirs(results.run_dir)
    if not os.path.exists(results.tmp_dir):
        logging.info("Creating tmp directory '%s'" % (results.tmp_dir))
        os.makedirs(results.tmp_dir)
    # parse sample table
    logging.info("Parsing library table")
    libraries = []
    valid = True
    library_num = 1
    sample_num = 1
    sample_id_map = {}
    library_map_fileh = open(results.library_id_map, 'w')
    sample_map_fileh = open(results.sample_id_map, 'w')
    for library in Library.from_file(args.library_table_file):
        # exclude samples
        if not os.path.exists(library.gtf_file):
            logging.warning("Library '%s' GTF file not found" % (library.library_id)) 
            continue
        # rename library id
        new_library_id = "L%d" % (library_num)
        print >>library_map_fileh, '\t'.join([new_library_id, library.library_id]) 
        library_num += 1
        library.library_id = new_library_id
        # rename sample id
        if library.sample_id not in sample_id_map:
            new_sample_id = "S%d" % (sample_num)
            print >>sample_map_fileh, '\t'.join([new_sample_id, library.sample_id]) 
            sample_id_map[library.sample_id] = new_sample_id
            sample_num += 1
        else:
            new_sample_id = sample_id_map[library.sample_id]
        library.sample_id = new_sample_id
        libraries.append(library)
    if not valid:
        logging.warning("Invalid libraries in table file")
    library_map_fileh.close()
    sample_map_fileh.close()
    # setup output files
    tmp_file = os.path.join(results.tmp_dir, "transcripts.unsorted.gtf")
    tmpfileh = open(tmp_file, "w")
    dropfileh = open(results.transcripts_dropped_gtf_file, "w")
    statsfileh = open(results.transcript_stats_file, 'w')
    header_fields = ['#library_id']
    header_fields.extend(config.TRANSCRIPT_STATS_FIELDS)
    header_fields.extend([("failed_q%d" % x) for x in config.TRANSCRIPT_SCORE_QUANTILES])
    header_fields.extend([("passed_q%d" % x) for x in config.TRANSCRIPT_SCORE_QUANTILES])
    print >>statsfileh, '\t'.join(header_fields)
    # read test transcripts
    test_gene_ids = set()
    if args.test_file is not None:
        fileh = open(args.test_file)
        test_gene_ids.update(line.strip() for line in fileh)
        fileh.close()
        logging.info("Read %d test genes" % len(test_gene_ids))
    # read reference GTF file and aggregate
    logging.info("Adding reference GTF file")
    add_reference_gtf_file(args.ref_gtf_file, test_gene_ids, 
                           args.random_test_frac, tmpfileh)
    # parse sample table
    logging.info("Adding libraries")
    for library in libraries:
        t_dict = read_gtf_file(library, args.gtf_score_attr)
        logging.debug("Read %s transcripts from file %s" % (len(t_dict), 
                                                            library.gtf_file))
        if len(t_dict) == 0:
            logging.warning("Library %s has no transcripts" % 
                            (library.library_id))
        else:
            filter_transcripts(library.library_id, t_dict, 
                               tmpfileh, dropfileh, statsfileh, 
                               args.min_transcript_length)
    statsfileh.close()
    tmpfileh.close()
    logging.info("Sorting GTF")
    retcode = sort_gtf(tmp_file, results.transcripts_gtf_file, tmp_dir=results.tmp_dir)
    if retcode != 0:
        logging.error("sort GTF failed")
        if os.path.exists(results.transcripts_gtf_file):
            os.remove(results.transcripts_gtf_file)
    os.remove(tmp_file)
    logging.info("Done")
    return retcode
Ejemplo n.º 7
0
def compare_assemblies(ref_gtf_file, test_gtf_file, output_dir):
    # output files
    if not os.path.exists(output_dir):
        logging.info('Creating output dir: %s' % (output_dir))
        os.makedirs(output_dir)
    # merge step
    merged_gtf_file = os.path.join(output_dir, "merged.gtf")
    merged_sorted_gtf_file = os.path.splitext(merged_gtf_file)[0] + ".srt.gtf"
    merge_done_file = os.path.join(output_dir, 'merged.done')
    sort_done_file = os.path.join(output_dir, 'sort.done')
    if not os.path.exists(merge_done_file):
        # merge and sort ref/test gtf files
        logging.info("Merging reference and test GTF files")
        # make temporary file to store merged ref/test gtf files
        with open(merged_gtf_file, "w") as fileh:
            logging.info("Adding reference GTF file")
            add_gtf_file(ref_gtf_file, fileh, is_ref=True)
            logging.info("Adding test GTF file")
            add_gtf_file(test_gtf_file, fileh, is_ref=False)
        open(merge_done_file, 'w').close()
    if not os.path.exists(sort_done_file):
        logging.info("Sorting merged GTF file")
        # create temp directory
        tmp_dir = os.path.join(output_dir, 'tmp')
        if not os.path.exists(tmp_dir):
            logging.debug("Creating tmp directory '%s'" % (tmp_dir))
            os.makedirs(tmp_dir)
        sort_gtf(merged_gtf_file, merged_sorted_gtf_file, tmp_dir=tmp_dir)
        # cleanup
        shutil.rmtree(tmp_dir)
        open(sort_done_file, 'w').close()
    # compare assemblies
    overlapping_gtf_file = os.path.join(output_dir, 'overlapping.gtf')
    intergenic_tmp_gtf_file = os.path.join(output_dir, 'intergenic.tmp.gtf')
    overlapping_file = os.path.join(output_dir, 'overlapping.tsv')
    overlapping_consensus_file = os.path.join(output_dir,
                                              'overlapping.consensus.tsv')
    overlapping_done_file = os.path.join(output_dir, 'overlapping.done')
    stats_file = os.path.join(output_dir, 'stats.txt')
    stats_obj = GlobalStats()
    num_intergenic = 0
    if not os.path.exists(overlapping_done_file):
        logging.info("Comparing assemblies")
        gtf_fileh = open(overlapping_gtf_file, 'w')
        tmp_gtf_fileh = open(intergenic_tmp_gtf_file, 'w')
        overlapping_fileh = open(overlapping_file, 'w')
        overlapping_consensus_fileh = open(overlapping_consensus_file, 'w')
        for locus_transcripts in parse_gtf(open(merged_sorted_gtf_file)):
            locus_chrom = locus_transcripts[0].chrom
            locus_start = locus_transcripts[0].start
            locus_end = max(t.end for t in locus_transcripts)
            logging.debug(
                "[LOCUS] %s:%d-%d %d transcripts" %
                (locus_chrom, locus_start, locus_end, len(locus_transcripts)))
            for t, match_stats in compare_locus(locus_transcripts):
                if len(match_stats) == 0:
                    # write intergenic transcripts to analyze separately
                    t.attrs['category'] = Category.to_str(Category.INTERGENIC)
                    for f in t.to_gtf_features(source='assembly'):
                        print >> tmp_gtf_fileh, str(f)
                    num_intergenic += 1
                else:
                    # get consensus match information
                    consensus_match = MatchStats.consensus(match_stats)
                    assert consensus_match is not None
                    t.attrs['category'] = consensus_match.category
                    # add gtf attributes and write
                    for f in t.to_gtf_features(source='assembly'):
                        consensus_match.add_gtf_attributes(f)
                        print >> gtf_fileh, str(f)
                    # tab-delimited text output
                    print >> overlapping_consensus_fileh, str(consensus_match)
                    for ms in match_stats:
                        print >> overlapping_fileh, str(ms)
            # compute global statistics
            stats_obj.compute(locus_transcripts)
        logging.info("Reporting global statistics")
        with open(stats_file, 'w') as f:
            print >> f, stats_obj.report()
        gtf_fileh.close()
        tmp_gtf_fileh.close()
        overlapping_fileh.close()
        overlapping_consensus_fileh.close()
        open(overlapping_done_file, 'w').close()
    # resolve intergenic transcripts
    intergenic_gtf_file = os.path.join(output_dir, 'intergenic.gtf')
    intergenic_file = os.path.join(output_dir, 'intergenic.tsv')
    intergenic_best_file = os.path.join(output_dir, 'intergenic.best.tsv')
    intergenic_done_file = os.path.join(output_dir, 'intergenic.done')
    if not os.path.exists(intergenic_done_file):
        logging.info("Building interval index")
        locus_trees = build_locus_trees(merged_sorted_gtf_file)
        logging.info('Finding nearest matches to intergenic transcripts')
        gtf_fileh = open(intergenic_gtf_file, 'w')
        intergenic_fileh = open(intergenic_file, 'w')
        intergenic_best_fileh = open(intergenic_best_file, 'w')
        for locus_transcripts in parse_gtf(open(intergenic_tmp_gtf_file)):
            for t in locus_transcripts:
                # find nearest transcripts
                nearest_transcripts = find_nearest_transcripts(
                    t.chrom, t.start, t.end, t.strand, locus_trees)
                match_stats = []
                best_match = None
                if len(nearest_transcripts) == 0:
                    best_match = MatchStats.from_transcript(t)
                    best_match.category = Category.to_str(Category.INTERGENIC)
                    match_stats.append(best_match)
                else:
                    for ref, category, dist in nearest_transcripts:
                        # create a match object
                        ms = MatchStats.from_transcript(t, ref)
                        ms.shared_same_strand_bp = 0
                        ms.shared_opp_strand_bp = 0
                        ms.shared_introns = 0
                        ms.shared_splicing = False
                        ms.category = Category.to_str(category)
                        ms.distance = dist
                        match_stats.append(ms)
                    # choose the consensus match
                    best_match = MatchStats.choose_best(match_stats)
                # add gtf attributes and write
                for f in t.to_gtf_features(source='assembly'):
                    best_match.add_gtf_attributes(f)
                    print >> gtf_fileh, str(f)
                # write tab-delimited data
                print >> intergenic_best_fileh, str(best_match)
                for ms in match_stats:
                    print >> intergenic_fileh, str(ms)
        gtf_fileh.close()
        intergenic_fileh.close()
        intergenic_best_fileh.close()
        open(intergenic_done_file, 'w').close()
    # merge overlapping and intergenic results
    logging.info('Merging results')
    metadata_file = os.path.join(output_dir, 'metadata.txt')
    metadata_consensus_file = os.path.join(output_dir,
                                           'metadata.consensus.txt')
    assembly_gtf_file = os.path.join(output_dir, 'assembly.cmp.gtf')
    combine_done_file = os.path.join(output_dir, 'done')
    if not os.path.exists(combine_done_file):
        filenames = [overlapping_file, intergenic_file]
        with open(metadata_file, 'w') as outfile:
            print >> outfile, '\t'.join(MatchStats.header_fields())
            for fname in filenames:
                with open(fname) as infile:
                    for line in infile:
                        outfile.write(line)
        filenames = [overlapping_consensus_file, intergenic_best_file]
        with open(metadata_consensus_file, 'w') as outfile:
            print >> outfile, '\t'.join(MatchStats.header_fields())
            for fname in filenames:
                with open(fname) as infile:
                    for line in infile:
                        outfile.write(line)
        filenames = [intergenic_gtf_file, overlapping_gtf_file]
        with open(assembly_gtf_file, 'w') as outfile:
            for fname in filenames:
                with open(fname) as infile:
                    for line in infile:
                        outfile.write(line)
        open(combine_done_file, 'w').close()
    # cleanup
    logging.info("Done")
Ejemplo n.º 8
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('gtf_file')
    parser.add_argument('excl_file')
    parser.add_argument('chrom_sizes')
    parser.add_argument("output_prefix")
    args = parser.parse_args()
    prefix = args.output_prefix
    excl_file = args.excl_file
    chrom_sizes_file = args.chrom_sizes
    gtf_file = args.gtf_file
    # check command line parameters
    if which('bedtools') is None:
        parser.error('bedtools binary not found in PATH')
    if not os.path.exists(chrom_sizes_file):
        parser.error('chrom sizes file %s not found' % (chrom_sizes_file))
    gene_intervals_file = prefix + '.gene_intervals.bed'
    gene_intervals_shuffled_file = prefix + '.gene_intervals.shuffle.bed'
    shuffled_gtf_file = prefix + '.shuffle.gtf'
    sorted_shuffled_gtf_file = prefix + '.shuffle.srt.gtf'
    logging.info('Parsing GTF file')
    with open(gene_intervals_file, 'w') as f:
        for locus_transcripts in parse_gtf(open(gtf_file)):
            # find borders of locus
            locus_chrom = locus_transcripts[0].chrom
            locus_start = min(t.start for t in locus_transcripts)
            locus_end = max(t.end for t in locus_transcripts)
            logging.debug("[LOCUS] %s:%d-%d %d transcripts" % 
                          (locus_chrom, locus_start, locus_end, 
                           len(locus_transcripts)))
            for g in get_gene_intervals(locus_transcripts):
                print >>f, '\t'.join(map(str, [g.chrom, g.start, g.end, g.gene_id]))    
    # randomly shuffle genes
    logging.info("Shuffling genes")
    args = ['bedtools', 'shuffle', 
            '-excl', excl_file,
            '-i', gene_intervals_file, 
            '-g', args.chrom_sizes]
    with open(gene_intervals_shuffled_file, 'w') as fileh:
        subprocess.call(args, stdout=fileh)
    # read new gene positions
    logging.info("Reading shuffled gene intervals")
    shuffle_gene_map = {}
    with open(gene_intervals_shuffled_file) as fileh:
        for line in fileh:
            fields = line.strip().split('\t')
            chrom = fields[0]
            start = int(fields[1])
            end = int(fields[2])
            gene_id = fields[3]
            shuffle_gene_map[gene_id] = (chrom, start, end)
    # reposition transcripts
    logging.info("Repositioning transcripts")
    with open(shuffled_gtf_file, 'w') as fileh:
        for locus_transcripts in parse_gtf(open(gtf_file)):
            # get original positions
            orig_gene_map = {}
            for g in get_gene_intervals(locus_transcripts):
                orig_gene_map[g.gene_id] = (g.chrom, g.start, g.end)
            for t in locus_transcripts:
                gene_id = t.attrs['gene_id']
                orig_chrom, orig_start, orig_end = orig_gene_map[gene_id]
                if gene_id not in shuffle_gene_map:
                    logging.warning('Gene %s [%s:%d-%d] could not be shuffled' % (gene_id, orig_chrom, orig_start, orig_end))
                    continue
                new_chrom, new_start, new_end = shuffle_gene_map[gene_id]
                # reposition transcript
                t.chrom = new_chrom
                t.start = new_start + (t.start - orig_start)
                t.end = new_start + (t.end - orig_start)
                for e in t.exons:
                    e.start = new_start + (e.start - orig_start)
                    e.end = new_start + (e.end - orig_start)
                
                fields = write_bed(t.chrom, t.attrs['transcript_id'], t.strand, 1000, t.exons)
                print '\t'.join(fields)                
                #for f in t.to_gtf_features(source='shuffle'):
                #    print >>fileh, str(f)
    logging.info("Sorting GTF file")
    sort_gtf(shuffled_gtf_file, sorted_shuffled_gtf_file)
Ejemplo n.º 9
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('gtf_file')
    parser.add_argument('excl_file')
    parser.add_argument('chrom_sizes')
    parser.add_argument("output_prefix")
    args = parser.parse_args()
    prefix = args.output_prefix
    excl_file = args.excl_file
    chrom_sizes_file = args.chrom_sizes
    gtf_file = args.gtf_file
    # check command line parameters
    if which('bedtools') is None:
        parser.error('bedtools binary not found in PATH')
    if not os.path.exists(chrom_sizes_file):
        parser.error('chrom sizes file %s not found' % (chrom_sizes_file))
    gene_intervals_file = prefix + '.gene_intervals.bed'
    gene_intervals_shuffled_file = prefix + '.gene_intervals.shuffle.bed'
    shuffled_gtf_file = prefix + '.shuffle.gtf'
    sorted_shuffled_gtf_file = prefix + '.shuffle.srt.gtf'
    logging.info('Parsing GTF file')
    with open(gene_intervals_file, 'w') as f:
        for locus_transcripts in parse_gtf(open(gtf_file)):
            # find borders of locus
            locus_chrom = locus_transcripts[0].chrom
            locus_start = min(t.start for t in locus_transcripts)
            locus_end = max(t.end for t in locus_transcripts)
            logging.debug(
                "[LOCUS] %s:%d-%d %d transcripts" %
                (locus_chrom, locus_start, locus_end, len(locus_transcripts)))
            for g in get_gene_intervals(locus_transcripts):
                print >> f, '\t'.join(
                    map(str, [g.chrom, g.start, g.end, g.gene_id]))
    # randomly shuffle genes
    logging.info("Shuffling genes")
    args = [
        'bedtools', 'shuffle', '-excl', excl_file, '-i', gene_intervals_file,
        '-g', args.chrom_sizes
    ]
    with open(gene_intervals_shuffled_file, 'w') as fileh:
        subprocess.call(args, stdout=fileh)
    # read new gene positions
    logging.info("Reading shuffled gene intervals")
    shuffle_gene_map = {}
    with open(gene_intervals_shuffled_file) as fileh:
        for line in fileh:
            fields = line.strip().split('\t')
            chrom = fields[0]
            start = int(fields[1])
            end = int(fields[2])
            gene_id = fields[3]
            shuffle_gene_map[gene_id] = (chrom, start, end)
    # reposition transcripts
    logging.info("Repositioning transcripts")
    with open(shuffled_gtf_file, 'w') as fileh:
        for locus_transcripts in parse_gtf(open(gtf_file)):
            # get original positions
            orig_gene_map = {}
            for g in get_gene_intervals(locus_transcripts):
                orig_gene_map[g.gene_id] = (g.chrom, g.start, g.end)
            for t in locus_transcripts:
                gene_id = t.attrs['gene_id']
                orig_chrom, orig_start, orig_end = orig_gene_map[gene_id]
                if gene_id not in shuffle_gene_map:
                    logging.warning(
                        'Gene %s [%s:%d-%d] could not be shuffled' %
                        (gene_id, orig_chrom, orig_start, orig_end))
                    continue
                new_chrom, new_start, new_end = shuffle_gene_map[gene_id]
                # reposition transcript
                t.chrom = new_chrom
                t.start = new_start + (t.start - orig_start)
                t.end = new_start + (t.end - orig_start)
                for e in t.exons:
                    e.start = new_start + (e.start - orig_start)
                    e.end = new_start + (e.end - orig_start)

                fields = write_bed(t.chrom, t.attrs['transcript_id'], t.strand,
                                   1000, t.exons)
                print '\t'.join(fields)
                #for f in t.to_gtf_features(source='shuffle'):
                #    print >>fileh, str(f)
    logging.info("Sorting GTF file")
    sort_gtf(shuffled_gtf_file, sorted_shuffled_gtf_file)
Ejemplo n.º 10
0
def compare_assemblies(ref_gtf_file, test_gtf_file, output_dir): 
    # output files
    if not os.path.exists(output_dir):
        logging.info('Creating output dir: %s' % (output_dir))
        os.makedirs(output_dir)
    # merge step
    merged_gtf_file = os.path.join(output_dir, "merged.gtf")
    merged_sorted_gtf_file = os.path.splitext(merged_gtf_file)[0] + ".srt.gtf"
    merge_done_file = os.path.join(output_dir, 'merged.done')
    sort_done_file = os.path.join(output_dir, 'sort.done')
    if not os.path.exists(merge_done_file):
        # merge and sort ref/test gtf files
        logging.info("Merging reference and test GTF files")
        # make temporary file to store merged ref/test gtf files
        with open(merged_gtf_file, "w") as fileh:
            logging.info("Adding reference GTF file")
            add_gtf_file(ref_gtf_file, fileh, is_ref=True)
            logging.info("Adding test GTF file")
            add_gtf_file(test_gtf_file, fileh, is_ref=False)
        open(merge_done_file, 'w').close()
    if not os.path.exists(sort_done_file):        
        logging.info("Sorting merged GTF file")
        # create temp directory
        tmp_dir = os.path.join(output_dir, 'tmp')    
        if not os.path.exists(tmp_dir):
            logging.debug("Creating tmp directory '%s'" % (tmp_dir))
            os.makedirs(tmp_dir)
        sort_gtf(merged_gtf_file, merged_sorted_gtf_file, tmp_dir=tmp_dir)
        # cleanup
        shutil.rmtree(tmp_dir)
        open(sort_done_file, 'w').close()
    # compare assemblies
    overlapping_gtf_file = os.path.join(output_dir, 'overlapping.gtf')
    intergenic_tmp_gtf_file = os.path.join(output_dir, 'intergenic.tmp.gtf')
    overlapping_file = os.path.join(output_dir, 'overlapping.tsv')
    overlapping_consensus_file = os.path.join(output_dir, 'overlapping.consensus.tsv')
    overlapping_done_file = os.path.join(output_dir, 'overlapping.done')
    stats_file = os.path.join(output_dir, 'stats.txt')
    stats_obj = GlobalStats()
    num_intergenic = 0
    if not os.path.exists(overlapping_done_file):
        logging.info("Comparing assemblies")
        gtf_fileh = open(overlapping_gtf_file, 'w')
        tmp_gtf_fileh = open(intergenic_tmp_gtf_file, 'w')
        overlapping_fileh = open(overlapping_file, 'w')
        overlapping_consensus_fileh = open(overlapping_consensus_file, 'w')
        for locus_transcripts in parse_gtf(open(merged_sorted_gtf_file)):
            locus_chrom = locus_transcripts[0].chrom
            locus_start = locus_transcripts[0].start
            locus_end = max(t.end for t in locus_transcripts)
            logging.debug("[LOCUS] %s:%d-%d %d transcripts" % 
                          (locus_chrom, locus_start, locus_end, 
                           len(locus_transcripts)))
            for t, match_stats in compare_locus(locus_transcripts):
                if len(match_stats) == 0:
                    # write intergenic transcripts to analyze separately
                    t.attrs['category'] = Category.to_str(Category.INTERGENIC)
                    for f in t.to_gtf_features(source='assembly'):
                        print >>tmp_gtf_fileh, str(f)
                    num_intergenic += 1
                else:
                    # get consensus match information
                    consensus_match = MatchStats.consensus(match_stats)                    
                    assert consensus_match is not None
                    t.attrs['category'] = consensus_match.category
                    # add gtf attributes and write
                    for f in t.to_gtf_features(source='assembly'):
                        consensus_match.add_gtf_attributes(f)
                        print >>gtf_fileh, str(f)
                    # tab-delimited text output
                    print >>overlapping_consensus_fileh, str(consensus_match)
                    for ms in match_stats:
                        print >>overlapping_fileh, str(ms)
            # compute global statistics
            stats_obj.compute(locus_transcripts)
        logging.info("Reporting global statistics")
        with open(stats_file, 'w') as f:
            print >>f, stats_obj.report()
        gtf_fileh.close()
        tmp_gtf_fileh.close()
        overlapping_fileh.close()
        overlapping_consensus_fileh.close()
        open(overlapping_done_file, 'w').close()
    # resolve intergenic transcripts
    intergenic_gtf_file = os.path.join(output_dir, 'intergenic.gtf')
    intergenic_file = os.path.join(output_dir, 'intergenic.tsv')
    intergenic_best_file = os.path.join(output_dir, 'intergenic.best.tsv')
    intergenic_done_file = os.path.join(output_dir, 'intergenic.done')
    if not os.path.exists(intergenic_done_file):
        logging.info("Building interval index")
        locus_trees = build_locus_trees(merged_sorted_gtf_file)
        logging.info('Finding nearest matches to intergenic transcripts')
        gtf_fileh = open(intergenic_gtf_file, 'w')
        intergenic_fileh = open(intergenic_file, 'w')
        intergenic_best_fileh = open(intergenic_best_file, 'w')
        for locus_transcripts in parse_gtf(open(intergenic_tmp_gtf_file)):
            for t in locus_transcripts:
                # find nearest transcripts
                nearest_transcripts = find_nearest_transcripts(t.chrom, t.start, t.end, t.strand, locus_trees)
                match_stats = []
                best_match = None
                if len(nearest_transcripts) == 0:
                    best_match = MatchStats.from_transcript(t)
                    best_match.category = Category.to_str(Category.INTERGENIC)
                    match_stats.append(best_match)
                else:
                    for ref,category,dist in nearest_transcripts: 
                        # create a match object
                        ms = MatchStats.from_transcript(t, ref)
                        ms.shared_same_strand_bp = 0
                        ms.shared_opp_strand_bp = 0
                        ms.shared_introns = 0
                        ms.shared_splicing = False
                        ms.category = Category.to_str(category)
                        ms.distance = dist
                        match_stats.append(ms)
                    # choose the consensus match
                    best_match = MatchStats.choose_best(match_stats)
                # add gtf attributes and write
                for f in t.to_gtf_features(source='assembly'):
                    best_match.add_gtf_attributes(f)
                    print >>gtf_fileh, str(f)
                # write tab-delimited data
                print >>intergenic_best_fileh, str(best_match)
                for ms in match_stats:
                    print >>intergenic_fileh, str(ms)
        gtf_fileh.close()
        intergenic_fileh.close()
        intergenic_best_fileh.close()
        open(intergenic_done_file, 'w').close()
    # merge overlapping and intergenic results
    logging.info('Merging results')
    metadata_file = os.path.join(output_dir, 'metadata.txt')
    metadata_consensus_file = os.path.join(output_dir, 'metadata.consensus.txt')
    assembly_gtf_file = os.path.join(output_dir, 'assembly.cmp.gtf')
    combine_done_file = os.path.join(output_dir, 'done')
    if not os.path.exists(combine_done_file):
        filenames = [overlapping_file, intergenic_file]
        with open(metadata_file, 'w') as outfile:
            print >>outfile, '\t'.join(MatchStats.header_fields())
            for fname in filenames:
                with open(fname) as infile:
                    for line in infile:
                        outfile.write(line)
        filenames = [overlapping_consensus_file, intergenic_best_file]
        with open(metadata_consensus_file, 'w') as outfile:
            print >>outfile, '\t'.join(MatchStats.header_fields())
            for fname in filenames:
                with open(fname) as infile:
                    for line in infile:
                        outfile.write(line)
        filenames = [intergenic_gtf_file, overlapping_gtf_file]
        with open(assembly_gtf_file, 'w') as outfile:
            for fname in filenames:
                with open(fname) as infile:
                    for line in infile:
                        outfile.write(line)
        open(combine_done_file, 'w').close()
    # cleanup
    logging.info("Done")