def main(): # setup logging logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logging.info("AssemblyLine %s" % (assemblyline.__version__)) logging.info("----------------------------------") # parse command line parser = argparse.ArgumentParser() parser.add_argument('--tmp-dir', dest="tmp_dir", default=None, help="directory for sort to store temp files") parser.add_argument("input_gtf_file") parser.add_argument("output_gtf_file") args = parser.parse_args() # check command line if not os.path.exists(args.input_gtf_file): parser.error("input gtf file %s not found" % (args.input_gtf_file)) logging.info("Parameters:") logging.info("input gtf file: %s" % (args.input_gtf_file)) logging.info("output gtf file: %s" % (args.output_gtf_file)) logging.info("Sorting") sort_gtf(args.input_gtf_file, args.output_gtf_file, tmp_dir=args.tmp_dir) logging.info("Done") return 0
def main(): # parse command line parser = argparse.ArgumentParser() parser.add_argument("-v", "--verbose", action="store_true", dest="verbose", default=False) parser.add_argument("ref_gtf_file") parser.add_argument("gtf_file") args = parser.parse_args() # set logging level if args.verbose: level = logging.DEBUG else: level = logging.INFO logging.basicConfig(level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") # check command line parameters if not os.path.exists(args.ref_gtf_file): parser.error("GTF file %s not found" % (args.ref_gtf_file)) if not os.path.exists(args.gtf_file): parser.error("GTF file %s not found" % (args.gtf_file)) logging.info("AssemblyLine %s" % (assemblyline.__version__)) logging.info("----------------------------------") # show parameters logging.info("Parameters:") logging.info("verbose logging: %s" % (args.verbose)) logging.info("ref gtf file: %s" % (args.ref_gtf_file)) logging.info("assembly gtf file: %s" % (args.gtf_file)) # find CDS regions if not os.path.exists('tmp.srt.gtf'): with open('tmp.gtf', 'w') as outfileh: logging.info("Reading CDS regions from reference GTF") for f in get_cds_features(args.ref_gtf_file): print >>outfileh, str(f) logging.info("Reading transcripts from assembly GTF") i = 0 for f in GTFFeature.parse(open(args.gtf_file)): print >>outfileh, str(f) i += 1 if i % 100000 == 0: logging.debug("Parsed %d transcripts" % (i)) logging.info("Sorting GTF file") sort_gtf('tmp.gtf', 'tmp.srt.gtf') for locus_transcripts in parse_gtf(open('tmp.srt.gtf')): locus_chrom = locus_transcripts[0].chrom locus_start = locus_transcripts[0].start locus_end = max(t.end for t in locus_transcripts) logging.debug("[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) for start, end, strand, m, t, c in categorize(locus_transcripts): fields = [locus_chrom, str(start), str(end), '%s|%s|%s' % (m,t,c), '0', strand_int_to_str(strand)] print '\t'.join(fields) return 0
def compare_assembly(ref_gtf_file, test_gtf_file, output_dir, gtf_score_attr, tmp_dir): # output files compare_file = os.path.join(output_dir, "compare_transcripts.txt") global_stats_file = os.path.join(output_dir, "global_stats.txt") tmp_gtf_file = os.path.join(output_dir, "tmp.gtf") tmp_sorted_gtf_file = os.path.splitext(tmp_gtf_file)[0] + ".srt.gtf" # merge and sort ref/test gtf files logging.info("Merging reference and test GTF files") # make temporary file to store merged ref/test gtf files outfh = open(tmp_gtf_file, "w") logging.info("Adding reference GTF file") add_gtf_file(ref_gtf_file, outfh, is_ref=True, sample_id=None) logging.info("Adding test GTF file") add_gtf_file(test_gtf_file, outfh, is_ref=False, sample_id='assembly') outfh.close() logging.info("Sorting merged GTF file") sort_gtf(tmp_gtf_file, tmp_sorted_gtf_file, tmp_dir=tmp_dir) os.remove(tmp_gtf_file) # compare assemblies logging.info("Comparing assemblies") cmp_fh = open(compare_file, "w") print >>cmp_fh, '\t'.join(map(str, MatchStats.header_fields())) stats_obj = GlobalStats() for locus_transcripts in parse_gtf(open(tmp_sorted_gtf_file)): locus_chrom = locus_transcripts[0].chrom locus_start = locus_transcripts[0].start locus_end = max(t.end for t in locus_transcripts) logging.debug("[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) # score transcripts for t in locus_transcripts: if gtf_score_attr is None: t.score = 0.0 else: t.score = float(t.attrs.get(gtf_score_attr, 0.0)) # run comparison for mobj in compare_locus(locus_transcripts): print >>cmp_fh, str(mobj) # measure global stats locus_stats_obj = gather_global_stats(locus_transcripts) stats_obj = stats_obj + locus_stats_obj # cleanup cmp_fh.close() logging.info("Printing report") f = open(global_stats_file, "w") print >>f, stats_obj.report() os.remove(tmp_sorted_gtf_file) logging.info("Done")
def main(): # setup logging logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logging.info("AssemblyLine %s" % (assemblyline.__version__)) logging.info("----------------------------------") # parse command line parser = argparse.ArgumentParser() parser.add_argument('--tmp-dir', dest="tmp_dir", default=None, help="directory for sort to store temp files") parser.add_argument("input_gtf_file") parser.add_argument("output_gtf_file") args = parser.parse_args() # check command line if not os.path.exists(args.input_gtf_file): parser.error("input gtf file %s not found" % (args.input_gtf_file)) logging.info("Parameters:") logging.info("input gtf file: %s" % (args.input_gtf_file)) logging.info("output gtf file: %s" % (args.output_gtf_file)) logging.info("Sorting") sort_gtf(args.input_gtf_file, args.output_gtf_file, tmp_dir=args.tmp_dir) logging.info("Done") return 0
def main(): # parse command line parser = argparse.ArgumentParser() parser.add_argument("-v", "--verbose", action="store_true", dest="verbose", default=False) parser.add_argument("ref_gtf_file") parser.add_argument("gtf_file") args = parser.parse_args() # set logging level if args.verbose: level = logging.DEBUG else: level = logging.INFO logging.basicConfig( level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") # check command line parameters if not os.path.exists(args.ref_gtf_file): parser.error("GTF file %s not found" % (args.ref_gtf_file)) if not os.path.exists(args.gtf_file): parser.error("GTF file %s not found" % (args.gtf_file)) logging.info("AssemblyLine %s" % (assemblyline.__version__)) logging.info("----------------------------------") # show parameters logging.info("Parameters:") logging.info("verbose logging: %s" % (args.verbose)) logging.info("ref gtf file: %s" % (args.ref_gtf_file)) logging.info("assembly gtf file: %s" % (args.gtf_file)) # find CDS regions if not os.path.exists('tmp.srt.gtf'): with open('tmp.gtf', 'w') as outfileh: logging.info("Reading CDS regions from reference GTF") for f in get_cds_features(args.ref_gtf_file): print >> outfileh, str(f) logging.info("Reading transcripts from assembly GTF") i = 0 for f in GTFFeature.parse(open(args.gtf_file)): print >> outfileh, str(f) i += 1 if i % 100000 == 0: logging.debug("Parsed %d transcripts" % (i)) logging.info("Sorting GTF file") sort_gtf('tmp.gtf', 'tmp.srt.gtf') for locus_transcripts in parse_gtf(open('tmp.srt.gtf')): locus_chrom = locus_transcripts[0].chrom locus_start = locus_transcripts[0].start locus_end = max(t.end for t in locus_transcripts) logging.debug( "[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) for start, end, strand, m, t, c in categorize(locus_transcripts): fields = [ locus_chrom, str(start), str(end), '%s|%s|%s' % (m, t, c), '0', strand_int_to_str(strand) ] print '\t'.join(fields) return 0
def main(): # setup logging logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logging.info("AssemblyLine %s" % (assemblyline.__version__)) logging.info("----------------------------------") # parse command line parser = argparse.ArgumentParser() parser.add_argument('--min-transcript-length', type=int, dest="min_transcript_length", metavar="N", default=config.MIN_TRANSCRIPT_LENGTH, help="Skip ab initio transcripts equal to or below " "this length [default=%(default)s]") parser.add_argument("--gtf-score-attr", dest="gtf_score_attr", default="FPKM", metavar="ATTR", help="GTF attribute field containing transcript " "expression [default='%(default)s']") parser.add_argument('-o', '--output-dir', dest="output_dir", default="assemblyline_out", help="directory to store assemblyline results and " "intermediate files [default=%(default)s]") parser.add_argument("--random-test-frac", dest="random_test_frac", default=0.1, metavar="FRAC", type=float, help="if no user-defined tests are specified " "using '--tests' randomly designate a fraction " "of reference transcripts as test data for use " "in classification [default=%(default)s]") parser.add_argument("--tests", dest="test_file", default=None, help="(optional) text file containing " "reference 'gene_id' attributes " "(one per line) that define test cases " "to use for validation purposes") parser.add_argument('ref_gtf_file') parser.add_argument('library_table_file') args = parser.parse_args() # check command line parameters if not os.path.exists(args.library_table_file): parser.error("library table file %s not found" % (args.library_table_file)) if args.min_transcript_length < 0: parser.error("min_transcript_length < 0") if not os.path.exists(args.ref_gtf_file): parser.error("reference GTF file %s not found" % (args.ref_gtf_file)) if (args.test_file is not None) and (not os.path.exists(args.test_file)): parser.error("test file %s not found" % (args.test_file)) if (args.random_test_frac < 0): parser.error("cannot set --random-test-frac < 0") # show parameters logging.info("Parameters:") logging.info("min transcript length: %d" % (args.min_transcript_length)) logging.info("gtf score attr: %s" % (args.gtf_score_attr)) logging.info("output directory: %s" % (args.output_dir)) logging.info("reference GTF file: %s" % (args.ref_gtf_file)) logging.info("test file: %s" % (args.test_file)) logging.info("library table file: %s" % (args.library_table_file)) logging.info("----------------------------------") # setup results results = config.AssemblylineResults(args.output_dir) # create output directory if not os.path.exists(results.run_dir): logging.debug("Creating output directory '%s'" % (results.run_dir)) os.makedirs(results.run_dir) if not os.path.exists(results.tmp_dir): logging.info("Creating tmp directory '%s'" % (results.tmp_dir)) os.makedirs(results.tmp_dir) # parse sample table logging.info("Parsing library table") libraries = [] valid = True library_num = 1 sample_num = 1 sample_id_map = {} library_map_fileh = open(results.library_id_map, 'w') sample_map_fileh = open(results.sample_id_map, 'w') for library in Library.from_file(args.library_table_file): # exclude samples if not os.path.exists(library.gtf_file): logging.warning("Library '%s' GTF file not found" % (library.library_id)) continue # rename library id new_library_id = "L%d" % (library_num) print >>library_map_fileh, '\t'.join([new_library_id, library.library_id]) library_num += 1 library.library_id = new_library_id # rename sample id if library.sample_id not in sample_id_map: new_sample_id = "S%d" % (sample_num) print >>sample_map_fileh, '\t'.join([new_sample_id, library.sample_id]) sample_id_map[library.sample_id] = new_sample_id sample_num += 1 else: new_sample_id = sample_id_map[library.sample_id] library.sample_id = new_sample_id libraries.append(library) if not valid: logging.warning("Invalid libraries in table file") library_map_fileh.close() sample_map_fileh.close() # setup output files tmp_file = os.path.join(results.tmp_dir, "transcripts.unsorted.gtf") tmpfileh = open(tmp_file, "w") dropfileh = open(results.transcripts_dropped_gtf_file, "w") statsfileh = open(results.transcript_stats_file, 'w') header_fields = ['#library_id'] header_fields.extend(config.TRANSCRIPT_STATS_FIELDS) header_fields.extend([("failed_q%d" % x) for x in config.TRANSCRIPT_SCORE_QUANTILES]) header_fields.extend([("passed_q%d" % x) for x in config.TRANSCRIPT_SCORE_QUANTILES]) print >>statsfileh, '\t'.join(header_fields) # read test transcripts test_gene_ids = set() if args.test_file is not None: fileh = open(args.test_file) test_gene_ids.update(line.strip() for line in fileh) fileh.close() logging.info("Read %d test genes" % len(test_gene_ids)) # read reference GTF file and aggregate logging.info("Adding reference GTF file") add_reference_gtf_file(args.ref_gtf_file, test_gene_ids, args.random_test_frac, tmpfileh) # parse sample table logging.info("Adding libraries") for library in libraries: t_dict = read_gtf_file(library, args.gtf_score_attr) logging.debug("Read %s transcripts from file %s" % (len(t_dict), library.gtf_file)) if len(t_dict) == 0: logging.warning("Library %s has no transcripts" % (library.library_id)) else: filter_transcripts(library.library_id, t_dict, tmpfileh, dropfileh, statsfileh, args.min_transcript_length) statsfileh.close() tmpfileh.close() logging.info("Sorting GTF") retcode = sort_gtf(tmp_file, results.transcripts_gtf_file, tmp_dir=results.tmp_dir) if retcode != 0: logging.error("sort GTF failed") if os.path.exists(results.transcripts_gtf_file): os.remove(results.transcripts_gtf_file) os.remove(tmp_file) logging.info("Done") return retcode
def compare_assemblies(ref_gtf_file, test_gtf_file, output_dir): # output files if not os.path.exists(output_dir): logging.info('Creating output dir: %s' % (output_dir)) os.makedirs(output_dir) # merge step merged_gtf_file = os.path.join(output_dir, "merged.gtf") merged_sorted_gtf_file = os.path.splitext(merged_gtf_file)[0] + ".srt.gtf" merge_done_file = os.path.join(output_dir, 'merged.done') sort_done_file = os.path.join(output_dir, 'sort.done') if not os.path.exists(merge_done_file): # merge and sort ref/test gtf files logging.info("Merging reference and test GTF files") # make temporary file to store merged ref/test gtf files with open(merged_gtf_file, "w") as fileh: logging.info("Adding reference GTF file") add_gtf_file(ref_gtf_file, fileh, is_ref=True) logging.info("Adding test GTF file") add_gtf_file(test_gtf_file, fileh, is_ref=False) open(merge_done_file, 'w').close() if not os.path.exists(sort_done_file): logging.info("Sorting merged GTF file") # create temp directory tmp_dir = os.path.join(output_dir, 'tmp') if not os.path.exists(tmp_dir): logging.debug("Creating tmp directory '%s'" % (tmp_dir)) os.makedirs(tmp_dir) sort_gtf(merged_gtf_file, merged_sorted_gtf_file, tmp_dir=tmp_dir) # cleanup shutil.rmtree(tmp_dir) open(sort_done_file, 'w').close() # compare assemblies overlapping_gtf_file = os.path.join(output_dir, 'overlapping.gtf') intergenic_tmp_gtf_file = os.path.join(output_dir, 'intergenic.tmp.gtf') overlapping_file = os.path.join(output_dir, 'overlapping.tsv') overlapping_consensus_file = os.path.join(output_dir, 'overlapping.consensus.tsv') overlapping_done_file = os.path.join(output_dir, 'overlapping.done') stats_file = os.path.join(output_dir, 'stats.txt') stats_obj = GlobalStats() num_intergenic = 0 if not os.path.exists(overlapping_done_file): logging.info("Comparing assemblies") gtf_fileh = open(overlapping_gtf_file, 'w') tmp_gtf_fileh = open(intergenic_tmp_gtf_file, 'w') overlapping_fileh = open(overlapping_file, 'w') overlapping_consensus_fileh = open(overlapping_consensus_file, 'w') for locus_transcripts in parse_gtf(open(merged_sorted_gtf_file)): locus_chrom = locus_transcripts[0].chrom locus_start = locus_transcripts[0].start locus_end = max(t.end for t in locus_transcripts) logging.debug( "[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) for t, match_stats in compare_locus(locus_transcripts): if len(match_stats) == 0: # write intergenic transcripts to analyze separately t.attrs['category'] = Category.to_str(Category.INTERGENIC) for f in t.to_gtf_features(source='assembly'): print >> tmp_gtf_fileh, str(f) num_intergenic += 1 else: # get consensus match information consensus_match = MatchStats.consensus(match_stats) assert consensus_match is not None t.attrs['category'] = consensus_match.category # add gtf attributes and write for f in t.to_gtf_features(source='assembly'): consensus_match.add_gtf_attributes(f) print >> gtf_fileh, str(f) # tab-delimited text output print >> overlapping_consensus_fileh, str(consensus_match) for ms in match_stats: print >> overlapping_fileh, str(ms) # compute global statistics stats_obj.compute(locus_transcripts) logging.info("Reporting global statistics") with open(stats_file, 'w') as f: print >> f, stats_obj.report() gtf_fileh.close() tmp_gtf_fileh.close() overlapping_fileh.close() overlapping_consensus_fileh.close() open(overlapping_done_file, 'w').close() # resolve intergenic transcripts intergenic_gtf_file = os.path.join(output_dir, 'intergenic.gtf') intergenic_file = os.path.join(output_dir, 'intergenic.tsv') intergenic_best_file = os.path.join(output_dir, 'intergenic.best.tsv') intergenic_done_file = os.path.join(output_dir, 'intergenic.done') if not os.path.exists(intergenic_done_file): logging.info("Building interval index") locus_trees = build_locus_trees(merged_sorted_gtf_file) logging.info('Finding nearest matches to intergenic transcripts') gtf_fileh = open(intergenic_gtf_file, 'w') intergenic_fileh = open(intergenic_file, 'w') intergenic_best_fileh = open(intergenic_best_file, 'w') for locus_transcripts in parse_gtf(open(intergenic_tmp_gtf_file)): for t in locus_transcripts: # find nearest transcripts nearest_transcripts = find_nearest_transcripts( t.chrom, t.start, t.end, t.strand, locus_trees) match_stats = [] best_match = None if len(nearest_transcripts) == 0: best_match = MatchStats.from_transcript(t) best_match.category = Category.to_str(Category.INTERGENIC) match_stats.append(best_match) else: for ref, category, dist in nearest_transcripts: # create a match object ms = MatchStats.from_transcript(t, ref) ms.shared_same_strand_bp = 0 ms.shared_opp_strand_bp = 0 ms.shared_introns = 0 ms.shared_splicing = False ms.category = Category.to_str(category) ms.distance = dist match_stats.append(ms) # choose the consensus match best_match = MatchStats.choose_best(match_stats) # add gtf attributes and write for f in t.to_gtf_features(source='assembly'): best_match.add_gtf_attributes(f) print >> gtf_fileh, str(f) # write tab-delimited data print >> intergenic_best_fileh, str(best_match) for ms in match_stats: print >> intergenic_fileh, str(ms) gtf_fileh.close() intergenic_fileh.close() intergenic_best_fileh.close() open(intergenic_done_file, 'w').close() # merge overlapping and intergenic results logging.info('Merging results') metadata_file = os.path.join(output_dir, 'metadata.txt') metadata_consensus_file = os.path.join(output_dir, 'metadata.consensus.txt') assembly_gtf_file = os.path.join(output_dir, 'assembly.cmp.gtf') combine_done_file = os.path.join(output_dir, 'done') if not os.path.exists(combine_done_file): filenames = [overlapping_file, intergenic_file] with open(metadata_file, 'w') as outfile: print >> outfile, '\t'.join(MatchStats.header_fields()) for fname in filenames: with open(fname) as infile: for line in infile: outfile.write(line) filenames = [overlapping_consensus_file, intergenic_best_file] with open(metadata_consensus_file, 'w') as outfile: print >> outfile, '\t'.join(MatchStats.header_fields()) for fname in filenames: with open(fname) as infile: for line in infile: outfile.write(line) filenames = [intergenic_gtf_file, overlapping_gtf_file] with open(assembly_gtf_file, 'w') as outfile: for fname in filenames: with open(fname) as infile: for line in infile: outfile.write(line) open(combine_done_file, 'w').close() # cleanup logging.info("Done")
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('gtf_file') parser.add_argument('excl_file') parser.add_argument('chrom_sizes') parser.add_argument("output_prefix") args = parser.parse_args() prefix = args.output_prefix excl_file = args.excl_file chrom_sizes_file = args.chrom_sizes gtf_file = args.gtf_file # check command line parameters if which('bedtools') is None: parser.error('bedtools binary not found in PATH') if not os.path.exists(chrom_sizes_file): parser.error('chrom sizes file %s not found' % (chrom_sizes_file)) gene_intervals_file = prefix + '.gene_intervals.bed' gene_intervals_shuffled_file = prefix + '.gene_intervals.shuffle.bed' shuffled_gtf_file = prefix + '.shuffle.gtf' sorted_shuffled_gtf_file = prefix + '.shuffle.srt.gtf' logging.info('Parsing GTF file') with open(gene_intervals_file, 'w') as f: for locus_transcripts in parse_gtf(open(gtf_file)): # find borders of locus locus_chrom = locus_transcripts[0].chrom locus_start = min(t.start for t in locus_transcripts) locus_end = max(t.end for t in locus_transcripts) logging.debug("[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) for g in get_gene_intervals(locus_transcripts): print >>f, '\t'.join(map(str, [g.chrom, g.start, g.end, g.gene_id])) # randomly shuffle genes logging.info("Shuffling genes") args = ['bedtools', 'shuffle', '-excl', excl_file, '-i', gene_intervals_file, '-g', args.chrom_sizes] with open(gene_intervals_shuffled_file, 'w') as fileh: subprocess.call(args, stdout=fileh) # read new gene positions logging.info("Reading shuffled gene intervals") shuffle_gene_map = {} with open(gene_intervals_shuffled_file) as fileh: for line in fileh: fields = line.strip().split('\t') chrom = fields[0] start = int(fields[1]) end = int(fields[2]) gene_id = fields[3] shuffle_gene_map[gene_id] = (chrom, start, end) # reposition transcripts logging.info("Repositioning transcripts") with open(shuffled_gtf_file, 'w') as fileh: for locus_transcripts in parse_gtf(open(gtf_file)): # get original positions orig_gene_map = {} for g in get_gene_intervals(locus_transcripts): orig_gene_map[g.gene_id] = (g.chrom, g.start, g.end) for t in locus_transcripts: gene_id = t.attrs['gene_id'] orig_chrom, orig_start, orig_end = orig_gene_map[gene_id] if gene_id not in shuffle_gene_map: logging.warning('Gene %s [%s:%d-%d] could not be shuffled' % (gene_id, orig_chrom, orig_start, orig_end)) continue new_chrom, new_start, new_end = shuffle_gene_map[gene_id] # reposition transcript t.chrom = new_chrom t.start = new_start + (t.start - orig_start) t.end = new_start + (t.end - orig_start) for e in t.exons: e.start = new_start + (e.start - orig_start) e.end = new_start + (e.end - orig_start) fields = write_bed(t.chrom, t.attrs['transcript_id'], t.strand, 1000, t.exons) print '\t'.join(fields) #for f in t.to_gtf_features(source='shuffle'): # print >>fileh, str(f) logging.info("Sorting GTF file") sort_gtf(shuffled_gtf_file, sorted_shuffled_gtf_file)
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('gtf_file') parser.add_argument('excl_file') parser.add_argument('chrom_sizes') parser.add_argument("output_prefix") args = parser.parse_args() prefix = args.output_prefix excl_file = args.excl_file chrom_sizes_file = args.chrom_sizes gtf_file = args.gtf_file # check command line parameters if which('bedtools') is None: parser.error('bedtools binary not found in PATH') if not os.path.exists(chrom_sizes_file): parser.error('chrom sizes file %s not found' % (chrom_sizes_file)) gene_intervals_file = prefix + '.gene_intervals.bed' gene_intervals_shuffled_file = prefix + '.gene_intervals.shuffle.bed' shuffled_gtf_file = prefix + '.shuffle.gtf' sorted_shuffled_gtf_file = prefix + '.shuffle.srt.gtf' logging.info('Parsing GTF file') with open(gene_intervals_file, 'w') as f: for locus_transcripts in parse_gtf(open(gtf_file)): # find borders of locus locus_chrom = locus_transcripts[0].chrom locus_start = min(t.start for t in locus_transcripts) locus_end = max(t.end for t in locus_transcripts) logging.debug( "[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) for g in get_gene_intervals(locus_transcripts): print >> f, '\t'.join( map(str, [g.chrom, g.start, g.end, g.gene_id])) # randomly shuffle genes logging.info("Shuffling genes") args = [ 'bedtools', 'shuffle', '-excl', excl_file, '-i', gene_intervals_file, '-g', args.chrom_sizes ] with open(gene_intervals_shuffled_file, 'w') as fileh: subprocess.call(args, stdout=fileh) # read new gene positions logging.info("Reading shuffled gene intervals") shuffle_gene_map = {} with open(gene_intervals_shuffled_file) as fileh: for line in fileh: fields = line.strip().split('\t') chrom = fields[0] start = int(fields[1]) end = int(fields[2]) gene_id = fields[3] shuffle_gene_map[gene_id] = (chrom, start, end) # reposition transcripts logging.info("Repositioning transcripts") with open(shuffled_gtf_file, 'w') as fileh: for locus_transcripts in parse_gtf(open(gtf_file)): # get original positions orig_gene_map = {} for g in get_gene_intervals(locus_transcripts): orig_gene_map[g.gene_id] = (g.chrom, g.start, g.end) for t in locus_transcripts: gene_id = t.attrs['gene_id'] orig_chrom, orig_start, orig_end = orig_gene_map[gene_id] if gene_id not in shuffle_gene_map: logging.warning( 'Gene %s [%s:%d-%d] could not be shuffled' % (gene_id, orig_chrom, orig_start, orig_end)) continue new_chrom, new_start, new_end = shuffle_gene_map[gene_id] # reposition transcript t.chrom = new_chrom t.start = new_start + (t.start - orig_start) t.end = new_start + (t.end - orig_start) for e in t.exons: e.start = new_start + (e.start - orig_start) e.end = new_start + (e.end - orig_start) fields = write_bed(t.chrom, t.attrs['transcript_id'], t.strand, 1000, t.exons) print '\t'.join(fields) #for f in t.to_gtf_features(source='shuffle'): # print >>fileh, str(f) logging.info("Sorting GTF file") sort_gtf(shuffled_gtf_file, sorted_shuffled_gtf_file)
def compare_assemblies(ref_gtf_file, test_gtf_file, output_dir): # output files if not os.path.exists(output_dir): logging.info('Creating output dir: %s' % (output_dir)) os.makedirs(output_dir) # merge step merged_gtf_file = os.path.join(output_dir, "merged.gtf") merged_sorted_gtf_file = os.path.splitext(merged_gtf_file)[0] + ".srt.gtf" merge_done_file = os.path.join(output_dir, 'merged.done') sort_done_file = os.path.join(output_dir, 'sort.done') if not os.path.exists(merge_done_file): # merge and sort ref/test gtf files logging.info("Merging reference and test GTF files") # make temporary file to store merged ref/test gtf files with open(merged_gtf_file, "w") as fileh: logging.info("Adding reference GTF file") add_gtf_file(ref_gtf_file, fileh, is_ref=True) logging.info("Adding test GTF file") add_gtf_file(test_gtf_file, fileh, is_ref=False) open(merge_done_file, 'w').close() if not os.path.exists(sort_done_file): logging.info("Sorting merged GTF file") # create temp directory tmp_dir = os.path.join(output_dir, 'tmp') if not os.path.exists(tmp_dir): logging.debug("Creating tmp directory '%s'" % (tmp_dir)) os.makedirs(tmp_dir) sort_gtf(merged_gtf_file, merged_sorted_gtf_file, tmp_dir=tmp_dir) # cleanup shutil.rmtree(tmp_dir) open(sort_done_file, 'w').close() # compare assemblies overlapping_gtf_file = os.path.join(output_dir, 'overlapping.gtf') intergenic_tmp_gtf_file = os.path.join(output_dir, 'intergenic.tmp.gtf') overlapping_file = os.path.join(output_dir, 'overlapping.tsv') overlapping_consensus_file = os.path.join(output_dir, 'overlapping.consensus.tsv') overlapping_done_file = os.path.join(output_dir, 'overlapping.done') stats_file = os.path.join(output_dir, 'stats.txt') stats_obj = GlobalStats() num_intergenic = 0 if not os.path.exists(overlapping_done_file): logging.info("Comparing assemblies") gtf_fileh = open(overlapping_gtf_file, 'w') tmp_gtf_fileh = open(intergenic_tmp_gtf_file, 'w') overlapping_fileh = open(overlapping_file, 'w') overlapping_consensus_fileh = open(overlapping_consensus_file, 'w') for locus_transcripts in parse_gtf(open(merged_sorted_gtf_file)): locus_chrom = locus_transcripts[0].chrom locus_start = locus_transcripts[0].start locus_end = max(t.end for t in locus_transcripts) logging.debug("[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) for t, match_stats in compare_locus(locus_transcripts): if len(match_stats) == 0: # write intergenic transcripts to analyze separately t.attrs['category'] = Category.to_str(Category.INTERGENIC) for f in t.to_gtf_features(source='assembly'): print >>tmp_gtf_fileh, str(f) num_intergenic += 1 else: # get consensus match information consensus_match = MatchStats.consensus(match_stats) assert consensus_match is not None t.attrs['category'] = consensus_match.category # add gtf attributes and write for f in t.to_gtf_features(source='assembly'): consensus_match.add_gtf_attributes(f) print >>gtf_fileh, str(f) # tab-delimited text output print >>overlapping_consensus_fileh, str(consensus_match) for ms in match_stats: print >>overlapping_fileh, str(ms) # compute global statistics stats_obj.compute(locus_transcripts) logging.info("Reporting global statistics") with open(stats_file, 'w') as f: print >>f, stats_obj.report() gtf_fileh.close() tmp_gtf_fileh.close() overlapping_fileh.close() overlapping_consensus_fileh.close() open(overlapping_done_file, 'w').close() # resolve intergenic transcripts intergenic_gtf_file = os.path.join(output_dir, 'intergenic.gtf') intergenic_file = os.path.join(output_dir, 'intergenic.tsv') intergenic_best_file = os.path.join(output_dir, 'intergenic.best.tsv') intergenic_done_file = os.path.join(output_dir, 'intergenic.done') if not os.path.exists(intergenic_done_file): logging.info("Building interval index") locus_trees = build_locus_trees(merged_sorted_gtf_file) logging.info('Finding nearest matches to intergenic transcripts') gtf_fileh = open(intergenic_gtf_file, 'w') intergenic_fileh = open(intergenic_file, 'w') intergenic_best_fileh = open(intergenic_best_file, 'w') for locus_transcripts in parse_gtf(open(intergenic_tmp_gtf_file)): for t in locus_transcripts: # find nearest transcripts nearest_transcripts = find_nearest_transcripts(t.chrom, t.start, t.end, t.strand, locus_trees) match_stats = [] best_match = None if len(nearest_transcripts) == 0: best_match = MatchStats.from_transcript(t) best_match.category = Category.to_str(Category.INTERGENIC) match_stats.append(best_match) else: for ref,category,dist in nearest_transcripts: # create a match object ms = MatchStats.from_transcript(t, ref) ms.shared_same_strand_bp = 0 ms.shared_opp_strand_bp = 0 ms.shared_introns = 0 ms.shared_splicing = False ms.category = Category.to_str(category) ms.distance = dist match_stats.append(ms) # choose the consensus match best_match = MatchStats.choose_best(match_stats) # add gtf attributes and write for f in t.to_gtf_features(source='assembly'): best_match.add_gtf_attributes(f) print >>gtf_fileh, str(f) # write tab-delimited data print >>intergenic_best_fileh, str(best_match) for ms in match_stats: print >>intergenic_fileh, str(ms) gtf_fileh.close() intergenic_fileh.close() intergenic_best_fileh.close() open(intergenic_done_file, 'w').close() # merge overlapping and intergenic results logging.info('Merging results') metadata_file = os.path.join(output_dir, 'metadata.txt') metadata_consensus_file = os.path.join(output_dir, 'metadata.consensus.txt') assembly_gtf_file = os.path.join(output_dir, 'assembly.cmp.gtf') combine_done_file = os.path.join(output_dir, 'done') if not os.path.exists(combine_done_file): filenames = [overlapping_file, intergenic_file] with open(metadata_file, 'w') as outfile: print >>outfile, '\t'.join(MatchStats.header_fields()) for fname in filenames: with open(fname) as infile: for line in infile: outfile.write(line) filenames = [overlapping_consensus_file, intergenic_best_file] with open(metadata_consensus_file, 'w') as outfile: print >>outfile, '\t'.join(MatchStats.header_fields()) for fname in filenames: with open(fname) as infile: for line in infile: outfile.write(line) filenames = [intergenic_gtf_file, overlapping_gtf_file] with open(assembly_gtf_file, 'w') as outfile: for fname in filenames: with open(fname) as infile: for line in infile: outfile.write(line) open(combine_done_file, 'w').close() # cleanup logging.info("Done")