def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("--library-type", dest="library_type", default=LibraryTypes.FR_UNSTRANDED) parser.add_argument("--input-sam", dest="input_sam", action="store_true", default=False) parser.add_argument("--output-sam", dest="output_sam", action="store_true", default=False) parser.add_argument("genome_index") parser.add_argument("transcript_feature_file") parser.add_argument("input_sam_file") parser.add_argument("output_sam_file") args = parser.parse_args() # read transcript features logging.debug("Reading transcript features") transcripts = list(TranscriptFeature.parse(open(args.transcript_feature_file))) return transcriptome_to_genome(args.genome_index, transcripts, args.input_sam_file, args.output_sam_file, args.library_type, args.input_sam, args.output_sam)
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('--max-fragment-length', dest="max_fragment_length", type=int, default=config.DEFAULT_MAX_FRAG_LENGTH) parser.add_argument('--library', dest="library_type", default=LibraryTypes.FR_UNSTRANDED) parser.add_argument('--max-multihits', dest="max_multihits", default=config.DEFAULT_MAX_MULTIHITS) parser.add_argument("transcript_file") parser.add_argument("input_bam_file") parser.add_argument("paired_bam_file") parser.add_argument("discordant_bam_file") parser.add_argument("unpaired_bam_file") parser.add_argument("unmapped_bam_file") parser.add_argument("multimap_bam_file") parser.add_argument("unresolved_bam_file") args = parser.parse_args() # read transcript features logging.debug("Reading transcript features") transcripts = list(TranscriptFeature.parse(open(args.transcript_file))) return find_discordant_fragments(transcripts, args.input_bam_file, args.paired_bam_file, args.discordant_bam_file, args.unpaired_bam_file, args.unmapped_bam_file, args.multimap_bam_file, args.unresolved_bam_file, max_isize=args.max_fragment_length, max_multihits=args.max_multihits, library_type=args.library_type)
def discordant_reads_to_bedpe(index_dir, input_bam_file, output_file): # open BAM alignment file bamfh = pysam.Samfile(input_bam_file, "rb") # build a lookup table to get genomic intervals from transcripts logging.debug("Reading transcript features") transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) tid_tx_map = build_tid_transcript_map(bamfh, transcripts) outfh = open(output_file, "w") logging.debug("Converting BAM to BEDPE format") for r5p,r3p in parse_gene_discordant_reads(bamfh): # store pertinent read information in lightweight structure called # DiscordantRead object. this departs from SAM format into a # custom read format dr5p = DiscordantRead.from_read(r5p) dr3p = DiscordantRead.from_read(r3p) # get gene information tx5p = tid_tx_map[r5p.rname] tx3p = tid_tx_map[r3p.rname] # write bedpe format fields = [tx5p.tx_id, r5p.pos, r5p.aend, tx3p.tx_id, r3p.pos, r3p.aend, r5p.qname, # read name 0, # score tx5p.strand, tx3p.strand, # strand 1, strand 2 ] fields.append('|'.join(map(str, dr5p.to_list()))) fields.append('|'.join(map(str, dr3p.to_list()))) print >>outfh, '\t'.join(map(str, fields)) outfh.close() bamfh.close()
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("--library-type", dest="library_type", default=LibraryTypes.FR_UNSTRANDED) parser.add_argument("--input-sam", dest="input_sam", action="store_true", default=False) parser.add_argument("--output-sam", dest="output_sam", action="store_true", default=False) parser.add_argument("genome_index") parser.add_argument("transcript_feature_file") parser.add_argument("input_sam_file") parser.add_argument("output_sam_file") args = parser.parse_args() # read transcript features logging.debug("Reading transcript features") transcripts = list( TranscriptFeature.parse(open(args.transcript_feature_file))) return transcriptome_to_genome(args.genome_index, transcripts, args.input_sam_file, args.output_sam_file, args.library_type, args.input_sam, args.output_sam)
def filter_multihits(transcript_file, input_bam_file, output_bam_file, max_multihits=1): logging.debug("Reading transcript features") transcripts = list(TranscriptFeature.parse(open(transcript_file))) # parse and convert sam -> bam inbamfh = pysam.Samfile(input_bam_file, "rb") outbamfh = pysam.Samfile(output_bam_file, "wb", template=inbamfh) # build a transcript to genome coordinate map tid_tx_genome_map = build_tid_transcript_genome_map(outbamfh, transcripts) num_frags = 0 logging.debug("Annotating and filtering multihits") for pe_reads in parse_pe_reads(inbamfh): mate_num_hits = [] for reads in pe_reads: num_hits = annotate_multihits(reads, tid_tx_genome_map) mate_num_hits.append(num_hits) new_pe_reads = [[], []] if mate_num_hits[0] > max_multihits: r = copy_read(pe_reads[0][0]) r.is_unmapped = True r.is_proper_pair = False r.is_secondary = False r.rname = -1 r.pos = 0 if mate_num_hits[1] > max_multihits: r.mate_is_unmapped = True r.mrnm = -1 r.mpos = 0 new_pe_reads[0] = [r] else: new_pe_reads[0] = pe_reads[0] if mate_num_hits[1] > max_multihits: r = copy_read(pe_reads[1][0]) r.is_unmapped = True r.is_proper_pair = False r.is_secondary = False r.rname = -1 r.pos = 0 if mate_num_hits[0] > max_multihits: r.mate_is_unmapped = True r.mrnm = -1 r.mpos = 0 new_pe_reads[1] = [r] else: new_pe_reads[1] = pe_reads[1] for reads in pe_reads: for r in reads: outbamfh.write(r) num_frags += 1 logging.debug("Found %d fragments" % (num_frags)) inbamfh.close() outbamfh.close() return config.JOB_SUCCESS
def filter_multihits(transcript_file, input_bam_file, output_bam_file, max_multihits=1): logging.debug("Reading transcript features") transcripts = list(TranscriptFeature.parse(open(transcript_file))) # parse and convert sam -> bam inbamfh = pysam.Samfile(input_bam_file, "rb") outbamfh = pysam.Samfile(output_bam_file, "wb", template=inbamfh) # build a transcript to genome coordinate map tid_tx_genome_map = build_tid_transcript_genome_map(outbamfh, transcripts) num_frags = 0 logging.debug("Annotating and filtering multihits") for pe_reads in parse_pe_reads(inbamfh): mate_num_hits = [] for reads in pe_reads: num_hits = annotate_multihits(reads, tid_tx_genome_map) mate_num_hits.append(num_hits) new_pe_reads = [[],[]] if mate_num_hits[0] > max_multihits: r = copy_read(pe_reads[0][0]) r.is_unmapped = True r.is_proper_pair = False r.is_secondary = False r.rname = -1 r.pos = 0 if mate_num_hits[1] > max_multihits: r.mate_is_unmapped = True r.mrnm = -1 r.mpos = 0 new_pe_reads[0] = [r] else: new_pe_reads[0] = pe_reads[0] if mate_num_hits[1] > max_multihits: r = copy_read(pe_reads[1][0]) r.is_unmapped = True r.is_proper_pair = False r.is_secondary = False r.rname = -1 r.pos = 0 if mate_num_hits[0] > max_multihits: r.mate_is_unmapped = True r.mrnm = -1 r.mpos = 0 new_pe_reads[1] = [r] else: new_pe_reads[1] = pe_reads[1] for reads in pe_reads: for r in reads: outbamfh.write(r) num_frags += 1 logging.debug("Found %d fragments" % (num_frags)) inbamfh.close() outbamfh.close() return config.JOB_SUCCESS
def find_discordant_fragments(input_bam_file, paired_bam_file, unmapped_bam_file, index_dir, max_isize, library_type): """ parses BAM file and categorizes reads into several groups: - concordant - discordant within gene (splicing isoforms) - discordant between different genes (chimeras) """ logging.info("Finding discordant read pair combinations") logging.debug("\tInput file: %s" % (input_bam_file)) logging.debug("\tMax insert size: '%d'" % (max_isize)) logging.debug("\tLibrary type: '%s'" % (library_type)) logging.debug("\tGene paired file: %s" % (paired_bam_file)) logging.debug("\tUnmapped file: %s" % (unmapped_bam_file)) # setup input and output files bamfh = pysam.Samfile(input_bam_file, "rb") genefh = pysam.Samfile(paired_bam_file, "wb", template=bamfh) unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh) # read transcript features logging.debug("Reading transcript features") transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) logging.debug("Building transcript lookup tables") # build a lookup table from bam tid index to transcript object tid_tx_map = build_tid_transcript_map(bamfh, transcripts) # build a transcript to genome coordinate map tid_tx_genome_map = build_tid_transcript_genome_map(bamfh, transcripts) logging.info("Parsing reads") for pe_reads in parse_pe_reads(bamfh): # add hit index and multimap information to read tags # this function also checks for unmapped reads any_unmapped = False for reads in pe_reads: any_unmapped = (any_unmapped or annotate_multihits( bamfh, reads, tid_tx_genome_map)) if any_unmapped: # write to output as discordant reads and continue to # next fragment write_pe_reads(unmappedfh, pe_reads) continue # examine all read pairing combinations and rule out invalid pairings gene_pairs, unpaired_reads = classify_read_pairs( pe_reads, max_isize, library_type, tid_tx_map) if len(gene_pairs) > 0: write_pairs(genefh, gene_pairs) # TODO: do something with unpaired discordant reads? genefh.close() unmappedfh.close() bamfh.close() logging.info("Finished pairing reads") return config.JOB_SUCCESS
def find_discordant_fragments(input_bam_file, paired_bam_file, unmapped_bam_file, index_dir, max_isize, library_type): """ parses BAM file and categorizes reads into several groups: - concordant - discordant within gene (splicing isoforms) - discordant between different genes (chimeras) """ logging.info("Finding discordant read pair combinations") logging.debug("\tInput file: %s" % (input_bam_file)) logging.debug("\tMax insert size: '%d'" % (max_isize)) logging.debug("\tLibrary type: '%s'" % (library_type)) logging.debug("\tGene paired file: %s" % (paired_bam_file)) logging.debug("\tUnmapped file: %s" % (unmapped_bam_file)) # setup input and output files bamfh = pysam.Samfile(input_bam_file, "rb") genefh = pysam.Samfile(paired_bam_file, "wb", template=bamfh) unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh) # read transcript features logging.debug("Reading transcript features") transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) logging.debug("Building transcript lookup tables") # build a lookup table from bam tid index to transcript object tid_tx_map = build_tid_transcript_map(bamfh, transcripts) # build a transcript to genome coordinate map tid_tx_genome_map = build_tid_transcript_genome_map(bamfh, transcripts) logging.info("Parsing reads") for pe_reads in parse_pe_reads(bamfh): # add hit index and multimap information to read tags # this function also checks for unmapped reads any_unmapped = False for reads in pe_reads: any_unmapped = any_unmapped or annotate_multihits(bamfh, reads, tid_tx_genome_map) if any_unmapped: # write to output as discordant reads and continue to # next fragment write_pe_reads(unmappedfh, pe_reads) continue # examine all read pairing combinations and rule out invalid pairings gene_pairs, unpaired_reads = classify_read_pairs(pe_reads, max_isize, library_type, tid_tx_map) if len(gene_pairs) > 0: write_pairs(genefh, gene_pairs) # TODO: do something with unpaired discordant reads? genefh.close() unmappedfh.close() bamfh.close() logging.info("Finished pairing reads") return config.JOB_SUCCESS
def filter_highest_coverage_isoforms(index_dir, input_file, output_file): # read transcripts logging.debug("Reading transcripts") transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) # find highest coverage chimeras among isoforms kept_chimeras = get_highest_coverage_isoforms(input_file, transcripts) num_filtered_chimeras = 0 f = open(output_file, "w") for c in Chimera.parse(open(input_file)): if c.name in kept_chimeras: num_filtered_chimeras += 1 print >> f, '\t'.join(map(str, c.to_list())) f.close() logging.debug("\tAfter choosing best isoform: %d" % num_filtered_chimeras) return config.JOB_SUCCESS
def filter_highest_coverage_isoforms(index_dir, input_file, output_file): # read transcripts logging.debug("Reading transcripts") transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) # find highest coverage chimeras among isoforms kept_chimeras = get_highest_coverage_isoforms(input_file, transcripts) num_filtered_chimeras = 0 f = open(output_file, "w") for c in Chimera.parse(open(input_file)): if c.name in kept_chimeras: num_filtered_chimeras += 1 print >>f, '\t'.join(map(str, c.to_list())) f.close() logging.debug("\tAfter choosing best isoform: %d" % num_filtered_chimeras) return config.JOB_SUCCESS
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("--ann", dest="annotation_source", default="ensembl") parser.add_argument("transcript_file") parser.add_argument("cluster_shelve_file") parser.add_argument("cluster_pair_file") parser.add_argument("read_name_file") parser.add_argument("output_file") args = parser.parse_args() # read transcript features logging.debug("Reading transcript features") transcripts = list(TranscriptFeature.parse(open(args.transcript_file))) # run main function retcode = write_output(transcripts, args.cluster_shelve_file, args.cluster_pair_file, args.read_name_file, args.output_file, args.annotation_source) return retcode
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("--ann", dest="annotation_source", default="ensembl") parser.add_argument("transcript_file") parser.add_argument("cluster_shelve_file") parser.add_argument("cluster_pair_file") parser.add_argument("read_name_file") parser.add_argument("output_file") args = parser.parse_args() # read transcript features logging.debug("Reading transcript features") transcripts = list(TranscriptFeature.parse(open(args.transcript_file))) # run main function retcode = write_output(transcripts, args.cluster_shelve_file, args.cluster_pair_file, args.read_name_file, args.output_file, args.annotation_source) return retcode
def transcript_features_to_fasta(transcript_feature_file, reference_seq_file): ref_fa = pysam.Fastafile(reference_seq_file) total = 0 used = 0 for g in TranscriptFeature.parse(open(transcript_feature_file)): total += 1 exon_seqs = [] error_occurred = False for start, end in g.exons: seq = ref_fa.fetch(g.chrom, start, end) if (not seq) or (len(seq) < (end - start)): logging.warning( "transcript id %d exon %s:%d-%d not found in reference" % (g.tx_id, g.chrom, start, end)) error_occurred = True break exon_seqs.append(seq) if error_occurred: continue used += 1 # make fasta record seq = ''.join(exon_seqs) # look for sequences containing only 'N's base_counts = collections.Counter(seq) valid_bases = sum(base_counts[x] for x in ("A", "T", "G", "C", "a", "t", "g", "c")) if valid_bases == 0: logging.warning("transcript %d at pos %s:%d-%d lacks valid bases" % (g.tx_id, g.chrom, g.tx_start, g.tx_end)) continue # reverse complement negative stranded sequences if g.strand == '-': seq = DNA_reverse_complement(seq) # break seq onto multiple lines seqlines = split_seq(seq, BASES_PER_LINE) fa_record = ( ">%d range=%s:%d-%d strand=%s\n%s" % (g.tx_id, g.chrom, g.tx_start, g.tx_end, g.strand, seqlines)) yield g, fa_record logging.info("Used %d/%d gene features" % (used, total)) ref_fa.close()
def transcript_features_to_fasta(transcript_feature_file, reference_seq_file): ref_fa = pysam.Fastafile(reference_seq_file) total = 0 used = 0 for g in TranscriptFeature.parse(open(transcript_feature_file)): total += 1 exon_seqs = [] error_occurred = False for start, end in g.exons: seq = ref_fa.fetch(g.chrom, start, end) if (not seq) or (len(seq) < (end - start)): logging.warning("transcript id %d exon %s:%d-%d not found in reference" % (g.tx_id, g.chrom, start, end)) error_occurred = True break exon_seqs.append(seq) if error_occurred: continue used += 1 # make fasta record seq = ''.join(exon_seqs) # look for sequences containing only 'N's base_counts = collections.Counter(seq) valid_bases = sum(base_counts[x] for x in ("A","T","G","C","a","t","g","c")) if valid_bases == 0: logging.warning("transcript %d at pos %s:%d-%d lacks valid bases" % (g.tx_id, g.chrom, g.tx_start, g.tx_end)) continue # reverse complement negative stranded sequences if g.strand == '-': seq = DNA_reverse_complement(seq) # break seq onto multiple lines seqlines = split_seq(seq, BASES_PER_LINE) fa_record = (">%d range=%s:%d-%d strand=%s\n%s" % (g.tx_id, g.chrom, g.tx_start, g.tx_end, g.strand, seqlines)) yield g, fa_record logging.info("Used %d/%d gene features" % (used,total)) ref_fa.close()
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("--frag-size-mean", dest="frag_size_mean", type=float, default=DEFAULT_FRAG_SIZE_MEAN) parser.add_argument("--frag-size-sd", dest="frag_size_sd", type=float, default=DEFAULT_FRAG_SIZE_SD) parser.add_argument("--rlen", dest="read_length", type=int, default=DEFAULT_READ_LENGTH) parser.add_argument("--stranded", dest="stranded", action="store_true", default=False) parser.add_argument("-n", dest="num_frags", type=int, default=DEFAULT_NUM_FRAGS) parser.add_argument("index_dir") parser.add_argument("transcript_exprs_file") parser.add_argument("chimera_file") parser.add_argument("output_prefix") args = parser.parse_args() # setup parameters transcript_file = os.path.join(args.index_dir, config.TRANSCRIPT_FEATURE_FILE) genome_fasta_file = os.path.join(args.index_dir, config.GENOME_FASTA_FILE) fastafh = pysam.Fastafile(genome_fasta_file) logging.info("Reading transcripts") transcript_dict = {} for t in TranscriptFeature.parse(open(transcript_file)): transcript_dict[str(t.tx_id)] = t logging.info("Generating simulated reads") generate_simulated_reads(fastafh, transcript_dict, args.transcript_exprs_file, args.chimera_file, args.output_prefix, frag_size_mean=args.frag_size_mean, frag_size_sd=args.frag_size_sd, num_frags=args.num_frags, read_length=args.read_length, stranded=args.stranded) fastafh.close()
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('--max-fragment-length', dest="max_fragment_length", type=int, default=config.DEFAULT_MAX_FRAG_LENGTH) parser.add_argument('--library', dest="library_type", default=LibraryTypes.FR_UNSTRANDED) parser.add_argument('--max-multihits', dest="max_multihits", default=config.DEFAULT_MAX_MULTIHITS) parser.add_argument("transcript_file") parser.add_argument("input_bam_file") parser.add_argument("paired_bam_file") parser.add_argument("discordant_bam_file") parser.add_argument("unpaired_bam_file") parser.add_argument("unmapped_bam_file") parser.add_argument("multimap_bam_file") parser.add_argument("unresolved_bam_file") args = parser.parse_args() # read transcript features logging.debug("Reading transcript features") transcripts = list(TranscriptFeature.parse(open(args.transcript_file))) return find_discordant_fragments(transcripts, args.input_bam_file, args.paired_bam_file, args.discordant_bam_file, args.unpaired_bam_file, args.unmapped_bam_file, args.multimap_bam_file, args.unresolved_bam_file, max_isize=args.max_fragment_length, max_multihits=args.max_multihits, library_type=args.library_type)
def run_chimerascan(runconfig): """ main function for running the chimerascan pipeline """ # print a welcome message title_string = "Running chimerascan version %s" % (__version__) logging.info(title_string) logging.info("-" * len(title_string)) # validate run configuration config_passed = runconfig.check_config() if not config_passed: logging.error("Invalid run configuration, aborting.") return config.JOB_ERROR # create output dir if it does not exist if not os.path.exists(runconfig.output_dir): os.makedirs(runconfig.output_dir) logging.info("Created output directory: %s" % (runconfig.output_dir)) # create log dir if it does not exist log_dir = os.path.join(runconfig.output_dir, config.LOG_DIR) if not os.path.exists(log_dir): os.makedirs(log_dir) logging.debug("Created directory for log files: %s" % (log_dir)) # create tmp dir if it does not exist tmp_dir = os.path.join(runconfig.output_dir, config.TMP_DIR) if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) logging.debug("Created directory for tmp files: %s" % (tmp_dir)) # write the run config to a file xmlstring = runconfig.to_xml() runconfig_xml_file = os.path.join(runconfig.output_dir, config.RUNCONFIG_XML_FILE) logging.info("Writing run configuration to XML file: %s" % (runconfig_xml_file)) fh = open(runconfig_xml_file, "w") print >> fh, xmlstring fh.close() # mask biotypes and references mask_biotypes = set() if runconfig.mask_biotypes_file: logging.info("Reading biotypes mask file") mask_biotypes.update( [line.strip() for line in open(runconfig.mask_biotypes_file)]) logging.info("\tread biotypes: %s" % (','.join(sorted(mask_biotypes)))) mask_rnames = set() if runconfig.mask_rnames_file: logging.info("Reading references mask file") mask_rnames.update( [line.strip() for line in open(runconfig.mask_rnames_file)]) logging.info("\tread references: %s" % (','.join(sorted(mask_rnames)))) # read transcripts logging.info("Reading transcript features") transcript_file = os.path.join(runconfig.index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) logging.info("\tread %d transcripts" % (len(transcripts))) # setup alignment indexes genome_index = os.path.join(runconfig.index_dir, config.GENOME_INDEX) transcriptome_index = os.path.join(runconfig.index_dir, config.TRANSCRIPTOME_INDEX) max_transcriptome_hits_file = os.path.join(runconfig.index_dir, config.MAX_MULTIMAPPING_FILE) max_transcriptome_hits = int( open(max_transcriptome_hits_file).next().strip()) # detect read length original_read_length = detect_read_length(runconfig.fastq_files[0]) # minimum fragment length cannot be smaller than the trimmed read length trimmed_read_length = (original_read_length - runconfig.trim5 - runconfig.trim3) min_fragment_length = max(runconfig.min_fragment_length, trimmed_read_length) # # Process and inspect the FASTQ files, performing several alterations # to the reads: # # 1) rename them from long string to numbers to save space throughout # the pipeline. also store mapping from read numbers to full names # in a separate file # 2) ensure the "/1" and "/2" suffixes exist to denote paired reads # 3) convert quality scores to sanger format # converted_fastq_files = [ os.path.join(tmp_dir, fq) for fq in config.CONVERTED_FASTQ_FILES ] read_name_file = os.path.join(tmp_dir, config.READ_NAME_TXT_FILE) msg = "Processing FASTQ files" skip = all( up_to_date(cfq, fq) for cfq, fq in zip(converted_fastq_files, runconfig.fastq_files)) skip = skip and up_to_date(read_name_file, runconfig.fastq_files[0]) if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) converted_fastq_prefix = \ os.path.join(tmp_dir, config.CONVERTED_FASTQ_PREFIX) try: retcode = process_input_reads(runconfig.fastq_files, converted_fastq_prefix, quals=runconfig.quals, trim5=runconfig.trim5, trim3=runconfig.trim3) if retcode != config.JOB_SUCCESS: logging.error("%s step failed" % (msg)) return config.JOB_ERROR except Exception as e: logging.info("Cleaning up after error %s" % (str(e))) for fq in converted_fastq_files: if os.path.isfile(fq): os.remove(fq) # # Transcriptome alignment step # # Align to transcriptome in paired-end mode, trying to resolve as many # reads as possible. # transcriptome_bam_file = os.path.join(tmp_dir, config.TRANSCRIPTOME_BAM_FILE) transcriptome_unaligned_path = os.path.join( tmp_dir, config.TRANSCRIPTOME_UNALIGNED_PATH) transcriptome_unaligned_fastq_files = tuple( os.path.join(tmp_dir, fq) for fq in config.TRANSCRIPTOME_UNALIGNED_FASTQ_FILES) msg = "Aligning paired-end reads to transcriptome" if (all( up_to_date(transcriptome_bam_file, fq) for fq in converted_fastq_files) and all( up_to_date(a, b) for a, b in zip(transcriptome_unaligned_fastq_files, converted_fastq_files))): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) log_file = os.path.join(log_dir, config.TRANSCRIPTOME_LOG_FILE) retcode = bowtie2_align_transcriptome_pe( transcriptome_index=transcriptome_index, genome_index=genome_index, transcript_file=transcript_file, fastq_files=converted_fastq_files, unaligned_path=transcriptome_unaligned_path, bam_file=transcriptome_bam_file, log_file=log_file, library_type=runconfig.library_type, min_fragment_length=min_fragment_length, max_fragment_length=runconfig.max_fragment_length, max_transcriptome_hits=max_transcriptome_hits, num_processors=runconfig.num_processors) # cleanup if job failed if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(transcriptome_bam_file): os.remove(transcriptome_bam_file) for f in transcriptome_unaligned_fastq_files: if os.path.exists(f): os.remove(f) return config.JOB_ERROR # # Sort transcriptome reads by position # msg = "Sorting transcriptome reads" sorted_transcriptome_bam_file = os.path.join( runconfig.output_dir, config.SORTED_TRANSCRIPTOME_BAM_FILE) if (up_to_date(sorted_transcriptome_bam_file, transcriptome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) sorted_aligned_bam_prefix = os.path.splitext( sorted_transcriptome_bam_file)[0] pysam.sort("-m", str(int(1e9)), transcriptome_bam_file, sorted_aligned_bam_prefix) # # Index BAM file # msg = "Indexing BAM file" sorted_transcriptome_bam_index_file = sorted_transcriptome_bam_file + ".bai" if (up_to_date(sorted_transcriptome_bam_index_file, sorted_transcriptome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_transcriptome_bam_file) # # Get insert size distribution # isize_dist_file = os.path.join(runconfig.output_dir, config.ISIZE_DIST_FILE) msg = "Profiling insert size distribution" if up_to_date(isize_dist_file, transcriptome_bam_file): logging.info("[SKIPPED] %s" % msg) isize_dist = InsertSizeDistribution.from_file( open(isize_dist_file, "r")) else: logging.info(msg) bamfh = pysam.Samfile(sorted_transcriptome_bam_file, "rb") isize_dist = InsertSizeDistribution.from_genome_bam( bamfh, transcripts, min_isize=min_fragment_length, max_isize=runconfig.max_fragment_length, max_samples=config.ISIZE_MAX_SAMPLES) bamfh.close() # if not enough samples, use a normal distribution instead # of the empirical distribution if isize_dist.n < config.ISIZE_MIN_SAMPLES: logging.warning("Not enough fragments to sample insert size " "distribution empirically. Using mean=%d " "stdev=%f instead" % (runconfig.isize_mean, runconfig.isize_stdev)) isize_dist = InsertSizeDistribution.from_random( runconfig.isize_mean, runconfig.isize_stdev, min_isize=runconfig.min_fragment_length, max_isize=runconfig.max_fragment_length, samples=config.ISIZE_MAX_SAMPLES) isize_dist.to_file(open(isize_dist_file, "w")) # # Determine ideal segment length automatically # # log insert size statistics logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" % (isize_dist.n, isize_dist.mean(), isize_dist.std(), isize_dist.isize_at_percentile(50.0), isize_dist.mode())) # choose a segment length to optimize mapping optimal_isize = isize_dist.isize_at_percentile( DEFAULT_FRAG_SIZE_SENSITIVITY) logging.info("Determining soft-clipped segment length") logging.debug("\tInsert size at %f percent of distribution is %d" % (DEFAULT_FRAG_SIZE_SENSITIVITY, optimal_isize)) optimal_segment_length = int(round(optimal_isize / 3.0)) logging.debug("\tOptimal segment length is %d/3.0 = %d" % (optimal_isize, optimal_segment_length)) segment_length = min(optimal_segment_length, trimmed_read_length) segment_length = max(config.MIN_SEGMENT_LENGTH, segment_length) logging.debug( "\tAfter adjusting for min %d and read length %d, final segment length is %d" % (config.MIN_SEGMENT_LENGTH, trimmed_read_length, segment_length)) if runconfig.segment_length is not None: logging.debug( "\tOverriding auto segment length and using segment length of %d" % (runconfig.segment_length)) segment_length = runconfig.segment_length # # Genome alignment step # # Align any unaligned transcriptome reads to genome in paired-end mode. # Resolve as many reads as possible. # genome_bam_file = os.path.join(tmp_dir, config.GENOME_BAM_FILE) genome_unaligned_path = os.path.join(tmp_dir, config.GENOME_UNALIGNED_PATH) genome_unaligned_fastq_files = tuple( os.path.join(tmp_dir, fq) for fq in config.GENOME_UNALIGNED_FASTQ_FILES) msg = "Realigning unaligned paired-end reads to genome" if (all(up_to_date(genome_bam_file, fq) for fq in converted_fastq_files) and all( up_to_date(a, b) for a, b in zip(genome_unaligned_fastq_files, converted_fastq_files))): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) log_file = os.path.join(log_dir, config.GENOME_LOG_FILE) retcode = bowtie2_align_pe( index=genome_index, fastq_files=transcriptome_unaligned_fastq_files, unaligned_path=genome_unaligned_path, bam_file=genome_bam_file, log_file=log_file, library_type=runconfig.library_type, min_fragment_length=min_fragment_length, max_fragment_length=runconfig.max_fragment_length, max_hits=max_transcriptome_hits, num_processors=runconfig.num_processors) # cleanup if job failed if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(genome_bam_file): os.remove(genome_bam_file) for f in genome_unaligned_fastq_files: if os.path.exists(f): os.remove(f) return config.JOB_ERROR # # Realignment step # # trim and realign all the initially unaligned reads in order to # increase sensitivity to detect reads spanning fusion junctions # realigned_bam_file = os.path.join(tmp_dir, config.REALIGNED_BAM_FILE) realigned_log_file = os.path.join(log_dir, config.REALIGNED_LOG_FILE) msg = "Trimming and realigning initially unmapped reads" if (all( up_to_date(realigned_bam_file, fq) for fq in genome_unaligned_fastq_files) and up_to_date(realigned_bam_file, isize_dist_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = bowtie2_align_pe_sr(index=transcriptome_index, transcript_file=transcript_file, fastq_files=genome_unaligned_fastq_files, bam_file=realigned_bam_file, log_file=realigned_log_file, tmp_dir=tmp_dir, segment_length=segment_length, max_hits=max_transcriptome_hits, num_processors=runconfig.num_processors) if retcode != config.JOB_SUCCESS: if os.path.exists(realigned_bam_file): os.remove(realigned_bam_file) return config.JOB_ERROR # # Find discordant reads # # iterate through realigned reads and divide them into groups of # concordant, discordant within a gene (isoforms), discordant # between different genes, and discordant in the genome # paired_bam_file = os.path.join(tmp_dir, config.PAIRED_BAM_FILE) discordant_bam_file = os.path.join(tmp_dir, config.DISCORDANT_BAM_FILE) unpaired_bam_file = os.path.join(tmp_dir, config.UNPAIRED_BAM_FILE) unmapped_bam_file = os.path.join(tmp_dir, config.UNMAPPED_BAM_FILE) multimap_bam_file = os.path.join(tmp_dir, config.MULTIMAP_BAM_FILE) unresolved_bam_file = os.path.join(tmp_dir, config.UNRESOLVED_BAM_FILE) output_files = (paired_bam_file, discordant_bam_file, unpaired_bam_file, unmapped_bam_file, multimap_bam_file, unresolved_bam_file) msg = "Classifying concordant and discordant read pairs" if (all(up_to_date(f, realigned_bam_file) for f in output_files)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = find_discordant_fragments( transcripts=transcripts, input_bam_file=realigned_bam_file, paired_bam_file=paired_bam_file, discordant_bam_file=discordant_bam_file, unpaired_bam_file=unpaired_bam_file, unmapped_bam_file=unmapped_bam_file, multimap_bam_file=multimap_bam_file, unresolved_bam_file=unresolved_bam_file, max_isize=runconfig.max_fragment_length, max_multihits=runconfig.max_multihits, library_type=runconfig.library_type) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) return config.JOB_ERROR # # Convert discordant transcriptome reads to genome coordinates # discordant_genome_bam_file = os.path.join( tmp_dir, config.DISCORDANT_GENOME_BAM_FILE) msg = "Converting discordant transcriptome hits to genomic coordinates" if (up_to_date(discordant_genome_bam_file, discordant_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) discordant_genome_sam_file = os.path.join( tmp_dir, config.DISCORDANT_GENOME_SAM_FILE) retcode = transcriptome_to_genome( genome_index, transcripts, input_file=discordant_bam_file, output_file=discordant_genome_sam_file, library_type=runconfig.library_type, input_sam=False, output_sam=True) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(discordant_genome_sam_file): os.remove(discordant_genome_sam_file) return config.JOB_ERROR retcode = sam_to_bam(discordant_genome_sam_file, discordant_genome_bam_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(discordant_genome_bam_file): os.remove(discordant_genome_bam_file) return config.JOB_ERROR if os.path.exists(discordant_genome_sam_file): os.remove(discordant_genome_sam_file) # # Sort discordant reads by position # msg = "Sorting discordant BAM file" sorted_discordant_genome_bam_file = os.path.join( tmp_dir, config.SORTED_DISCORDANT_GENOME_BAM_FILE) if (up_to_date(sorted_discordant_genome_bam_file, discordant_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bam_prefix = os.path.splitext(sorted_discordant_genome_bam_file)[0] pysam.sort("-m", str(int(1e9)), discordant_genome_bam_file, bam_prefix) # # Index BAM file # msg = "Indexing discordant BAM file" sorted_discordant_bam_index_file = sorted_discordant_genome_bam_file + ".bai" if (up_to_date(sorted_discordant_bam_index_file, sorted_discordant_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_discordant_genome_bam_file) # # Convert unpaired transcriptome reads to genome coordinates # unpaired_genome_bam_file = os.path.join(tmp_dir, config.UNPAIRED_GENOME_BAM_FILE) msg = "Converting unpaired transcriptome hits to genomic coordinates" if (up_to_date(unpaired_genome_bam_file, unpaired_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) unpaired_genome_sam_file = os.path.join( tmp_dir, config.UNPAIRED_GENOME_SAM_FILE) retcode = transcriptome_to_genome(genome_index, transcripts, input_file=unpaired_bam_file, output_file=unpaired_genome_sam_file, library_type=runconfig.library_type, input_sam=False, output_sam=True) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(unpaired_genome_sam_file): os.remove(unpaired_genome_sam_file) return config.JOB_ERROR retcode = sam_to_bam(unpaired_genome_sam_file, unpaired_genome_bam_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(unpaired_genome_bam_file): os.remove(unpaired_genome_bam_file) return config.JOB_ERROR if os.path.exists(unpaired_genome_sam_file): os.remove(unpaired_genome_sam_file) # # Sort unpaired reads by position # msg = "Sorting unpaired BAM file" sorted_unpaired_genome_bam_file = os.path.join( tmp_dir, config.SORTED_UNPAIRED_GENOME_BAM_FILE) if (up_to_date(sorted_unpaired_genome_bam_file, unpaired_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bam_prefix = os.path.splitext(sorted_unpaired_genome_bam_file)[0] pysam.sort("-m", str(int(1e9)), unpaired_genome_bam_file, bam_prefix) # # Index BAM file # msg = "Indexing unpaired BAM file" sorted_unpaired_bam_index_file = sorted_unpaired_genome_bam_file + ".bai" if (up_to_date(sorted_unpaired_bam_index_file, sorted_unpaired_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_unpaired_genome_bam_file) # # Cluster discordant reads into chimera candidates # cluster_file = os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_FILE) cluster_shelve_file = \ os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_SHELVE_FILE) sorted_discordant_genome_cluster_bam_file = \ os.path.join(runconfig.output_dir, config.SORTED_DISCORDANT_GENOME_CLUSTER_BAM_FILE) input_files = (sorted_discordant_genome_bam_file, sorted_unpaired_genome_bam_file) output_files = (cluster_file, cluster_shelve_file, sorted_discordant_genome_cluster_bam_file) msg = "Clustering discordant reads" skip = True for input_file in input_files: for output_file in output_files: skip = skip and up_to_date(output_file, input_file) if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.debug(msg) retcode = cluster_discordant_reads( discordant_bam_file=sorted_discordant_genome_bam_file, unpaired_bam_file=sorted_unpaired_genome_bam_file, concordant_bam_file=sorted_transcriptome_bam_file, output_bam_file=sorted_discordant_genome_cluster_bam_file, cluster_file=cluster_file, cluster_shelve_file=cluster_shelve_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) # # Pair discordant clusters # cluster_pair_file = \ os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_PAIR_FILE) msg = "Pairing discordant clusters" output_files = (cluster_pair_file, ) if up_to_date(cluster_pair_file, sorted_discordant_genome_cluster_bam_file): logging.info("[SKIPPED] %s" % (msg)) else: logging.debug(msg) retcode = pair_discordant_clusters( discordant_bam_file=sorted_discordant_genome_cluster_bam_file, cluster_pair_file=cluster_pair_file, tmp_dir=tmp_dir) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) # # Perform realignment across putative fusion breakpoints # breakpoint_bam_file = os.path.join(tmp_dir, config.BREAKPOINT_BAM_FILE) msg = "Realigning to find breakpoint-spanning reads" input_files = (sorted_discordant_genome_bam_file, sorted_unpaired_genome_bam_file, cluster_shelve_file, cluster_pair_file) output_files = (breakpoint_bam_file, ) skip = True for inp in input_files: for outp in output_files: if not up_to_date(outp, inp): skip = False if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.debug(msg) retcode = realign_across_breakpoints( index_dir=runconfig.index_dir, discordant_bam_file=sorted_discordant_genome_bam_file, unpaired_bam_file=sorted_unpaired_genome_bam_file, cluster_shelve_file=cluster_shelve_file, cluster_pair_file=cluster_pair_file, breakpoint_bam_file=breakpoint_bam_file, log_dir=log_dir, tmp_dir=tmp_dir, num_processors=runconfig.num_processors, local_anchor_length=runconfig.local_anchor_length, local_multihits=runconfig.local_multihits) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) # # Nominate breakpoint spanning reads (split reads) # spanning_sam_file = os.path.join(tmp_dir, config.SPANNING_SAM_FILE) spanning_bam_file = os.path.join(tmp_dir, config.SPANNING_BAM_FILE) spanning_cluster_pair_file = os.path.join( tmp_dir, config.SPANNING_CLUSTER_PAIR_FILE) msg = "Processing breakpoint-spanning alignments" input_files = (breakpoint_bam_file, cluster_shelve_file, cluster_pair_file) output_files = (spanning_bam_file, spanning_cluster_pair_file) skip = True for inp in input_files: for outp in output_files: if not up_to_date(outp, inp): skip = False if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = process_spanning_alignments( cluster_shelve_file=cluster_shelve_file, cluster_pair_file=cluster_pair_file, bam_file=breakpoint_bam_file, output_sam_file=spanning_sam_file, output_cluster_pair_file=spanning_cluster_pair_file, local_anchor_length=runconfig.local_anchor_length) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) retcode = sam_to_bam(spanning_sam_file, spanning_bam_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(spanning_bam_file): os.remove(spanning_bam_file) return config.JOB_ERROR if os.path.exists(spanning_sam_file): os.remove(spanning_sam_file) # # Sort unpaired reads by position # msg = "Sorting spanning BAM file" sorted_spanning_bam_file = os.path.join(runconfig.output_dir, config.SORTED_SPANNING_BAM_FILE) if (up_to_date(sorted_spanning_bam_file, spanning_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bam_prefix = os.path.splitext(sorted_spanning_bam_file)[0] pysam.sort("-m", str(int(1e9)), spanning_bam_file, bam_prefix) # # Index BAM file # msg = "Indexing spanning BAM file" sorted_spanning_bam_index_file = sorted_spanning_bam_file + ".bai" if (up_to_date(sorted_spanning_bam_index_file, sorted_spanning_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_spanning_bam_file) # # Write chimera file # unfiltered_chimera_bedpe_file = os.path.join( runconfig.output_dir, config.UNFILTERED_CHIMERA_BEDPE_FILE) msg = "Writing unfiltered chimeras to file %s" % ( unfiltered_chimera_bedpe_file) if (up_to_date(unfiltered_chimera_bedpe_file, spanning_cluster_pair_file) and up_to_date(unfiltered_chimera_bedpe_file, cluster_shelve_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = write_output(transcripts, cluster_shelve_file=cluster_shelve_file, cluster_pair_file=spanning_cluster_pair_file, read_name_file=read_name_file, output_file=unfiltered_chimera_bedpe_file, annotation_source="ensembl") if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(unfiltered_chimera_bedpe_file): os.remove(unfiltered_chimera_bedpe_file) # # Filter chimeras # chimera_bedpe_file = os.path.join(runconfig.output_dir, config.CHIMERA_BEDPE_FILE) msg = "Filtering chimeras" if (up_to_date(chimera_bedpe_file, unfiltered_chimera_bedpe_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = filter_chimeras( input_file=unfiltered_chimera_bedpe_file, output_file=chimera_bedpe_file, filter_num_frags=runconfig.filter_num_frags, filter_allele_fraction=runconfig.filter_allele_fraction, mask_biotypes=mask_biotypes, mask_rnames=mask_rnames) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(chimera_bedpe_file): os.remove(chimera_bedpe_file) # # Cleanup # if not runconfig.keep_tmp: logging.info("Cleaning up temporary files") shutil.rmtree(tmp_dir) # # Done # logging.info("Finished run.") return config.JOB_SUCCESS
def build_transcriptome_annotation(gtf_files, output_file): # read gtf files and store transcripts transcripts = [] for itm in gtf_files: fields = itm.split(",") filename,source = fields[0],None if len(fields) > 1: source = fields[1] logging.info("Reading gene features from %s (source=%s)" % (filename, source)) for t in TranscriptFeature.from_gtf(open(filename), source=source): transcripts.append(t) logging.debug("\tRead %d annotations from %d files" % (len(transcripts), len(gtf_files))) # cluster transcripts by chromosome/strand/position logging.info("Determining transcript clusters") cur_transcript_id = 1 cur_cluster_id = 1 chrom_transcript_clusters = collections.defaultdict(lambda: collections.defaultdict(lambda: [])) for cluster in cluster_transcripts(transcripts): for t in cluster: t.tx_id = cur_transcript_id t.cluster_id = cur_cluster_id chrom_transcript_clusters[t.chrom][(t.introns,t.cluster_id)].append(t) cur_transcript_id += 1 cur_cluster_id += 1 logging.info("Found %d transcript clusters" % (cur_cluster_id)) # merge genes in transcript clusters logging.info("Merging transcripts") outfh = open(output_file, "w") cur_transcript_id = 1 for chrom in sorted(chrom_transcript_clusters): transcript_clusters = chrom_transcript_clusters[chrom] new_transcripts = [] for cluster in transcript_clusters.itervalues(): t = TranscriptFeature() t.chrom = chrom t.tx_start = min(x.tx_start for x in cluster) t.tx_end = max(x.tx_end for x in cluster) t.cluster_id = cluster[0].cluster_id t.strand = cluster[0].strand t.exon_count = cluster[0].exon_count t.exons = list(cluster[0].exons) t.exons[0] = (t.tx_start, t.exons[0][1]) t.exons[-1] = (t.exons[-1][0], t.tx_end) t.gene_biotype = "na" for x in cluster: if x.gene_biotype != "na": t.gene_biotype = x.gene_biotype t.tx_names.extend(x.tx_names) t.gene_names.extend(x.gene_names) t.annotation_sources.extend(x.annotation_sources) new_transcripts.append(t) new_transcripts.sort(key=operator.attrgetter("tx_start")) for t in new_transcripts: t.tx_id = cur_transcript_id cur_transcript_id += 1 print >>outfh, str(t) outfh.close() logging.info("Wrote gene annotation file")
def nominate_chimeras(index_dir, isize_dist_file, input_file, output_file, trim_bp, max_read_length, homology_mismatches): # read insert size distribution isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file)) # build a lookup table to get genomic intervals from transcripts logging.debug("Reading transcript information") transcript_feature_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_feature_file))) tx_id_map = build_transcript_map(transcripts) # open the reference sequence fasta file ref_fasta_file = os.path.join(index_dir, config.TRANSCRIPTOME_FASTA_FILE) ref_fa = pysam.Fastafile(ref_fasta_file) # keep track of mapping from breakpoint sequence to breakpoint id # this requires storing all breakpoint sequences in memory which is # potentially expensive. TODO: investigate whether this should be # moved to a separate sort-update-sort procedure breakpoint_seq_name_map = {} breakpoint_num = 1 # group discordant read pairs by gene logging.debug("Parsing discordant reads") chimera_num = 1 outfh = open(output_file, "w") for tx_id_5p, tx_id_3p, frags in parse_discordant_bedpe_by_transcript_pair(open(input_file)): # get gene information tx5p = tx_id_map[tx_id_5p] tx3p = tx_id_map[tx_id_3p] # bin fragments into putative breakpoints breakpoint_dict = collections.defaultdict(lambda: []) for dr5p,dr3p in frags: # given the insert size find the highest probability # exon junction breakpoint between the two transcripts isize_prob, breakpoints = \ choose_best_breakpoints(dr5p, dr3p, tx5p, tx3p, trim_bp, isize_dist) for breakpoint in breakpoints: breakpoint_dict[breakpoint].append((dr5p, dr3p)) # iterate through breakpoints and build chimera candidates for breakpoint,frags in breakpoint_dict.iteritems(): exon_num_5p, tx_end_5p, exon_num_3p, tx_start_3p = breakpoint breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right = \ extract_breakpoint_sequence(tx_id_5p, tx_end_5p, tx_id_3p, tx_start_3p, ref_fa, max_read_length, homology_mismatches) tx3p_length = sum((end - start) for start,end in tx3p.exons) # get unique breakpoint id based on sequence breakpoint_seq = breakpoint_seq_5p + breakpoint_seq_3p if breakpoint_seq in breakpoint_seq_name_map: breakpoint_name = breakpoint_seq_name_map[breakpoint_seq] else: breakpoint_name = "B%07d" % (breakpoint_num) breakpoint_seq_name_map[breakpoint_seq] = breakpoint_name breakpoint_num += 1 # write gene, breakpoint, and raw reads to a file and follow the # BEDPE format gene_names_5p = ",".join(sorted(set(["_".join(x.split()) for x in tx5p.gene_names]))) gene_names_3p = ",".join(sorted(set(["_".join(x.split()) for x in tx3p.gene_names]))) fields = [tx5p.tx_id, 0, tx_end_5p, # chrom1, start1, end1 tx3p.tx_id, tx_start_3p, tx3p_length, # chrom2, start2, end2 "C%07d" % (chimera_num), # name 1.0, # pvalue tx5p.strand, tx3p.strand, # strand1, strand2 gene_names_5p, gene_names_3p, # gene names # exon interval information '%d-%d' % (0, exon_num_5p), '%d-%d' % (exon_num_3p, len(tx3p.exons)), # breakpoint information breakpoint_name, breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right, # fragments frags_to_encomp_string(frags), # spanning reads None] print >>outfh, '\t'.join(map(str, fields)) chimera_num += 1 outfh.close() ref_fa.close() return config.JOB_SUCCESS
def write_output(input_file, bam_file, output_file, index_dir): # read transcripts logging.debug("Reading transcripts") transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) # build a lookup table to get genome coordinates from transcript # coordinates transcript_genome_map = build_transcript_genome_map(transcripts) tx_id_map = build_transcript_map(transcripts) genome_tx_trees = build_genome_transcript_trees(transcripts) # open BAM file for checking wild-type isoform bamfh = pysam.Samfile(bam_file, "rb") # group chimera isoforms together lines = [] chimera_clusters = 0 for key,chimeras in get_chimera_groups(input_file, tx_id_map): txs5p = set() txs3p = set() genes5p = set() genes3p = set() names = set() for c in chimeras: txs5p.add("%s:%d-%d" % (c.tx_name_5p, c.tx_start_5p, c.tx_end_5p-1)) txs3p.add("%s:%d-%d" % (c.tx_name_3p, c.tx_start_3p, c.tx_end_3p-1)) genes5p.add(c.gene_name_5p) genes3p.add(c.gene_name_3p) names.add(c.name) c = get_best_coverage_chimera(chimeras) # get chimera type and distance between genes chimera_type, distance = get_chimera_type(tx_id_map[c.tx_name_5p], tx_id_map[c.tx_name_3p], genome_tx_trees) # get genomic positions of chimera chrom5p,strand5p,start5p = transcript_to_genome_pos(c.tx_name_5p, c.tx_start_5p, transcript_genome_map) chrom5p,strand5p,end5p = transcript_to_genome_pos(c.tx_name_5p, c.tx_end_5p-1, transcript_genome_map) if strand5p == 1: start5p,end5p = end5p,start5p chrom3p,strand3p,start3p = transcript_to_genome_pos(c.tx_name_3p, c.tx_start_3p, transcript_genome_map) chrom3p,strand3p,end3p = transcript_to_genome_pos(c.tx_name_3p, c.tx_end_3p-1, transcript_genome_map) if strand3p == 1: start3p,end3p = end3p,start3p # get breakpoint spanning sequences spanning_seqs = set() spanning_fasta_lines = [] for dr in c.get_spanning_reads(): if dr.seq in spanning_seqs: continue spanning_seqs.add(dr.seq) spanning_fasta_lines.extend([">%s/%d;pos=%d;strand=%s" % (dr.qname, dr.readnum+1, dr.pos, "-" if dr.is_reverse else "+"), dr.seq]) # get isoform fraction num_wt_frags_5p, num_wt_frags_3p = get_wildtype_frags(c, bamfh) num_chimeric_frags = c.get_num_frags() frac5p = float(num_chimeric_frags) / (num_chimeric_frags + num_wt_frags_5p) frac3p = float(num_chimeric_frags) / (num_chimeric_frags + num_wt_frags_3p) # setup fields of BEDPE file fields = [chrom5p, start5p, end5p, chrom3p, start3p, end3p, "CLUSTER%d" % (chimera_clusters), c.get_num_frags(), "+" if (strand5p == 0) else "-", "+" if (strand3p == 0) else "-", ','.join(txs5p), ','.join(txs3p), ','.join(genes5p), ','.join(genes3p), chimera_type, distance, c.get_num_frags(), c.get_num_spanning_frags(), c.get_num_unique_positions(), frac5p, frac3p, ','.join(spanning_fasta_lines), ','.join(names)] lines.append(fields) chimera_clusters += 1 bamfh.close() logging.debug("Clustered chimeras: %d" % (chimera_clusters)) # sort lines = sorted(lines, key=operator.itemgetter(18, 17, 16), reverse=True) f = open(output_file, "w") print >>f, '\t'.join(['#chrom5p', 'start5p', 'end5p', 'chrom3p', 'start3p', 'end3p', 'chimera_cluster_id', 'score', 'strand5p', 'strand3p', 'transcript_ids_5p', 'transcript_ids_3p', 'genes5p', 'genes3p', 'type', 'distance', 'total_frags', 'spanning_frags', 'unique_alignment_positions', 'isoform_fraction_5p', 'isoform_fraction_3p', 'breakpoint_spanning_reads', 'chimera_ids']) for fields in lines: print >>f, '\t'.join(map(str, fields)) f.close() return config.JOB_SUCCESS
def run_chimerascan(runconfig): """ main function for running the chimerascan pipeline """ # print a welcome message title_string = "Running chimerascan version %s" % (__version__) logging.info(title_string) logging.info("-" * len(title_string)) # validate run configuration config_passed = runconfig.check_config() if not config_passed: logging.error("Invalid run configuration, aborting.") return config.JOB_ERROR # create output dir if it does not exist if not os.path.exists(runconfig.output_dir): os.makedirs(runconfig.output_dir) logging.info("Created output directory: %s" % (runconfig.output_dir)) # create log dir if it does not exist log_dir = os.path.join(runconfig.output_dir, config.LOG_DIR) if not os.path.exists(log_dir): os.makedirs(log_dir) logging.debug("Created directory for log files: %s" % (log_dir)) # create tmp dir if it does not exist tmp_dir = os.path.join(runconfig.output_dir, config.TMP_DIR) if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) logging.debug("Created directory for tmp files: %s" % (tmp_dir)) # write the run config to a file xmlstring = runconfig.to_xml() runconfig_xml_file = os.path.join(runconfig.output_dir, config.RUNCONFIG_XML_FILE) logging.info("Writing run configuration to XML file: %s" % (runconfig_xml_file)) fh = open(runconfig_xml_file, "w") print >>fh, xmlstring fh.close() # mask biotypes and references mask_biotypes = set() if runconfig.mask_biotypes_file: logging.info("Reading biotypes mask file") mask_biotypes.update([line.strip() for line in open(runconfig.mask_biotypes_file)]) logging.info("\tread biotypes: %s" % (','.join(sorted(mask_biotypes)))) mask_rnames = set() if runconfig.mask_rnames_file: logging.info("Reading references mask file") mask_rnames.update([line.strip() for line in open(runconfig.mask_rnames_file)]) logging.info("\tread references: %s" % (','.join(sorted(mask_rnames)))) # read transcripts logging.info("Reading transcript features") transcript_file = os.path.join(runconfig.index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) logging.info("\tread %d transcripts" % (len(transcripts))) # setup alignment indexes genome_index = os.path.join(runconfig.index_dir, config.GENOME_INDEX) transcriptome_index = os.path.join(runconfig.index_dir, config.TRANSCRIPTOME_INDEX) max_transcriptome_hits_file = os.path.join(runconfig.index_dir, config.MAX_MULTIMAPPING_FILE) max_transcriptome_hits = int(open(max_transcriptome_hits_file).next().strip()) # detect read length original_read_length = detect_read_length(runconfig.fastq_files[0]) # minimum fragment length cannot be smaller than the trimmed read length trimmed_read_length = (original_read_length - runconfig.trim5 - runconfig.trim3) min_fragment_length = max(runconfig.min_fragment_length, trimmed_read_length) # # Process and inspect the FASTQ files, performing several alterations # to the reads: # # 1) rename them from long string to numbers to save space throughout # the pipeline. also store mapping from read numbers to full names # in a separate file # 2) ensure the "/1" and "/2" suffixes exist to denote paired reads # 3) convert quality scores to sanger format # converted_fastq_files = [os.path.join(tmp_dir, fq) for fq in config.CONVERTED_FASTQ_FILES] read_name_file = os.path.join(tmp_dir, config.READ_NAME_TXT_FILE) msg = "Processing FASTQ files" skip = all(up_to_date(cfq, fq) for cfq,fq in zip(converted_fastq_files, runconfig.fastq_files)) skip = skip and up_to_date(read_name_file, runconfig.fastq_files[0]) if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) converted_fastq_prefix = \ os.path.join(tmp_dir, config.CONVERTED_FASTQ_PREFIX) try: retcode = process_input_reads(runconfig.fastq_files, converted_fastq_prefix, quals=runconfig.quals, trim5=runconfig.trim5, trim3=runconfig.trim3) if retcode != config.JOB_SUCCESS: logging.error("%s step failed" % (msg)) return config.JOB_ERROR except Exception as e: logging.info("Cleaning up after error %s" % (str(e))) for fq in converted_fastq_files: if os.path.isfile(fq): os.remove(fq) # # Transcriptome alignment step # # Align to transcriptome in paired-end mode, trying to resolve as many # reads as possible. # transcriptome_bam_file = os.path.join(tmp_dir, config.TRANSCRIPTOME_BAM_FILE) transcriptome_unaligned_path = os.path.join(tmp_dir, config.TRANSCRIPTOME_UNALIGNED_PATH) transcriptome_unaligned_fastq_files = tuple(os.path.join(tmp_dir, fq) for fq in config.TRANSCRIPTOME_UNALIGNED_FASTQ_FILES) msg = "Aligning paired-end reads to transcriptome" if (all(up_to_date(transcriptome_bam_file, fq) for fq in converted_fastq_files) and all(up_to_date(a,b) for a,b in zip(transcriptome_unaligned_fastq_files, converted_fastq_files))): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) log_file = os.path.join(log_dir, config.TRANSCRIPTOME_LOG_FILE) retcode = bowtie2_align_transcriptome_pe(transcriptome_index=transcriptome_index, genome_index=genome_index, transcript_file=transcript_file, fastq_files=converted_fastq_files, unaligned_path=transcriptome_unaligned_path, bam_file=transcriptome_bam_file, log_file=log_file, library_type=runconfig.library_type, min_fragment_length=min_fragment_length, max_fragment_length=runconfig.max_fragment_length, max_transcriptome_hits=max_transcriptome_hits, num_processors=runconfig.num_processors) # cleanup if job failed if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(transcriptome_bam_file): os.remove(transcriptome_bam_file) for f in transcriptome_unaligned_fastq_files: if os.path.exists(f): os.remove(f) return config.JOB_ERROR # # Sort transcriptome reads by position # msg = "Sorting transcriptome reads" sorted_transcriptome_bam_file = os.path.join(runconfig.output_dir, config.SORTED_TRANSCRIPTOME_BAM_FILE) if (up_to_date(sorted_transcriptome_bam_file, transcriptome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) sorted_aligned_bam_prefix = os.path.splitext(sorted_transcriptome_bam_file)[0] pysam.sort("-m", str(int(1e9)), transcriptome_bam_file, sorted_aligned_bam_prefix) # # Index BAM file # msg = "Indexing BAM file" sorted_transcriptome_bam_index_file = sorted_transcriptome_bam_file + ".bai" if (up_to_date(sorted_transcriptome_bam_index_file, sorted_transcriptome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_transcriptome_bam_file) # # Get insert size distribution # isize_dist_file = os.path.join(runconfig.output_dir, config.ISIZE_DIST_FILE) msg = "Profiling insert size distribution" if up_to_date(isize_dist_file, transcriptome_bam_file): logging.info("[SKIPPED] %s" % msg) isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file, "r")) else: logging.info(msg) bamfh = pysam.Samfile(sorted_transcriptome_bam_file, "rb") isize_dist = InsertSizeDistribution.from_genome_bam(bamfh, transcripts, min_isize=min_fragment_length, max_isize=runconfig.max_fragment_length, max_samples=config.ISIZE_MAX_SAMPLES) bamfh.close() # if not enough samples, use a normal distribution instead # of the empirical distribution if isize_dist.n < config.ISIZE_MIN_SAMPLES: logging.warning("Not enough fragments to sample insert size " "distribution empirically. Using mean=%d " "stdev=%f instead" % (runconfig.isize_mean, runconfig.isize_stdev)) isize_dist = InsertSizeDistribution.from_random(runconfig.isize_mean, runconfig.isize_stdev, min_isize=runconfig.min_fragment_length, max_isize=runconfig.max_fragment_length, samples=config.ISIZE_MAX_SAMPLES) isize_dist.to_file(open(isize_dist_file, "w")) # # Determine ideal segment length automatically # # log insert size statistics logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" % (isize_dist.n, isize_dist.mean(), isize_dist.std(), isize_dist.isize_at_percentile(50.0), isize_dist.mode())) # choose a segment length to optimize mapping optimal_isize = isize_dist.isize_at_percentile(DEFAULT_FRAG_SIZE_SENSITIVITY) logging.info("Determining soft-clipped segment length") logging.debug("\tInsert size at %f percent of distribution is %d" % (DEFAULT_FRAG_SIZE_SENSITIVITY, optimal_isize)) optimal_segment_length = int(round(optimal_isize / 3.0)) logging.debug("\tOptimal segment length is %d/3.0 = %d" % (optimal_isize, optimal_segment_length)) segment_length = min(optimal_segment_length, trimmed_read_length) segment_length = max(config.MIN_SEGMENT_LENGTH, segment_length) logging.debug("\tAfter adjusting for min %d and read length %d, final segment length is %d" % (config.MIN_SEGMENT_LENGTH, trimmed_read_length, segment_length)) if runconfig.segment_length is not None: logging.debug("\tOverriding auto segment length and using segment length of %d" % (runconfig.segment_length)) segment_length = runconfig.segment_length # # Genome alignment step # # Align any unaligned transcriptome reads to genome in paired-end mode. # Resolve as many reads as possible. # genome_bam_file = os.path.join(tmp_dir, config.GENOME_BAM_FILE) genome_unaligned_path = os.path.join(tmp_dir, config.GENOME_UNALIGNED_PATH) genome_unaligned_fastq_files = tuple(os.path.join(tmp_dir, fq) for fq in config.GENOME_UNALIGNED_FASTQ_FILES) msg = "Realigning unaligned paired-end reads to genome" if (all(up_to_date(genome_bam_file, fq) for fq in converted_fastq_files) and all(up_to_date(a,b) for a,b in zip(genome_unaligned_fastq_files, converted_fastq_files))): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) log_file = os.path.join(log_dir, config.GENOME_LOG_FILE) retcode = bowtie2_align_pe(index=genome_index, fastq_files=transcriptome_unaligned_fastq_files, unaligned_path=genome_unaligned_path, bam_file=genome_bam_file, log_file=log_file, library_type=runconfig.library_type, min_fragment_length=min_fragment_length, max_fragment_length=runconfig.max_fragment_length, max_hits=max_transcriptome_hits, num_processors=runconfig.num_processors) # cleanup if job failed if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(genome_bam_file): os.remove(genome_bam_file) for f in genome_unaligned_fastq_files: if os.path.exists(f): os.remove(f) return config.JOB_ERROR # # Realignment step # # trim and realign all the initially unaligned reads in order to # increase sensitivity to detect reads spanning fusion junctions # realigned_bam_file = os.path.join(tmp_dir, config.REALIGNED_BAM_FILE) realigned_log_file = os.path.join(log_dir, config.REALIGNED_LOG_FILE) msg = "Trimming and realigning initially unmapped reads" if (all(up_to_date(realigned_bam_file, fq) for fq in genome_unaligned_fastq_files) and up_to_date(realigned_bam_file, isize_dist_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = bowtie2_align_pe_sr(index=transcriptome_index, transcript_file=transcript_file, fastq_files=genome_unaligned_fastq_files, bam_file=realigned_bam_file, log_file=realigned_log_file, tmp_dir=tmp_dir, segment_length=segment_length, max_hits=max_transcriptome_hits, num_processors=runconfig.num_processors) if retcode != config.JOB_SUCCESS: if os.path.exists(realigned_bam_file): os.remove(realigned_bam_file) return config.JOB_ERROR # # Find discordant reads # # iterate through realigned reads and divide them into groups of # concordant, discordant within a gene (isoforms), discordant # between different genes, and discordant in the genome # paired_bam_file = os.path.join(tmp_dir, config.PAIRED_BAM_FILE) discordant_bam_file = os.path.join(tmp_dir, config.DISCORDANT_BAM_FILE) unpaired_bam_file = os.path.join(tmp_dir, config.UNPAIRED_BAM_FILE) unmapped_bam_file = os.path.join(tmp_dir, config.UNMAPPED_BAM_FILE) multimap_bam_file = os.path.join(tmp_dir, config.MULTIMAP_BAM_FILE) unresolved_bam_file = os.path.join(tmp_dir, config.UNRESOLVED_BAM_FILE) output_files = (paired_bam_file, discordant_bam_file, unpaired_bam_file, unmapped_bam_file, multimap_bam_file, unresolved_bam_file) msg = "Classifying concordant and discordant read pairs" if (all(up_to_date(f, realigned_bam_file) for f in output_files)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = find_discordant_fragments(transcripts=transcripts, input_bam_file=realigned_bam_file, paired_bam_file=paired_bam_file, discordant_bam_file=discordant_bam_file, unpaired_bam_file=unpaired_bam_file, unmapped_bam_file=unmapped_bam_file, multimap_bam_file=multimap_bam_file, unresolved_bam_file=unresolved_bam_file, max_isize=runconfig.max_fragment_length, max_multihits=runconfig.max_multihits, library_type=runconfig.library_type) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) return config.JOB_ERROR # # Convert discordant transcriptome reads to genome coordinates # discordant_genome_bam_file = os.path.join(tmp_dir, config.DISCORDANT_GENOME_BAM_FILE) msg = "Converting discordant transcriptome hits to genomic coordinates" if (up_to_date(discordant_genome_bam_file, discordant_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) discordant_genome_sam_file = os.path.join(tmp_dir, config.DISCORDANT_GENOME_SAM_FILE) retcode = transcriptome_to_genome(genome_index, transcripts, input_file=discordant_bam_file, output_file=discordant_genome_sam_file, library_type=runconfig.library_type, input_sam=False, output_sam=True) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(discordant_genome_sam_file): os.remove(discordant_genome_sam_file) return config.JOB_ERROR retcode = sam_to_bam(discordant_genome_sam_file, discordant_genome_bam_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(discordant_genome_bam_file): os.remove(discordant_genome_bam_file) return config.JOB_ERROR if os.path.exists(discordant_genome_sam_file): os.remove(discordant_genome_sam_file) # # Sort discordant reads by position # msg = "Sorting discordant BAM file" sorted_discordant_genome_bam_file = os.path.join(tmp_dir, config.SORTED_DISCORDANT_GENOME_BAM_FILE) if (up_to_date(sorted_discordant_genome_bam_file, discordant_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bam_prefix = os.path.splitext(sorted_discordant_genome_bam_file)[0] pysam.sort("-m", str(int(1e9)), discordant_genome_bam_file, bam_prefix) # # Index BAM file # msg = "Indexing discordant BAM file" sorted_discordant_bam_index_file = sorted_discordant_genome_bam_file + ".bai" if (up_to_date(sorted_discordant_bam_index_file, sorted_discordant_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_discordant_genome_bam_file) # # Convert unpaired transcriptome reads to genome coordinates # unpaired_genome_bam_file = os.path.join(tmp_dir, config.UNPAIRED_GENOME_BAM_FILE) msg = "Converting unpaired transcriptome hits to genomic coordinates" if (up_to_date(unpaired_genome_bam_file, unpaired_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) unpaired_genome_sam_file = os.path.join(tmp_dir, config.UNPAIRED_GENOME_SAM_FILE) retcode = transcriptome_to_genome(genome_index, transcripts, input_file=unpaired_bam_file, output_file=unpaired_genome_sam_file, library_type=runconfig.library_type, input_sam=False, output_sam=True) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(unpaired_genome_sam_file): os.remove(unpaired_genome_sam_file) return config.JOB_ERROR retcode = sam_to_bam(unpaired_genome_sam_file, unpaired_genome_bam_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(unpaired_genome_bam_file): os.remove(unpaired_genome_bam_file) return config.JOB_ERROR if os.path.exists(unpaired_genome_sam_file): os.remove(unpaired_genome_sam_file) # # Sort unpaired reads by position # msg = "Sorting unpaired BAM file" sorted_unpaired_genome_bam_file = os.path.join(tmp_dir, config.SORTED_UNPAIRED_GENOME_BAM_FILE) if (up_to_date(sorted_unpaired_genome_bam_file, unpaired_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bam_prefix = os.path.splitext(sorted_unpaired_genome_bam_file)[0] pysam.sort("-m", str(int(1e9)), unpaired_genome_bam_file, bam_prefix) # # Index BAM file # msg = "Indexing unpaired BAM file" sorted_unpaired_bam_index_file = sorted_unpaired_genome_bam_file + ".bai" if (up_to_date(sorted_unpaired_bam_index_file, sorted_unpaired_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_unpaired_genome_bam_file) # # Cluster discordant reads into chimera candidates # cluster_file = os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_FILE) cluster_shelve_file = \ os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_SHELVE_FILE) sorted_discordant_genome_cluster_bam_file = \ os.path.join(runconfig.output_dir, config.SORTED_DISCORDANT_GENOME_CLUSTER_BAM_FILE) input_files = (sorted_discordant_genome_bam_file, sorted_unpaired_genome_bam_file) output_files = (cluster_file, cluster_shelve_file, sorted_discordant_genome_cluster_bam_file) msg = "Clustering discordant reads" skip = True for input_file in input_files: for output_file in output_files: skip = skip and up_to_date(output_file, input_file) if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.debug(msg) retcode = cluster_discordant_reads(discordant_bam_file=sorted_discordant_genome_bam_file, unpaired_bam_file=sorted_unpaired_genome_bam_file, concordant_bam_file=sorted_transcriptome_bam_file, output_bam_file=sorted_discordant_genome_cluster_bam_file, cluster_file=cluster_file, cluster_shelve_file=cluster_shelve_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) # # Pair discordant clusters # cluster_pair_file = \ os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_PAIR_FILE) msg = "Pairing discordant clusters" output_files = (cluster_pair_file,) if up_to_date(cluster_pair_file, sorted_discordant_genome_cluster_bam_file): logging.info("[SKIPPED] %s" % (msg)) else: logging.debug(msg) retcode = pair_discordant_clusters(discordant_bam_file=sorted_discordant_genome_cluster_bam_file, cluster_pair_file=cluster_pair_file, tmp_dir=tmp_dir) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) # # Perform realignment across putative fusion breakpoints # breakpoint_bam_file = os.path.join(tmp_dir, config.BREAKPOINT_BAM_FILE) msg = "Realigning to find breakpoint-spanning reads" input_files = (sorted_discordant_genome_bam_file, sorted_unpaired_genome_bam_file, cluster_shelve_file, cluster_pair_file) output_files = (breakpoint_bam_file,) skip = True for inp in input_files: for outp in output_files: if not up_to_date(outp, inp): skip = False if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.debug(msg) retcode = realign_across_breakpoints(index_dir=runconfig.index_dir, discordant_bam_file=sorted_discordant_genome_bam_file, unpaired_bam_file=sorted_unpaired_genome_bam_file, cluster_shelve_file=cluster_shelve_file, cluster_pair_file=cluster_pair_file, breakpoint_bam_file=breakpoint_bam_file, log_dir=log_dir, tmp_dir=tmp_dir, num_processors=runconfig.num_processors, local_anchor_length=runconfig.local_anchor_length, local_multihits=runconfig.local_multihits) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) # # Nominate breakpoint spanning reads (split reads) # spanning_sam_file = os.path.join(tmp_dir, config.SPANNING_SAM_FILE) spanning_bam_file = os.path.join(tmp_dir, config.SPANNING_BAM_FILE) spanning_cluster_pair_file = os.path.join(tmp_dir, config.SPANNING_CLUSTER_PAIR_FILE) msg = "Processing breakpoint-spanning alignments" input_files = (breakpoint_bam_file, cluster_shelve_file, cluster_pair_file) output_files = (spanning_bam_file, spanning_cluster_pair_file) skip = True for inp in input_files: for outp in output_files: if not up_to_date(outp, inp): skip = False if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = process_spanning_alignments(cluster_shelve_file=cluster_shelve_file, cluster_pair_file=cluster_pair_file, bam_file=breakpoint_bam_file, output_sam_file=spanning_sam_file, output_cluster_pair_file=spanning_cluster_pair_file, local_anchor_length=runconfig.local_anchor_length) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) retcode = sam_to_bam(spanning_sam_file, spanning_bam_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(spanning_bam_file): os.remove(spanning_bam_file) return config.JOB_ERROR if os.path.exists(spanning_sam_file): os.remove(spanning_sam_file) # # Sort unpaired reads by position # msg = "Sorting spanning BAM file" sorted_spanning_bam_file = os.path.join(runconfig.output_dir, config.SORTED_SPANNING_BAM_FILE) if (up_to_date(sorted_spanning_bam_file, spanning_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bam_prefix = os.path.splitext(sorted_spanning_bam_file)[0] pysam.sort("-m", str(int(1e9)), spanning_bam_file, bam_prefix) # # Index BAM file # msg = "Indexing spanning BAM file" sorted_spanning_bam_index_file = sorted_spanning_bam_file + ".bai" if (up_to_date(sorted_spanning_bam_index_file, sorted_spanning_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_spanning_bam_file) # # Write chimera file # unfiltered_chimera_bedpe_file = os.path.join(runconfig.output_dir, config.UNFILTERED_CHIMERA_BEDPE_FILE) msg = "Writing unfiltered chimeras to file %s" % (unfiltered_chimera_bedpe_file) if (up_to_date(unfiltered_chimera_bedpe_file, spanning_cluster_pair_file) and up_to_date(unfiltered_chimera_bedpe_file, cluster_shelve_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = write_output(transcripts, cluster_shelve_file=cluster_shelve_file, cluster_pair_file=spanning_cluster_pair_file, read_name_file=read_name_file, output_file=unfiltered_chimera_bedpe_file, annotation_source="ensembl") if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(unfiltered_chimera_bedpe_file): os.remove(unfiltered_chimera_bedpe_file) # # Filter chimeras # chimera_bedpe_file = os.path.join(runconfig.output_dir, config.CHIMERA_BEDPE_FILE) msg = "Filtering chimeras" if (up_to_date(chimera_bedpe_file, unfiltered_chimera_bedpe_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = filter_chimeras(input_file=unfiltered_chimera_bedpe_file, output_file=chimera_bedpe_file, filter_num_frags=runconfig.filter_num_frags, filter_allele_fraction=runconfig.filter_allele_fraction, mask_biotypes=mask_biotypes, mask_rnames=mask_rnames) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(chimera_bedpe_file): os.remove(chimera_bedpe_file) # # Cleanup # if not runconfig.keep_tmp: logging.info("Cleaning up temporary files") shutil.rmtree(tmp_dir) # # Done # logging.info("Finished run.") return config.JOB_SUCCESS
def nominate_chimeras(index_dir, isize_dist_file, input_file, output_file, trim_bp, max_read_length, homology_mismatches): # read insert size distribution isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file)) # build a lookup table to get genomic intervals from transcripts logging.debug("Reading transcript information") transcript_feature_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_feature_file))) tx_id_map = build_transcript_map(transcripts) # open the reference sequence fasta file ref_fasta_file = os.path.join(index_dir, config.TRANSCRIPTOME_FASTA_FILE) ref_fa = pysam.Fastafile(ref_fasta_file) # keep track of mapping from breakpoint sequence to breakpoint id # this requires storing all breakpoint sequences in memory which is # potentially expensive. TODO: investigate whether this should be # moved to a separate sort-update-sort procedure breakpoint_seq_name_map = {} breakpoint_num = 1 # group discordant read pairs by gene logging.debug("Parsing discordant reads") chimera_num = 1 outfh = open(output_file, "w") for tx_id_5p, tx_id_3p, frags in parse_discordant_bedpe_by_transcript_pair( open(input_file)): # get gene information tx5p = tx_id_map[tx_id_5p] tx3p = tx_id_map[tx_id_3p] # bin fragments into putative breakpoints breakpoint_dict = collections.defaultdict(lambda: []) for dr5p, dr3p in frags: # given the insert size find the highest probability # exon junction breakpoint between the two transcripts isize_prob, breakpoints = \ choose_best_breakpoints(dr5p, dr3p, tx5p, tx3p, trim_bp, isize_dist) for breakpoint in breakpoints: breakpoint_dict[breakpoint].append((dr5p, dr3p)) # iterate through breakpoints and build chimera candidates for breakpoint, frags in breakpoint_dict.iteritems(): exon_num_5p, tx_end_5p, exon_num_3p, tx_start_3p = breakpoint breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right = \ extract_breakpoint_sequence(tx_id_5p, tx_end_5p, tx_id_3p, tx_start_3p, ref_fa, max_read_length, homology_mismatches) tx3p_length = sum((end - start) for start, end in tx3p.exons) # get unique breakpoint id based on sequence breakpoint_seq = breakpoint_seq_5p + breakpoint_seq_3p if breakpoint_seq in breakpoint_seq_name_map: breakpoint_name = breakpoint_seq_name_map[breakpoint_seq] else: breakpoint_name = "B%07d" % (breakpoint_num) breakpoint_seq_name_map[breakpoint_seq] = breakpoint_name breakpoint_num += 1 # write gene, breakpoint, and raw reads to a file and follow the # BEDPE format gene_names_5p = ",".join( sorted(set(["_".join(x.split()) for x in tx5p.gene_names]))) gene_names_3p = ",".join( sorted(set(["_".join(x.split()) for x in tx3p.gene_names]))) fields = [ tx5p.tx_id, 0, tx_end_5p, # chrom1, start1, end1 tx3p.tx_id, tx_start_3p, tx3p_length, # chrom2, start2, end2 "C%07d" % (chimera_num), # name 1.0, # pvalue tx5p.strand, tx3p.strand, # strand1, strand2 gene_names_5p, gene_names_3p, # gene names # exon interval information '%d-%d' % (0, exon_num_5p), '%d-%d' % (exon_num_3p, len(tx3p.exons)), # breakpoint information breakpoint_name, breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right, # fragments frags_to_encomp_string(frags), # spanning reads None ] print >> outfh, '\t'.join(map(str, fields)) chimera_num += 1 outfh.close() ref_fa.close() return config.JOB_SUCCESS
def build_transcriptome_annotation(gtf_files, output_file): # read gtf files and store transcripts transcripts = [] for itm in gtf_files: fields = itm.split(",") filename, source = fields[0], None if len(fields) > 1: source = fields[1] logging.info("Reading gene features from %s (source=%s)" % (filename, source)) for t in TranscriptFeature.from_gtf(open(filename), source=source): transcripts.append(t) logging.debug("\tRead %d annotations from %d files" % (len(transcripts), len(gtf_files))) # cluster transcripts by chromosome/strand/position logging.info("Determining transcript clusters") cur_transcript_id = 1 cur_cluster_id = 1 chrom_transcript_clusters = collections.defaultdict( lambda: collections.defaultdict(lambda: [])) for cluster in cluster_transcripts(transcripts): for t in cluster: t.tx_id = cur_transcript_id t.cluster_id = cur_cluster_id chrom_transcript_clusters[t.chrom][(t.introns, t.cluster_id)].append(t) cur_transcript_id += 1 cur_cluster_id += 1 logging.info("Found %d transcript clusters" % (cur_cluster_id)) # merge genes in transcript clusters logging.info("Merging transcripts") outfh = open(output_file, "w") cur_transcript_id = 1 for chrom in sorted(chrom_transcript_clusters): transcript_clusters = chrom_transcript_clusters[chrom] new_transcripts = [] for cluster in transcript_clusters.itervalues(): t = TranscriptFeature() t.chrom = chrom t.tx_start = min(x.tx_start for x in cluster) t.tx_end = max(x.tx_end for x in cluster) t.cluster_id = cluster[0].cluster_id t.strand = cluster[0].strand t.exon_count = cluster[0].exon_count t.exons = list(cluster[0].exons) t.exons[0] = (t.tx_start, t.exons[0][1]) t.exons[-1] = (t.exons[-1][0], t.tx_end) t.gene_biotype = "na" for x in cluster: if x.gene_biotype != "na": t.gene_biotype = x.gene_biotype t.tx_names.extend(x.tx_names) t.gene_names.extend(x.gene_names) t.annotation_sources.extend(x.annotation_sources) new_transcripts.append(t) new_transcripts.sort(key=operator.attrgetter("tx_start")) for t in new_transcripts: t.tx_id = cur_transcript_id cur_transcript_id += 1 print >> outfh, str(t) outfh.close() logging.info("Wrote gene annotation file")