def main(): from optparse import OptionParser logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = OptionParser("usage: %prog [options] <in.bam> <out.bam>") parser.add_option('--min-fragment-length', dest="min_fragment_length", type="int", default=50) parser.add_option('--max-fragment-length', dest="max_fragment_length", type="int", default=1000) parser.add_option('--library', dest="library_type", default="fr") #parser.add_option('--unpaired-bam', dest="unpaired_bam_file", default=None) options, args = parser.parse_args() input_bam_file = args[0] output_bam_file = args[1] logging.info("Merging read pairs") logging.debug("Input file: %s" % (input_bam_file)) logging.debug("Output file: %s" % (output_bam_file)) logging.debug("Library type: '%s'" % (options.library_type)) library_type = parse_library_type(options.library_type) bamfh = pysam.Samfile(input_bam_file, "rb") outfh = pysam.Samfile(output_bam_file, "wb", template=bamfh) #outfh = pysam.Samfile("-", "w", template=bamfh) merge_read_pairs(bamfh, outfh, options.min_fragment_length, options.max_fragment_length, library_type) logging.info("Paired-end merging completed")
def sam_to_bam(input_fastq_file, input_sam_file, output_bam_file, multihits, mode, keep_unmapped=True): samfh = pysam.Samfile(input_sam_file, "r") if mode == "pe": fix_iter = fix_pe_alignment_ordering(samfh, open(input_fastq_file), is_paired=True) elif mode == "pesr": fix_iter = fix_pe_sr_alignment_ordering(samfh, open(input_fastq_file)) elif mode == "sr": fix_iter = fix_pe_alignment_ordering(samfh, open(input_fastq_file), is_paired=False) num_unmapped = 0 num_multihits = 0 num_frags = 0 bamfh = pysam.Samfile(output_bam_file, "wb", template=samfh) for frags in fix_iter: num_frags += 1 for reads in frags: un, mh = write_reads_to_bam(reads, bamfh, multihits, keep_unmapped) num_unmapped += un num_multihits += mh bamfh.close() samfh.close() logging.debug("Found %d fragments" % (num_frags)) logging.debug("\t%d unmapped reads" % (num_unmapped)) logging.debug("\t%d multimapping (>%dX) reads" % (num_multihits, multihits))
def join_segmented_alignments(input_sam_file, input_fastq_file, output_bam_file, is_paired): # setup debugging logging messages debug_count = 0 debug_every = 1e6 debug_next = debug_every # open sam file infh = pysam.Samfile(input_sam_file, "r") #header = infh.header outfh = pysam.Samfile(output_bam_file, "wb", template=infh) tid_type_map = get_tid_ref_types(outfh) #outfh = pysam.Samfile("-", "w", template=infh) # iterate through paired-end alignments logging.info("Processing paired alignments") align_iter = fix_segmented_alignment_ordering(infh, open(input_fastq_file), is_paired) for segmented_pe_reads in align_iter: debug_count += 1 if debug_count == debug_next: debug_next += debug_every logging.debug("Processed %d reads" % debug_count) # get alignments for mate, mate_segs in enumerate(segmented_pe_reads): # search for segment matches joined_hits = find_valid_segment_alignments(mate_segs) num_hits = len(joined_hits) #print 'HITS', num_hits for hit_index, split_hits in enumerate(joined_hits): # total number of splits num_splits = len(split_hits) #print 'HIT', hit_index, 'SPLITS', len(split_hits) for split_index, seg_hits in enumerate(split_hits): num_seg_hits = len(seg_hits) split_reads = [] multimaps = 0 #print 'SPLIT', split_index, 'HITS', num_seg_hits for seg_index, seg_reads in enumerate(seg_hits): # make SAM record for each segment tags = [(SamTags.RTAG_NUM_PARTITIONS, num_hits), (SamTags.RTAG_PARTITION_IND, hit_index), (SamTags.RTAG_NUM_SPLITS, num_splits), (SamTags.RTAG_SPLIT_IND, split_index), (SamTags.RTAG_NUM_MAPPINGS, num_seg_hits), (SamTags.RTAG_MAPPING_IND, seg_index)] r = make_joined_read(mate, seg_reads, tags=tags) split_reads.append(r) # TODO: keep track of multimaps using the number of # genome hits as a proxy (this is not perfect, since # splice junction reads could be multimapping if tid_type_map[r.rname] == REF_GENOME: multimaps += 1 # output reads now that multimappings have been computed for r in split_reads: if not r.is_unmapped: r.tags = r.tags + [("NH", multimaps)] outfh.write(r)
def transcriptome_to_genome(input_sam_file, output_sam_file, gene_to_genome_map): insamfh = pysam.Samfile(input_sam_file, "r") new_header, gene_table = build_translation_table(insamfh, gene_to_genome_map) outsamfh = pysam.Samfile(output_sam_file, "wh", header=new_header) for read in translate_multihit_reads(insamfh, gene_table): outsamfh.write(read) outsamfh.close() insamfh.close()
def filter_multihits(transcript_file, input_bam_file, output_bam_file, max_multihits=1): logging.debug("Reading transcript features") transcripts = list(TranscriptFeature.parse(open(transcript_file))) # parse and convert sam -> bam inbamfh = pysam.Samfile(input_bam_file, "rb") outbamfh = pysam.Samfile(output_bam_file, "wb", template=inbamfh) # build a transcript to genome coordinate map tid_tx_genome_map = build_tid_transcript_genome_map(outbamfh, transcripts) num_frags = 0 logging.debug("Annotating and filtering multihits") for pe_reads in parse_pe_reads(inbamfh): mate_num_hits = [] for reads in pe_reads: num_hits = annotate_multihits(reads, tid_tx_genome_map) mate_num_hits.append(num_hits) new_pe_reads = [[], []] if mate_num_hits[0] > max_multihits: r = copy_read(pe_reads[0][0]) r.is_unmapped = True r.is_proper_pair = False r.is_secondary = False r.rname = -1 r.pos = 0 if mate_num_hits[1] > max_multihits: r.mate_is_unmapped = True r.mrnm = -1 r.mpos = 0 new_pe_reads[0] = [r] else: new_pe_reads[0] = pe_reads[0] if mate_num_hits[1] > max_multihits: r = copy_read(pe_reads[1][0]) r.is_unmapped = True r.is_proper_pair = False r.is_secondary = False r.rname = -1 r.pos = 0 if mate_num_hits[0] > max_multihits: r.mate_is_unmapped = True r.mrnm = -1 r.mpos = 0 new_pe_reads[1] = [r] else: new_pe_reads[1] = pe_reads[1] for reads in pe_reads: for r in reads: outbamfh.write(r) num_frags += 1 logging.debug("Found %d fragments" % (num_frags)) inbamfh.close() outbamfh.close() return config.JOB_SUCCESS
def sam_to_bam(input_fastq_files, input_sam_file, output_bam_file, quals, multihits, pe_sr_mode=False, softclip=True, keep_unmapped=True): samfh = pysam.Samfile(input_sam_file, "r") num_unmapped = 0 num_multihits = 0 num_frags = 0 bamfh = pysam.Samfile(output_bam_file, "wb", template=samfh) # setup fastq parsing if softclip and (quals != SANGER_FORMAT): kwargs = {"convert_quals": True, "qual_format": quals} else: kwargs = {"convert_quals": False} fqiters = [ parse_fastq_record(open(fq), **kwargs) for fq in input_fastq_files ] # handle single-read and paired-end if len(fqiters) == 1: reorder_func = fix_sr_alignment_ordering(samfh, fqiters[0]) else: reorder_func = fix_alignment_ordering(samfh, fqiters, pe_sr_mode) # iterate through buffer for bufitems in reorder_func: num_frags += 1 for bufitem in bufitems: for r in bufitem.reads: # softclip uses the fastq record to replace the sequence # and quality scores of the read if softclip: soft_pad_read(bufitem.fqrec, r) # keep statistics of unmapped/multimapped reads and # suppress output if 'keep_unmapped' is False if r.is_unmapped: xm_tag = r.opt('XM') if xm_tag < multihits: num_unmapped += 1 if not keep_unmapped: continue else: num_multihits += 1 bamfh.write(r) for fqfh in fqiters: fqfh.close() bamfh.close() samfh.close() logging.debug("Found %d fragments" % (num_frags)) logging.debug("\t%d unmapped reads" % (num_unmapped)) logging.debug("\t%d multimapping (>%dX) reads" % (num_multihits, multihits))
def extend_and_pad_sam(input_fastq_files, input_sam_file, output_sam_file): infh = pysam.Samfile(input_sam_file, "r") outfh = pysam.Samfile(output_sam_file, "w", template=infh) tagdict = dict(r.tags) # TODO: bug in pysam handling CP tag, fix by forcing to integer if "CP" in tagdict: tagdict["CP"] = int(tagdict["CP"]) # add additional tags tagdict.update(tags) r.tags = tagdict.items()
def find_discordant_fragments(input_bam_file, paired_bam_file, unmapped_bam_file, index_dir, max_isize, library_type): """ parses BAM file and categorizes reads into several groups: - concordant - discordant within gene (splicing isoforms) - discordant between different genes (chimeras) """ logging.info("Finding discordant read pair combinations") logging.debug("\tInput file: %s" % (input_bam_file)) logging.debug("\tMax insert size: '%d'" % (max_isize)) logging.debug("\tLibrary type: '%s'" % (library_type)) logging.debug("\tGene paired file: %s" % (paired_bam_file)) logging.debug("\tUnmapped file: %s" % (unmapped_bam_file)) # setup input and output files bamfh = pysam.Samfile(input_bam_file, "rb") genefh = pysam.Samfile(paired_bam_file, "wb", template=bamfh) unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh) # read transcript features logging.debug("Reading transcript features") transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) logging.debug("Building transcript lookup tables") # build a lookup table from bam tid index to transcript object tid_tx_map = build_tid_transcript_map(bamfh, transcripts) # build a transcript to genome coordinate map tid_tx_genome_map = build_tid_transcript_genome_map(bamfh, transcripts) logging.info("Parsing reads") for pe_reads in parse_pe_reads(bamfh): # add hit index and multimap information to read tags # this function also checks for unmapped reads any_unmapped = False for reads in pe_reads: any_unmapped = (any_unmapped or annotate_multihits( bamfh, reads, tid_tx_genome_map)) if any_unmapped: # write to output as discordant reads and continue to # next fragment write_pe_reads(unmappedfh, pe_reads) continue # examine all read pairing combinations and rule out invalid pairings gene_pairs, unpaired_reads = classify_read_pairs( pe_reads, max_isize, library_type, tid_tx_map) if len(gene_pairs) > 0: write_pairs(genefh, gene_pairs) # TODO: do something with unpaired discordant reads? genefh.close() unmappedfh.close() bamfh.close() logging.info("Finished pairing reads") return config.JOB_SUCCESS
def process_tophat_alignments(fastq_files, bam_file, gene_file, max_fragment_length, output_fastq_files, output_bam_file, unpaired=False, suffix="/"): # index genes exon_intervals, exon_trees = build_exon_interval_trees(gene_file) # open input files bamfh = pysam.Samfile(bam_file, "rb") if unpaired: bam_iter = parse_unpaired_pe_reads(bamfh) else: bam_iter = parse_pe_reads(bamfh) fastq_iters = [parse_fastq(open(fq)) for fq in fastq_files] # open output files outfq = [open(fq, "w") for fq in output_fastq_files] outbamfh = pysam.Samfile(output_bam_file, "wb", template=bamfh) # iterate through fastq files and bam file try: while True: bam_pe_reads = bam_iter.next() # synchronize fastq and bam and write unmapped reads to a file is_unaligned = synchronize_bam_fastq(bam_pe_reads, fastq_iters, outfq, suffix) if is_unaligned: continue # if loop reaches this point then we have a paired-end # read where both pairs align. now need to check if # the alignment is discordant tx_concordant, gene_concordant = \ is_concordant(bamfh, bam_pe_reads, exon_intervals, exon_trees, max_fragment_length) if not gene_concordant: for r in bam_pe_reads[0]: outbamfh.write(r) for r in bam_pe_reads[1]: outbamfh.write(r) except StopIteration: pass # finish remaining fastq lines try: while True: fqreads = [it.next() for it in fastq_iters] print >> outfq[0], fastq_to_string(fqreads[0]) print >> outfq[1], fastq_to_string(fqreads[1]) except StopIteration: pass return config.JOB_SUCCESS
def realign_genome_reads(input_bam_file, output_bam_file, gene_file): # build a map of gene name to genome coords logging.info("Reading gene index") infh = pysam.Samfile(input_bam_file, "rb") gene_tid_list = get_gene_tids(infh) exon_trees = build_exon_trees(infh, gene_file) outfh = pysam.Samfile("-", "w", template=infh) #outfh = pysam.Samfile(output_bam_file, "wb", template=infh) for pe_reads in parse_pe_sam_file(infh): for mate_partitions in pe_reads: for splits in mate_partitions: for reads in splits: for r in realign_split_reads(reads, gene_tid_list, exon_trees): outfh.write(r)
def discordant_reads_to_bedpe(index_dir, input_bam_file, output_file): # open BAM alignment file bamfh = pysam.Samfile(input_bam_file, "rb") # build a lookup table to get genomic intervals from transcripts logging.debug("Reading transcript features") transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) tid_tx_map = build_tid_transcript_map(bamfh, transcripts) outfh = open(output_file, "w") logging.debug("Converting BAM to BEDPE format") for r5p,r3p in parse_gene_discordant_reads(bamfh): # store pertinent read information in lightweight structure called # DiscordantRead object. this departs from SAM format into a # custom read format dr5p = DiscordantRead.from_read(r5p) dr3p = DiscordantRead.from_read(r3p) # get gene information tx5p = tid_tx_map[r5p.rname] tx3p = tid_tx_map[r3p.rname] # write bedpe format fields = [tx5p.tx_id, r5p.pos, r5p.aend, tx3p.tx_id, r3p.pos, r3p.aend, r5p.qname, # read name 0, # score tx5p.strand, tx3p.strand, # strand 1, strand 2 ] fields.append('|'.join(map(str, dr5p.to_list()))) fields.append('|'.join(map(str, dr3p.to_list()))) print >>outfh, '\t'.join(map(str, fields)) outfh.close() bamfh.close()
def filter_chimeras(input_file, output_file, index_dir, bam_file, weighted_unique_frags, median_isize, max_isize, isoform_fraction, false_pos_file): logging.debug("Filtering Parameters") logging.debug("\tweighted unique fragments: %f" % (weighted_unique_frags)) logging.debug("\tmedian insert size: %d" % (median_isize)) logging.debug("\tmax insert size allowed: %d" % (max_isize)) logging.debug("\tfraction of wild-type isoform: %f" % (isoform_fraction)) logging.debug("\tfalse positive chimeras file: %s" % (false_pos_file)) # get false positive chimera list if (false_pos_file is not None) and (false_pos_file is not ""): logging.debug("Parsing false positive chimeras") false_pos_pairs = read_false_pos_file(false_pos_file) else: false_pos_pairs = set() # open BAM file for checking wild-type isoform bamfh = pysam.Samfile(bam_file, "rb") # filter chimeras logging.debug("Checking chimeras") num_chimeras = 0 num_filtered_chimeras = 0 tmp_file = make_temp(os.path.dirname(output_file), suffix=".txt") f = open(tmp_file, "w") for c in Chimera.parse(open(input_file)): num_chimeras += 1 good = filter_weighted_frags(c, weighted_unique_frags) if not good: continue good = good and filter_inner_dist(c, max_isize) if not good: continue false_pos_key = (c.partner5p.tx_name, c.partner5p.end, c.partner3p.tx_name, c.partner3p.start) good = good and (false_pos_key not in false_pos_pairs) if not good: continue good = good and filter_chimeric_isoform_fraction( c, isoform_fraction, median_isize, bamfh) if good: print >> f, '\t'.join(map(str, c.to_list())) num_filtered_chimeras += 1 f.close() logging.debug("Total chimeras: %d" % num_chimeras) logging.debug("Filtered chimeras: %d" % num_filtered_chimeras) # cleanup memory for false positive chimeras del false_pos_pairs bamfh.close() # find highest coverage chimeras among isoforms gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE) kept_chimeras = get_highest_coverage_isoforms(tmp_file, gene_file) num_filtered_chimeras = 0 f = open(output_file, "w") for c in Chimera.parse(open(tmp_file)): if c.name in kept_chimeras: num_filtered_chimeras += 1 print >> f, '\t'.join(map(str, c.to_list())) f.close() logging.debug("\tAfter choosing best isoform: %d" % num_filtered_chimeras) os.remove(tmp_file) return config.JOB_SUCCESS
def nominate_spanning_reads(chimera_file, unmapped_bam_file, output_fastq_file): # find all reads that need to be remapped to see if they span the # breakpoint junction fqfh = open(output_fastq_file, "w") remap_qnames = set() breaks5p = collections.defaultdict(lambda: []) breaks3p = collections.defaultdict(lambda: []) for c in Chimera.parse(open(chimera_file)): end5p = c.partner5p.end start3p = c.partner3p.start # keep track of all breakpoints breaks5p[c.partner5p.tx_name].append(end5p) breaks3p[c.partner5p.tx_name].append(start3p) for r5p, r3p in c.encomp_read_pairs: # if 5' read overlaps breakpoint then it should be remapped if r5p.clipstart < end5p < r5p.clipend: key5p = (r5p.qname, r5p.readnum) if key5p not in remap_qnames: remap_qnames.add((r5p.qname, r5p.readnum)) print >> fqfh, to_fastq(r5p.qname, r5p.readnum, r5p.seq, "I" * len(r5p.seq)) # if 3' read overlaps breakpoint then it should be remapped if r3p.clipstart < start3p < r3p.clipend: key3p = (r3p.qname, r3p.readnum) if key3p not in remap_qnames: remap_qnames.add((r3p.qname, r3p.readnum)) print >> fqfh, to_fastq(r3p.qname, r3p.readnum, r3p.seq, "I" * len(r3p.seq)) # sort breakpoint positions within each gene for tx_name in breaks5p.keys(): breaks5p[tx_name] = sorted(breaks5p[tx_name]) for tx_name in breaks3p.keys(): breaks3p[tx_name] = sorted(breaks3p[tx_name]) # check read pairs with one or both unmapped, and remap those # as well bamfh = pysam.Samfile(unmapped_bam_file, "rb") for pe_reads in parse_pe_reads(bamfh): for readnum in xrange(0, 2): print >> fqfh, to_fastq(pe_reads[readnum][0].qname, readnum, pe_reads[readnum][0].seq, pe_reads[readnum][0].qual) # # add unmapped reads # if reads[0].is_unmapped: # readnum = 2 if reads[0].is_read2 else 1 # print >>fqfh, to_fastq(reads[0].qname, readnum, reads[0].seq, # "I" * len(reads[0].seq)) # # TODO: remove this # assert len(reads) == 1 # else: # remap = False # for r in reads: # tx_name = config.GENE_REF_PREFIX + bamfh.getrname(r.rname) # # check if this read overlaps a breakpoint # # bisect() bamfh.close() return config.JOB_SUCCESS
def bam_to_fastq(bam_file, fastq_files): fqfhs = [open(f, "w") for f in fastq_files] bamfh = pysam.Samfile(bam_file, "rb") for r in bamfh: if r.is_read1: i = 0 elif r.is_read2: i = 1 record = "@%s\n%s\n+\n%s" % (r.qname, r.seq, r.qual) print >> fqfhs[i], record
def extract_tophat_encompassing_reads(index_dir, tophat_bam_file, encompassing_bam_file, max_isize, library_type): gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE) bamfh = pysam.Samfile(tophat_bam_file, "rb") for r in bamfh: if (r.is_unmapped) or (r.mate_is_unmapped): continue if r.rname != r.mrnm: print r.qname, r.rname, r.pos, r.is_reverse, r.mrnm, r.mpos, r.mate_is_reverse bamfh.close()
def sam_stdin_to_bam(output_bam_file, input_fastq_file, multihits, is_paired=True, keep_unmapped=True): samfh = pysam.Samfile("-", "r") bamfh = pysam.Samfile(output_bam_file, "wb", template=samfh) num_unmapped = 0 num_multihits = 0 if is_paired: for pe_reads in fix_pe_alignment_ordering(samfh, open(input_fastq_file), is_paired=is_paired): for reads in pe_reads: for r in reads: if r.is_unmapped: xm_tag = r.opt('XM') if xm_tag < multihits: num_unmapped += 1 if not keep_unmapped: continue num_multihits += 1 bamfh.write(r) else: for reads in fix_sr_alignment_ordering(samfh, open(input_fastq_file)): for r in reads: if r.is_unmapped: xm_tag = r.opt('XM') if xm_tag < multihits: num_unmapped += 1 if not keep_unmapped: continue num_multihits += 1 bamfh.write(r) bamfh.close() samfh.close() logging.debug("[SAMTOBAM] Filtered %d unmapped reads" % (num_unmapped)) logging.debug("[SAMTOBAM] Found %d multimapping (>%d) reads" % (num_multihits, multihits)) logging.info("[SAMTOBAM] Finished converting SAM -> BAM")
def nominate_unmapped_spanning_reads(unmapped_bam_file, output_fastq_file): # find all reads that need to be remapped to see if they span the # breakpoint junction fqfh = open(output_fastq_file, "w") # check read pairs with one or both unmapped, and remap those # as well bamfh = pysam.Samfile(unmapped_bam_file, "rb") for pe_reads in parse_pe_reads(bamfh): # remap all unmapped reads for readnum, reads in enumerate(pe_reads): if any(r.is_unmapped for r in reads): print >> fqfh, to_fastq(pe_reads[readnum][0].qname, readnum, pe_reads[readnum][0].seq, pe_reads[readnum][0].qual) bamfh.close() fqfh.close() return config.JOB_SUCCESS
def calc_chimera_pvalues(input_file, bam_file, num_mapped_reads, num_discordant_reads_within_isize_range): # calc discordant reads per million percent_discordant = num_discordant_reads_within_isize_range / float(num_mapped_reads) # open BAM file for checking wild-type isoforms bamfh = pysam.Samfile(bam_file, "rb") for c in Chimera.parse(open(input_file)): # count 5' and 3' reads rname5p = config.GENE_REF_PREFIX + c.tx_name_5p rname3p = config.GENE_REF_PREFIX + c.tx_name_3p num_reads_5p = len(set(r.qname for r in bamfh.fetch(rname5p, c.tx_start_5p, c.tx_end_5p))) num_reads_3p = len(set(r.qname for r in bamfh.fetch(rname3p, c.tx_start_3p, c.tx_end_3p))) # expected number of discordant reads exp_discordant_5p = num_reads_5p * percent_discordant exp_discordant_3p = num_reads_3p * percent_discordant print c.gene_name_5p, c.gene_name_3p, num_reads_5p, num_reads_3p, exp_discordant_5p, exp_discordant_3p bamfh.close()
def nominate_chimeras(index_dir, input_bam_file, output_file, trim_bp): logging.debug("Reading gene information") gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE) bamfh = pysam.Samfile(input_bam_file, "rb") # build a lookup table to get genomic intervals from transcripts tid_tx_map, genome_tx_trees = build_tid_tx_maps( bamfh, gene_file, rname_prefix=config.GENE_REF_PREFIX) # group discordant read pairs by gene chimera_num = 0 outfh = open(output_file, "w") logging.debug("Parsing discordant reads") for tid5p, tid3p, readpairs in parse_gene_chimeric_reads(bamfh): c = read_pairs_to_chimera("C%07d" % (chimera_num), tid5p, tid3p, readpairs, tid_tx_map, genome_tx_trees, trim_bp) fields = c.to_list() chimera_num += 1 print >> outfh, '\t'.join(map(str, fields)) outfh.close() bamfh.close()
def main(): from optparse import OptionParser logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = OptionParser("usage: %prog [options] <bam> <out.bedpe>") parser.add_option('-i', '--min-fragment-length', dest="min_fragment_length", type="int", default=0) parser.add_option('-I', '--max-fragment-length', dest="max_fragment_length", type="int", default=1000) parser.add_option('-n', '--max-samples', dest="max_samples", type="int", default=None) parser.add_option('-o', dest="output_file", default=None) options, args = parser.parse_args() input_bam_file = args[0] bamfh = pysam.Samfile(input_bam_file, "rb") isizedist = InsertSizeDistribution.from_bam(bamfh, options.min_fragment_length, options.max_fragment_length, options.max_samples) bamfh.close() if options.output_file is not None: f = open(options.output_file, "w") else: f = sys.stdout isizedist.to_file(f) if options.output_file is not None: f.close() logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" % (isizedist.n, isizedist.mean(), isizedist.std(), isizedist.percentile(50.0), isizedist.mode()))
def main(): from optparse import OptionParser logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = OptionParser("usage: %prog [options] <bam> <out.bedpe>") parser.add_option("--index", dest="index_dir", help="Path to chimerascan index directory") parser.add_option('--max-fragment-length', dest="max_fragment_length", type="int", default=1000) parser.add_option('--max-indel-size', dest="max_indel_size", type="int", default=100) parser.add_option('--library-type', dest="library_type", default="fr") parser.add_option('--multihits', type="int", default=1) parser.add_option('--padding', type="int", default=0) options, args = parser.parse_args() input_bam_file = args[0] gene_output_file = args[1] genome_output_file = args[2] gene_feature_file = os.path.join(options.index_dir, config.GENE_FEATURE_FILE) library_type = parse_library_type(options.library_type) # open bam file bamfh = pysam.Samfile(input_bam_file, "rb") find_discordant_reads(bamfh, gene_output_file, genome_output_file, gene_feature_file, max_indel_size=options.max_indel_size, max_isize=options.max_fragment_length, max_multihits=options.multihits, library_type=library_type, padding=options.padding) bamfh.close()
def fastq_to_bam(fastq_files, qual_format, bam_file): fqfhs = [parse_fastq(open(f)) for f in fastq_files] qual_func = get_qual_conversion_func(qual_format) header = {'HD': {'VN': '1.0', 'SO': 'unknown'}} # 'SQ': [{'LN': 1, 'SN': 'dummy'}]} bamfh = pysam.Samfile(bam_file, "wb", header=header) try: while True: for i, fqiter in enumerate(fqfhs): id, seq, qual = fqiter.next() a = pysam.AlignedRead() a.rname = -1 a.mrnm = -1 #a.pos = 0 #a.mpos = 0 a.qname = id a.seq = seq a.qual = qual_func(qual) a.is_read1 = (i == 0) a.is_read2 = (i == 1) bamfh.write(a) except StopIteration: pass bamfh.close()
def run_chimerascan(runconfig): # normal run config_passed = runconfig.check_config() if not config_passed: logging.error("Invalid run configuration, aborting.") sys.exit(JOB_ERROR) # create output dir if it does not exist if not os.path.exists(runconfig.output_dir): os.makedirs(runconfig.output_dir) logging.info("Created output directory: %s" % (runconfig.output_dir)) # create log dir if it does not exist log_dir = os.path.join(runconfig.output_dir, config.LOG_DIR) if not os.path.exists(log_dir): os.makedirs(log_dir) logging.debug("Created directory for log files: %s" % (log_dir)) # create tmp dir if it does not exist tmp_dir = os.path.join(runconfig.output_dir, config.TMP_DIR) if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) logging.debug("Created directory for tmp files: %s" % (tmp_dir)) # write the run config to a file xmlstring = runconfig.to_xml() runconfig_xml_file = os.path.join(runconfig.output_dir, config.RUNCONFIG_XML_FILE) fh = open(runconfig_xml_file, "w") print >> fh, xmlstring fh.close() # gather and parse run parameters library_type = parse_library_type(runconfig.library_type) gene_feature_file = os.path.join(runconfig.index_dir, config.GENE_FEATURE_FILE) bowtie_mode = "-v" if runconfig.bowtie_mode_v else "-n" bowtie_index = os.path.join(runconfig.index_dir, config.ALIGN_INDEX) original_read_length = get_read_length(runconfig.fastq_files[0]) # minimum fragment length cannot be smaller than the trimmed read length trimmed_read_length = original_read_length - runconfig.trim5 - runconfig.trim3 min_fragment_length = max(runconfig.min_fragment_length, trimmed_read_length) # # Initial Bowtie alignment step # # align in paired-end mode, trying to resolve as many reads as possible # this effectively rules out the vast majority of reads as candidate # fusions unaligned_fastq_param = os.path.join(tmp_dir, config.UNALIGNED_FASTQ_PARAM) maxmultimap_fastq_param = os.path.join(tmp_dir, config.MAXMULTIMAP_FASTQ_PARAM) aligned_bam_file = os.path.join(runconfig.output_dir, config.ALIGNED_READS_BAM_FILE) aligned_log_file = os.path.join(log_dir, "bowtie_alignment.log") if all(up_to_date(aligned_bam_file, fq) for fq in runconfig.fastq_files): logging.info("[SKIPPED] Alignment results exist") else: logging.info("Aligning full-length reads in paired-end mode") retcode = align_pe_full( runconfig.fastq_files, bowtie_index, aligned_bam_file, unaligned_fastq_param, maxmultimap_fastq_param, min_fragment_length=min_fragment_length, max_fragment_length=runconfig.max_fragment_length, trim5=runconfig.trim5, trim3=runconfig.trim3, library_type=runconfig.library_type, num_processors=runconfig.num_processors, fastq_format=runconfig.fastq_format, multihits=runconfig.multihits, mismatches=runconfig.mismatches, bowtie_bin=runconfig.bowtie_bin, bowtie_mode=bowtie_mode, log_file=aligned_log_file) if retcode != 0: logging.error("Bowtie failed with error code %d" % (retcode)) sys.exit(retcode) # # Get insert size distribution # isize_dist_file = os.path.join(runconfig.output_dir, config.ISIZE_DIST_FILE) isize_dist = InsertSizeDistribution() if up_to_date(isize_dist_file, aligned_bam_file): logging.info("[SKIPPED] Profiling insert size distribution") isize_dist.from_file(open(isize_dist_file, "r")) else: logging.info("Profiling insert size distribution") max_isize_samples = config.ISIZE_MAX_SAMPLES bamfh = pysam.Samfile(aligned_bam_file, "rb") isize_dist.from_bam(bamfh, min_isize=min_fragment_length, max_isize=runconfig.max_fragment_length, max_samples=max_isize_samples) isize_dist.to_file(open(isize_dist_file, "w")) bamfh.close() logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" % (isize_dist.n, isize_dist.mean(), isize_dist.std(), isize_dist.percentile(50.0), isize_dist.mode())) # # Discordant reads alignment step # discordant_bam_file = os.path.join(tmp_dir, config.DISCORDANT_BAM_FILE) discordant_log_file = os.path.join(log_dir, "bowtie_segmented_alignment.log") unaligned_fastq_files = [ os.path.join(tmp_dir, fq) for fq in config.UNALIGNED_FASTQ_FILES ] # get the segments used in discordant alignment to know the effective # read length used to align. we used this to set the 'padding' during # spanning read discovery segments = determine_read_segments(original_read_length, segment_length=runconfig.segment_length, segment_trim=True, trim5=runconfig.trim5, trim3=runconfig.trim3) segmented_read_length = segments[-1][1] logging.debug("Segmented alignment will use effective read length of %d" % (segmented_read_length)) if all( up_to_date(discordant_bam_file, fq) for fq in runconfig.fastq_files): logging.info("[SKIPPED] Discordant alignment results exist") else: logging.info("Aligning initially unmapped reads in single read mode") align(unaligned_fastq_files, runconfig.fastq_format, bowtie_index, discordant_bam_file, bowtie_bin=runconfig.bowtie_bin, num_processors=runconfig.num_processors, segment_length=runconfig.segment_length, segment_trim=True, trim5=runconfig.trim5, trim3=runconfig.trim3, multihits=runconfig.multihits, mismatches=runconfig.mismatches, bowtie_mode=bowtie_mode, best_strata=runconfig.best_strata, log_file=discordant_log_file) # # Merge paired-end reads step # paired_bam_file = os.path.join(tmp_dir, config.DISCORDANT_PAIRED_BAM_FILE) if up_to_date(paired_bam_file, discordant_bam_file): logging.info("[SKIPPED] Read pairing results exist") else: logging.info("Pairing aligned reads") bamfh = pysam.Samfile(discordant_bam_file, "rb") paired_bamfh = pysam.Samfile(paired_bam_file, "wb", template=bamfh) merge_read_pairs(bamfh, paired_bamfh, runconfig.min_fragment_length, runconfig.max_fragment_length, library_type) paired_bamfh.close() bamfh.close() # # Find discordant reads step # discordant_gene_bedpe_file = \ os.path.join(tmp_dir, config.DISCORDANT_GENE_BEDPE_FILE) discordant_genome_bedpe_file = \ os.path.join(tmp_dir, config.DISCORDANT_GENOME_BEDPE_FILE) padding = original_read_length - segmented_read_length if (up_to_date(discordant_gene_bedpe_file, paired_bam_file) and up_to_date(discordant_genome_bedpe_file, paired_bam_file)): logging.info("[SKIPPED] Finding discordant reads") else: logging.info("Finding discordant reads") bamfh = pysam.Samfile(paired_bam_file, "rb") find_discordant_reads(bamfh, discordant_gene_bedpe_file, discordant_genome_bedpe_file, gene_feature_file, max_indel_size=runconfig.max_indel_size, max_isize=runconfig.max_fragment_length, max_multihits=runconfig.multihits, library_type=library_type, padding=padding) bamfh.close() # # Extract full sequences of the discordant reads # extended_discordant_gene_bedpe_file = \ os.path.join(tmp_dir, config.EXTENDED_DISCORDANT_GENE_BEDPE_FILE) if up_to_date(extended_discordant_gene_bedpe_file, discordant_gene_bedpe_file): logging.info( "[SKIPPED] Retrieving full length sequences for realignment") else: logging.info("Retrieving full length sequences for realignment") extend_sequences(unaligned_fastq_files, discordant_gene_bedpe_file, extended_discordant_gene_bedpe_file) # # Sort discordant reads # sorted_discordant_gene_bedpe_file = os.path.join( tmp_dir, config.SORTED_DISCORDANT_GENE_BEDPE_FILE) if (up_to_date(sorted_discordant_gene_bedpe_file, extended_discordant_gene_bedpe_file)): logging.info("[SKIPPED] Sorting discordant BEDPE file") else: logging.info("Sorting discordant BEDPE file") sort_discordant_reads(extended_discordant_gene_bedpe_file, sorted_discordant_gene_bedpe_file) # # Nominate chimeras step # encompassing_bedpe_file = os.path.join( tmp_dir, config.ENCOMPASSING_CHIMERA_BEDPE_FILE) if (up_to_date(encompassing_bedpe_file, sorted_discordant_gene_bedpe_file)): logging.info("[SKIPPED] Nominating chimeras from discordant reads") else: logging.info("Nominating chimeras from discordant reads") nominate_chimeras(open(sorted_discordant_gene_bedpe_file, "r"), open(encompassing_bedpe_file, "w"), gene_feature_file, trim=config.EXON_JUNCTION_TRIM_BP) # # Filter encompassing chimeras step # filtered_encomp_bedpe_file = \ os.path.join(tmp_dir, config.FILTERED_ENCOMPASSING_CHIMERA_BEDPE_FILE) if (up_to_date(filtered_encomp_bedpe_file, encompassing_bedpe_file)): logging.info("[SKIPPED] Filtering encompassing chimeras") else: logging.info("Filtering encompassing chimeras") # max_isize = isize_mean + runconfig.filter_isize_stdevs*isize_std filter_encompassing_chimeras( encompassing_bedpe_file, filtered_encomp_bedpe_file, gene_feature_file, max_multimap=runconfig.filter_max_multimaps, multimap_cov_ratio=runconfig.filter_multimap_ratio, max_isize=-1, strand_pval=runconfig.filter_strand_pval) # # Nominate spanning reads step # spanning_fastq_file = os.path.join(runconfig.output_dir, config.SPANNING_FASTQ_FILE) if all(up_to_date(spanning_fastq_file, f) for f in unaligned_fastq_files): logging.info("[SKIPPED] Preparing junction spanning reads") else: logging.info("Preparing junction spanning reads") outfh = open(spanning_fastq_file, "w") for f in unaligned_fastq_files: shutil.copyfileobj(open(f), outfh) outfh.close() # TODO: skip this step for now, and simply realign all the reads # spanning_fastq_file = os.path.join(runconfig.output_dir, config.SPANNING_FASTQ_FILE) # if (up_to_date(spanning_fastq_file, extended_discordant_bedpe_file) and # up_to_date(spanning_fastq_file, filtered_encomp_bedpe_file)): # logging.info("[SKIPPED] Nominating junction spanning reads") # else: # logging.info("Nominating junction spanning reads") # nominate_spanning_reads(open(extended_discordant_bedpe_file, 'r'), # open(filtered_encomp_bedpe_file, 'r'), # open(spanning_fastq_file, 'w')) # # Extract junction sequences from chimeras file # ref_fasta_file = os.path.join(runconfig.index_dir, config.ALIGN_INDEX + ".fa") junc_fasta_file = os.path.join(tmp_dir, config.JUNC_REF_FASTA_FILE) junc_map_file = os.path.join(tmp_dir, config.JUNC_REF_MAP_FILE) spanning_read_length = get_read_length(spanning_fastq_file) if (up_to_date(junc_fasta_file, filtered_encomp_bedpe_file) and up_to_date(junc_map_file, filtered_encomp_bedpe_file)): logging.info("[SKIPPED] Extracting junction read sequences") else: logging.info("Extracting junction read sequences") bedpe_to_junction_fasta(filtered_encomp_bedpe_file, ref_fasta_file, spanning_read_length, open(junc_fasta_file, "w"), open(junc_map_file, "w")) # # Build a bowtie index to align and detect spanning reads # bowtie_spanning_index = os.path.join(tmp_dir, config.JUNC_BOWTIE_INDEX) bowtie_spanning_index_file = os.path.join(tmp_dir, config.JUNC_BOWTIE_INDEX_FILE) if (up_to_date(bowtie_spanning_index_file, junc_fasta_file)): logging.info( "[SKIPPED] Building bowtie index for junction-spanning reads") else: logging.info("Building bowtie index for junction-spanning reads") args = [ runconfig.bowtie_build_bin, junc_fasta_file, bowtie_spanning_index ] f = open(os.path.join(log_dir, "bowtie_build.log"), "w") subprocess.call(args, stdout=f, stderr=f) f.close() # # Align unmapped reads across putative junctions # junc_bam_file = os.path.join(tmp_dir, config.JUNC_READS_BAM_FILE) junc_log_file = os.path.join(log_dir, "bowtie_spanning_alignment.log") if (up_to_date(junc_bam_file, bowtie_spanning_index_file) and up_to_date(junc_bam_file, spanning_fastq_file)): logging.info("[SKIPPED] Aligning junction spanning reads") else: logging.info("Aligning junction spanning reads") retcode = align_sr_full(spanning_fastq_file, bowtie_spanning_index, junc_bam_file, trim5=runconfig.trim5, trim3=runconfig.trim3, num_processors=runconfig.num_processors, fastq_format=runconfig.fastq_format, multihits=runconfig.multihits, mismatches=runconfig.mismatches, bowtie_bin=runconfig.bowtie_bin, bowtie_mode=bowtie_mode, log_file=junc_log_file) if retcode != 0: logging.error("Bowtie failed with error code %d" % (retcode)) sys.exit(retcode) # # Merge spanning and encompassing read information # raw_chimera_bedpe_file = os.path.join(tmp_dir, config.RAW_CHIMERA_BEDPE_FILE) if (up_to_date(raw_chimera_bedpe_file, junc_bam_file) and up_to_date(raw_chimera_bedpe_file, junc_map_file)): logging.info( "[SKIPPED] Merging spanning and encompassing read alignments") else: logging.info("Merging spanning and encompassing read alignments") merge_spanning_alignments(junc_bam_file, junc_map_file, raw_chimera_bedpe_file, anchor_min=0, anchor_max=0, anchor_mismatches=0) # # Choose best isoform for each junction # chimera_bedpe_file = os.path.join(tmp_dir, config.CHIMERA_BEDPE_FILE) if (up_to_date(chimera_bedpe_file, raw_chimera_bedpe_file)): logging.info("[SKIPPED] Filtering chimeras") else: logging.info("Filtering chimeras") # get insert size at prob max_isize = isize_dist.percentile(runconfig.filter_isize_percentile) filter_spanning_chimeras(raw_chimera_bedpe_file, chimera_bedpe_file, gene_feature_file, mate_pval=runconfig.filter_strand_pval, max_isize=max_isize) # # Rank chimeras # ranked_chimera_bedpe_file = os.path.join(runconfig.output_dir, config.RANKED_CHIMERA_BEDPE_FILE) if (up_to_date(ranked_chimera_bedpe_file, chimera_bedpe_file)): logging.info("[SKIPPED] Ranking chimeras") else: logging.info("Ranking chimeras") rank_chimeras(chimera_bedpe_file, ranked_chimera_bedpe_file, empirical_prob=runconfig.empirical_prob) # # Cleanup # #shutil.rmtree(tmp_dir) # # Done # logging.info("Finished run. Chimeras written to file %s" % (ranked_chimera_bedpe_file)) return JOB_SUCCESS
def find_discordant_fragments(input_bam_file, gene_paired_bam_file, genome_paired_bam_file, unmapped_bam_file, complex_bam_file, index_dir, max_isize, library_type): """ parses BAM file and categorizes reads into several groups: - concordant - discordant within gene (splicing isoforms) - discordant between different genes (chimeras) - discordant genome alignments (unannotated) """ logging.info("Finding discordant read pair combinations") logging.debug("\tInput file: %s" % (input_bam_file)) logging.debug("\tMax insert size: '%d'" % (max_isize)) logging.debug("\tLibrary type: '%s'" % (library_type)) logging.debug("\tGene paired file: %s" % (gene_paired_bam_file)) logging.debug("\tGenome paired file: %s" % (genome_paired_bam_file)) logging.debug("\tUnmapped file: %s" % (unmapped_bam_file)) logging.debug("\tComplex file: %s" % (complex_bam_file)) # setup input and output files bamfh = pysam.Samfile(input_bam_file, "rb") genefh = pysam.Samfile(gene_paired_bam_file, "wb", template=bamfh) genomefh = pysam.Samfile(genome_paired_bam_file, "wb", template=bamfh) unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh) complexfh = pysam.Samfile(complex_bam_file, "wb", template=bamfh) gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE) # build a lookup table to get all the overlapping transcripts given a # transcript 'tid' tid_tx_cluster_map = build_tid_tx_cluster_map(bamfh, open(gene_file), rname_prefix=config.GENE_REF_PREFIX) # build a lookup table to get genome coordinates from transcript # coordinates tid_genome_map = build_tid_to_genome_map(bamfh, open(gene_file), rname_prefix=config.GENE_REF_PREFIX) for pe_reads in parse_pe_reads(bamfh): # add hit index and number of multimaps information to read tags # this function also checks for unmapped reads any_unmapped = False for reads in pe_reads: any_unmapped = (any_unmapped or annotate_multihits(bamfh, reads, tid_genome_map)) if any_unmapped: # write to output as discordant reads and continue to # next fragment write_pe_reads(unmappedfh, pe_reads) continue # examine all read pairing combinations and rule out invalid # pairings. this returns gene pairs and genome pairs gene_pairs, genome_pairs, unpaired_reads = \ classify_read_pairs(pe_reads, max_isize, library_type, tid_genome_map, tid_tx_cluster_map) if len(gene_pairs) > 0 or len(genome_pairs) > 0: write_pairs(genefh, gene_pairs) write_pairs(genomefh, genome_pairs) else: write_pe_reads(complexfh, unpaired_reads) genefh.close() genomefh.close() unmappedfh.close() complexfh.close() bamfh.close() logging.info("Finished pairing reads")
def discordant_reads_to_breakpoints(index_dir, isize_dist_file, input_bam_file, output_file, trim_bp, max_read_length, homology_mismatches): """ homology_mismatches: number of mismatches to tolerate while computing homology between chimeric breakpoint sequence and "wildtype" sequence trim_bp: when selecting the best matching exon for each read, we account for spurious overlap into adjacent exons by trimming the read by 'trim_bp' """ # read insert size distribution isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file)) # open BAM alignment file bamfh = pysam.Samfile(input_bam_file, "rb") # build a lookup table to get genomic intervals from transcripts logging.debug("Reading gene information") gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE) tid_tx_map = build_tid_tx_map(bamfh, gene_file, rname_prefix=config.GENE_REF_PREFIX) # open the reference sequence fasta file ref_fasta_file = os.path.join(index_dir, config.ALIGN_INDEX + ".fa") ref_fa = pysam.Fastafile(ref_fasta_file) # iterate through read pairs outfh = open(output_file, "w") logging.debug("Parsing discordant reads") for r5p, r3p in parse_gene_discordant_reads(bamfh): # store pertinent read information in lightweight structure called # DiscordantRead object. this departs from SAM format into a # custom read format dr5p = DiscordantRead.from_read(r5p) dr3p = DiscordantRead.from_read(r3p) # get gene information tx5p = tid_tx_map[r5p.rname] tx3p = tid_tx_map[r3p.rname] # given the insert size find the highest probability # exon junction breakpoint between the two transcripts isize_prob, breakpoints = \ choose_best_breakpoints(r5p, r3p, tx5p, tx3p, trim_bp, isize_dist) # extract the sequence of the breakpoint along with the # number of homologous bases at the breakpoint between # chimera and wildtype genes for breakpoint in breakpoints: exon_num_5p, tx_end_5p, exon_num_3p, tx_start_3p = breakpoint breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right = \ extract_breakpoint_sequence(config.GENE_REF_PREFIX + tx5p.tx_name, tx_end_5p, config.GENE_REF_PREFIX + tx3p.tx_name, tx_start_3p, ref_fa, max_read_length, homology_mismatches) # write breakpoint information for each read to a file fields = [ tx5p.tx_name, 0, tx_end_5p, tx3p.tx_name, tx_start_3p, tx3p.tx_end, r5p.rname, # name isize_prob, # score tx5p.strand, tx3p.strand, # strand 1, strand 2 # user defined fields exon_num_5p, exon_num_3p, breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right ] fields.append('|'.join(map(str, dr5p.to_list()))) fields.append('|'.join(map(str, dr3p.to_list()))) print >> outfh, '\t'.join(map(str, fields)) # cleanup ref_fa.close() outfh.close() bamfh.close() return config.JOB_SUCCESS