def sam_to_bam(input_fastq_files, input_sam_file, output_bam_file, quals, multihits, pe_sr_mode=False, softclip=True, keep_unmapped=True): samfh = pysam.Samfile(input_sam_file, "r") num_unmapped = 0 num_multihits = 0 num_frags = 0 bamfh = pysam.Samfile(output_bam_file, "wb", template=samfh) # setup fastq parsing if softclip and (quals != SANGER_FORMAT): kwargs = {"convert_quals": True, "qual_format": quals} else: kwargs = {"convert_quals": False} fqiters = [ parse_fastq_record(open(fq), **kwargs) for fq in input_fastq_files ] # handle single-read and paired-end if len(fqiters) == 1: reorder_func = fix_sr_alignment_ordering(samfh, fqiters[0]) else: reorder_func = fix_alignment_ordering(samfh, fqiters, pe_sr_mode) # iterate through buffer for bufitems in reorder_func: num_frags += 1 for bufitem in bufitems: for r in bufitem.reads: # softclip uses the fastq record to replace the sequence # and quality scores of the read if softclip: soft_pad_read(bufitem.fqrec, r) # keep statistics of unmapped/multimapped reads and # suppress output if 'keep_unmapped' is False if r.is_unmapped: xm_tag = r.opt('XM') if xm_tag < multihits: num_unmapped += 1 if not keep_unmapped: continue else: num_multihits += 1 bamfh.write(r) for fqfh in fqiters: fqfh.close() bamfh.close() samfh.close() logging.debug("Found %d fragments" % (num_frags)) logging.debug("\t%d unmapped reads" % (num_unmapped)) logging.debug("\t%d multimapping (>%dX) reads" % (num_multihits, multihits))
def sam_to_bam(input_fastq_files, input_sam_file, output_bam_file, quals, multihits, pe_sr_mode=False, softclip=True, keep_unmapped=True): samfh = pysam.Samfile(input_sam_file, "r") num_unmapped = 0 num_multihits = 0 num_frags = 0 bamfh = pysam.Samfile(output_bam_file, "wb", template=samfh) # setup fastq parsing if softclip and (quals != SANGER_FORMAT): kwargs = {"convert_quals": True, "qual_format": quals} else: kwargs = {"convert_quals": False} fqiters = [parse_fastq_record(open(fq), **kwargs) for fq in input_fastq_files] # handle single-read and paired-end if len(fqiters) == 1: reorder_func = fix_sr_alignment_ordering(samfh, fqiters[0]) else: reorder_func = fix_alignment_ordering(samfh, fqiters, pe_sr_mode) # iterate through buffer for bufitems in reorder_func: num_frags += 1 for bufitem in bufitems: for r in bufitem.reads: # softclip uses the fastq record to replace the sequence # and quality scores of the read if softclip: soft_pad_read(bufitem.fqrec, r) # keep statistics of unmapped/multimapped reads and # suppress output if 'keep_unmapped' is False if r.is_unmapped: xm_tag = r.opt('XM') if xm_tag < multihits: num_unmapped += 1 if not keep_unmapped: continue else: num_multihits += 1 bamfh.write(r) for fqfh in fqiters: fqfh.close() bamfh.close() samfh.close() logging.debug("Found %d fragments" % (num_frags)) logging.debug("\t%d unmapped reads" % (num_unmapped)) logging.debug("\t%d multimapping (>%dX) reads" % (num_multihits, multihits))
def sam_stdin_to_bam(output_bam_file, input_fastq_file, multihits, is_paired=True, keep_unmapped=True): samfh = pysam.Samfile("-", "r") bamfh = pysam.Samfile(output_bam_file, "wb", template=samfh) num_unmapped = 0 num_multihits = 0 if is_paired: for pe_reads in fix_pe_alignment_ordering(samfh, open(input_fastq_file), is_paired=is_paired): for reads in pe_reads: for r in reads: if r.is_unmapped: xm_tag = r.opt('XM') if xm_tag < multihits: num_unmapped += 1 if not keep_unmapped: continue num_multihits += 1 bamfh.write(r) else: for reads in fix_sr_alignment_ordering(samfh, open(input_fastq_file)): for r in reads: if r.is_unmapped: xm_tag = r.opt('XM') if xm_tag < multihits: num_unmapped += 1 if not keep_unmapped: continue num_multihits += 1 bamfh.write(r) bamfh.close() samfh.close() logging.debug("[SAMTOBAM] Filtered %d unmapped reads" % (num_unmapped)) logging.debug("[SAMTOBAM] Found %d multimapping (>%d) reads" % (num_multihits, multihits)) logging.info("[SAMTOBAM] Finished converting SAM -> BAM")