def sam_to_bam_pesr(input_sam_file, input_fastq_file, output_bam_file): samfh = pysam.Samfile(input_sam_file, "r") fqiter = parse_fastq_record(open(input_fastq_file)) bamfh = pysam.Samfile(output_bam_file, "wb", template=samfh) num_reads = 0 # get first fastq record fqrec = fqiter.next() for r in samfh: # get next fastq record if r.qname != fqrec.qname: fqrec = fqiter.next() # TODO: eventually remove assert statement assert r.qname == fqrec.qname # remove mate number from qname r.qname = r.qname[1:] # pad read that has been trimmed soft_pad_read(fqrec, r) # reset paired-end flags for read r.is_paired = True if fqrec.readnum == 1: r.is_read1 = True elif fqrec.readnum == 2: r.is_read2 = True else: assert False # TODO: remove is_secondary bit here because this does not # make sense in the context of paired-end alignments r.is_secondary = False bamfh.write(r) num_reads += 1 logging.debug("Found %d reads" % (num_reads)) bamfh.close() samfh.close() return config.JOB_SUCCESS
def sam_to_bam(input_fastq_files, input_sam_file, output_bam_file, quals, multihits, pe_sr_mode=False, softclip=True, keep_unmapped=True): samfh = pysam.Samfile(input_sam_file, "r") num_unmapped = 0 num_multihits = 0 num_frags = 0 bamfh = pysam.Samfile(output_bam_file, "wb", template=samfh) # setup fastq parsing if softclip and (quals != SANGER_FORMAT): kwargs = {"convert_quals": True, "qual_format": quals} else: kwargs = {"convert_quals": False} fqiters = [ parse_fastq_record(open(fq), **kwargs) for fq in input_fastq_files ] # handle single-read and paired-end if len(fqiters) == 1: reorder_func = fix_sr_alignment_ordering(samfh, fqiters[0]) else: reorder_func = fix_alignment_ordering(samfh, fqiters, pe_sr_mode) # iterate through buffer for bufitems in reorder_func: num_frags += 1 for bufitem in bufitems: for r in bufitem.reads: # softclip uses the fastq record to replace the sequence # and quality scores of the read if softclip: soft_pad_read(bufitem.fqrec, r) # keep statistics of unmapped/multimapped reads and # suppress output if 'keep_unmapped' is False if r.is_unmapped: xm_tag = r.opt('XM') if xm_tag < multihits: num_unmapped += 1 if not keep_unmapped: continue else: num_multihits += 1 bamfh.write(r) for fqfh in fqiters: fqfh.close() bamfh.close() samfh.close() logging.debug("Found %d fragments" % (num_frags)) logging.debug("\t%d unmapped reads" % (num_unmapped)) logging.debug("\t%d multimapping (>%dX) reads" % (num_multihits, multihits))
def sam_to_bam(input_fastq_files, input_sam_file, output_bam_file, quals, multihits, pe_sr_mode=False, softclip=True, keep_unmapped=True): samfh = pysam.Samfile(input_sam_file, "r") num_unmapped = 0 num_multihits = 0 num_frags = 0 bamfh = pysam.Samfile(output_bam_file, "wb", template=samfh) # setup fastq parsing if softclip and (quals != SANGER_FORMAT): kwargs = {"convert_quals": True, "qual_format": quals} else: kwargs = {"convert_quals": False} fqiters = [parse_fastq_record(open(fq), **kwargs) for fq in input_fastq_files] # handle single-read and paired-end if len(fqiters) == 1: reorder_func = fix_sr_alignment_ordering(samfh, fqiters[0]) else: reorder_func = fix_alignment_ordering(samfh, fqiters, pe_sr_mode) # iterate through buffer for bufitems in reorder_func: num_frags += 1 for bufitem in bufitems: for r in bufitem.reads: # softclip uses the fastq record to replace the sequence # and quality scores of the read if softclip: soft_pad_read(bufitem.fqrec, r) # keep statistics of unmapped/multimapped reads and # suppress output if 'keep_unmapped' is False if r.is_unmapped: xm_tag = r.opt('XM') if xm_tag < multihits: num_unmapped += 1 if not keep_unmapped: continue else: num_multihits += 1 bamfh.write(r) for fqfh in fqiters: fqfh.close() bamfh.close() samfh.close() logging.debug("Found %d fragments" % (num_frags)) logging.debug("\t%d unmapped reads" % (num_unmapped)) logging.debug("\t%d multimapping (>%dX) reads" % (num_multihits, multihits))