def export_bam(outbam, read1, read2, quiet=False): if read2: def gen(): for r1, r2 in itertools.izip(read1.fetch(quiet=quiet), read2.fetch(quiet=True)): yield (r1, r2) else: def gen(): for r1 in read1.fetch(quiet=quiet): yield (r1, None) for r1, r2 in gen(): record1 = pysam.AlignedRead() record1.qname = r1.name record1.seq = r1.seq record1.qual = r1.qual if r2: record1.is_paired = True record1.is_read1 = True record2 = pysam.AlignedRead() record2.qname = r1.name record2.seq = r1.seq record2.qual = r1.qual record2.is_paired = True record2.is_read2 = True outbam.write(record1) if r2: outbam.write(record2)
def read_to_unmapped(read, ref=None): ''' Converts a read from mapped to unmapped. Sets the 'ZR' tag to indicate the original ref/pos/cigar (if ref is passed) ''' newread = pysam.AlignedRead() if ref: tags = [('ZR', '%s:%s:%s' % (ref, read.pos, cigar_tostr(read.cigar)))] newread.is_unmapped = True newread.mapq = 0 newread.tlen = 0 newread.pos = -1 newread.pnext = -1 newread.rnext = -1 newread.tid = -1 newread.qname = read.qname if read.is_paired: newread.is_paired = True if not read.is_unmapped and read.is_reverse: newread.seq = ngsutils.support.revcomp(read.seq) newread.qual = read.qual[::-1] else: newread.seq = read.seq newread.qual = read.qual newread.tags = tags return newread
def makeNewRead(trim,loff,newCigar,read,trim2=None,MIPid=None): a = pysam.AlignedRead() a.qname = read.qname a.pos = read.pos+loff if (trim > 0) or (trim == None): a.seq = read.seq[trim:trim2] a.qual = read.qual[trim:trim2] else: a.seq = read.seq[trim2:trim] a.qual = read.qual[trim2:trim] a.flag = read.flag # Convert PE to SR a.is_paired = False a.is_read1 = False a.is_read2 = False a.rname = read.rname a.mapq = read.mapq a.cigar = newCigar #a.mrnm = read.mrnm #a.mpos = read.mpos # WILL BE OFF #a.isize = read.isize # WILL BE OFF tags = [] for key,value in read.tags: if key != "NM" and key != "MD" and key != "ZM": tags.append((key,value)) elif key == "ZM" and MIPid == None: tags.append((key,value)) if MIPid != None: tags.append(("ZM",MIPid)) a.tags = tags return a
def read_gen(seq, qual, MD, cigar=None): read = pysam.AlignedRead() read.seq = seq read.qual = qual read.tags = [('MD', MD)] read.cigar = cigar return read
def store(self, qname, N_mismatch, FR, refname, strand, pos, cigar, original_BS, methy, STEVE, rnext = -1, pnext = -1, qual = None, output_genome = None, rrbs = False, my_region_serial = -1, my_region_start = 0, my_region_end = 0): if self.format == BS_SEEKER1: # remove the soft clipped bases from the read # this is done for backwards compatibility with the old format r_start, r_end, _ = get_read_start_end_and_genome_length(cigar) original_BS = original_BS[r_start : r_end] if not rrbs: #pdb.set_trace() self.f.write('%s\t%2d\t%s\t%s%s%s\t%s\t%s\t%s\t%d\n' % (qname, N_mismatch, FR, refname, strand, str(pos+1).zfill(10), output_genome, original_BS, methy, STEVE)) else: #pdb.set_trace() self.f.write('%s\t%2d\t%s\t%s%s%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n' % (qname, N_mismatch, FR, refname, strand, str(pos+1).zfill(10), output_genome, original_BS, methy, STEVE, my_region_serial, my_region_start, my_region_end)) elif self.format == BAM or self.format == SAM: # flag = 0x10 if strand == '-' else 0 # cigarr = cigar if strand == '+' else list(reversed(cigar)) # rnextt = rnext if rnext == -1 else self.chrom_ids[rnext] # seqq = original_BS if strand == '+' else reverse_compl_seq(original_BS) #pdb.set_trace() # self.to_string = self.to_string + '\n' + qname + '\t' + str(flag) + '\t' + str(self.chrom_ids[refname]) + '\t' + str(pos) + '\t' + '255' + '\t' + str(cigarr) + '\t' + str(rnextt) + '\t' + str(pnext) + '\t' + str(seqq) + '\t' + str(qual) + '\t' + 'None' +'\t' + '[(\'XO\', ' + str(FR) + '), (\'XS\', '+ str(STEVE) + '), (\'NM\', ' + str(N_mismatch) + '), (\'XM\', ' + str(methy) + '), (\'XG\', ' + str(output_genome) + ')]' # return pdb.set_trace() a = pysam.AlignedRead() a.qname = qname a.seq = original_BS if strand == '+' else reverse_compl_seq(original_BS) a.flag = 0x10 if strand == '-' else 0 a.tid = self.chrom_ids[refname] a.pos = pos a.mapq = 255 a.cigar = cigar if strand == '+' else list(reversed(cigar)) a.rnext = rnext if rnext == -1 else self.chrom_ids[rnext] a.pnext = pnext a.qual= qual if rrbs: a.tags = (('XO', FR), ('XS', STEVE), ('NM', N_mismatch), ('XM', methy), ('XG', output_genome), ('YR', my_region_serial), ('YS', my_region_start), ('YE', my_region_end) ) else: a.tags = (('XO', FR), ('XS', STEVE), ('NM', N_mismatch), ('XM', methy), ('XG', output_genome)) #pdb.set_trace() self.f.write(a)
def write_read(bam_file, name, seq, qual, tid, pos, mapq=0, cigar=None, reverse=False, paired=False, read1=True, mate_tid=None, mate_pos=None, mate_reverse=False): """ Creates an alignedRead object and writes it to the corresponding bam file. If cigar is unspecified, the alignment is defaulted to a perfect alignment. Chrom """ r = pysam.AlignedRead() r.qname = name r.seq = seq r.qual = qual r.tid = tid r.pos = pos r.mapq = mapq if pos == -1: r.is_unmapped = True if cigar: r.cigar = cigar else: r.cigar = [(0, len(seq))] r.is_reverse = reverse r.is_read1 = read1 r.is_read2 = not (read1) if paired: r.is_paired = True if mate_tid: r.mrnm = mate_tid if mate_pos: r.mpos = mate_pos if mate_pos == -1: r.mate_is_unmapped = True if mate_reverse: r.mate_is_reverse = True else: r.mate_is_reverse = False bam_file.write(r)
def generateSamInfo(self, sinfo, seq, cig, startpos, refname, qual): a = pysam.AlignedRead() a.rname = refname a.qname = sinfo.qname a.seq = seq a.flag = 0 #sinfo.flag & 16 a.pos = startpos a.mapq = sinfo.mapq a.cigarstring = cig a.rnext = -1 a.pnext = -1 #a.pos #a.isize = sinfo.isize a.tlen = 0 a.qual = qual.strip().replace(" ", "") tags = [] tmp = self.__getTag__(sinfo, "RG") if tmp: tags.append(( "RG", tmp, )) tmp = self.__getTag__(sinfo, "X0") if tmp: tags.append(( "X0", tmp, )) tmp = self.__getTag__(sinfo, "AS") if tmp: tags.append(( "AS", tmp, )) tmp = self.__getTag__(sinfo, "XS") if tmp: tags.append(( "XS", tmp, )) tmp = self.__getTag__(sinfo, "YS") if tmp: tags.append(( "YS", tmp, )) tmp = self.__getTag__(sinfo, "YT") if tmp: tags.append(( "YT", tmp, )) a.tags = tags #a.tags = sinfo.tags return a
def convert_read(r, transcript_tid_map, library_type): if r.is_unmapped: # return copy of original read return copy_read(r) # copy and modify tags tagdict = collections.OrderedDict(r.tags) if 'XS' in tagdict: del tagdict['XS'] if 'NH' in tagdict: del tagdict['NH'] # convert transcript reference to genome genome_tid, negstrand, exons = transcript_tid_map[r.tid] # find genomic start position of transcript newpos, eindex, testart, toffset = convert_pos(r.pos, negstrand, exons) # parse and convert transcript cigar string newcigar, alen, spliced = \ convert_cigar(r.cigar, negstrand, exons, eindex, testart, toffset) if negstrand: # set position to left end of transcript newpos = newpos - alen + 1 # flip is_reverse flag is_reverse = (not r.is_reverse) # reverse complement seq and quals seq = DNA_reverse_complement(r.seq) qual = None if r.qual is None else r.qual[::-1] # flip MD tag if 'MD' in tagdict: tagdict['MD'] = reverse_complement_MD_tag(tagdict['MD']) else: is_reverse = r.is_reverse seq = r.seq qual = r.qual # add XS tag strand = get_read_strand(r.is_read2, is_reverse, negstrand, library_type) tagdict['XS'] = strand # create copy of read a = pysam.AlignedRead() a.qname = r.qname a.flag = r.flag a.seq = seq a.qual = qual a.is_reverse = is_reverse a.tid = genome_tid a.pos = newpos a.cigar = newcigar a.mapq = r.mapq a.rnext = r.rnext a.pnext = r.pnext a.tlen = r.tlen a.tags = tuple(tagdict.iteritems()) return a
def writeBAMEntry(outfile, chrom, outputDict, readlength): index = 0 tagList = [] alignedRead = pysam.AlignedRead() queryName = string.split(outputDict["readID"], "/")[0] alignedRead.qname = queryName if outputDict["sense"] == "-": alignedRead.is_reverse = True alignedRead.rname = outfile.references.index(chrom) if outputDict.has_key("startL"): startL = outputDict["startL"] stopL = outputDict["stopL"] startR = outputDict["startR"] stopR = outputDict["stopR"] alignedRead.pos = startL alignedRead.cigar = [(0, stopL - startL + 1), (3, startR - stopL - 1), (0, stopR - startR + 1)] tagList.append(("XS", outputDict["sense"])) else: alignedRead.pos = outputDict["start"] alignedRead.cigar = [(0, readlength)] if outputDict.has_key("pairID"): pairID = outputDict["pairID"] if pairID == "1": alignedRead.is_read1 = True alignedRead.is_proper_pair = True elif pairID == "2": alignedRead.is_read2 = True alignedRead.is_proper_pair = True else: pass if outputDict.has_key("mismatch"): mismatchTag = getMismatches(outputDict["mismatch"]) if mismatchTag: tagList.append(("MD", mismatchTag)) if tagList: alignedRead.tags = tagList multiplicity = 1.0 / outputDict.get("weight", 1.0) while multiplicity > 0: outfile.write(alignedRead) multiplicity -= 1.0 index += 1 return index
def unmapped_aligned_read(qname): aligned_read = pysam.AlignedRead() aligned_read.qname = qname aligned_read.flag = 0x4 aligned_read.rname = -1 aligned_read.pos = -1 aligned_read.mapq = 0 aligned_read.cigar = None aligned_read.rnext = -1 aligned_read.pnext = -1 aligned_read.tlen = 0 aligned_read.seq = '*' aligned_read.qual = '*' return aligned_read
def MakeBam(self, ref_name, aln): qual_list = self.sam_quality() qual = ''.join(qual_list[aln.begin:aln.end]) bam = pysam.AlignedRead() bam.qname = self.read.name bam.seq=self.read.seq.tostring() bam.flag = 0 #bam.rname = ref_name bam.pos = aln.begin + 1 bam.mapq = 255 bam.cigar = aln.bam_cigar() bam.qual = qual bam.tags = ( ("NM", 1), ("RG", "L1") ) return bam
def copy_read(r): a = pysam.AlignedRead() a.qname = r.qname a.seq = r.seq a.flag = r.flag a.tid = r.tid a.pos = r.pos a.mapq = r.mapq a.cigar = r.cigar a.rnext = r.rnext a.pnext = r.pnext a.isize = r.isize a.qual = r.qual a.tags = list(r.tags) return a
def mapping_from_line(line): combined_mapping = pysam.AlignedRead() parsed_line = sam.parse_line(line) if parsed_line['strand'] == '-': combined_mapping.is_reverse = True combined_mapping.seq = parsed_line['SEQ'] combined_mapping.qual = parsed_line['QUAL'] combined_mapping.cigarstring = parsed_line['CIGAR'] # This should obviously be made more general. for tag_name in ['Xs', 'Xq', 'Xw']: if tag_name in parsed_line: tag = [(tag_name, parsed_line[tag_name])] combined_mapping.tags = combined_mapping.tags + tag return combined_mapping
def doWork(args): """ Main wrapper""" # make sam header header = {'HD': {'VN': '1.0'}} headers, header_lookup = parseReferences(args.ref) header['SQ'] = headers # open outfile outfile = pysam.Samfile(args.samfile, "wh", header=header) # parse in the blast file blast_records = blastxml.parse(open(args.blast)) for blast_record in blast_records: for alignment in blast_record.alignments: for hsp in alignment.hsps: #print(alignment.title) read = pysam.AlignedRead() read.qname = blast_record.query read.flag = 0 # fix up reverse complementing dna = hsp.query.replace('-', '') #print(hsp.frame) cigar = makeCigar(hsp, blast_record.query_letters ) # represented as tuple of 2-tuples if hsp.frame[1] ^ hsp.frame[0]: seq = Seq(dna) rc = seq.reverse_complement() read.seq = str(rc) read.flag |= 0x10 read.pos = hsp.sbjct_end - 1 read.cigar = cigar[::-1] else: read.seq = dna read.pos = hsp.sbjct_start - 1 read.cigar = cigar read.rname = header_lookup[ alignment.hit_def] # index to list of headers read.mapq = 255 # phred scaled probability score read.mrnm = -1 # index of the mate read.mpos = -1 # position of the mate read.tlen = 0 # insert size of the mates outfile.write(read) #print(read) outfile.close()
def create_new_read(new_seq, new_qual, read1, read2): global notified new = pysam.AlignedRead() if not read1.qname.startswith("M_"): new.qname = "M_" + read1.qname else: new.qname = read1.qname new.seq = new_seq new.qual = new_qual new.is_unmapped = True new.pos = -1 new.mpos = -1 new.is_qcfail = read1.is_qcfail and read2.is_qcfail if read1.tags != None: htags = dict(read1.tags) else: htags = {} if (len(new_seq) < min_length): new.is_qcfail = True if "ZQ" in htags: htags["ZQ"] += "L" else: htags["ZQ"] = "L" stags = set() new_tags = [] if read2.tags != None: for tag, value in read2.tags: stags.add(tag) if tag == "NM" or tag == "MD": continue elif tag in htags and value != htags[tag]: # NEW TAG DIFF VALUE if tag == "ZQ": qc_tag = list(set(list(value + htags[tag]))) qc_tag.sort() new_tags.append((tag, "".join(qc_tag))) else: if tag not in notified: sys.stderr.write( "Do not know how to combine %s BAM tags. Information of one of the reads will get lost during merging.\n" % tag) notified.add(tag) elif tag in htags and value == htags[tag]: # SAME TAG AND VALUE new_tags.append((tag, value)) else: # NEW TAG new_tags.append((tag, value)) for tag, value in htags.iteritems(): if tag not in stags: new_tags.append((tag, value)) new.tags = new_tags return new
def _cleanup_record(record): """Marks a BAM record as unmapped, clearing relevant fields and/or setting fields to match those of the mate (if mapped). An updated (possibly new) record is returned. """ if record.cigar: # Build a new read; this is nessesary, as it is not possible # to clean the CIGAR string on an existing record in current # versions of Pysam. unmapped_read = pysam.AlignedRead() unmapped_read.qname = record.qname unmapped_read.flag = record.flag unmapped_read.seq = record.seq unmapped_read.qual = record.qual unmapped_read.tags = record.tags if not record.mate_is_unmapped: unmapped_read.rnext = record.rnext unmapped_read.pnext = record.pnext else: unmapped_read.rnext = -1 unmapped_read.pnext = -1 unmapped_read.tid = unmapped_read.rnext # Set .pos TWICE; this is a workaround for a bug in current versions # of pysam, in which the bin in the record is re-calculated BEFORE # the new position value is set, using the old pos value. unmapped_read.pos = unmapped_read.pnext # Update 1 of 2 unmapped_read.pos = unmapped_read.pnext # Update 2 of 2 return unmapped_read else: record.mapq = 0 if record.mate_is_unmapped: record.rnext = -1 record.pnext = -1 record.tid = record.rnext record.pos = record.pnext record.tlen = 0 return record
def makeSAMrec(pos, seqinfo, cig, orient, refname, single = False, grpident = None): newcig, pos, rmfront, rmback = cleanupCigar(pos, cig, len(seqinfo.seq)) a = pysam.AlignedRead() a.tid = 0 a.rname = 0 a.qname = seqinfo.id a.flag = 0x00 if orient == '-': a.seq = str(seqinfo.seq.reverse_complement()) else: a.seq= str(seqinfo.seq) if rmback: a.seq = a.seq[:-rmback] if rmfront: a.seq = a.seq[rmfront:] if seqinfo.letter_annotations: tmpq = seqinfo.format("fastq").strip().split("\n")[-1] if orient == '-': tmpq = tmpq[::-1] if rmback: tmpq = tmpq[:-rmback] if rmfront: tmpq = tmpq[rmfront:] a.qual = tmpq a.pos = pos a.cigarstring = newcig a.rnext = -1 a.pnext= -1 a.tlen = 0 if single: tags= [] if grpident == None: tags.append( ("RG", "GROUP-%s"%(seqinfo.id.split("_")[0]),) ) else: tags.append( ("RG", "GROUP-%s"%(grpident),) ) a.tags = tags return a
def generateSamInfo(self, refnamestr, sinfo, seq, cig, startpos, refname, qual, readgroupID): a = pysam.AlignedRead() a.rname = refname a.qname = "%s_%s"%(sinfo.qname, refnamestr) a.seq= seq a.flag = 0x00 #sinfo.flag & 16 a.pos = startpos a.mapq = sinfo.mapq a.cigarstring = cig a.rnext = -1 a.pnext= -1 #a.pos #a.isize = sinfo.isize a.tlen = 0 a.qual = qual.strip().replace(" ","") tags= [] # DLS: force to use our regroup information instead of what was previously filled in.. tags.append( ("RG", readgroupID,) ) tmp = self.__getTag__(sinfo, "X0") if tmp: tags.append( ("X0", tmp,) ) tmp = self.__getTag__(sinfo, "AS") if tmp: tags.append( ("AS", tmp,) ) tmp = self.__getTag__(sinfo, "XS") if tmp: tags.append( ("XS", tmp,) ) tmp = self.__getTag__(sinfo, "YS") if tmp: tags.append( ("YS", tmp,) ) tmp = self.__getTag__(sinfo, "YT") if tmp: tags.append( ("YT", tmp,) ) a.tags = tags #a.tags = sinfo.tags return a
def main(): # Parameters to be input. parser = ArgumentParser() parser.add_argument("--infile", action="store", dest="infile", help="input BAM file", required=True) parser.add_argument("--outfile", action="store", dest="outfile", help="output BAM file", required=True) parser.add_argument( '--Ncutoff', type=float, default=1.0, dest='Ncutoff', help="Maximum percentage of Ns allowed in a consensus [1.0]") parser.add_argument( '--readlength', type=int, default=84, dest='read_length', help="Length of the input read that is being used. [84]") parser.add_argument( '--barcode_length', type=int, default=12, dest='blength', help= 'Length of the duplex tag sequence. Should match the value in tag_to_header.[12]' ) parser.add_argument( '--read_out', type=int, default=1000000, dest='rOut', help= 'How often you want to be told what the program is doing. [1000000]') parser.add_argument('--gzip-fqs', action="store_true", default=False, dest='gzip_fastqs', help='Output gzipped fastqs [False]') o = parser.parse_args() # Initialization of all global variables, main input/output files, and main iterator and dictionaries. in_bam = pysam.Samfile(o.infile, "rb") # Open the input BAM file out_bam = pysam.Samfile(o.outfile, "wb", template=in_bam) # Open the output BAM file fastq_file1 = fastq_open(o.outfile, o.gzip_fastqs, 'r1') fastq_file2 = fastq_open(o.outfile, o.gzip_fastqs, 'r2') read_num = 0 duplexes_made = 0 uP = 0 nC = 0 file_done = False # Initialize end of file bool finished = False read_one = True bam_entry = in_bam.fetch(until_eof=True) # Initialize the iterator first_read = bam_entry.next() # Get the first read read_dict = {} # Initialize the read dictionary first_tag = first_read.qname.split(":")[0] qual_score = first_read.qual # Set a dummy quality score consensus_dict = {} cig_dum = first_read.cigar # set a dummy cigar score # Start going through the input BAM file, one position at a time. for line in bam_entry: # Reinitialize first line read_num += 1 if read_one is True and first_read.is_unmapped is False: read_dict[first_tag] = [ first_read.flag, first_read.rname, first_read.pos, first_read.mrnm, first_read.mpos, first_read.isize, first_read.seq ] read_one = False while line.pos == first_read.pos and file_done is False: tag = line.qname.split(":")[0] # Extract the barcode # Add the sequence to the read dictionary if line.is_unmapped is False: read_dict[tag] = [ line.flag, line.rname, line.pos, line.mrnm, line.mpos, line.isize, line.seq ] try: # Keep StopIteration error from happening line = bam_entry.next() # Iterate the line read_num += 1 except: file_done = True # Tell the program that it has reached the end of the file read_num += 1 if read_num % o.rOut == 0: sys.stderr.write("%s reads processed\n" % read_num) else: # Send reads to dcs_maker first_read = line # Store the present line for the next group of lines first_tag = first_read.qname.split(":")[0] read_one = True dict_keys = read_dict.keys() for dict_tag in read_dict.keys( ): # Extract sequences to send to the dcs_maker switch_tag = dict_tag[o.blength:] + dict_tag[:o.blength] try: consensus = dcs_maker( [read_dict[dict_tag][6], read_dict[switch_tag][6]], o.read_length) duplexes_made += 1 # Filter out consensuses with too many Ns in them if consensus.count("N") / len(consensus) > o.Ncutoff: nC += 1 else: # Write a line to the consensus_dictionary a = pysam.AlignedRead() a.qname = dict_tag a.flag = read_dict[dict_tag][0] if a.is_reverse is True: tmp_seq = Seq(consensus, IUPAC.unambiguous_dna) a.seq = str(tmp_seq.reverse_complement()) else: a.seq = consensus a.rname = read_dict[dict_tag][1] a.pos = read_dict[dict_tag][2] a.mapq = 255 a.cigar = cig_dum a.mrnm = read_dict[dict_tag][3] a.mpos = read_dict[dict_tag][4] a.isize = read_dict[dict_tag][5] a.qual = qual_score # Write DCSs to output BAM file in read pairs. if dict_tag in consensus_dict: if a.is_read1 is True: fastq_file1.write('@:%s\n%s\n+\n%s\n' % (a.qname, a.seq, a.qual)) out_bam.write(a) fastq_file2.write( '@:%s\n%s\n+\n%s\n' % (consensus_dict[dict_tag].qname, consensus_dict[dict_tag].seq, consensus_dict[dict_tag].qual)) out_bam.write(consensus_dict.pop(dict_tag)) else: fastq_file1.write( '@:%s\n%s\n+\n%s\n' % (consensus_dict[dict_tag].qname, consensus_dict[dict_tag].seq, consensus_dict[dict_tag].qual)) out_bam.write(consensus_dict.pop(dict_tag)) fastq_file2.write('@:%s\n%s\n+\n%s\n' % (a.qname, a.seq, a.qual)) out_bam.write(a) else: consensus_dict[dict_tag] = a del read_dict[dict_tag] del read_dict[switch_tag] except: pass read_dict = {} # Reset the read dictionary # Close BAM files in_bam.close() # Write unpaired DCSs for consTag in consensus_dict.keys(): a = pysam.AlignedRead() a.qname = consTag a.flag = 5 a.seq = '.' * o.read_length a.rname = consensus_dict[consTag].rname a.pos = consensus_dict[consTag].pos a.mapq = 255 a.cigar = cig_dum a.mrnm = consensus_dict[consTag].mrnm a.mpos = consensus_dict[consTag].pos a.isize = consensus_dict[consTag].isize a.qual = qual_score if consensus_dict[consTag].is_read1 is False: fastq_file1.write('@:%s\n%s\n+\n%s\n' % (a.qname, a.seq, a.qual)) out_bam.write(a) fastq_file2.write( '@:%s\n%s\n+\n%s\n' % (consensus_dict[consTag].qname, consensus_dict[consTag].seq, consensus_dict[consTag].qual)) out_bam.write(consensus_dict.pop(consTag)) else: fastq_file1.write( '@:%s\n%s\n+\n%s\n' % (consensus_dict[consTag].qname, consensus_dict[consTag].seq, consensus_dict[consTag].qual)) out_bam.write(consensus_dict.pop(consTag)) fastq_file2.write('@:%s\n%s\n+\n%s\n' % (a.qname, a.seq, a.qual)) out_bam.write(a) uP += 1 fastq_file1.close() fastq_file2.close() out_bam.close() # Write summary statistics. Duplexes made includes unpaired duplexes sys.stderr.write("Summary Statistics: \n") sys.stderr.write("Reads Processed: %s\n" % read_num) sys.stderr.write("Duplexes Made: %s\n" % duplexes_made) sys.stderr.write("Unpaired Duplexes: %s\n" % uP) sys.stderr.write("N-clipped Duplexes: %s\n" % nC)
def main(): #Parameters to be input. parser = ArgumentParser() parser.add_argument("--infile", action="store", dest="infile", help="input BAM file", required=True) parser.add_argument("--tagfile", action="store", dest="tagfile", help="output tagcounts file", default='sys.stdout', required=True) parser.add_argument("--outfile", action="store", dest="outfile", help="output BAM file", required=True) parser.add_argument( "--rep_filt", action="store", type=int, dest='rep_filt', help="Remove tags with homomeric runs of nucleotides of length x. [9]", default=9) parser.add_argument( '--minmem', type=int, default=3, dest='minmem', help="Minimum number of reads allowed to comprise a consensus. [3] ") parser.add_argument( '--maxmem', type=int, default=1000, dest='maxmem', help="Maximum number of reads allowed to comprise a consensus. [1000]") parser.add_argument( '--cutoff', type=float, default=.7, dest='cutoff', help= "Percentage of nucleotides at a given position in a read that must be identical in order for a consensus to be called at that position. [0.7]" ) parser.add_argument( '--Ncutoff', type=float, default=1, dest='Ncutoff', help= "With --filt 'n', maximum fraction of Ns allowed in a consensus [1.0]") parser.add_argument( '--readlength', type=int, default=84, dest='read_length', help="Length of the input read that is being used. [80]") parser.add_argument( '--read_type', type=str, action="store", dest='read_type', default="dpm", help= "A string specifying which types of read to consider. Read types: n: Neither read 1 or read 2 mapped. m: Either read 1 or read 2 mapped, but not both. p: Both read 1 and read 2 mapped, not a propper pair. d: Both read 1 and read 2 mapped, propper pair. s: Single ended reads\n\t\t['dpm']" ) parser.add_argument('--isize', type=int, default=-1, dest='isize', help="maximum distance between read pairs") parser.add_argument( '--read_out', type=int, default=1000000, dest='rOut', help= 'How often you want to be told what the program is doing. [1000000]') parser.add_argument( '--filt', type=str, default='osn', dest='filt', help= "A string indicating which filters should be implemented. Filters: s: Softclipping filter. o: Overlap filter. n: N filter. ['osn']" ) o = parser.parse_args() # Initialization of all global variables, main input/output files, and main iterator and dictionaries. goodFlag = [] if 'd' in o.read_type: goodFlag.extend((99, 83, 163, 147)) if 'm' in o.read_type: goodFlag.extend((181, 117, 137, 133, 73, 89, 69, 153)) if 'p' in o.read_type: goodFlag.extend((97, 81, 161, 145, 129, 65, 177, 113)) if 'n' in o.read_type: goodFlag.extend((141, 77, 4)) if 's' in o.read_type: goodFlag.extend((0, 16)) if 'u' in o.read_type: goodFlag.extend((103, 167)) inBam = pysam.Samfile(o.infile, "rb") # Open the input BAM file outBam = pysam.Samfile(o.outfile, "wb", template=inBam) # Open the output BAM file outNC1 = pysam.Samfile(o.outfile.replace(".bam", "_LCC.bam"), "wb", template=inBam) nonMap = pysam.Samfile(o.outfile.replace(".bam", "_NM.bam"), "wb", template=inBam) # File for reads with strange flags if o.read_type == 'd': extraBam = pysam.Samfile(o.outfile.replace(".bam", "_UP.bam"), "wb", template=inBam) readNum = 0 nM = 0 bF = 0 oL = 0 sC = 0 rT = 0 nC = 0 LCC = 0 ConMade = 0 if o.read_type == 'd': UP = 0 fileDone = False # Initialize end of file bool finished = False readOne = False qualScore = 'J' * o.read_length # Set a dummy quality score bamEntry = inBam.fetch(until_eof=True) # Initialize the iterator readWin = [bamEntry.next(), ''] # Get the first read winPos = 0 readDict = {} # Initialize the read dictionary tagDict = defaultdict(lambda: 0) # Initialize the tag dictionary consensusDict = {} #Start going through the input BAM file, one position at a time. for line in bamEntry: winPos += 1 readWin[winPos % 2] = line # Reinitialize first line if readOne == True: winPos -= 1 while (readWin[winPos % 2].pos == readWin[(winPos - 1) % 2].pos and fileDone == False and readOne == False) or readOne == True: if readNum % o.rOut == 0: sys.stderr.write("Reads processed:" + str(readNum) + "\n") try: tag = readWin[winPos % 2].qname.split('|')[1].split('/')[0] + ( ":1" if readWin[winPos % 2].is_read1 == True else (":2" if readWin[winPos % 2].is_read2 == True else ":se")) tagDict[tag] += 1 except: print readNum raise # Overlap filter: filters out overlapping reads (with --filt o) overlap = False if 'o' in o.filt: if readWin[winPos % 2].pos < readWin[winPos % 2].mpos and readWin[ winPos % 2].mpos < readWin[ winPos % 2].pos + o.read_length and int( readWin[winPos % 2].flag) in (83, 99, 147, 163): overlap = True elif readWin[winPos % 2].pos > readWin[ winPos % 2].mpos and readWin[winPos % 2].pos < readWin[ winPos % 2].mpos + o.read_length and int( readWin[winPos % 2].flag) in (83, 99, 147, 163): overlap = True elif readWin[winPos % 2].pos == readWin[winPos % 2].mpos and int( readWin[winPos % 2].flag) in (83, 99, 147, 163): overlap = True readNum += 1 # Softclip filter: filters out softclipped reads (with --filt s) softClip = False if 's' in o.filt: if readWin[winPos % 2].cigar != None: for tupple in readWin[winPos % 2].cigar: if tupple[0] == 4: softClip = True # Check if the given read is good data if int(readWin[winPos % 2].flag ) in goodFlag and overlap == False and softClip == False: if ('A' * o.rep_filt in tag) or ('C' * o.rep_filt in tag) or ( 'G' * o.rep_filt in tag) or ('T' * o.rep_filt in tag): # Check for bad barcodes nM += 1 nonMap.write(readWin[winPos % 2]) rT += 1 else: # Add the sequence to the read dictionary if tag not in readDict: readDict[tag] = [ readWin[winPos % 2].flag, readWin[winPos % 2].rname, readWin[winPos % 2].pos, readWin[winPos % 2].mrnm, readWin[winPos % 2].mpos, readWin[winPos % 2].isize, { str(readWin[winPos % 2].cigar): [0, readWin[winPos % 2].cigar] } ] if str(readWin[winPos % 2].cigar) not in readDict[tag][6]: readDict[tag][6][str(readWin[winPos % 2].cigar)] = [ 0, readWin[winPos % 2].cigar ] readDict[tag][6][str(readWin[winPos % 2].cigar)].append( readWin[winPos % 2].seq) readDict[tag][6][str(readWin[winPos % 2].cigar)][0] += 1 else: nM += 1 nonMap.write(readWin[winPos % 2]) if int(readWin[winPos % 2].flag) not in goodFlag: bF += 1 elif overlap == True: oL += 1 elif softClip == True: sC += 1 winPos += 1 if readOne == False: try: # Keep StopIteration error from happening at the end of a file readWin[winPos % 2] = bamEntry.next() # Iterate the line except: fileDone = True # Tell the program that it has reached the end of the file else: readOne = False else: # Send reads to consensusMaker readOne = True for dictTag in readDict.keys( ): # Extract sequences to send to the consensus maker # Cigar string filtering cigComp = {} for cigStr in readDict[dictTag][6].keys( ): # Determin the most common cigar string cigComp[cigStr] = readDict[dictTag][6][cigStr][0] maxCig = max(cigComp) if cigComp[maxCig] >= o.minmem: if cigComp[maxCig] <= o.maxmem: ConMade += 1 consensus, fam_size = consensusMaker( readDict[dictTag][6][maxCig][2:], o.cutoff, o.read_length) else: ConMade += 1 consensus, fam_size = consensusMaker( random.sample(readDict[dictTag][6][maxCig][2:], o.maxmem), o.cutoff, o.read_length) for cigStr in readDict[dictTag][6].keys(): if cigStr != maxCig: for n in xrange( 2, len(readDict[dictTag][6][cigStr][2:])): a = pysam.AlignedRead() a.qname = dictTag + ':' + str(fam_size) a.flag = readDict[dictTag][0] a.seq = readDict[dictTag][6][cigStr][n] a.rname = readDict[dictTag][1] a.pos = readDict[dictTag][2] a.mapq = 255 a.cigar = readDict[dictTag][6][cigStr][1] a.mrnm = readDict[dictTag][3] a.mpos = readDict[dictTag][4] a.isize = readDict[dictTag][5] a.qual = qualScore outNC1.write(a) LCC += 1 cigComp = {} # Filter out consensuses with too many Ns in them if (consensus.count("N") / len(consensus) <= o.Ncutoff and 'n' in o.filt) or ('n' not in o.filt): # Write a line to the consensusDictionary a = pysam.AlignedRead() a.qname = dictTag + ":" + str(fam_size) a.flag = readDict[dictTag][0] a.seq = consensus a.rname = readDict[dictTag][1] a.pos = readDict[dictTag][2] a.mapq = 255 a.cigar = readDict[dictTag][6][maxCig][1] a.mrnm = readDict[dictTag][3] a.mpos = readDict[dictTag][4] a.isize = readDict[dictTag][5] a.qual = qualScore # Write SSCSs to output BAM file in read pairs. altTag = dictTag.replace( ("1" if "1" in dictTag else "2"), ("2" if "1" in dictTag else "1")) if altTag in consensusDict: if a.is_read1 == True: outBam.write(a) outBam.write(consensusDict.pop(altTag)) else: outBam.write(consensusDict.pop(altTag)) outBam.write(a) else: consensusDict[dictTag] = a else: nC += 1 readDict = {} # Reset the read dictionary if o.read_type == 'd': if o.isize != -1: for consTag in consensusDict.keys(): if consensusDict[consTag].pos + o.isize < readWin[winPos % 2].pos: extraBam.write(consensusDict.pop(consTag)) UP += 1 # Write unpaired SSCSs for consTag in consensusDict.keys(): if o.read_type == 'd': extraBam.write(consensusDict.pop(consTag)) UP += 1 else: outBam.write(consensusDict.pop(consTag)) # Close BAM files inBam.close() outBam.close() nonMap.close() outNC1.close() if o.read_type == 'd': extraBam.close() # Write summary statistics sys.stderr.write("Summary Statistics: \n") sys.stderr.write("Reads processed:" + str(readNum) + "\n") sys.stderr.write("Bad reads: %s\n" % nM) sys.stderr.write("\tReads with Bad Flags: %s\n" % bF) sys.stderr.write("\tOverlapping Reads: %s\n" % oL) sys.stderr.write("\tSoftclipped Reads: %s\n" % sC) sys.stderr.write("\tRepetitive Duplex Tag: %s\n" % rT) sys.stderr.write("Reads with Less Common Cigar Strings: %s\n" % LCC) sys.stderr.write("Consensuses Made: %s\n" % ConMade) #sys.stderr.write("Unpaired Consensuses: %s\n" % UP) sys.stderr.write("Consensuses with Too Many Ns: %s\n\n" % nC) # Write the tag counts file. tagFile = open(o.tagfile, "w") tagFile.write("\n".join([ "%s\t%d" % (SMI, tagDict[SMI]) for SMI in sorted( tagDict.keys(), key=lambda x: tagDict[x], reverse=True) ])) tagFile.close() tagStats(o.tagfile)
def convert_bam_file(chain_file, file_in, file_out, reverse=False): """ Convert genome coordinates (in BAM/SAM format) between assemblies. These coordinates are stored in the :class:`.chain.ChainFile` object. :param chain_file: chain file used for conversion :type chain_file: :class:`.chain.ChainFile` :param str file_in: the input SAM or BAM file :type file_in: string :param file_out: the output SAM or file :type file_out: string :param reverse: reverse direction of original chain file :type reverse: boolean """ if not isinstance(chain_file, ChainFile): chain_file = g2g_fu.check_file(chain_file) if not isinstance(file_in, pysam.Samfile): file_in = g2g_fu.check_file(file_in) output_file_name = g2g_fu.check_file(file_out, 'w') unmapped_file_name = "{0}.unmapped".format(output_file_name) LOG.info("CHAIN FILE: {0}".format(chain_file)) LOG.info("INPUT FILE: {0}".format(file_in)) LOG.info("OUTPUT FILE: {0}".format(output_file_name)) LOG.info("UNMAPPED FILE: {0}".format(unmapped_file_name)) if not isinstance(chain_file, ChainFile): LOG.info("Parsing chain file...") chain_file = ChainFile(chain_file, reverse=reverse) LOG.info("Chain file parsed") if not isinstance(file_in, pysam.Samfile): try: sam_file = pysam.Samfile(file_in, 'rb') if len(sam_file.header) == 0: raise G2GBAMError("BAM File has no header information") except: sam_file = pysam.Samfile(file_in, 'r') if len(sam_file.header) == 0: raise G2GBAMError("SAM File has no header information") LOG.info("Converting BAM file") new_header = sam_file.header # replace 'HD' new_header['HD'] = {'VN': 1.0, 'SO': 'coordinate'} # replace SQ tmp = [] name_to_id = {} id = 0 for ref_name in sorted(chain_file.chrom_size_to): tmp.append({ 'LN': chain_file.chrom_size_from[ref_name], 'SN': ref_name }) name_to_id[ref_name] = id id += 1 new_header['SQ'] = tmp if 'PG' not in new_header: new_header['PG'] = [] new_header['PG'].append({'ID': 'gtgtools', 'VN': 1.0}) if 'CO' not in new_header: new_header['CO'] = [] new_header['CO'].append("Original file: {0}".format(file_in)) new_header['CO'].append("Chain File: {0}".format(chain_file.file_name)) dir, temp_file_name = os.path.split(file_out) parts = temp_file_name.split('.') ext = parts[-1] if ext.lower() == 'bam': new_file = pysam.Samfile(file_out, 'wb', header=new_header) new_file_unmapped = pysam.Samfile(unmapped_file_name, 'wb', template=sam_file) elif ext.lower() == 'sam': new_file = pysam.Samfile(file_out, 'wh', header=new_header) new_file_unmapped = pysam.Samfile(unmapped_file_name, 'wh', template=sam_file) else: raise G2GBAMError( "Unable to create new file based upon file extension") total = 0 total_unmapped = 0 total_fail_qc = 0 map_statistics = { 'total': 0, 'fail_cannot_map': 0, 'success_simple': 0, 'success_complex': 0 } map_statistics_pair = { 'total': 0, 'fail_cannot_map': 0, 'success_1_fail_2_simple': 0, 'success_1_fail_2_complex': 0, 'success_1_simple_2_fail': 0, 'success_1_simple_2_simple': 0, 'success_1_simple_2_complex': 0, 'success_1_complex_2_fail': 0, 'success_1_complex_2_simple': 0, 'success_1_complex_2_complex': 0 } try: while True: if total and total % 10000 == 0: status_success = 0 status_failed = 0 for k, v in map_statistics_pair.iteritems(): if k.startswith('success'): status_success += v elif k.startswith('fail'): status_failed += v LOG.info( "Processed {0:,} reads, {1:,} successful, {2:,} failed". format(total, status_success, status_failed)) alignment = sam_file.next() alignment_new = pysam.AlignedRead() read_chr = sam_file.getrname(alignment.tid) # READ ONLY # aend aligned reference position of the read on the reference genome # alen aligned length of the read on the reference genome. # positions a list of reference positions that this read aligns to # qend end index of the aligned query portion of the sequence (0-based, exclusive) # qlen Length of the aligned query sequence # qqual aligned query sequence quality values # qstart start index of the aligned query portion of the sequence (0-based, inclusive) # query aligned portion of the read and excludes any flanking bases that were soft clipped # rlen length of the read # TRUE / FALSE (setting effects flag) # is_paired true if read is paired in sequencing # is_proper_pair true if read is mapped in a proper pair # is_qcfail true if QC failure # is_read1 true if this is read1 # is_read2 true if this is read2 # is_reverse true if read is mapped to reverse strand # is_secondary true if not primary alignment # is_unmapped true if read itself is unmapped # mate_is_reverse true is read is mapped to reverse strand # mate_is_unmapped true if the mate is unmapped # SET # cigar cigar as list of tuples # cigarstring alignment as a string # flag properties flag # mapq mapping quality # pnext the position of the mate # pos 0-based leftmost coordinate # pnext the position of the mate # qname the query name # rnext the reference id of the mate # seq read sequence bases, including soft clipped bases # tid target id, contains the index of the reference sequence in the sequence dictionary # DON'T NEED TO SET or SHOULD WE SET? # qual read sequence base qualities, including soft clipped bases # tags the tags in the AUX field # tlen insert size total += 1 LOG.debug('~' * 80) LOG.debug("Converting {0} {1} {2} {3}".format( alignment.qname, read_chr, alignment.pos, alignment.cigarstring)) if alignment.is_qcfail: LOG.debug("\tFail due to qc of old alignment") new_file_unmapped.write(alignment) total_fail_qc += 1 continue if alignment.is_unmapped: LOG.debug("\tFail due to unmapped old alignment") new_file_unmapped.write(alignment) total_unmapped += 1 continue if not alignment.is_paired: LOG.debug("SINGLE END ALIGNMENT") map_statistics['total'] += 1 alignment_new.seq = alignment.seq alignment_new.flag = FLAG_NONE alignment_new.mapq = alignment.mapq alignment_new.qname = alignment.qname alignment_new.qual = alignment.qual alignment_new.tags = alignment.tags read_start = alignment.pos read_end = alignment.aend read_strand = '-' if alignment.is_reverse else '+' mappings = chain_file.find_mappings(read_chr, read_start, read_end) # unmapped if mappings is None: LOG.debug("\tFail due to no mappings") new_file_unmapped.write(alignment) map_statistics['fail_cannot_map'] += 1 elif len(mappings) == 1: if alignment.is_reverse: alignment_new.flag |= FLAG_REVERSE alignment_new.tid = name_to_id[mappings[0].to_chr] alignment_new.pos = mappings[0].to_start alignment_new.cigar = alignment.cigar new_file.write(alignment_new) LOG.debug("\tSuccess (simple): {0} {1}".format( alignment_new.pos, alignment_new.cigarstring)) map_statistics['success_simple'] += 1 else: LOG.debug("MAPPINGS: {0}".format(len(mappings))) for m in mappings: LOG.debug("> {0}".format(m)) if alignment.is_reverse: alignment_new.flag |= FLAG_REVERSE alignment_new.tid = name_to_id[mappings[0].to_chr] alignment_new.pos = mappings[0].to_start alignment_new.cigar = convert_cigar( alignment.cigar, read_chr, chain_file, alignment.seq, read_strand, alignment.pos) new_file.write(alignment_new) LOG.debug("\tSuccess (complex): {0} {1}".format( alignment_new.pos, alignment_new.cigarstring)) map_statistics['success_complex'] += 1 else: LOG.debug("PAIRED END ALIGNMENT") map_statistics_pair['total'] += 1 alignment_new.seq = alignment.seq alignment_new.flag = FLAG_PAIRED alignment_new.mapq = alignment.mapq alignment_new.qname = alignment.qname alignment_new.qual = alignment.qual alignment_new.tags = alignment.tags if alignment.is_read1: alignment_new.flag |= FLAG_READ1 if alignment.is_read2: alignment_new.flag |= FLAG_READ2 if alignment.is_reverse: alignment_new.flag |= FLAG_REVERSE if alignment.mate_is_reverse: alignment_new.flag |= FLAG_MREVERSE read1_chr = sam_file.getrname(alignment.tid) read1_start = alignment.pos read1_end = alignment.aend read1_strand = '-' if alignment.is_reverse else '+' read1_mappings = chain_file.find_mappings( read1_chr, read1_start, read1_end) #, read1_strand) read2_chr = None read2_start = None read2_end = None read2_strand = None read2_mappings = None if alignment.mate_is_unmapped: alignment_new.flag |= FLAG_MUNMAP else: read2_chr = sam_file.getrname(alignment.rnext) read2_start = alignment.pnext read2_end = read2_start + 1 read2_strand = '-' if alignment.mate_is_reverse else '+' try: read2_mappings = chain_file.find_mappings( read2_chr, read2_start, read2_end) except: read2_mappings = None if read1_mappings is None and read2_mappings is None: alignment_new.flag |= FLAG_UNMAP alignment_new.flag |= FLAG_MUNMAP LOG.debug("\tFail due to no mappings") new_file_unmapped.write(alignment) map_statistics_pair['fail_cannot_map'] += 1 elif read1_mappings is None and read2_mappings and len( read2_mappings) == 1: alignment_new.flag |= FLAG_UNMAP alignment_new.pos = 0 alignment_new.cigarstring = '0M' alignment_new.rnext = name_to_id[read2_mappings[0].to_chr] alignment_new.pnext = read2_mappings[0].to_start alignment_new.tlen = 0 LOG.debug( "\tPair Success (1:fail,2:simple): {0} {1}".format( alignment_new.pos, alignment_new.cigarstring)) new_file.write(alignment_new) map_statistics_pair['success_1_fail_2_simple'] += 1 elif read1_mappings is None and read2_mappings and len( read2_mappings) > 1: alignment_new.flag |= FLAG_UNMAP alignment_new.pos = 0 alignment_new.cigarstring = '0M' alignment_new.rnext = name_to_id[read2_mappings[0].to_chr] alignment_new.pnext = read2_mappings[0].to_start alignment_new.tlen = 0 LOG.debug( "\tPair Success (1:fail,2:complex): {0} {1}".format( alignment_new.pos, alignment_new.cigarstring)) new_file.write(alignment_new) map_statistics_pair['success_1_fail_2_complex'] += 1 elif read1_mappings and len( read1_mappings) == 1 and read2_mappings is None: alignment_new.flag |= FLAG_MUNMAP alignment_new.tid = name_to_id[read1_mappings[0].to_chr] alignment_new.pos = read1_mappings[0].to_start alignment_new.cigar = alignment.cigar alignment_new.rnext = name_to_id[read1_mappings[0].to_chr] alignment_new.pnext = 0 alignment_new.tlen = 0 # CHECK LOG.debug( "\tPair Success (1:simple,2:fail): {0} {1}".format( alignment_new.pos, alignment_new.cigarstring)) new_file.write(alignment_new) map_statistics_pair['success_1_simple_2_fail'] += 1 elif read1_mappings and len( read1_mappings) == 1 and read2_mappings and len( read2_mappings) == 1: alignment_new.tid = name_to_id[read1_mappings[0].to_chr] alignment_new.pos = read1_mappings[0].to_start alignment_new.cigar = alignment.cigar alignment_new.rnext = name_to_id[read2_mappings[0].to_chr] alignment_new.pnext = read2_mappings[0].to_start alignment_new.tlen = 0 # CHECK LOG.debug( "\tPair Success (1:simple,2:simple): {0} {1}".format( alignment_new.pos, alignment_new.cigarstring)) new_file.write(alignment_new) map_statistics_pair['success_1_simple_2_simple'] += 1 elif read1_mappings and len( read1_mappings ) == 1 and read2_mappings and len(read2_mappings) > 1: alignment_new.tid = name_to_id[read1_mappings[0].to_chr] alignment_new.pos = read1_mappings[0].to_start alignment_new.cigar = alignment.cigar alignment_new.rnext = name_to_id[read2_mappings[0].to_chr] alignment_new.pnext = read2_mappings[0].to_start alignment_new.tlen = 0 # CHECK LOG.debug( "\tPair Success (1:simple,2:complex): {0} {1}".format( alignment_new.pos, alignment_new.cigarstring)) new_file.write(alignment_new) map_statistics_pair['success_1_simple_2_complex'] += 1 elif read1_mappings and len( read1_mappings) > 1 and read2_mappings is None: alignment_new.flag |= FLAG_MUNMAP alignment_new.tid = name_to_id[read1_mappings[0].to_chr] alignment_new.pos = read1_mappings[0].to_start alignment_new.cigar = convert_cigar( alignment.cigar, read_chr, chain_file, alignment.seq, read1_strand, alignment.pos) alignment_new.rnext = name_to_id[read1_mappings[0].to_chr] alignment_new.pnext = 0 alignment_new.tlen = 0 # CHECK LOG.debug( "\tPair Success (1:complex,2:fail): {0} {1}".format( alignment_new.pos, alignment_new.cigarstring)) new_file.write(alignment_new) map_statistics_pair['success_1_complex_2_fail'] += 1 elif read1_mappings and len( read1_mappings) > 1 and read2_mappings and len( read2_mappings) == 1: alignment_new.tid = name_to_id[read1_mappings[0].to_chr] alignment_new.pos = read1_mappings[0].to_start alignment_new.cigar = convert_cigar( alignment.cigar, read_chr, chain_file, alignment.seq, read1_strand, alignment.pos) alignment_new.rnext = name_to_id[read2_mappings[0].to_chr] alignment_new.pnext = read2_mappings[0].to_start alignment_new.tlen = 0 # CHECK LOG.debug( "\tPair Success (1:complex,2:simple): {0} {1}".format( alignment_new.pos, alignment_new.cigarstring)) new_file.write(alignment_new) map_statistics_pair['success_1_complex_2_simple'] += 1 elif read1_mappings and len( read1_mappings) > 1 and read2_mappings and len( read2_mappings) > 1: alignment_new.tid = name_to_id[read1_mappings[0].to_chr] alignment_new.pos = read1_mappings[0].to_start alignment_new.cigar = convert_cigar( alignment.cigar, read_chr, chain_file, alignment.seq, read1_strand, alignment.pos) alignment_new.rnext = name_to_id[read2_mappings[0].to_chr] alignment_new.pnext = read2_mappings[0].to_start alignment_new.tlen = 0 # CHECK LOG.debug( "\tPair Success (1:complex,2:complex): {0} {1}".format( alignment_new.pos, alignment_new.cigarstring)) new_file.write(alignment_new) map_statistics_pair['success_1_complex_2_complex'] += 1 else: raise G2GBAMError( "Unknown BAM/SAM conversion/parse situation") except StopIteration: LOG.info("All reads processed") LOG.info(" {:>10} TOTAL ENTRIES".format(total)) LOG.info(" {:>10} TOTAL UNMAPPED ".format(total_unmapped)) LOG.info(" {:>10} TOTAL FAIL QC ".format(total_fail_qc)) if map_statistics['total'] > 0: LOG.info("") LOG.info("Mapping Summary Single End") LOG.info(" {:>10} TOTAL ENTRIES".format(map_statistics['total'])) LOG.info("") LOG.info( " {:>10} TOTAL SUCCESS".format(map_statistics['success_simple'] + map_statistics['success_complex'])) LOG.info(" {:>10} Simple".format(map_statistics['success_simple'])) LOG.info(" {:>10} Complex".format(map_statistics['success_complex'])) LOG.info("") LOG.info(" {:>10} TOTAL FAILURES".format( map_statistics['fail_cannot_map'])) LOG.info(" {:>10} Cannot Map ".format( map_statistics['fail_cannot_map'])) if map_statistics_pair['total'] > 0: total_success = 0 for k, v in map_statistics_pair.iteritems(): if k.startswith('success'): total_success += v LOG.info("") LOG.info("Mapping Summary Paired End") LOG.info(" {:>10} TOTAL ENTRIES".format(map_statistics_pair['total'])) LOG.info("") LOG.info(" {:>10} TOTAL SUCCESS".format(total_success)) LOG.info(" {:>10} Read 1 Failed, Read 2 Simple".format( map_statistics_pair['success_1_fail_2_simple'])) LOG.info(" {:>10} Read 1 Failed, Read 2 Complex".format( map_statistics_pair['success_1_fail_2_complex'])) LOG.info(" {:>10} Read 1 Simple, Read 2 Failed".format( map_statistics_pair['success_1_simple_2_fail'])) LOG.info(" {:>10} Read 1 Simple, Read 2 Simple".format( map_statistics_pair['success_1_simple_2_simple'])) LOG.info(" {:>10} Read 1 Simple, Read 2 Complex".format( map_statistics_pair['success_1_simple_2_complex'])) LOG.info(" {:>10} Read 1 Complex, Read 2 Failed".format( map_statistics_pair['success_1_complex_2_fail'])) LOG.info(" {:>10} Read 1 Complex, Read 2 Simple".format( map_statistics_pair['success_1_complex_2_simple'])) LOG.info(" {:>10} Read 1 Complex, Read 2 Complex".format( map_statistics_pair['success_1_complex_2_complex'])) LOG.info("") LOG.info(" {:>10} TOTAL FAILURES".format( map_statistics_pair['fail_cannot_map'])) LOG.info(" {:>10} Cannot Map".format( map_statistics_pair['fail_cannot_map'])) LOG.info("") LOG.info("BAM File Converted")
def split_combined_mapping(combined_mapping, remove_soft_clipped=True): ''' Split a combined_mapping into non-overlapping mappings. ''' R1_mapping = pysam.AlignedRead() R1_mapping.is_read1 = True R1_mapping.tid = combined_mapping.tid R1_mapping.qname = combined_mapping.qname R2_mapping = pysam.AlignedRead() R2_mapping.is_read2 = True R2_mapping.tid = combined_mapping.tid R2_mapping.qname = combined_mapping.qname skip_index = find_skip_index_in_combined(combined_mapping) left_cigar = combined_mapping.cigar[:skip_index] right_cigar = combined_mapping.cigar[skip_index + 1:] if remove_soft_clipped: first_left_op, first_left_length = left_cigar[0] if first_left_op == sam.BAM_CSOFT_CLIP: left_cigar = left_cigar[1:] last_right_op, last_right_length = right_cigar[-1] if last_right_op == sam.BAM_CSOFT_CLIP: right_cigar = right_cigar[:-1] combined_md = dict(combined_mapping.tags)['MD'] left_ref_bases = sam.total_reference_nucs(left_cigar) right_ref_bases = sam.total_reference_nucs(right_cigar) _, gap = combined_mapping.cigar[skip_index] left_pos = combined_mapping.pos right_pos = left_pos + left_ref_bases + gap left_md = sam.truncate_md_string_up_to(combined_md, left_ref_bases) right_md = sam.truncate_md_string_from_beginning(combined_md, right_ref_bases) strand = sam.get_strand(combined_mapping) if strand == '+': R1_mapping.cigar = left_cigar R1_mapping.setTag('MD', left_md) R2_mapping.cigar = right_cigar R2_mapping.setTag('MD', right_md) R1_mapping.pos = left_pos R2_mapping.pos = right_pos R2_mapping.is_reverse = True elif strand == '-': R1_mapping.cigar = right_cigar R1_mapping.setTag('MD', right_md) R2_mapping.cigar = left_cigar R2_mapping.setTag('MD', left_md) R1_mapping.pos = right_pos R2_mapping.pos = left_pos R1_mapping.is_reverse = True R1_seq, R1_qual, R2_seq, R2_qual = extract_seqs_from_combined( combined_mapping, include_overlap=False, remove_soft_clipped=remove_soft_clipped, flip_if_reverse=False, ) if R1_seq != '': R1_mapping.seq = R1_seq R1_mapping.qual = R1_qual if R2_seq != '': R2_mapping.seq = R2_seq R2_mapping.qual = R2_qual return R1_mapping, R2_mapping
def main(): usage = "%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option( "-i", "--input-file", action="store", type="string", dest="input_file", help= "Alignment file in BAM or SAM format. BAM file should be sorted and indexed" ) parser.add_option( "-o", "--out-prefix", action="store", type="string", dest="output_prefix", help= "Prefix of output BAM files. \"prefix.R1.bam\" file contains the 1st read, \"prefix.R2.bam\" file contains the 2nd read" ) (options, args) = parser.parse_args() if not (options.input_file): parser.print_help() sys.exit(0) if not os.path.exists(options.input_file): print >> sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n' sys.exit(0) samfile = pysam.Samfile(options.input_file, 'rb') OUT1 = pysam.Samfile( options.output_prefix + '.R1.bam', 'wb', template=samfile) #bam file containing reads hit to exon region OUT2 = pysam.Samfile( options.output_prefix + '.R2.bam', 'wb', template=samfile) #bam file containing reads not hit to exon region OUT3 = pysam.Samfile( options.output_prefix + '.unmap.bam', 'wb', template=samfile) #bam file containing reads not hit to exon region total_alignment = 0 r1_alignment = 0 r2_alignment = 0 unmapped = 0 print >> sys.stderr, "spliting " + options.input_file + " ...", try: while (1): new_alignment = pysam.AlignedRead() # create AlignedRead object old_alignment = samfile.next() total_alignment += 1 new_alignment.qname = old_alignment.qname # 1st column. read name. #new_alignment.flag = old_alignment.flag # 2nd column. subject to change. flag value new_alignment.tid = old_alignment.tid # 3rd column. samfile.getrname(tid) == chrom name new_alignment.pos = old_alignment.pos # 4th column. reference Start position of the aligned part (of read) [0-based] new_alignment.mapq = old_alignment.mapq # 5th column. mapping quality new_alignment.cigar = old_alignment.cigar # 6th column. subject to change. #new_alignment.rnext = old_alignment.rnext # 7th column. tid of the reference (mate read mapped to) #new_alignment.pnext = old_alignment.pnext # 8th column. position of the reference (0 based, mate read mapped to) #new_alignment.tlen = old_alignment.tlen # 9th column. insert size new_alignment.seq = old_alignment.seq # 10th column. read sequence. all bases. new_alignment.qual = old_alignment.qual # 11th column. read sequence quality. all bases. new_alignment.tags = old_alignment.tags # 12 - columns new_alignment.flag = 0x0000 if old_alignment.is_unmapped: OUT3.write(old_alignment) unmapped += 1 continue if old_alignment.is_reverse: new_alignment.flag = new_alignment.flag | 0x0010 if old_alignment.is_secondary: new_alignment.flag = new_alignment.flag | 0x0100 if old_alignment.is_qcfail: new_alignment.flag = new_alignment.flag | 0x0200 if old_alignment.is_duplicate: new_alignment.flag = new_alignment.flag | 0x0400 if old_alignment.is_read1: OUT1.write(new_alignment) r1_alignment += 1 else: OUT2.write(new_alignment) r2_alignment += 1 except StopIteration: print >> sys.stderr, "Done" print "%-55s%d" % ("Total records:", total_alignment) print "%-55s%d" % (options.output_prefix + 'Read 1:', r1_alignment) print "%-55s%d" % (options.output_prefix + 'Read 2:', r2_alignment) print "%-55s%d" % (options.output_prefix + 'Unmapped:', unmapped)
infile = open(filename) # ACTUAL INDEX IDENTIFICATION AND READ SORTING for seqid, seq, qual in read_fastq(infile): seqid = seqid.split()[0] seq2, qual2 = None, None if qual != None and options.qualityoffset != 33: qual = "".join( map(lambda x: chr(ord(x) - options.qualityoffset + 33), qual)) if options.start != None: seq2 = seq[options.start:] seq = seq[:options.start] if qual != None: qual2 = qual[options.start:] qual = qual[:options.start] forward = pysam.AlignedRead() forward.qname = seqid forward.seq = seq if qual != None: forward.qual = qual else: forward.qual = "*" forward.is_unmapped = True forward.pos = -1 forward.mpos = -1 if seq2 != None: forward.is_read1 = True forward.is_paired = True reverse = pysam.AlignedRead() reverse.qname = seqid reverse.is_read2 = True reverse.is_paired = True reverse.seq = seq2
def combine_paired_mappings(R1_mapping, R2_mapping, verbose=False): ''' Takes two pysam mappings representing opposite ends of a fragment and combines them into one mapping, (ab)using BAM_CREF_SKIP to bridge the gap (if any) between them. ''' R1_mapping = copy.deepcopy(R1_mapping) R2_mapping = copy.deepcopy(R2_mapping) R1_strand = sam.get_strand(R1_mapping) if R1_strand == '+': left_mapping, right_mapping = R1_mapping, R2_mapping elif R1_strand == '-': left_mapping, right_mapping = R2_mapping, R1_mapping # Soft-clipping at the 3' end of a read should only happen if this is # read-through into soft-clipping at the 5' end of the other read. # If there is non-physical soft-clipping in this pair, give up now. # Specifically, check if any pairing of read position to ref position # isn't the same. if (left_mapping.cigar[-1][0] == sam.BAM_CSOFT_CLIP) or \ (right_mapping.cigar[0][0] == sam.BAM_CSOFT_CLIP): left_pairs = left_mapping.get_aligned_pairs(matches_only=True) right_pairs = right_mapping.get_aligned_pairs(matches_only=True) if left_pairs != right_pairs: return False # Otherwise, remove all soft-clipping from the mappings, storing the 5' # soft-clipped seq and quals from both reads to add back at the end. left_clipped = remove_soft_clipping(left_mapping) right_clipped = remove_soft_clipping(right_mapping) left_md = dict(left_mapping.tags)['MD'] right_md = dict(right_mapping.tags)['MD'] right_aligned_pairs = sam.cigar_to_aligned_pairs( right_mapping.cigar, right_mapping.reference_start) right_after_overlap_pair_index = len(right_aligned_pairs) for i, (read, ref) in enumerate(right_aligned_pairs): if ref != None and ref >= left_mapping.aend: right_after_overlap_pair_index = i break right_overlap_pairs = right_aligned_pairs[:right_after_overlap_pair_index] right_after_overlap_pairs = right_aligned_pairs[ right_after_overlap_pair_index:] right_reads_after = [ read for read, ref in right_after_overlap_pairs if read != None and read != 's' ] right_refs_after = [ ref for read, ref in right_after_overlap_pairs if ref != None ] right_overlap_cigar = sam.aligned_pairs_to_cigar(right_overlap_pairs) right_after_overlap_cigar = sam.aligned_pairs_to_cigar( right_after_overlap_pairs) right_after_overlap_md = sam.truncate_md_string_from_beginning( right_md, len(right_refs_after)) right_after_overlap_read_start = len( right_mapping.seq) - len(right_reads_after) right_overlap_seq = right_mapping.seq[:right_after_overlap_read_start] right_overlap_qual = right_mapping.query_qualities[: right_after_overlap_read_start] right_after_overlap_seq = right_mapping.seq[ right_after_overlap_read_start:] right_after_overlap_qual = right_mapping.qual[ right_after_overlap_read_start:] left_aligned_pairs = sam.cigar_to_aligned_pairs( left_mapping.cigar, left_mapping.reference_start) left_before_overlap_pair_index = -1 for i, (read, ref) in list(enumerate(left_aligned_pairs))[::-1]: if ref != None and ref < right_mapping.pos: left_before_overlap_pair_index = i break left_overlap_pairs = left_aligned_pairs[left_before_overlap_pair_index + 1:] left_before_overlap_pairs = left_aligned_pairs[: left_before_overlap_pair_index + 1] left_reads_before = [ read for read, ref in left_before_overlap_pairs if read != None and read != 's' ] left_refs_before = [ ref for read, ref in left_before_overlap_pairs if ref != None ] left_overlap_cigar = sam.aligned_pairs_to_cigar(left_overlap_pairs) left_before_overlap_cigar = sam.aligned_pairs_to_cigar( left_before_overlap_pairs) left_before_overlap_md = sam.truncate_md_string_up_to( left_md, len(left_refs_before)) left_overlap_read_start = len(left_reads_before) left_overlap_seq = left_mapping.seq[left_overlap_read_start:] left_overlap_qual = left_mapping.query_qualities[left_overlap_read_start:] left_before_overlap_seq = left_mapping.seq[:left_overlap_read_start] left_before_overlap_qual = left_mapping.qual[:left_overlap_read_start] if left_overlap_pairs or right_overlap_pairs: gap_length = 0 left_has_splicing = sam.contains_splicing(left_mapping) right_has_splicing = sam.contains_splicing(right_mapping) if left_overlap_cigar == right_overlap_cigar: # If the two mappings agree about the location of indels in their overlap, # use the seq from the mapping with the higher average quality in the # overlap. left_mean_qual = np.mean(left_overlap_qual) right_mean_qual = np.mean(right_overlap_qual) if left_mean_qual > right_mean_qual: use_overlap_from = 'left' else: use_overlap_from = 'right' elif left_has_splicing != right_has_splicing: # A temporary(?) heuristic - if one read has splicing and the other # doesn't, use the overlap from the one with splicing under the # assumption that the other just has a few bases overhanging the # splice junction. if left_has_splicing: use_overlap_from = 'left' else: use_overlap_from = 'right' else: # If the two mappings disagree about the location of indels in their overlap, # we need a heuristic for picking which mapping we believe reflects the # true structure of the input fragment. The most innocuous explanation # is that a 'true' indel happened to lie close to the edge of one of the # mappings. A more problematic situation is a 'false' indel (that is, # produced during cluster generation or sequencing-by-synthesis, NOT # template production). Our strategy is: realign the overlapping part of # left mapping starting from the left edge of the overlap according to the # cigar of the right mapping and realign the overlapping part of the right # mapping starting from the right edge of the overlap according to the cigar # of the left mapping. Count the number of mismatches produced by each. # If the left overlap can accomodate the right cigar with fewer mismatches, # use the right cigar and seq. If the right overlap can accomodate the left # cigar with fewer mismatches, use the left cigar and seq. # The leftmost aligned_pair from the right mapping is guaranteed by the # mapping process to not involve a gap. _, overlap_ref_start = right_overlap_pairs[0] # Similarly, the rightmost aligned_pair from the left mapping can't be a # gap. _, overlap_ref_end = left_overlap_pairs[-1] realigned_left_cigar = sam.truncate_cigar_blocks_up_to( right_mapping.cigar, len(left_overlap_seq)) realigned_right_cigar = sam.truncate_cigar_blocks_from_beginning( left_mapping.cigar, len(right_overlap_seq)) ref_dict = sam.merge_ref_dicts( sam.ref_dict_from_mapping(left_mapping), sam.ref_dict_from_mapping(right_mapping), ) try: left_using_right_mismatches = realigned_mismatches( left_overlap_seq, overlap_ref_start, realigned_left_cigar, ref_dict) right_using_left_mismatches = realigned_mismatches_backwards( right_overlap_seq, overlap_ref_end, realigned_right_cigar, ref_dict) except (ValueError, TypeError): print(left_mapping) print(right_mapping) raise if verbose: logging.info('disagreements in {0}'.format(left_mapping.qname)) logging.info('left overlap cigar is {0}'.format( str(left_overlap_cigar))) logging.info('right overlap cigar is {0}'.format( str(right_overlap_cigar))) logging.info('left_using_right_mismatches - {0}'.format( len(left_using_right_mismatches))) logging.info('right_using_left_mismatches - {0}'.format( len(right_using_left_mismatches))) if len(left_using_right_mismatches) < len( right_using_left_mismatches): use_overlap_from = 'right' elif len(right_using_left_mismatches) < len( left_using_right_mismatches): use_overlap_from = 'left' else: logging.info('disagreements in {0}'.format(left_mapping.qname)) logging.info('left overlap cigar is {0}'.format( str(left_overlap_cigar))) logging.info('right overlap cigar is {0}'.format( str(right_overlap_cigar))) logging.info('left_using_right_mismatches - {0}'.format( len(left_using_right_mismatches))) logging.info('right_using_left_mismatches - {0}'.format( len(right_using_left_mismatches))) logging.info('ambiguous disagreement') return False else: gap_length = right_mapping.pos - left_mapping.aend # It doesn't matter what use_overlap_from is set to; there is no overlap use_overlap_from = 'left' combined_mapping = pysam.AlignedRead() combined_mapping.qname = left_mapping.qname combined_mapping.tid = left_mapping.tid combined_mapping.mapq = min(left_mapping.mapq, right_mapping.mapq) combined_mapping.rnext = -1 combined_mapping.pnext = -1 combined_mapping.pos = left_mapping.pos if R1_strand == '-': combined_mapping.is_reverse = True gap_cigar = [(sam.BAM_CREF_SKIP, gap_length)] if use_overlap_from == 'left': combined_mapping.seq = left_mapping.seq + right_after_overlap_seq combined_mapping.qual = left_mapping.qual + right_after_overlap_qual combined_mapping.cigar = left_mapping.cigar + gap_cigar + right_after_overlap_cigar combined_md = sam.combine_md_strings(left_md, right_after_overlap_md) combined_mapping.setTag('MD', combined_md) overlap_seq_tag = right_overlap_seq overlap_qual_tag = fastq.encode_sanger(right_overlap_qual) elif use_overlap_from == 'right': combined_mapping.seq = left_before_overlap_seq + right_mapping.seq combined_mapping.qual = left_before_overlap_qual + right_mapping.qual combined_mapping.cigar = left_before_overlap_cigar + gap_cigar + right_mapping.cigar combined_md = sam.combine_md_strings(left_before_overlap_md, right_md) combined_mapping.setTag('MD', combined_md) overlap_seq_tag = left_overlap_seq overlap_qual_tag = fastq.encode_sanger(left_overlap_qual) if len(overlap_seq_tag) > 0: # Having empty tags causes problems, so don't create them. combined_mapping.setTag('Xs', overlap_seq_tag) combined_mapping.setTag('Xq', overlap_qual_tag) combined_mapping.setTag('Xw', use_overlap_from) qual = combined_mapping.qual seq = combined_mapping.seq cigar = combined_mapping.cigar before = left_clipped['from_start'] after = right_clipped['from_end'] combined_mapping.cigar = before['cigar'] + cigar + after['cigar'] combined_mapping.seq = before['seq'] + seq + after['seq'] combined_mapping.qual = before['qual'] + qual + after['qual'] return combined_mapping
def write_align_pair(out, dup, r, q): qname = '%s:%d-%d(%s)_%s:%d-%d(%s)' % (dup.tName, dup.tStart + 1, dup.tEnd, dup.tStrand, dup.qName, dup.qStart + 1, dup.qEnd, dup.qStrand) tid1 = out.gettid(dup.tName) tid2 = out.gettid(dup.qName) assert tid1 != -1 and tid2 != -1 if tid1 != -1: a = pysam.AlignedRead() a.qname = qname a.tid = tid1 a.pos = dup.tStart a.mapq = 255 a.seq = q a.cigar = dup.cigar.to_pysam_list() a.tags = [('RG', 'hg38.chain')] #[,('NM', nm)] a.tlen = 0 a.flag = 0 a.rnext = tid2 if dup.tStrand == '-': a.flag |= 0x10 if tid2 != -1: a.rnext = tid2 a.pnext = dup.qStart a.flag = 0x1 | 0x2 | 0x40 if dup.qStrand == '-': a.flag |= 0x20 out.write(a) if tid2 != -1: cigar2 = dup.cigar if dup.qStrand != '+': cigar2.reverse() cigar2 = cigar2.invert()[0] b = pysam.AlignedRead() b.qname = qname b.tid = tid2 b.pos = dup.qStart b.mapq = 255 b.seq = r if dup.qStrand == '+' else rc(r) b.cigar = cigar2.to_pysam_list() b.tags = [('RG', 'hg38.chain')] # ,('NM', nm)] b.tlen = 0 b.flag = 0 b.rnext = tid1 if dup.qStrand == '-': b.flag |= 0x10 if tid1 != -1: b.rnext = tid1 b.pnext = dup.tStart b.flag |= 0x1 | 0x2 | 0x80 if dup.tStrand == '-': b.flag |= 0x20 out.write(b)
def crossmap_bam_file(mapping, chainfile, infile, outfile_prefix, chrom_size, IS_size=200, IS_std=30.0, fold=3, addtag=True): ''' Description ----------- Convert genome coordinates (in BAM/SAM format) between assemblies. BAM/SAM format: http://samtools.sourceforge.net/ chrom_size is target chromosome size Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. chainfile : file Input chain format file. infile : file Input BAM, SAM or CRAM foramt file. outfile_prefix : str Output prefix. chrom_size : dict Chromosome size of the *target* assembly, used to build bam header. IS_size : int Average insert size of pair-end sequencing. IS_std : float Stanadard deviation of insert size. fold : float A mapped pair is considered as \"proper pair\" if both ends mapped to different strand and the distance between them is less then fold * stdev from the mean. addtag : bool if addtag is set to True, will add tags to each alignmnet: Q = QC (QC failed) N = unmapped (originally unmapped or originally mapped but failed to liftover to new assembly) M = multiple mapped (alignment can be liftover to multiple places) U = unique mapped (alignment can be liftover to only 1 place) tags for pair-end sequencing include: QF: QC failed NN: both read1 and read2 unmapped NU: read1 unmapped, read2 unique mapped NM: read1 unmapped, multiple mapped UN: read1 uniquely mapped, read2 unmap UU: both read1 and read2 uniquely mapped UM: read1 uniquely mapped, read2 multiple mapped MN: read1 multiple mapped, read2 unmapped MU: read1 multiple mapped, read2 unique mapped MM: both read1 and read2 multiple mapped tags for single-end sequencing include: QF: QC failed SN: unmaped SM: multiple mapped SU: uniquely mapped ''' # determine the input file format (BAM, CRAM or SAM) file_type = '' if infile.lower().endswith('.bam'): file_type = 'BAM' comments = ['ORIGINAL_BAM_FILE=' + infile] samfile = pysam.Samfile(infile, 'rb') if len(samfile.header) == 0: print("BAM file has no header section. Exit!", file=sys.stderr) sys.exit(1) elif infile.lower().endswith('.cram'): file_type = 'CRAM' comments = ['ORIGINAL_CRAM_FILE=' + infile] samfile = pysam.Samfile(infile, 'rc') if len(samfile.header) == 0: print("CRAM file has no header section. Exit!", file=sys.stderr) sys.exit(1) elif infile.lower().endswith('.sam'): file_type = 'SAM' comments = ['ORIGINAL_SAM_FILE=' + infile] samfile = pysam.Samfile(infile, 'r') if len(samfile.header) == 0: print("SAM file has no header section. Exit!", file=sys.stderr) sys.exit(1) else: print( "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'.", file=sys.stderr) sys.exit(1) comments.append('CHAIN_FILE=' + chainfile) sam_ori_header = samfile.header.to_dict() # chromosome ID style of the original BAM file chrom_style = sam_ori_header['SQ'][0]['SN'] # either 'chr1' or '1' # update chrom_size of target genome target_chrom_sizes = {} for n, l in chrom_size.items(): target_chrom_sizes[update_chromID(chrom_style, n)] = l (new_header, name_to_id) = sam_header.bam_header_generator( orig_header=sam_ori_header, chrom_size=target_chrom_sizes, prog_name="CrossMap", prog_ver=__version__, format_ver=1.0, sort_type='coordinate', co=comments) # write to file if outfile_prefix is not None: if file_type == 'BAM': OUT_FILE = pysam.Samfile(outfile_prefix + '.bam', "wb", header=new_header) printlog( ["Liftover BAM file:", infile, '==>', outfile_prefix + '.bam']) elif file_type == 'CRAM': OUT_FILE = pysam.Samfile(outfile_prefix + '.bam', "wb", header=new_header) printlog([ "Liftover CRAM file:", infile, '==>', outfile_prefix + '.bam' ]) elif file_type == 'SAM': OUT_FILE = pysam.Samfile(outfile_prefix + '.sam', "wh", header=new_header) printlog( ["Liftover SAM file:", infile, '==>', outfile_prefix + '.sam']) else: print( "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'.", file=sys.stderr) sys.exit(1) # write to screen else: if file_type == 'BAM': OUT_FILE = pysam.Samfile('-', "wb", header=new_header) printlog(["Liftover BAM file:", infile]) elif file_type == 'CRAM': OUT_FILE = pysam.Samfile('-', "wb", header=new_header) printlog(["Liftover CRAM file:", infile]) elif file_type == 'SAM': OUT_FILE = pysam.Samfile('-', "w", header=new_header) printlog(["Liftover SAM file:", infile]) else: print( "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'.", file=sys.stderr) sys.exit(1) QF = 0 NN = 0 NU = 0 NM = 0 UN = 0 UU = 0 UM = 0 MN = 0 MU = 0 MM = 0 SN = 0 SM = 0 SU = 0 total_item = 0 try: while (1): total_item += 1 old_alignment = next(samfile) new_alignment = pysam.AlignedRead() # create AlignedRead object new_alignment.query_name = old_alignment.query_name # 1st column. read name. new_alignment.query_sequence = old_alignment.query_sequence # 10th column. read sequence. all bases. new_alignment.query_qualities = old_alignment.query_qualities # 11th column. read sequence quality. all bases. new_alignment.set_tags(old_alignment.get_tags()) # 12 - columns # by default pysam will change RG:Z to RG:A, which can cause downstream failures with GATK and freebayes # Thanks Wolfgang Resch <*****@*****.**> identified this bug and provided solution. try: rg, rgt = old_alignment.get_tag("RG", with_value_type=True) except KeyError: pass else: new_alignment.set_tag("RG", str(rg), rgt) ## Pair-end sequencing if old_alignment.is_paired: new_alignment.flag = 0x1 #pair-end in sequencing if old_alignment.is_read1: new_alignment.flag = new_alignment.flag | 0x40 elif old_alignment.is_read2: new_alignment.flag = new_alignment.flag | 0x80 if old_alignment.is_qcfail: new_alignment.flag = new_alignment.flag | 0x200 new_alignment.reference_id = -1 #3 new_alignment.reference_start = 0 #4 new_alignment.mapping_quality = 255 #5 new_alignment.cigartuples = old_alignment.cigartuples #6 new_alignment.next_reference_id = -1 #7 new_alignment.next_reference_start = 0 #8 new_alignment.template_length = 0 #9 QF += 1 if addtag: new_alignment.set_tag(tag="QF", value=0) OUT_FILE.write(new_alignment) continue #================================== # R1 originally unmapped #================================== elif old_alignment.is_unmapped: new_alignment.flag = new_alignment.flag | 0x4 #2 new_alignment.reference_id = -1 #3 new_alignment.reference_start = 0 #4 new_alignment.mapping_quality = 255 #5 new_alignment.cigartuples = old_alignment.cigartuples #6 # R1 & R2 originally unmapped if old_alignment.mate_is_unmapped: new_alignment.next_reference_id = -1 #7 new_alignment.next_reference_start = 0 #8 new_alignment.template_length = 0 #9 NN += 1 if addtag: new_alignment.set_tag(tag="NN", value=0) OUT_FILE.write(new_alignment) continue # R1 unmap, R2 is mapped else: try: read2_chr = samfile.get_reference_name( old_alignment.next_reference_id) read2_strand = '-' if old_alignment.mate_is_reverse else '+' read2_start = old_alignment.next_reference_start read2_end = read2_start + 1 read2_maps = map_coordinates( mapping, read2_chr, read2_start, read2_end, read2_strand) except: read2_maps = None #------------------------------------ # R1 unmapped, R2 failed to liftover #------------------------------------ if read2_maps is None: new_alignment.next_reference_id = -1 #7 new_alignment.next_reference_start = 0 #8 new_alignment.template_length = 0 #9 NN += 1 if addtag: new_alignment.set_tag(tag="NN", value=0) OUT_FILE.write(new_alignment) continue #------------------------------------ # R1 unmapped, R2 unique #------------------------------------ elif len(read2_maps) == 2: # 2-9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 new_alignment.reference_id = name_to_id[ read2_maps[1] [0]] #recommend to set the RNAME of unmapped read to its mate's new_alignment.reference_start = read2_maps[1][ 1] #recommend to set the POS of unmapped read to its mate's new_alignment.mapping_quality = old_alignment.mapping_quality new_alignment.cigartuples = old_alignment.cigartuples new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = 0 NU += 1 if addtag: new_alignment.set_tag(tag="NU", value=0) OUT_FILE.write(new_alignment) continue #------------------------------------ # R1 unmapped, R2 multiple #------------------------------------ else: if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 # 2-9 new_alignment.flag = new_alignment.flag | 0x100 new_alignment.reference_id = name_to_id[ read2_maps[1][0]] new_alignment.reference_start = read2_maps[1][1] new_alignment.mapping_quality = old_alignment.mapping_quality new_alignment.cigartuples = old_alignment.cigartuples new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = 0 NM += 1 if addtag: new_alignment.set_tag(tag="NM", value=0) OUT_FILE.write(new_alignment) continue #================================== # R1 is originally mapped #================================== else: try: read1_chr = samfile.get_reference_name( old_alignment.reference_id) read1_strand = '-' if old_alignment.is_reverse else '+' read1_start = old_alignment.reference_start read1_end = old_alignment.reference_end read1_maps = map_coordinates(mapping, read1_chr, read1_start, read1_end, read1_strand) except: read1_maps = None if not old_alignment.mate_is_unmapped: try: read2_chr = samfile.get_reference_name( old_alignment.next_reference_id) read2_strand = '-' if old_alignment.mate_is_reverse else '+' read2_start = old_alignment.next_reference_start read2_end = read2_start + 1 read2_maps = map_coordinates( mapping, read2_chr, read2_start, read2_end, read2_strand) except: read2_maps = None #------------------------------------ # R1 failed to liftover #------------------------------------ if read1_maps is None: # read2 is unmapped or failed to convertion if old_alignment.mate_is_unmapped or (read2_maps is None): # col2 - col9 new_alignment.flag = new_alignment.flag | 0x4 #2 new_alignment.reference_id = -1 #3 new_alignment.reference_start = 0 #4 new_alignment.mapping_quality = 255 #5 new_alignment.cigartuples = old_alignment.cigartuples #6 new_alignment.next_reference_id = -1 #7 new_alignment.next_reference_start = 0 #8 new_alignment.template_length = 0 #9 if addtag: new_alignment.set_tag(tag="NN", value=0) NN += 1 OUT_FILE.write(new_alignment) continue # read2 is unique mapped elif len(read2_maps) == 2: # col2 - col9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 new_alignment.reference_id = name_to_id[ read2_maps[1] [0]] #recommend to set the RNAME of unmapped read to its mate's new_alignment.reference_start = read2_maps[1][ 1] #recommend to set the POS of unmapped read to its mate's new_alignment.mapping_quality = old_alignment.mapping_quality new_alignment.cigartuples = old_alignment.cigartuples new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] new_alignment.next_reference_start = read2_maps[1][ 1] #start new_alignment.template_length = 0 NU += 1 if addtag: new_alignment.set_tag(tag="NU", value=0) OUT_FILE.write(new_alignment) continue # read2 is multiple mapped else: # col2 - col9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 new_alignment.flag = new_alignment.flag | 0x100 new_alignment.reference_id = name_to_id[ read2_maps[1][0]] new_alignment.reference_start = read2_maps[1][1] new_alignment.mapping_quality = 255 # mapq not available new_alignment.cigartuples = old_alignment.cigartuples new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] new_alignment.next_reference_start = read2_maps[1][ 1] #start new_alignment.template_length = 0 NM += 1 if addtag: new_alignment.set_tag(tag="NM", value=0) OUT_FILE.write(new_alignment) continue #------------------------------------ # R1 uniquely mapped #------------------------------------ elif len(read1_maps) == 2: # col2 - col5 if read1_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x10 new_alignment.reference_id = name_to_id[read1_maps[1] [0]] new_alignment.reference_start = read1_maps[1][1] new_alignment.mapping_quality = old_alignment.mapping_quality if read1_maps[0][3] != read1_maps[1][ 3]: # opposite strand # 6 new_alignment.cigartuples = old_alignment.cigartuples[:: -1] #reverse cigar tuple # 10 new_alignment.query_sequence = revcomp_DNA( old_alignment.query_sequence ) #reverse complement read sequence # 11 new_alignment.query_qualities = old_alignment.query_qualities[:: -1] #reverse quality string elif read1_maps[0][3] == read1_maps[1][ 3]: # same strand # 6 new_alignment.cigartuples = old_alignment.cigartuples # R2 unmapped before or after conversion if (old_alignment.mate_is_unmapped) or (read2_maps is None): #2,7-9 new_alignment.flag = new_alignment.flag | 0x8 new_alignment.next_reference_id = name_to_id[ read1_maps[1][0]] new_alignment.next_reference_start = read1_maps[1][ 1] new_alignment.template_length = 0 UN += 1 if addtag: new_alignment.set_tag(tag="UN", value=0) OUT_FILE.write(new_alignment) continue # R2 is unique mapped elif len(read2_maps) == 2: # 2,7-9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] #chrom new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = abs( new_alignment.reference_start - new_alignment.next_reference_start ) + old_alignment.reference_length # 2 if (read2_maps[1][3] != read1_maps[1][3]) and ( new_alignment.template_length <= IS_size + fold * IS_std) and ( new_alignment.template_length >= IS_size - fold * IS_std): new_alignment.flag = new_alignment.flag | 0x2 UU += 1 if addtag: new_alignment.set_tag(tag="UU", value=0) OUT_FILE.write(new_alignment) continue # R2 is multiple mapped else: # 2 (strand) if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 # 2 (secondary alignment) new_alignment.flag = new_alignment.flag | 0x100 #7-9 new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = 0 UM += 1 if addtag: new_alignment.set_tag(tag="UM", value=0) OUT_FILE.write(new_alignment) continue #------------------------------------ # R1 multiple mapped #----------------------------------- elif len(read1_maps) > 2 and len(read1_maps) % 2 == 0: # 2 new_alignment.flag = new_alignment.flag | 0x100 if read1_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x10 # 3-5 new_alignment.tid = name_to_id[read1_maps[1] [0]] #chrom new_alignment.pos = read1_maps[1][1] #start new_alignment.mapq = 255 if read1_maps[0][3] != read1_maps[1][ 3]: # opposite strand # 6 new_alignment.cigartuples = old_alignment.cigartuples[:: -1] #reverse cigar tuple # 10 new_alignment.query_sequence = revcomp_DNA( old_alignment.query_sequence ) #reverse complement read sequence # 11 new_alignment.query_qualities = old_alignment.query_qualities[:: -1] #reverse quality string elif read1_maps[0][3] == read1_maps[1][ 3]: # same strand # 6 new_alignment.cigartuples = old_alignment.cigartuples # (1) R2 is unmapped if (old_alignment.mate_is_unmapped) or (read2_maps is None): #2,7-9 new_alignment.flag = new_alignment.flag | 0x8 new_alignment.next_reference_id = name_to_id[ read1_maps[1][0]] new_alignment.next_reference_start = read1_maps[1][ 1] new_alignment.template_length = 0 MN += 1 if addtag: new_alignment.set_tag(tag="MN", value=0) OUT_FILE.write(new_alignment) continue # (2) read2 is unique mapped elif len(read2_maps) == 2: # 2,7-9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] #chrom new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = 0 MU += 1 if addtag: new_alignment.set_tag(tag="MU", value=0) OUT_FILE.write(new_alignment) continue # (3) R2 is multiple mapped else: # 2,7-9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 # 2 (secondary alignment) new_alignment.flag = new_alignment.flag | 0x100 new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] #chrom new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = 0 MM += 1 if addtag: new_alignment.set_tag(tag="MM", value=0) OUT_FILE.write(new_alignment) continue # Singel end sequencing else: # 7-9 new_alignment.next_reference_id = -1 new_alignment.next_reference_start = 0 new_alignment.template_length = 0 # (1) originally unmapped if old_alignment.is_unmapped: # 2-6 new_alignment.flag = new_alignment.flag | 0x4 new_alignment.reference_id = -1 new_alignment.reference_start = 0 new_alignment.mapping_quality = 255 new_alignment.cigartuples = old_alignment.cigartuples SN += 1 if addtag: new_alignment.set_tag(tag="SN", value=0) OUT_FILE.write(new_alignment) continue else: new_alignment.flag = 0x0 read_chr = samfile.get_reference_name( old_alignment.reference_id) read_strand = '-' if old_alignment.is_reverse else '+' read_start = old_alignment.reference_start read_end = old_alignment.reference_end read_maps = map_coordinates(mapping, read_chr, read_start, read_end, read_strand) # (2) unmapped afte liftover if read_maps is None: new_alignment.flag = new_alignment.flag | 0x4 new_alignment.reference_id = -1 new_alignment.reference_start = 0 new_alignment.mapping_quality = 255 SN += 1 if addtag: new_alignment.set_tag(tag="SN", value=0) OUT_FILE.write(new_alignment) continue # (3) unique mapped if len(read_maps) == 2: if read_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x10 if read_maps[0][3] != read_maps[1][3]: # 6 new_alignment.cigartuples = old_alignment.cigartuples[:: -1] #reverse cigar tuple # 10 new_alignment.query_sequence = revcomp_DNA( old_alignment.query_sequence ) #reverse complement read sequence # 11 try: new_alignment.query_qualities = old_alignment.query_qualities[:: -1] #reverse quality string except: new_alignment.query_qualities = [] else: # 6 new_alignment.cigartuples = old_alignment.cigartuples # 3-5 new_alignment.reference_id = name_to_id[read_maps[1] [0]] new_alignment.reference_start = read_maps[1][1] new_alignment.mapping_quality = old_alignment.mapping_quality SU += 1 if addtag: new_alignment.set_tag(tag="SU", value=0) OUT_FILE.write(new_alignment) continue # (4) multiple mapped if len(read_maps) > 2 and len(read_maps) % 2 == 0: new_alignment.flag = new_alignment.flag | 0x100 if read_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x10 if read_maps[0][3] != read_maps[1][3]: # 6 new_alignment.cigartuples = old_alignment.cigartuples[:: -1] #reverse cigar tuple # 10 new_alignment.query_sequence = revcomp_DNA( old_alignment.query_sequence ) #reverse complement read sequence # 11 new_alignment.query_qualities = old_alignment.query_qualities[:: -1] #reverse quality string else: # 6 new_alignment.cigartuples = old_alignment.cigartuples # 3-5 new_alignment.tid = name_to_id[read_maps[1][0]] new_alignment.pos = read_maps[1][1] new_alignment.mapq = old_alignment.mapq SM += 1 if addtag: new_alignment.set_tag(tag="SM", value=0) OUT_FILE.write(new_alignment) continue except StopIteration: printlog(["Done!"]) OUT_FILE.close() if outfile_prefix is not None: if file_type == "BAM" or file_type == "CRAM": try: printlog([ 'Sort "%s" and save as "%s"' % (outfile_prefix + '.bam', outfile_prefix + '.sorted.bam') ]) pysam.sort("-o", outfile_prefix + '.sorted.bam', outfile_prefix + '.bam') except: printlog(["Warning: ", "output BAM file was NOT sorted"]) try: printlog(['Index "%s" ...' % (outfile_prefix + '.sorted.bam')]) pysam.index(outfile_prefix + '.sorted.bam', outfile_prefix + '.sorted.bam.bai') except: printlog(["Warning: ", "output BAM file was NOT indexed."]) print("Total alignments:" + str(total_item - 1)) print(" QC failed: " + str(QF)) if max(NN, NU, NM, UN, UU, UM, MN, MU, MM) > 0: print(" Paired-end reads:") print("\tR1 unique, R2 unique (UU): " + str(UU)) print("\tR1 unique, R2 unmapp (UN): " + str(UN)) print("\tR1 unique, R2 multiple (UM): " + str(UM)) print("\tR1 multiple, R2 multiple (MM): " + str(MM)) print("\tR1 multiple, R2 unique (MU): " + str(MU)) print("\tR1 multiple, R2 unmapped (MN): " + str(MN)) print("\tR1 unmap, R2 unmap (NN): " + str(NN)) print("\tR1 unmap, R2 unique (NU): " + str(NU)) print("\tR1 unmap, R2 multiple (NM): " + str(NM)) if max(SN, SU, SM) > 0: print(" Single-end reads:") print("\tUniquley mapped (SU): " + str(SU)) print("\tMultiple mapped (SM): " + str(SM)) print("\tUnmapped (SN): " + str(SN))
def store(self, qname, N_mismatch, FR, refname, strand, pos, cigar, original_BS, methy, STEVE, rnext=-1, pnext=-1, qual=None, output_genome=None, rrbs=False, my_region_serial=None, my_region_start=None, my_region_end=None): if self.format == BS_SEEKER1: # remove the soft clipped bases from the read # this is done for backwards compatibility with the old format r_start, r_end, _ = get_read_start_end_and_genome_length(cigar) original_BS = original_BS[r_start:r_end] if rrbs: self.f.write('%s\t%2d\t%s\t%s%s%s\t%s\t%s\t%s\t%d\n' % (qname, N_mismatch, FR, refname, strand, str(pos + 1).zfill(10), output_genome, original_BS, methy, STEVE)) else: self.f.write( '%s\t%2d\t%s\t%s%s%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n' % (qname, N_mismatch, FR, refname, strand, str(pos + 1).zfill(10), output_genome, original_BS, methy, my_region_serial, my_region_start, my_region_end, STEVE)) elif self.format == BAM or self.format == SAM: a = pysam.AlignedRead() a.qname = qname a.seq = original_BS if strand == '+' else reverse_compl_seq( original_BS) a.flag = 0x10 if strand == '-' else 0 a.tid = self.chrom_ids[refname] a.pos = pos a.mapq = 255 a.cigar = cigar if strand == '+' else list(reversed(cigar)) a.rnext = rnext if rnext == -1 else self.chrom_ids[rnext] a.pnext = pnext a.qual = qual if rrbs: a.tags = (('XO', FR), ('XS', STEVE), ('NM', N_mismatch), ('XM', methy), ('XG', output_genome), ('YR', my_region_serial), ('YS', my_region_start), ('YE', my_region_end)) else: a.tags = (('XO', FR), ('XS', STEVE), ('NM', N_mismatch), ('XM', methy), ('XG', output_genome)) self.f.write(a)
def mergeChainedAlignedReads(chainedAlignedReads, refSequence, readSequence): """Makes a global aligment for the given chained reads. From doc on building pysam line a = pysam.AlignedRead() a.qname = "read_28833_29006_6945" a.seq="AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG" a.flag = 99 a.rname = 0 a.pos = 32 a.mapq = 20 a.cigar = ( (0,10), (2,1), (0,25) ) a.mrnm = 0 a.mpos=199 a.isize=167 a.qual="<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<" a.tags = ( ("NM", 1), ("RG", "L1") ) """ cAR = pysam.AlignedRead() aR = chainedAlignedReads[0] cAR.qname = aR.qname #Parameters we don't and therefore set properly #cAR.flag = aR.flag #cAR.mapq = aR.mapq #cAR.mrnm = 0 #cAR.mpos=0 #cAR.isize=0 #cAR.qual = "<" * len(readSequence) #cAR.tags = aR.tags cAR.rnext = -1 cAR.pos = 0 cAR.is_reverse = aR.is_reverse if cAR.is_reverse: cAR.seq = reverseComplement(readSequence) else: cAR.seq = readSequence cAR.rname = aR.rname cigarList = [] pPos = 0 if cAR.is_reverse: #Iterate from the other end of the sequence pQPos = -(len(readSequence) - 1) else: pQPos = 0 for aR in chainedAlignedReads: assert cAR.is_reverse == aR.is_reverse #Add a deletion representing the preceding unaligned reference positions assert aR.pos >= pPos if aR.pos > pPos: cigarList.append((2, aR.pos - pPos)) pPos = aR.pos #Add an insertion representing the preceding unaligned read positions qPos = getAbsoluteReadOffset(aR, refSequence, readSequence) assert qPos >= pQPos if qPos > pQPos: cigarList.append((1, qPos - pQPos)) pQPos = qPos #Add the operations of the cigar, filtering hard and soft clipping for op, length in aR.cigar: assert op in (0, 1, 2, 4, 5) if op in (0, 1, 2): cigarList.append((op, length)) if op in (0, 2): #Is match or deletion pPos += length if op in (0, 1): #Is match or insertion pQPos += length #Now add any trailing deletions/insertions assert pPos <= len(refSequence) if pPos < len(refSequence): cigarList.append((2, len(refSequence) - pPos)) if cAR.is_reverse: assert pQPos <= 1 if pQPos < 1: cigarList.append((1, -pQPos + 1)) else: assert pQPos <= len(readSequence) if pQPos < len(readSequence): cigarList.append((1, len(readSequence) - pQPos)) #Check coordinates #print cAR.is_reverse, sum([ length for op, length in cigarList if op in (0, 2)]), len(refSequence), sum([ length for op, length in cigarList if op in (0, 1)]), len(readSequence), cAR.qname assert sum([length for op, length in cigarList if op in (0, 2)]) == len(refSequence) assert sum([length for op, length in cigarList if op in (0, 1)]) == len(readSequence) cAR.cigar = tuple(cigarList) return cAR
samwrite=sys.argv[2] samfile=pysam.Samfile(samuse,'rb') reads=pysam.Samfile(samwrite, "wb", template=samfile) def get_bit(byteval,idx): return ((byteval&(1<<idx))!=0); for alignedread in samfile.fetch(): reads.write(alignedread) originalflag=alignedread.flag if(alignedread.tags[0][0] == 'X0'): try: for newrec in alignedread.opt('XA').split(';'): if(newrec != ''): s=newrec.split(',') a = pysam.AlignedRead() a = alignedread a.flag=originalflag if(get_bit(alignedread.flag,4) and int(s[1]) < 0 and get_bit(alignedread.flag,7)): a.flag = int(0x80) elif(get_bit(alignedread.flag,4) and int(s[1]) > 0 and get_bit(alignedread.flag,7)): a.flag= int(0x10) + int(0x80) elif(get_bit(alignedread.flag,4) and int(s[1]) < 0 and get_bit(alignedread.flag,6)): a.flag= int(0x40) elif(get_bit(alignedread.flag,4) and int(s[1]) > 0 and get_bit(alignedread.flag,6)): a.flag=int(0x10) + int(0x40) elif(int(s[1]) > 0 and get_bit(alignedread.flag,7)): a.flag= int(0x80) elif(int(s[1]) < 0 and get_bit(alignedread.flag,7)): a.flag= int(0x10) + int(0x80) elif(int(s[1]) > 0 and get_bit(alignedread.flag,6)):