def filter_func(aln): # False means this read will be filtered out filter_bool = all( q >= quality_threshold for q in pysam.qualitystring_to_array( aln.get_tag(BAM_CONSTANTS["RAW_CELL_BC_QUALITY_TAG"]) )) and all(q >= quality_threshold for q in pysam.qualitystring_to_array( aln.get_tag(BAM_CONSTANTS["UMI_QUALITY_TAG"]))) nonlocal n_filtered n_filtered += not filter_bool return filter_bool
def test_pysam(): import pysam # Create BAM file from scratch # Code stolen from https://pysam.readthedocs.io/en/latest/usage.html#creating-bam-cram-sam-files-from-scratch header = { 'HD': {'VN': '1.0'}, 'SQ': [{'LN': 1575, 'SN': 'chr1'}, {'LN': 1584, 'SN': 'chr2'}] } file_name = "out.bam" with pysam.AlignmentFile(file_name, "wb", header=header) as outf: a = pysam.AlignedSegment() a.query_name = "read_28833_29006_6945" a.query_sequence="AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG" a.flag = 99 a.reference_id = 0 a.reference_start = 32 a.mapping_quality = 20 a.cigar = ((0,10), (2,1), (0,25)) a.next_reference_id = 0 a.next_reference_start=199 a.template_length=167 a.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<") a.tags = (("NM", 1), ("RG", "L1")) outf.write(a) # Verify output file exists assert os.path.isfile(file_name) # Call samtools to sort the file # This will fail if the file is not a valid BAM file pysam.sort("-o", "sorted.bam", file_name) assert os.path.isfile("sorted.bam")
def convert_to_AlignedSegment(header, sequence, quality, barcode_sequence, umi_sequence): """ This function converts the input variables (header,sequence,quality,barcode_sequence,umi_sequence) to a unaligned pysam.AlignedSegment with the umi and barcode informations as the following tags: Tag Value "B0" barcode_sequence "B3" umi_sequence :param header: string with the header information :param sequence: string with the DNA/RNA sequence :param quality: string with the base calling quality values :param barcode_sequence: string with the barcode sequence :param umi_sequence: string with the unique molecular identifier sequence """ # create aligned_segment = pysam.AlignedSegment() # Set the standard values # Header must not contain empty spaces aligned_segment.query_name = header.split()[0] aligned_segment.query_sequence = sequence aligned_segment.query_qualities = pysam.qualitystring_to_array(quality) # setting the flag to un_mapped aligned_segment.flag |= pysam.FUNMAP # Set the tags aligned_segment.set_tag('B0', barcode_sequence) aligned_segment.set_tag('B3', umi_sequence) aligned_segment.set_tag('RG', '0') return aligned_segment
def unmapped_read(fq_read, sam_flag, chromium_tags, trim=None): ubam_read = pysam.AlignedSegment() ubam_read.query_name = fq_read.name if trim: ubam_read.query_sequence = fq_read.sequence[0:trim] ubam_read.query_qualities = pysam.qualitystring_to_array( fq_read.quality[0:trim] ) else: ubam_read.query_sequence = fq_read.sequence ubam_read.query_qualities = pysam.qualitystring_to_array( fq_read.quality ) ubam_read.flag = sam_flag ubam_read.reference_id = -1 ubam_read.reference_start = -1 ubam_read.next_reference_id = -1 ubam_read.next_reference_start = -1 ubam_read.tags = chromium_tags return ubam_read
def generate_read(self, read_length, query_name, cb, ub): reference_id = np.random.randint(len(self.chromosome2length)) chromosome, chr_length = list( self.chromosome2length.items())[reference_id] seq = self.chromosome2sequence[chromosome] start = np.random.randint(0, chr_length - read_length) # straight mapping a = pysam.AlignedSegment() a.query_name = query_name a.query_sequence = ''.join(seq[start:start + read_length]) # flag taken from pysam example, did not analyze a.flag = 99 a.reference_id = reference_id a.reference_start = start a.mapping_quality = 255 a.cigar = ((0, read_length), ) # a.next_reference_id = reference_id # a.next_reference_start = 199 a.template_length = read_length a.query_qualities = pysam.qualitystring_to_array("<" * read_length) a.tags = ( ("NM", 1), ("RG", "L1"), ("NH", 1), # normally should also add number of mutations compared to reference ("AS", read_length - 2), ("CB", cb), ("UB", ub), ) return a
def print_as_BAM(linked, header, path): with pysam.AlignmentFile(path, 'wb', header=header) as f: for n, introns in enumerate(linked): introns = sort_by_pos(introns) # calulate the postion, and distance to the next intron if len(introns) > 1: tlen = introns[-1][2] - introns[0][1] + 1 else: tlen = 0 # print out each intron as a seperate BAM entry for m, i in enumerate(introns): chrom, start, end, strand = i length = end - start + 1 if m < len(introns) - 1: next_ref = introns[m + 1][1] else: next_ref = introns[0][1] tlen = -tlen a = pysam.AlignedSegment() a.query_name = 'linked' + str(n) a.query_sequence = 'N' * length a.flag = 0 a.reference_id = chrom a.reference_start = start a.mapping_quality = 60 # 60 = unqiuely mapped for HISAT2 a.cigartuples = [(0, length)] a.next_reference_id = chrom a.next_reference_start = next_ref a.template_length = tlen a.query_qualities = pysam.qualitystring_to_array('/' * length) a.tags = [('XN', next_ref + 1), ('XI', len(introns))] f.write(a)
def compose_aln(x): """Composes unaligned alignment. Parameters ---------- x : tuple or list A cell barcode matching result. The output of \'match_cell_barcodes\' function. Returns ------- AlignedSegment Unaligned read 2 with cell barcode matching result as tags. """ read_name, read1_seq, read1_qual, read2_seq, read2_qual, bc, dist = x a = pysam.AlignedSegment() a.query_name = read_name.split(' ')[0] a.flag = 0x4 a.template_length = len(read2_seq) a.query_sequence = read2_seq a.query_qualities = pysam.qualitystring_to_array(read2_qual) tags = [ ('RG', 'fba'), ('R1', read1_seq), ('CB', bc), ('CM', dist), ] a.tags = tags return a
def quality_string_to_array(quality_string): """Convert quality string into a list of phred scores. :param quality_string: Quality string. :returns: Array of scores. :rtype: array """ return pysam.qualitystring_to_array(quality_string)
def test_bamread_get_quals(simple_bam_reads): bamread = simple_bam_reads[0] bamread.set_tag('OQ', ''.join(['('] * 17)) assert np.array_equal( read.bamread_get_quals(bamread), np.array(pysam.qualitystring_to_array('==99=?<*+/5:@A99:'))) assert np.array_equal(read.bamread_get_quals(bamread, use_oq=True), np.array([7] * 17))
def createBam(chromosome, positions, bamFile, readSize, outFile): chromInfo = bamReader.getChromosomeInfromationFromBAM(bamFile) header = { 'HD': { 'VN': '1.0' }, 'SQ': [{ 'LN': chromInfo[chromosome], 'SN': chromosome }] } ######SAM columns; this will be used to create SAM file###### chromosome = chromosome CIGAR = str(readSize) + 'M' RNEXT = '*' PNEXT = '0' TLEN = '0' DNA = ['A', 'T', 'G', 'C'] qualityScores = [ '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I' ] flags = [0, 16] MAPQs = [3, 8, 23, 24, 40, 42] tags = (("XN", random.choice(range(0, 15))), ("XM", random.choice(range(0, 20))), ("XO", random.choice(range(0, 3))), ("XG", random.choice(range(0, 9))), ("NM", random.choice(range(0, 15))), ("MD", readSize), ("YT", "UU")) seq = ''.join(np.random.choice(DNA, readSize).tolist()) qual = ''.join(np.random.choice(qualityScores, readSize).tolist()) flag = random.choice(flags) mapq = random.choice(MAPQs) counter = 0 with py.AlignmentFile(outFile, "wb", header=header) as outf: for pos in positions: a = py.AlignedSegment() a.query_name = "randomlyGenerated_CK_" + str(counter) a.query_sequence = seq a.flag = random.choice(flags) a.reference_id = 0 a.reference_start = pos a.mapping_quality = mapq a.cigarstring = CIGAR a.next_reference_id = 0 a.next_reference_start = 0 a.template_length = 0 a.query_qualities = py.qualitystring_to_array(qual) a.tags = (("XN", random.choice(range(0, 15))), ("XM", random.choice(range(0, 20))), ("XO", random.choice(range(0, 3))), ("XG", random.choice(range(0, 9))), ("NM", random.choice(range(0, 15))), ("MD", readSize), ("YT", "UU")) outf.write(a) counter = counter + 1
def write_to_bam(self, f): if self.is_split_read == False: a = pysam.AlignedSegment() a.query_name = self.name a.query_sequence = self.seq a.flag = 0 a.reference_id = f.references.index(self.contig) a.reference_start = self.start_pos a.mapping_quality = 60 a.cigar = self.get_cigar() a.query_qualities = pysam.qualitystring_to_array(self.qual) a.tags = (("NM", self.nmtag), ("RG", "L1")) f.write(a) else: j = 0 for i, s in enumerate(self.splits): a = pysam.AlignedSegment() if type(s) is tuple: start = s[0] - self.start_pos end = s[1] - self.start_pos a.reference_start = s[0] else: start = s - self.start_pos end = len(self.seq) a.reference_start = s a.query_sequence = self.seq[start:end] if a.query_sequence: parts = self.name.split('_Count=') a.query_name = parts[0] + '_' + chr( j + 97) + '_Count=' + parts[1] j += 1 a.flag = 0 a.reference_id = f.references.index(self.contig) a.mapping_quality = 60 endc = end + self.cigarstring[start:end].count( '2') #add 1 for each deletion groups = groupby(self.cigarstring[start:endc]) cigar = tuple((int(label), sum(1 for _ in group)) for label, group in groups) a.cigar = cigar a.query_qualities = pysam.qualitystring_to_array( self.qual)[start:end] a.tags = (("NM", self.nmtag), ("RG", "L1")) f.write(a)
def write_to_bam(self, f): a = pysam.AlignedSegment() a.query_name = self.name a.query_sequence = self.seq a.flag = 0 a.reference_id = f.references.index(self.contig) a.reference_start = self.start_pos a.mapping_quality = 60 a.cigar = self.get_cigar() a.query_qualities = pysam.qualitystring_to_array(self.qual) a.tags = (("NM", self.nmtag), ("RG", "L1")) f.write(a)
def iter_sam_record(self, reference_ids, tags=None): mapq = 30 bq = '+' it = self.iter_raw() # TODO secondary alignment flag for rec in it: is_reverse = int(rec.send) < int(rec.sstart) if is_reverse: qaln = dna_revcomp(rec.qseq) saln = dna_revcomp(rec.sseq) ref_start = int(rec.send) - 1 lclip = int(rec.qstart) - 1 rclip = int(rec.qend) - int(rec.qend) else: qaln = rec.qseq saln = rec.sseq ref_start = int(rec.sstart) - 1 lclip = int(rec.qlen) - int(rec.qend) rclip = int(rec.qstart) - 1 cigar = aln2cigar(qaln, saln) qseq = qaln.replace('-', '') if lclip > 0: cigar.prepend((cigar.H, lclip)) if rclip > 0: cigar.append((cigar.H, rclip)) try: edit = int(rec.mismatches) + int(rec.gaps) except: edit = -1 a = pysam.AlignedSegment() a.query_name = rec.qname.encode('ascii') a.query_sequence = qseq.encode('ascii') a.reference_id = reference_ids[rec.sname] a.flag = (16 if is_reverse else 0) a.reference_start = ref_start a.mapping_quality = mapq a.cigar = cigar.values a.next_reference_id = -1 a.next_reference_start = -1 try: a.template_length = int(rec.slen) except: pass a.query_qualities = pysam.qualitystring_to_array(bq * len(qseq)) a.tags = [ ("AS", float(rec.bit_score)), # alignment score ("NM", edit), # edit distance ("ZE", float(rec.evalue)), # E-value ] yield a
def _string_to_aligned_segment(line, seq_dict, log_output): """Converts SAM record in string format to pysam AlignedRead Args: line: String of SAM record seq_dict: Dictionary mapping reference ID to reference ID index log_output: Handle for outputting log information Returns: aligned_segment: pysam AlignedRead class with values from 'line' """ line = line.strip().split() #print(line) aligned_segment = AlignedRead() aligned_segment.query_name = line[0] aligned_segment.flag = int(line[1]) if line[2] != "*": aligned_segment.reference_id = seq_dict[line[2]] aligned_segment.reference_start = int(line[3]) - 1 aligned_segment.mapping_quality = int(line[4]) cigartuples = [] pos = "" for symbol in line[5]: if symbol.isdigit(): pos += symbol elif symbol == "*": continue else: cigartuples.append((_CIGAR_OPERATIONS[symbol], int(pos))) pos = "" aligned_segment.cigartuples = cigartuples if line[6] == "=": aligned_segment.next_reference_id = seq_dict[line[2]] elif line[6] != "*": aligned_segment.next_reference_id = seq_dict[line[6]] aligned_segment.next_reference_start = int(line[7]) - 1 aligned_segment.template_length = int(line[8]) aligned_segment.query_sequence = line[9] aligned_segment.query_qualities = qualitystring_to_array(line[10]) for field in line[11::]: tag, tag_type, val = field.split(":", maxsplit=2) if tag_type == "i": val = int(val) elif tag_type == "f": val = float(val) elif tag_type == "H": val = bytearray.fromhex(val) elif tag_type == "B": val = [int(i) for i in val.split(",")] elif not (tag_type == "A" or tag_type == "Z"): err_msg = "Optional Ttag type '{}' not recognised".format(tag_type) log_output.write("ERROR: {}\n".format(err_msg)) raise Exception(err_msg) aligned_segment.set_tag(tag, val, value_type=tag_type) return aligned_segment
def corrected_reads(self, **kwargs): end_correction = self.end_correction nucleotide_counts = self.get_nucleotide_counts() self.full_covariation_test() covarying_sites = self.multiple_testing_correction() if end_correction: tail_cutoff = self.reference_length - end_correction for read in self.pysam_alignment.fetch(): sequence, _ = self.read_count_data(read) intraread_covarying_sites = covarying_sites[ (covarying_sites >= read.reference_start) & (covarying_sites < read.reference_end) ] mask = np.ones(len(sequence), np.bool) mask[intraread_covarying_sites - read.reference_start] = False local_consensus = nucleotide_counts.consensus[ read.reference_start: read.reference_end ] sequence[mask] = local_consensus[mask] if end_correction: if read.reference_start < end_correction: query_index = end_correction - read.reference_start query_correction = nucleotide_counts.consensus[ read.reference_start: end_correction ] sequence[0: query_index] = query_correction if read.reference_end > tail_cutoff: correction_length = read.reference_end - tail_cutoff query_correction = nucleotide_counts.consensus[ tail_cutoff: tail_cutoff + correction_length ] sequence[-correction_length:] = query_correction corrected_read = pysam.AlignedSegment() corrected_read.query_name = read.query_name corrected_read.query_sequence = ''.join(sequence) corrected_read.flag = read.flag corrected_read.reference_id = 0 corrected_read.reference_start = read.reference_start corrected_read.mapping_quality = read.mapping_quality corrected_read.cigar = [(0, len(sequence))] corrected_read.next_reference_id = read.next_reference_id corrected_read.next_reference_start = read.next_reference_start corrected_read.template_length = read.template_length corrected_read.query_qualities = pysam.qualitystring_to_array( len(sequence) * '<' ) corrected_read.tags = read.tags yield corrected_read
def test_get_aligned_pairs_padding(self): a = pysam.AlignedSegment() a.query_name = "read_12345" a.query_sequence = "ACGT" * 10 a.flag = 0 a.reference_id = 0 a.reference_start = 20 a.mapping_quality = 20 a.cigartuples = ((7, 20), (6, 1), (8, 19)) a.query_qualities = pysam.qualitystring_to_array("1234") * 10 def inner(): a.get_aligned_pairs() # padding is not bein handled right now self.assertRaises(NotImplementedError, inner)
def test_readdata_from_bamread(simple_bam_reads): bamread = simple_bam_reads[0] r = read.ReadData.from_bamread(bamread) assert np.array_equal( r.qual, np.array(pysam.qualitystring_to_array('==99=?<*+/5:@A99:'))) assert r.rg is None bamread.set_tag('OQ', '(' * 17) bamread.set_tag('RG', 'foo') r = read.ReadData.from_bamread(bamread, use_oq=True) assert np.array_equal(r.qual, np.array([7] * 17)) assert r.rg == 'foo' #the rg 0 from conftest.py has int 0 assert read.ReadData.rg_to_int[None] == 0 assert read.ReadData.rg_to_int['foo'] == 1 assert read.ReadData.numrgs == 2 bamread.is_reverse = True r = read.ReadData.from_bamread(bamread) assert np.array_equal( r.qual, np.flip(np.array(pysam.qualitystring_to_array('==99=?<*+/5:@A99:')))) assert np.array_equal(r.seq, np.array(list('CAGTATCCTTTATCTAA'))) read.ReadData.rg_to_pu = dict() read.ReadData.rg_to_int = dict() read.ReadData.numrgs = 0
def test_get_aligned_pairs_match_mismatch(self): a = pysam.AlignedSegment() a.query_name = "read_12345" a.query_sequence = "ACGT" * 10 a.flag = 0 a.reference_id = 0 a.reference_start = 20 a.mapping_quality = 20 a.cigartuples = ((7, 20), (8, 20)) a.query_qualities = pysam.qualitystring_to_array("1234") * 10 self.assertEqual(a.get_aligned_pairs(), [(qpos, refpos) for (qpos, refpos) in zip( range(0, 0 + 40), range(20, 20 + 40))]) self.assertEqual(a.get_aligned_pairs(True), [(qpos, refpos) for (qpos, refpos) in zip( range(0, 0 + 40), range(20, 20 + 40))])
def build_read(self): '''build an example read.''' a = pysam.AlignedSegment() a.query_name = "read_12345" a.query_sequence = "ACGT" * 10 a.flag = 0 a.reference_id = 0 a.reference_start = 20 a.mapping_quality = 20 a.cigartuples = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20)) a.next_reference_id = 0 a.next_reference_start = 200 a.template_length = 167 a.query_qualities = pysam.qualitystring_to_array("1234") * 10 return a
def testUpdate2(self): '''issue 135: inplace update of sequence and quality score. This does not work as setting the sequence will erase the quality scores. ''' a = self.buildRead() a.query_sequence = a.query_sequence[5:10] self.assertEqual(pysam.qualities_to_qualitystring(a.query_qualities), None) a = self.buildRead() s = pysam.qualities_to_qualitystring(a.query_qualities) a.query_sequence = a.query_sequence[5:10] a.query_qualities = pysam.qualitystring_to_array(s[5:10]) self.assertEqual(pysam.qualities_to_qualitystring(a.query_qualities), s[5:10])
def buildRead(self): '''build an example read.''' a = pysam.AlignedSegment() a.query_name = "read_12345" a.query_sequence = "ACGT" * 10 a.flag = 0 a.reference_id = 0 a.reference_start = 20 a.mapping_quality = 20 a.cigartuples = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20)) a.next_reference_id = 0 a.next_reference_start = 200 a.template_length = 167 a.query_qualities = pysam.qualitystring_to_array("1234") * 10 # todo: create tags return a
def test_get_aligned_pairs_hard_clipping(self): a = pysam.AlignedSegment() a.query_name = "read_12345" a.query_sequence = "ACGT" * 10 a.flag = 0 a.reference_id = 0 a.reference_start = 20 a.mapping_quality = 20 a.cigartuples = ((5, 2), (0, 35), (5, 3)) a.query_qualities = pysam.qualitystring_to_array("1234") * 10 self.assertEqual(a.get_aligned_pairs(), # No seq, no seq pos [(qpos, refpos) for (qpos, refpos) in zip( range(0, 0 + 35), range(20, 20 + 35))]) self.assertEqual(a.get_aligned_pairs(True), [(qpos, refpos) for (qpos, refpos) in zip( range(0, 0 + 35), range(20, 20 + 35))])
def testLargeRead(self): '''build an example read.''' a = pysam.AlignedSegment() a.query_name = "read_12345" a.query_sequence = "ACGT" * 200 a.flag = 0 a.reference_id = 0 a.reference_start = 20 a.mapping_quality = 20 a.cigartuples = ((0, 4 * 200), ) a.next_reference_id = 0 a.next_reference_start = 200 a.template_length = 167 a.query_qualities = pysam.qualitystring_to_array("1234") * 200 return a
def build_read(self): '''build an example read, but without header information.''' a = pysam.AlignedSegment() a.query_name = "read_12345" a.query_sequence = "ATGC" * 10 a.flag = 0 a.reference_id = -1 a.reference_start = 20 a.mapping_quality = 20 a.cigartuples = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20)) a.next_reference_id = 0 a.next_reference_start = 200 a.template_length = 167 a.query_qualities = pysam.qualitystring_to_array("1234") * 10 # todo: create tags return a
def testLargeRead(self): '''build an example read.''' a = pysam.AlignedSegment() a.query_name = "read_12345" a.query_sequence = "ATGC" * 200 a.flag = 0 a.reference_id = -1 a.reference_start = 20 a.mapping_quality = 20 a.cigartuples = ((0, 4 * 200), ) a.next_reference_id = 0 a.next_reference_start = 200 a.template_length = 167 a.query_qualities = pysam.qualitystring_to_array("1234") * 200 return a
def _write_header_to_sam(self, header, sf_header_sam_file): with pysam.AlignmentFile( sf_header_sam_file, "w", header=header) as outf: ##write the new header into file a = pysam.AlignedSegment() a.query_name = "read_28833_29006_6945_tmp" a.query_sequence = "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG" a.flag = 99 a.reference_id = 0 a.reference_start = 32 a.mapping_quality = 20 a.cigar = ((0, 10), (2, 1), (0, 25)) a.next_reference_id = 0 a.next_reference_start = 199 a.template_length = 167 a.query_qualities = pysam.qualitystring_to_array( "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<") outf.write(a)
def build_read(self): '''build an example read.''' header = pysam.AlignmentHeader.from_references(["chr1", "chr2"], [10000000, 10000000]) a = pysam.AlignedSegment(header) a.query_name = "read_12345" a.query_sequence = "ATGC" * 10 a.flag = 0 a.reference_id = 0 a.reference_start = 20 a.mapping_quality = 20 a.cigartuples = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20)) a.next_reference_id = 0 a.next_reference_start = 200 a.template_length = 167 a.query_qualities = pysam.qualitystring_to_array("1234") * 10 return a
def build_read(self): '''build an example read.''' header = pysam.AlignmentHeader.from_references( ["chr1", "chr2"], [10000000, 10000000]) a = pysam.AlignedSegment(header) a.query_name = "read_12345" a.query_sequence = "ATGC" * 10 a.flag = 0 a.reference_id = 0 a.reference_start = 20 a.mapping_quality = 20 a.cigartuples = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20)) a.next_reference_id = 0 a.next_reference_start = 200 a.template_length = 167 a.query_qualities = pysam.qualitystring_to_array("1234") * 10 return a
def test_get_aligned_pairs_skip(self): a = pysam.AlignedSegment() a.query_name = "read_12345" a.query_sequence = "ACGT" * 10 a.flag = 0 a.reference_id = 0 a.reference_start = 20 a.mapping_quality = 20 a.cigartuples = ((0, 2), (3, 100), (0, 38)) a.query_qualities = pysam.qualitystring_to_array("1234") * 10 self.assertEqual(a.get_aligned_pairs(), [(0, 20), (1, 21)] + [(None, refpos) for refpos in range(22, 22 + 100)] + [(qpos, refpos) for (qpos, refpos) in zip( range(2, 2 + 38), range(20 + 2 + 100, 20 + 2 + 100 + 38))]) self.assertEqual(a.get_aligned_pairs(True), [(0, 20), (1, 21)] + # [(None, refpos) for refpos in range(21, 21+100)] + [(qpos, refpos) for (qpos, refpos) in zip( range(2, 2 + 38), range(20 + 2 + 100, 20 + 2 + 100 + 38))])
def test_get_aligned_pairs_soft_clipping(self): a = pysam.AlignedSegment() a.query_name = "read_12345" a.query_sequence = "ACGT" * 10 a.flag = 0 a.reference_id = 0 a.reference_start = 20 a.mapping_quality = 20 a.cigartuples = ((4, 2), (0, 35), (4, 3)) a.query_qualities = pysam.qualitystring_to_array("1234") * 10 self.assertEqual(a.get_aligned_pairs(), [(0, None), (1, None)] + [(qpos, refpos) for (qpos, refpos) in zip( range(2, 2 + 35), range(20, 20 + 35))] + [(37, None), (38, None), (39, None)] ) self.assertEqual(a.get_aligned_pairs(True), # [(0, None), (1, None)] + [(qpos, refpos) for (qpos, refpos) in zip( range(2, 2 + 35), range(20, 20 + 35))] # [(37, None), (38, None), (39, None)] )
def test_realign_rc(genome_source): read = pysam.AlignedSegment() read.query_sequence = genome_source.get_seq("chr1", 30, 50, "-") alns = genome_source.align(Alignment(read)) assert len(alns) == 1 assert alns[0].cigarstring == "21M" assert alns[0].reference_start == 30 assert alns[0].reference_end == 51 assert alns[0].is_reverse qs = "<<<<<<<:<9/,&,22;;<<<" read.query_qualities = pysam.qualitystring_to_array(qs) alns = genome_source.align(Alignment(read)) import warnings with warnings.catch_warnings(): # this is a python 2/3 incompatibility I think, where the warning # indicates array.tostring() is deprecated but array.tobytes() # only exists in py3 warnings.simplefilter("ignore") assert pysam.qualities_to_qualitystring( alns[0].query_qualities) == qs[::-1]
def add_snv(read, pos, base): ''' ''' a = ps.AlignedSegment() # Add snv to seq and set high base quality score qual_ = read.qual[:pos - read.reference_start] + 'A' + read.qual[ pos - read.reference_start + 1:] seq_ = read.seq[:pos - read.reference_start] + base + read.seq[ pos - read.reference_start + 1:] # Calculate tag MD MD = read.get_tag('MD') md_list_, is_new = md_list(MD, pos - read.reference_start, read) MD_ = md_from_list(md_list_) # Calculate tag NM NM_ = read.get_tag('NM') if is_new: NM_ += 1 # Create modified read a.query_name = read.query_name a.query_sequence = seq_ a.flag = read.flag a.reference_id = read.reference_id a.reference_start = read.reference_start a.mapping_quality = read.mapping_quality a.cigar = read.cigar a.next_reference_id = read.next_reference_id a.next_reference_start = read.next_reference_start a.template_length = read.template_length a.query_qualities = ps.qualitystring_to_array(qual_) a.tags = read.get_tags() a.set_tag('MD', MD_, 'Z') a.set_tag('NM', NM_, 'i') return a
def make_bam_segment( self, qname=None, flag=0, rname=0, pos=0, mapq=20, cigar=None, rnext=0, pnext=0, tlen=0, seq=None, qual=None, tags=None, **kwargs, ): """ Return pysam.AlignedSegment object. Each pysam.AlignedSegment element has 11 mandatory tab-separated fields (qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual, tags). Additionaly there is 12-th field TAGS for additional info. We try to set sensible defaults where possible, but when creating the segment one should at least set the `cigar` field. """ segment = pysam.AlignedSegment() if qname is None: qname = "read-{}".format(random.randrange(1000, 9999)) segment.query_name = qname segment.flag = flag or self.get_flag_value(**kwargs) segment.reference_id = rname segment.reference_start = pos segment.mapping_quality = mapq segment.cigar = cigar segment.next_reference_id = rnext segment.next_reference_start = pnext segment.template_length = tlen length = sum([ length for (operation, length) in segment.cigartuples if operation in [0, 1, 4, 7, 8] ]) if seq is None: seq = FastaTestCaseMixin.make_fasta_sequence(size=length, include_n=False) segment.query_sequence = seq if qual is None: qual = pysam.qualitystring_to_array( FastqTestCaseMixin.make_quality_scores(size=length)) segment.query_qualities = qual if tags is not None: segment.tags = tags.items() return segment
def make_aligned_segment(data, rnd_seed=None): """ Return pysam.AlignedSegment() object with the data in `data`. Each pysam.AlignedSegment element has 11 mandatory tab-separated fields. Additionaly there is TAGS field for additional info:: QNAME FLAG RNAME POS MAPQ CIGAR RNEXT PNEXT TLEN SEQ QUAL TAGS Since only some fields are required for our purpuses, the input parameter `data` should be a tuple with the following content: (qname, flag, refname, pos, mapq, cigar, tags) * qname - querry name * flag - bitwise flag, detailed description in [1] * refname - Index of the reference_sequence in header * pos - starting position of read in reference sequence * mapq - mapping quality * cigar - CIGAR values, detailed description in [1] * tags - additional information Flag is a bitwise value. For detailed explanation, see [1], but for our needs, we only need values 4 (is_unmapped) and 16 (reverse_strand). Columns RNEXT and RNAME are left undefined, since we are only interested in sigle-end reads. TLEN is also set to undefined (value 0). SEQ and QUAL are randomly generated, their size is determined by cigar value. Example of input parameter `data`: data = ('name123', 20, 0, 30, 20, [(0, 90), (2, 5), (0, 5)], {'NH': 5}) Read more about SAM file specifications here: [1] https://samtools.github.io/hts-specs/SAMv1.pdf Parameters ---------- data : tuple Input data for AlignedSegment. Returns ------- pysam.AlignedSegment AlignedSegment, filled with given data. """ # pylint: disable=no-member segment = pysam.AlignedSegment() segment.query_name = data[0] segment.flag = data[1] segment.reference_id = data[2] segment.reference_start = data[3] segment.mapping_quality = data[4] segment.cigar = data[5] segment.next_reference_id = 0 segment.next_reference_start = 0 segment.template_length = 0 length = sum([n2 for (n1, n2) in segment.cigar if n1 in [0, 1, 4, 7, 8]]) segment.query_sequence = make_sequence(size=length, include_n=True, rnd_seed=rnd_seed) segment.query_qualities = pysam.qualitystring_to_array( make_quality_scores(size=length, rnd_seed=rnd_seed)) segment.tags = data[6].items() return segment
def rescue_reads(tasks, results, parser_result): aligner = parser_result.aligner.lower() output_dir = parser_result.output_dir while True: item = tasks.get() if item is None: tasks.task_done() break unmapped_info, ref_id, start, is_spliced, genome_seq = item rescue_tmp_dir = output_dir + "/rescue_tmp" random_prefix = random_string(10) temp_dir = "%s/%s_temp" % (rescue_tmp_dir, random_prefix) random_output_prefix = "%s/%s" % (rescue_tmp_dir, random_prefix) target_sam_file = None unmapped_read_file, target_genome_file, star_index_num = \ make_unmapped_read_target_genome(unmapped_info, ref_id, genome_seq, random_output_prefix, is_spliced) if is_spliced: # Rebuilds aligner index with target genome file if aligner == "star": parser_result.builder_extra_args = "--genomeSAindexNbases {star_index_num} " \ "--outTmpDir {temp_dir}".format(star_index_num=star_index_num, temp_dir=temp_dir) parser_result.genome_file = target_genome_file parser_result.output_dir = rescue_tmp_dir parser_result.prefix = random_prefix parser_result.quiet = True parser_result.threads = 1 try: target_genome_index = build_aligner_index.build_index( parser_result) except RuntimeError: for unmapped_name in unmapped_info: unmapped_seq = unmapped_info[unmapped_name][0] results.put((unmapped_name, unmapped_seq)) tasks.task_done() continue # Aligns unmapped read to target genome if aligner == "star": parser_result.aligner_extra_args = "--outTmpDir %s" % temp_dir else: parser_result.aligner_extra_args = None parser_result.input = [unmapped_read_file] parser_result.genome_index = target_genome_index if target_genome_index is not None: try: target_sam_file = run_aligner.run_aligner(parser_result) except RuntimeError: for unmapped_name in unmapped_info: unmapped_seq = unmapped_info[unmapped_name][0] results.put((unmapped_name, unmapped_seq)) tasks.task_done() continue else: target_sam_file = "%s.sam" % random_output_prefix command = "blastn -query {unmapped_read} -subject {target_genome} -task megablast -perc_identity {identity} " \ "-qcov_hsp_perc {coverage} -outfmt \"17 SQ SR\" -out {sam_output} -parse_deflines". \ format(unmapped_read=unmapped_read_file, target_genome=target_genome_file, identity=parser_result.blast_identity, coverage=parser_result.blast_query_coverage, sam_output=target_sam_file) tool_process = Popen(shlex.split(command), stdout=PIPE, stderr=PIPE) tool_out, tool_err = tool_process.communicate() if tool_process.returncode != 0 or "[Errno" in tool_err.decode( "utf8").strip(): for unmapped_name in unmapped_info: unmapped_seq = unmapped_info[unmapped_name][0] results.put((unmapped_name, unmapped_seq)) tasks.task_done() continue if os.path.exists( target_sam_file) and os.path.getsize(target_sam_file) != 0: # Checks for target genome results with pysam.AlignmentFile(target_sam_file) as f: for r in f: if not r.is_unmapped and not r.is_secondary and not r.is_supplementary: new_start = start + r.reference_start cigarstring = r.cigarstring first_hard_clip = re.findall("^\d+H", cigarstring) first_bp = int( re.findall("\d+", first_hard_clip[0]) [0]) if first_hard_clip else None last_hard_clip = re.findall("\d+H$", cigarstring) last_bp = int(re.findall( "\d+", last_hard_clip[0])[0]) if last_hard_clip else None unmapped_qual = unmapped_info[r.query_name][1] if r.is_reverse: new_qualities = pysam.qualitystring_to_array( unmapped_qual[::-1]) else: new_qualities = pysam.qualitystring_to_array( unmapped_qual) if first_bp is not None: new_qualities = new_qualities[first_bp:] if last_bp is not None: last_bp = len(new_qualities) - last_bp new_qualities = new_qualities[:last_bp] results.put((r.query_name, r.flag, ref_id, new_start, r.mapping_quality, cigarstring, r.next_reference_id, r.next_reference_start, r.template_length, r.query_sequence, new_qualities, r.tags)) # break # Removes useless files and directories os.remove(unmapped_read_file) os.remove(target_genome_file) if os.path.exists("%s.sam" % random_output_prefix): os.remove("%s.sam" % random_output_prefix) elif os.path.exists("%s.bam" % random_output_prefix): os.remove("%s.bam" % random_output_prefix) if is_spliced and aligner == "star": if os.path.exists("%s.Aligned.out.sam" % random_output_prefix): os.remove("%s.Aligned.out.sam" % random_output_prefix) if os.path.exists("%s_star" % random_output_prefix): shutil.rmtree("%s_star" % random_output_prefix) if os.path.exists("%s_temp" % random_output_prefix): shutil.rmtree("%s_temp" % random_output_prefix) if os.path.exists("%s.Log.final.out" % random_output_prefix): os.remove("%s.Log.final.out" % random_output_prefix) if os.path.exists("%s.Log.out" % random_output_prefix): os.remove("%s.Log.out" % random_output_prefix) if os.path.exists("%s.Log.progress.out" % random_output_prefix): os.remove("%s.Log.progress.out" % random_output_prefix) if os.path.exists("%s.SJ.out.tab" % random_output_prefix): os.remove("%s.SJ.out.tab" % random_output_prefix) tasks.task_done()
#create a new read a = pysam.AlignedSegment() #assign values to each attribute a.query_name = "read_28833_29006_6945“ a.query_sequence="AGCTTAGCTA" a.flag = 99 a.reference_id = 0 a.reference_start = 32 a.mapping_quality = 20 a.cigar = ((0,10), (2,1), (0,25)) a.next_reference_id = 0 a.next_reference_start=199 a.template_length=167 a.query_qualities = pysam.qualitystring_to_array("<<<<<<<AAA") a.tags = (("NM", 1),("RG", "L1")) bamfile = pysam.AlignmentFile(bam, "rb") for pileupcolumn in bamfile.pileup('chr22', 16958180,16958190): print ("\ncoverage at base %s = %s" % (pileupcolumn.pos, pileupcolumn.n)) for pileupcolumn in bamfile.pileup('chr22', 16958160,16958170): print ("\nBases at position %s = " % (pileupcolumn.pos)) for pileupread in pileupcolumn.pileups: if not pileupread.is_del and not pileupread.is_refskip: # query position is None if is_del or is_refskip is set. print ("%s" % (pileupread.alignment.query_sequence[pileupread.query_position]))
#writing SAM files - like a dictionary header = { 'HD': {'VN': '1.0'}, 'SQ': [{'LN': 1575, 'SN': 'chr1'}, {'LN': 1584, 'SN': 'chr2'}] } with pysam.AlignmentFile(“out.bam”, "wb", header=header) as outf: a = pysam.AlignedSegment() a.query_name= "read_28833_29006_6945" a.query_sequence="AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG" a.flag= 99 a.reference_id= 0 a.reference_start= 32 a.mapping_quality= 20 a.cigar= ((0,10), (2,1), (0,25)) a.next_reference_id= 0 a.next_reference_start=199 a.template_length=167 a.query_qualities= pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<") a.tags= (("NM", 1), ("RG", "L1")) outf.write(a) #using samtools in command line (csamtools) pysam.sort("-o", "output.bam", "ex1.bam") #in python - allows for more investigation samtools sort -o output.bam ex1.bam #command line #Pysam works with Tabix indexed files (BED-user defined/GFF/GTF) #Tabix is generic indexer for TAB-delimited genome position files, gives random access to compressed files tbx = pysam.TabixFile("example.bed.gz") #tbx object of class TabixFile for row in tbx.fetch("chr11", 100, 200, parser = pysam.asBed()): #pysam.asGTF() or pysam.asTuple()* print("name is", row.name) #if BED or GTF parser used, fields are accessible by name
def call_consensus(family_bam: str, new_read_name: str = None, temp_sorted_filename: str = None, max_depth: int = 10000, calling_method: str = 'posterior') -> pysam.AlignedSegment: """ call a consensus read from a read family file :param family_bam: name of file containing the family reads :param new_read_name: name of new read :param temp_sorted_filename: name of temporary file in which to store family reads :param max_depth: max depth parameter for pileup :param calling_method: method for calling individual bases :return: consensus read """ assert temp_sorted_filename is not None assert new_read_name is not None # sort and index the family file pysam.sort(family_bam, "-o", temp_sorted_filename) pysam.index(temp_sorted_filename) with pysam.AlignmentFile(temp_sorted_filename, "rb") as family_file: first_read = family_file.__next__() reference_id = first_read.reference_id tags = first_read.get_tags(with_value_type=True) new_read_sequence_list = [] new_read_quality_list = [] new_read_cigar_tuple_list = [] cigar_last = BAM_CMATCH cigar_last_count = 0 last_pileup_position = None first_pileup_position = None with pysam.AlignmentFile(temp_sorted_filename, "rb") as family_file: for pileup_column in family_file.pileup(stepper="nofilter", max_depth=max_depth, min_base_quality=0): pos = pileup_column.pos if first_pileup_position is None: first_pileup_position = pos last_pileup_position = pos - 1 position_delta = pos - last_pileup_position if position_delta > 1: # We have a gap if cigar_last == BAM_CREF_SKIP: # If we are already in a gap extend it cigar_last_count += position_delta - 1 else: # If we are not in a gap close the previous segment and start a new new_read_cigar_tuple_list.append( (cigar_last, cigar_last_count)) cigar_last = BAM_CREF_SKIP cigar_last_count = position_delta - 1 query_sequences = pileup_column.get_query_sequences() query_qualities = pileup_column.get_query_qualities() called_base, called_quality, called_cigar = call_base( query_sequences=query_sequences, query_qualities=query_qualities, calling_method=calling_method) if called_cigar == BAM_CREF_SKIP: # No base could be called and we have a single skip if cigar_last == BAM_CREF_SKIP: # If we are already in a skip extend it cigar_last_count += 1 else: # otherwise close previous segment and start a skip new_read_cigar_tuple_list.append( (cigar_last, cigar_last_count)) cigar_last = BAM_CREF_SKIP cigar_last_count = 1 else: # we have a base call new_read_sequence_list.append(called_base) new_read_quality_list.append(called_quality) if cigar_last == BAM_CMATCH: cigar_last_count += 1 else: new_read_cigar_tuple_list.append( (cigar_last, cigar_last_count)) cigar_last = BAM_CMATCH cigar_last_count = 1 # update the last position for which we got a pileup last_pileup_position = pos # append the final cigar tuple new_read_cigar_tuple_list.append((cigar_last, cigar_last_count)) # Construct new AlignedSegment quality_string = ''.join(new_read_quality_list) quality_array = pysam.qualitystring_to_array(quality_string) new_read = pysam.AlignedSegment() new_read.query_name = new_read_name new_read.query_sequence = ''.join(new_read_sequence_list) new_read.flag = 0 new_read.reference_id = reference_id new_read.reference_start = first_pileup_position new_read.mapping_quality = 255 new_read.cigartuples = new_read_cigar_tuple_list new_read.query_qualities = quality_array new_read.tags = tags return new_read
if __name__ == "__main__": genome = "G" * 100 genome_start = 3 a = pysam.AlignedSegment() qseq = "A" * 20 a.query_name = "read1" a.query_sequence = qseq a.flag = 0 a.reference_id = 0 a.reference_start = 10 a.mapping_quality = 20 a.cigarstring = str(len(qseq)) + "M" a.query_qualities = pysam.qualitystring_to_array("<" * len(qseq)) a.tags = (("NM", 1), ("RG", "L1")) b = pysam.AlignedSegment() qseq = "T" * 20 b.query_name = "read2" b.query_sequence = qseq b.flag = 0 b.reference_id = 0 b.reference_start = 15 b.mapping_quality = 20 b.cigarstring = str(len(qseq)) + "M" b.query_qualities = pysam.qualitystring_to_array("<" * len(qseq)) b.tags = (("NM", 1), ("RG", "L1")) print("readA: " + str(a))
def consensusMaker (groupedReadsList, cutoff, readLength, flag) : '''The consensus maker uses a simple "majority rules" algorithm to qmake a consensus at each base position. If no nucleotide majority reaches above the minimum theshold (--cutoff), the position is considered undefined and an 'N' is placed at that position in the read.''' nucIdentityList=[0, 0, 0, 0, 0, 0] # In the order of T, C, G, A, N, Total nucKeyDict = {0:'T', 1:'C', 2:'G', 3:'A', 4:'N'} consensusRead = '' consensusReadQ = '' dif = [0, 0, 0, 0] #dif for 0-100 100-200 200-end total l1 = 100 l2 = 200 l3 = readLength - l2 major = 4 groupLen=len(groupedReadsList) # print groupedReadsList[0][1] # if only one reads, return itself if groupLen == 1: consensusRead=groupedReadsList[0][0] consensusReadQ=groupedReadsList[0][1] # if two reads, return each site with higer quality score elif groupLen == 2: qArray1=pysam.qualitystring_to_array(groupedReadsList[0][1]) qArray2=pysam.qualitystring_to_array(groupedReadsList[1][1]) for i in xrange(len(qArray1)): if qArray1[i] >= qArray2[i]: consensusRead += groupedReadsList[0][0][i] consensusReadQ += groupedReadsList[0][1][i] else: consensusRead += groupedReadsList[1][0][i] consensusReadQ += groupedReadsList[1][1][i] if groupedReadsList[0][0][i] != groupedReadsList[1][0][i]: m = i if flag == 83 or flag == 147: m = readLength - i if m <= l1: dif[0] += 1 elif m <=l2: dif[1] += 1 else: dif[2] += 1 dif[3] += 1 else: for i in xrange(readLength) : # Count the types of nucleotides at a position in a read. i is the nucleotide index within a read in groupedReadsList for j in xrange(len(groupedReadsList)): # Do this for every read that comprises a SMI group. j is the read index within groupedReadsList try: if groupedReadsList[j][0][i] == 'T' : nucIdentityList[0] += 1 elif groupedReadsList[j][0][i] == 'C': nucIdentityList[1] += 1 elif groupedReadsList[j][0][i] == 'G': nucIdentityList[2] += 1 elif groupedReadsList[j][0][i] == 'A': nucIdentityList[3] += 1 elif groupedReadsList[j][0][i] == 'N': nucIdentityList[4] += 1 else: nucIdentityList[4] += 1 nucIdentityList[5] += 1 except: break try: for j in [0, 1, 2, 3, 4] : if float(nucIdentityList[j])/float(nucIdentityList[5]) > cutoff : consensusRead += nucKeyDict[j] major = j break elif j==4: consensusRead += 'N' major = 4 except: consensusRead += 'N' # l1 = 80 # l2 = 160 # l3 = readLength - l2 m = i if flag == 83 or flag ==147: m = readLength - i if m <= l1: dif[0] += nucIdentityList[5] - nucIdentityList[major] elif m <= l2: dif[1] += nucIdentityList[5] - nucIdentityList[major] else: dif[2] += nucIdentityList[5] - nucIdentityList[major] dif[3] += nucIdentityList[5] - nucIdentityList[major] #difference for each point in reads nucIdentityList=[0, 0, 0, 0, 0, 0] # Reset for the next nucleotide position consensusReadQ = "J"*len(consensusRead) errRate = [0,0,0,0] errRate[0] = 100*float(dif[0])/float(l1*len(groupedReadsList)) errRate[1] = 100*float(dif[1])/float(l2*len(groupedReadsList)) errRate[2] = 100*float(dif[2])/float(l3*len(groupedReadsList)) errRate[3] = 100*float(dif[3])/float(readLength*len(groupedReadsList)) # errRate = map(lambda x: float(x)/float(readLength*len(groupedReadsList)), dif) # errRate = float(dif)/float(readLength*len(groupedReadsList)) return consensusRead, consensusReadQ, len(groupedReadsList), [errRate, len(groupedReadsList)]
def create_bam(sample, files_in1, files_in2, ref_fasta, probes_dict, output, has_trimmed_primers=True, debug=False): """ Create a BAM file with reads placed at their expected locations, adjusted through pairwise alignment to the target sequences. This will give reasonable results as long as probes capture the exact target sequences, but will generate alignments with many mismatches if there are any discrepancies. """ assert len(files_in1) == len(files_in2) tStart = time.time() counters = collections.Counter() ref_idx = pyfaidx.Faidx(ref_fasta, rebuild=False) bam_header = { 'HD': { 'VN': '1.0' }, 'SQ': [{ 'LN': record.rlen, 'SN': name } for name, record in ref_idx.index.items()], 'RG': [{ 'ID': sample, 'SM': sample }], 'PG': [{ 'ID': __title__, 'PN': __title__, 'VN': __version__ }], } chr_indices = { chrom: index for index, chrom in enumerate(ref_idx.index.keys()) } with pysam.AlignmentFile(output, "wb", header=bam_header) as pairedreads: for ixfile in range(len(files_in1)): file1 = files_in1[ixfile] file2 = files_in2[ixfile] log.info('Processing %s and %s (#%d)', file1, file2, ixfile) counters['files'] += 1 opener = gzip.open if file1.endswith('.gz') else open with opener(file1, 'rt') as hdl1, opener(file2, 'rt') as hdl2: for read_pair in zip( Bio.SeqIO.QualityIO.FastqGeneralIterator(hdl1), Bio.SeqIO.QualityIO.FastqGeneralIterator(hdl2)): counters['pairs_total'] += 1 if counters['pairs_total'] % 50000 == 0: log.info( "processed %d pairs - %.f sec elapsed, %.4f sec/pair, %.1f pairs/sec", counters['pairs_total'], time.time() - tStart, (time.time() - tStart) / counters['pairs_total'], counters['pairs_total'] / (time.time() - tStart)) if debug and counters['pairs_total'] > 10: print('DEBUG - stopping after ', counters['pairs_total']) break # extract and parse read name read_names_original = [ read_pair[0][0].split('\t')[0], read_pair[1][0].split('\t')[0], ] assert len(read_names_original[0]) > 0 assert read_names_original[0] == read_names_original[1] read_name, read_probe, read_umi = parse_extended_read_name( read_names_original[0]) probe_chr = probes_dict['chr'][read_probe] if not probe_chr in chr_indices: raise Exception( 'Probe {} is associated with chromosome {}, but this entry does not exist in the reference fasta file!' .format(read_probe, probe_chr)) probe_chr_index = chr_indices[probe_chr] read_lens = [ len(read_pair[read_number][1]) for read_number in range(2) ] # untested: if we haven't trimmed off the primers then we need to start aligning from the primer start location! if has_trimmed_primers: probe_start = int( probes_dict['target_start_0'][read_probe] + 1) probe_end = int(probes_dict['target_end'][read_probe]) else: probe_start = int( probes_dict['probe_start_0'][read_probe] + 1) probe_end = int(probes_dict['probe_end'][read_probe]) if probes_dict['strand'][read_probe] == '+': read_starts = [ probe_start, probe_end - read_lens[1] + 1 ] read_reverse = [False, True] elif probes_dict['strand'][read_probe] == '-': read_starts = [ probe_end - read_lens[0] + 1, probe_start ] read_reverse = [True, False] else: raise Exception( 'Unexpected strand for probe {}'.format( read_probe)) # NOTE: this SHOULD BE one-based based on documentation # but actually seem to be ZERO-based -- at least the sequence we get for PRRX1-Ex1 # starts CGGA but should start GGA; ends TTC but should end TTCT if we just use probe_start and probe_end # they are always in genomic sense probe_target_sequence = str( ref_idx.fetch(probe_chr, probe_start, probe_end)).upper() # sanity check that we got the right sequence if has_trimmed_primers: assert len(probe_target_sequence) == probes_dict[ 'target_length'][read_probe] else: assert len(probe_target_sequence ) == probes_dict['capture_size'][read_probe] if debug: print(read_name, read_probe, read_umi) print(probe_chr, probe_chr_index, probe_start, probe_end) print(read_starts) print(read_reverse) print(probe_target_sequence) print(read_pair) try: # pre-process alignments to make sure the mate starts are actually correct read_cigars = [] read_sequences = [] read_tags_for_pysam = [] for read_number in range(2): # copy over our custom tags from FASTQ file read_tags = [ tag.split(':') for tag in read_pair[read_number][0].split('\t')[1:] ] read_tags_for_pysam.append( [("RG", sample, "Z")] + [(tag_name, int(tag_value) if tag_type == 'i' else tag_value, tag_type) for tag_name, tag_type, tag_value in read_tags]) if debug: print(read_tags) print(read_tags_for_pysam[read_number]) # figure out sequence read_sequence = str( Bio.Seq.Seq(read_pair[read_number][1]). reverse_complement()) if read_reverse[ read_number] else read_pair[read_number][1] read_sequences.append(read_sequence) # align read to target sequence -- note both of these are in genomic sense! try: cigar_read_start_offset, cigartuples = align_and_find_cigar( read_sequence, probe_target_sequence) read_tags_for_pysam[read_number].append( ("so", cigar_read_start_offset, 'i')) # remember cigar read_cigars.append(cigartuples) # adjust start -- need to use zero-based coords here but probe_start is 1-based read_starts[ read_number] = probe_start - 1 + cigar_read_start_offset except AssertionError: cigar_read_start_offset, cigartuples = align_and_find_cigar( read_sequence, probe_target_sequence, debug=True) raise if debug: print(read_cigars) print(read_starts) for read_number in range(2): # create aligned segment a = pysam.AlignedSegment() a.mapping_quality = 255 #always best quality a.query_name = read_names_original[0] a.query_sequence = read_sequences[read_number] a.query_qualities = pysam.qualitystring_to_array( read_pair[read_number][2][::-1] if read_reverse[read_number] else read_pair[read_number][2]) a.set_tags(read_tags_for_pysam[read_number]) a.cigartuples = read_cigars[read_number] a.reference_id = probe_chr_index a.reference_start = read_starts[read_number] a.next_reference_id = probe_chr_index a.next_reference_start = read_starts[1 - read_number] # a.template_length = read_lens[read_number] a.is_paired = True a.is_proper_pair = True a.is_read1 = read_number == 0 a.is_read2 = read_number == 1 a.is_reverse = read_reverse[read_number] a.mate_is_reverse = read_reverse[1 - read_number] if debug: print(a) pairedreads.write(a) if debug: break # normally we always get an alignment, but apparently sometimes we don't? except AmplimapNoAlignment: counters['no_alignment'] += 1 pass log.info('%s done - %d pairs in total, %d without alignment', sample, counters['pairs_total'], counters['no_alignment']) log.info("BAM file created: %s", output)