def coverage_pbp(file, reference_length=None) -> np.ndarray: """ Reads the SAM file and computes the per-based coverage from the aligned blocks. In particular: if a position in the reference is consistently a 'delete' compared to the matched reads, it will be counted as zero. The `reference_length` can be inferred from the mapped reads but there can be an undetected residual at the 'right' end if it is not specified. """ from humdum.io import from_sam zeros = (lambda n: np.zeros(n, dtype=int)) counts = zeros(reference_length or 0) for read in from_sam(file): a = read.pos for (n, A) in re.findall(r"([0-9]+)([XIDSM=])", read.cigar): b = a + int(n) assert (a < b), "Only expect positive numbers in CIGAR." if (A in '=M'): if (b > len(counts)): counts = np.concatenate([counts, zeros(b - len(counts))]) counts[a:b] += 1 a = b return counts
def test_from_sam(self): from humdum.io import from_sam alignment = list(from_sam(file)).pop() last_read = "CACCATCCAGAACAGTGCCTCTTGCAGAGTCTCCTTGGGAAACTTACCAAGTCTGATGGTAGCAGGGGCATGGGACCATCCTAACTGGGAAGACAAAAAGGCTGAGACCTTCCCAGAGTCACCTT" self.assertEqual(alignment.seq, last_read)
def tlen_hist(file): """ Returns a structure containing the fields length counts where counts[i] is the number of transcripts of length length[i] Expects a SAM file `file`. Only counts nonnegative tlen. """ from humdum.io import from_sam from collections import Counter tlens_counts = numpy.asarray(list(Counter([ read.tlen for read in from_sam(file) if (0 <= read.tlen <= 10000) ]).items())).T class _: length = tlens_counts[0] counts = tlens_counts[1] return _
def test_sw_on_data_small(self, verbose=0): fa = Path( __file__).parent / "data_for_tests/data_small/genome.chr22.5K.fa" reference = str(unlist1(list(from_fasta(fa))).seq) in_file = list((Path(__file__).parent / "data_for_tests/data_small/").glob("*.sam")).pop() max_reads = 2 for (read, __) in zip(from_sam(in_file), range(max_reads)): read: Read ref = reference query = read.seq aligner = SmithWaterman() for alignment in aligner(ref=ref, query=query): if verbose: print(alignment.cigar, ' vs ', read.cigar) print(read.mapq, ' vs ', alignment.score) x, y, z = alignment.visualize(ref=ref, query=query) print(x) print(y) print(z) print(alignment.matching_subsegments(), ' vs ', read.cigar) self.assertEqual( alignment.cigar, read.cigar, f'{alignment.cigar} is not equal to cigar from sam file {read.cigar}' )
def test_against_pysam(self): from humdum.io import from_sam, AlignedSegment from humdum.io import from_sam_pysam import pysam for (have, want, n) in zip(from_sam(file), from_sam_pysam(file), count()): self.assertIsInstance(have, AlignedSegment) self.assertIsInstance(want, pysam.AlignedSegment) self.assertEqual(have.cigar, want.cigarstring) self.assertEqual(n, 1169)
def test_on_data_small(self): (read_file1, read_file2) = sorted(source_path.glob("*.fq")) genome_file = unlist1(source_path.glob("genome*.fa")) sam = AllTheKingsHorses.from_files(fa=genome_file, fq1=read_file1, fq2=read_file2) mine: AlignedSegment theirs: AlignedSegment for ((mine, theirs), n) in zip( zip(sam.alignments, from_sam(unlist1(source_path.glob("*.sam")))), count()): # See io/sam.py for the explanations self.assertEqual(mine.flag.is_minus_strand, bool(theirs.flag.value & 16)) self.assertEqual(mine.flag.is_secondary_alignment, bool(theirs.flag.value & 256)) cigar_match = (mine.cigar == theirs.cigar) pos_match = (mine.pos == theirs.pos) tlen_match = (mine.tlen == theirs.tlen) if cigar_match and pos_match: print(F"Read {mine.qname} looks good.") else: print(F"Read {mine.qname} does not match.") print(F"Mine: ", mine.cigar, "at", mine.pos) print(F"Theirs:", theirs.cigar, "at", theirs.pos) print(F"Read: ", mine.seq) # print(F"Neighborhood: ", aligned_segments.ref_genome[(mine.pos - 10):(mine.pos + 10 + len(mine.seq))]) if not tlen_match: print( F"tlen mismatch: {mine.tlen} (mine) vs {theirs.tlen} (theirs)" )