def check_kmer_freq_in_read_pair(read1, read2, minkmer, debugstream=None): """ Check interesting k-mer frequence in each read. When calculating offset between a pair of reads, do not use any interesting k-mers that occur multiple times in either read. """ maxkmer = kevlar.revcom(minkmer) matches1 = [ k for k in read1.ikmers if kevlar.same_seq(k.sequence, minkmer, maxkmer) ] matches2 = [ k for k in read2.ikmers if kevlar.same_seq(k.sequence, minkmer, maxkmer) ] nmatches1 = len(matches1) nmatches2 = len(matches2) assert nmatches1 > 0 and nmatches1 > 0, (nmatches1, nmatches2) if nmatches1 > 1 or nmatches2 > 1: if debugstream: message = ( 'stubbornly refusing to calculate offset bewteen {:s} and ' '{:s}; interesting k-mer {:s} occurs multiple times'.format( read1.name, read2.name, minkmer)) print('[kevlar::overlap] INFO', message, file=debugstream) return None, None kmer1 = matches1[0] kmer2 = matches2[0] return kmer1, kmer2
def test_allocate_sketch_graphy(count, smallcount): sequence = 'AATCAACGCTTCTTAATAGGCATAGTGTCTCTGCTGCGCATGGACGTGCCATAGCCACTACT' kmer = 'GCATAGTGTCTCTGCTGCGCA' sketch = kevlar.sketch.allocate(21, 1e4, 4, count, True, smallcount) sketch.consume(sequence) sketch.get(kmer) == 1 kmer_hash = sketch.hash(kmer) assert kevlar.same_seq(sketch.reverse_hash(kmer_hash), kmer)
def test_pico(picorecord1, picorecord2, picorecord3): pair1 = ReadPair(picorecord1, picorecord2, 'TTTTTTGTTTCCCAAAGTAAGGCTG') assert pair1.offset == 59 assert pair1.head.read.name == 'seq1_901428_901847_3:0:0_0:0:0_87d/1' print(pair1.mergedseq) pair2 = ReadPair(picorecord1, picorecord3, 'TTTTTTGTTTCCCAAAGTAAGGCTG') assert pair2.offset == 59 assert pair2.head.read.name == 'seq1_901428_901847_3:0:0_0:0:0_87d/1' print(pair2.mergedseq) assert kevlar.same_seq(pair1.mergedseq, pair2.mergedseq)
def test_pico_offset(picorecord1, picorecord2, picorecord3): pair = kevlar.overlap.calc_offset(picorecord1, picorecord2, 'TTTTTTGTTTCCCAAAGTAAGGCTG') assert pair.offset == 59 assert pair.head.name == 'seq1_901350_901788_1:0:0_0:0:0_21ca1/2' assert pair.swapped is False contig = kevlar.assemble.merge_pair(pair) print(contig) pair = kevlar.overlap.calc_offset(picorecord1, picorecord3, 'TTTTTTGTTTCCCAAAGTAAGGCTG') assert pair.offset == 59 assert pair.head.name == 'seq1_901350_901788_1:0:0_0:0:0_21ca1/2' assert pair.swapped is True newcontig = kevlar.assemble.merge_pair(pair) print(newcontig) assert kevlar.same_seq(contig, newcontig)
def test_pico_contains(picorecord3, picorecord4): pair = ReadPair(picorecord3, picorecord4, 'CACTCAGCCTTACTTTGGGAAACAA') print(pair.mergedseq) assert kevlar.same_seq(pair.mergedseq, picorecord3.sequence)