def make_call_from_reads(queue, idx, calls, refrfile, ksize=31, delta=50, seedsize=31, maxdiff=None, match=1, mismatch=2, gapopen=5, gapextend=0, min_ikmers=None, refrseqs=None, logstream=sys.stderr): while True: if queue.empty(): sleep(3) continue reads = queue.get() ccmatch = re.search(r'kvcc=(\d+)', reads[0].name) cc = ccmatch.group(1) if ccmatch else None message = '[kevlar::alac::make_call_from_reads' message += ' (thread={:d})]'.format(idx) message += ' grabbed partition={} from queue,'.format(cc) message += ' queue size now {:d}'.format(queue.qsize()) print(message, file=sys.stderr) # Assemble partitioned reads into contig(s) contigs = list(assemble_fml_asm(reads, logstream=logstream)) if min_ikmers is not None: # Apply min ikmer filter if it's set contigs = [c for c in contigs if len(c.annotations) >= min_ikmers] if len(contigs) == 0: queue.task_done() continue # Identify the genomic region(s) associated with each contig localizer = localize(contigs, refrfile, seedsize, delta=delta, maxdiff=maxdiff, refrseqs=refrseqs, logstream=logstream) targets = list(localizer) if len(targets) == 0: queue.task_done() continue # Align contigs to genomic targets to make variant calls caller = call(targets, contigs, match, mismatch, gapopen, gapextend, ksize, refrfile) for varcall in caller: if cc: varcall.annotate('PART', cc) calls.append(varcall) queue.task_done()
def test_call_pico_indel(ccid, varcall): qfile = data_file('pico' + ccid + '.contig.augfasta') tfile = data_file('pico' + ccid + '.gdna.fa') qinstream = kevlar.parse_augmented_fastx(kevlar.open(qfile, 'r')) queryseqs = [record for record in qinstream] targetseqs = [record for record in khmer.ReadParser(tfile)] calls = list(call(targetseqs, queryseqs)) assert len(calls) == 1 assert str(calls[0]) == varcall
def test_call_pico_indel(ccid, varcall): qfile = data_file('pico' + ccid + '.contig.augfasta') tfile = data_file('pico' + ccid + '.gdna.fa') qinstream = kevlar.parse_augmented_fastx(kevlar.open(qfile, 'r')) queries = [record for record in qinstream] tinstream = kevlar.reference.load_refr_cutouts(kevlar.open(tfile, 'r')) targets = [record for record in tinstream] calls = list(call(targets, queries)) assert len(calls) == 1 assert str(calls[0]) == varcall
def test_snv_dedup(): contigfile = data_file('bee-dupl.contigs.augfasta') contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r')) contigs = list(contigstream) gdnafile = data_file('bee-dupl.gdna.fa') gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r')) targets = list(gdnastream) calls = list(call(targets, contigs, ksize=27)) assert len(calls) == 1 assert calls[0].seqid == 'linkagegroup5' assert calls[0].position == 8174 - 1
def alac(pstream, refrfile, ksize=31, delta=25, maxdiff=10000, match=1, mismatch=2, gapopen=5, gapextend=0, greedy=False, logstream=sys.stderr): assembler = assemble_greedy if greedy else assemble_fml_asm for partition in pstream: contigs = [c for c in assembler(partition, logstream=logstream)] targets = [t for t in localize(contigs, refrfile, ksize, delta=delta)] caller = call( targets, contigs, match, mismatch, gapopen, gapextend, ksize ) for varcall in caller: yield varcall
def test_variant_kmers(): # variant here---------------| window = 'TTATTTTTAACAAAGGAGCAAAGGAGCAAAGGGCAAATACAATGAGGCAAAGATAGTCTCT' qfile = data_file('ssc223.contig.augfasta') tfile = data_file('ssc223.gdna.fa') qinstream = kevlar.parse_augmented_fastx(kevlar.open(qfile, 'r')) queryseqs = [record for record in qinstream] targetseqs = [record for record in khmer.ReadParser(tfile)] calls = list(call(targetseqs, queryseqs)) assert len(calls) == 1 assert calls[0].window == window
def test_perfect_match(): contigfile = data_file('nodiff.contig.fa') contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r')) contigs = list(contigstream) gdnafile = data_file('nodiff.gdna.fa') gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r')) targets = list(gdnastream) calls = list(call(targets, contigs)) assert len(calls) == 1 assert calls[0].seqid == 'chr99' assert calls[0].position == 2899377 assert calls[0].filterstr == 'PerfectMatch'
def test_funky_cigar_deletion(): contigfile = data_file('funkycigar/deletion.contig.fa') contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r')) contigs = list(contigstream) gdnafile = data_file('funkycigar/deletion.gdna.fa') gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r')) targets = list(gdnastream) calls = list(call(targets, contigs)) assert len(calls) == 1 assert calls[0].seqid == 'chr42' assert calls[0].position == 53644 assert calls[0]._refr == 'ATGTCTGTTTTCTTAACCT' assert calls[0]._alt == 'A'
def test_funky_cigar(part, coord, window): contigfile = data_file('funkycigar/part.cc{:d}.contig.fa.gz'.format(part)) contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r')) contigs = list(contigstream) gdnafile = data_file('funkycigar/part.cc{:d}.gdna.fa.gz'.format(part)) gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r')) targets = list(gdnastream) calls = list(call(targets, contigs)) assert len(calls) == 1 print('DEBUG', calls[0].vcf, file=sys.stderr) assert calls[0].seqid == '17' assert calls[0].position == coord - 1 assert calls[0].attribute('ALTWINDOW') == window
def test_call_ssc_isolated_snv(ccid, cigar, varcall): """ Ensure isolated SNVs are called correctly. SNVs that are well separated from other variants have a distinct alignment signature as reflected in the CIGAR string reported by ksw2. They are either of the form "delete-match-delete" or "delete-match-delete-match", where the second match is very short (and spurious). """ qfile = data_file('ssc' + ccid + '.contig.augfasta') tfile = data_file('ssc' + ccid + '.gdna.fa') qinstream = kevlar.parse_augmented_fastx(kevlar.open(qfile, 'r')) queryseqs = [record for record in qinstream] targetseqs = [record for record in khmer.ReadParser(tfile)] calls = list(call(targetseqs, queryseqs)) assert len(calls) == 1 assert str(calls[0]) == varcall
def test_multibest_revcom(): contigfile = data_file('multibestrc.contig.fa') contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r')) contigs = list(contigstream) gdnafile = data_file('multibestrc.gdna.fa') gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r')) targets = list(gdnastream) calls = list(call(targets, contigs)) assert len(calls) == 4 coordinates = [c.position + 1 for c in calls] assert coordinates == [34495786, 34583830, 58088279, 60344854] for c in calls: assert c._refr == 'A' assert c._alt == 'G' assert c.window == ('CCTGAGCCCTCTCAAGTCGGGTCCTGGCCCGGTCTGCCCATGAGGCTGG' 'GCCTGAGCCCCA')
def test_cigar_filter_regression(): contigfile = data_file('14153.cc5463.contig.augfasta.gz') contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r')) contigs = list(contigstream) gdnafile = data_file('14153.cc5463.gdna.augfasta.gz') gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r')) targets = list(gdnastream) calls = sorted(call(targets, contigs), key=lambda c: c.position) assert len(calls) == 2 assert calls[1].seqid == '6' # Equally valid calls from equally optimal alignments c1 = ('AGAAA', 'A', 154734241) c2 = ('GAAGA', 'G', 154734239) varcall = (calls[1]._refr, calls[1]._alt, calls[1].position) assert varcall in (c1, c2)