Exemple #1
0
def make_call_from_reads(queue,
                         idx,
                         calls,
                         refrfile,
                         ksize=31,
                         delta=50,
                         seedsize=31,
                         maxdiff=None,
                         match=1,
                         mismatch=2,
                         gapopen=5,
                         gapextend=0,
                         min_ikmers=None,
                         refrseqs=None,
                         logstream=sys.stderr):
    while True:
        if queue.empty():
            sleep(3)
            continue
        reads = queue.get()
        ccmatch = re.search(r'kvcc=(\d+)', reads[0].name)
        cc = ccmatch.group(1) if ccmatch else None
        message = '[kevlar::alac::make_call_from_reads'
        message += ' (thread={:d})]'.format(idx)
        message += ' grabbed partition={} from queue,'.format(cc)
        message += ' queue size now {:d}'.format(queue.qsize())
        print(message, file=sys.stderr)

        # Assemble partitioned reads into contig(s)
        contigs = list(assemble_fml_asm(reads, logstream=logstream))
        if min_ikmers is not None:
            # Apply min ikmer filter if it's set
            contigs = [c for c in contigs if len(c.annotations) >= min_ikmers]
        if len(contigs) == 0:
            queue.task_done()
            continue

        # Identify the genomic region(s) associated with each contig
        localizer = localize(contigs,
                             refrfile,
                             seedsize,
                             delta=delta,
                             maxdiff=maxdiff,
                             refrseqs=refrseqs,
                             logstream=logstream)
        targets = list(localizer)
        if len(targets) == 0:
            queue.task_done()
            continue

        # Align contigs to genomic targets to make variant calls
        caller = call(targets, contigs, match, mismatch, gapopen, gapextend,
                      ksize, refrfile)
        for varcall in caller:
            if cc:
                varcall.annotate('PART', cc)
            calls.append(varcall)
        queue.task_done()
Exemple #2
0
def test_call_pico_indel(ccid, varcall):
    qfile = data_file('pico' + ccid + '.contig.augfasta')
    tfile = data_file('pico' + ccid + '.gdna.fa')

    qinstream = kevlar.parse_augmented_fastx(kevlar.open(qfile, 'r'))
    queryseqs = [record for record in qinstream]
    targetseqs = [record for record in khmer.ReadParser(tfile)]

    calls = list(call(targetseqs, queryseqs))
    assert len(calls) == 1
    assert str(calls[0]) == varcall
Exemple #3
0
def test_call_pico_indel(ccid, varcall):
    qfile = data_file('pico' + ccid + '.contig.augfasta')
    tfile = data_file('pico' + ccid + '.gdna.fa')

    qinstream = kevlar.parse_augmented_fastx(kevlar.open(qfile, 'r'))
    queries = [record for record in qinstream]
    tinstream = kevlar.reference.load_refr_cutouts(kevlar.open(tfile, 'r'))
    targets = [record for record in tinstream]

    calls = list(call(targets, queries))
    assert len(calls) == 1
    assert str(calls[0]) == varcall
Exemple #4
0
def test_snv_dedup():
    contigfile = data_file('bee-dupl.contigs.augfasta')
    contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r'))
    contigs = list(contigstream)

    gdnafile = data_file('bee-dupl.gdna.fa')
    gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r'))
    targets = list(gdnastream)

    calls = list(call(targets, contigs, ksize=27))
    assert len(calls) == 1
    assert calls[0].seqid == 'linkagegroup5'
    assert calls[0].position == 8174 - 1
Exemple #5
0
def alac(pstream, refrfile, ksize=31, delta=25, maxdiff=10000, match=1,
         mismatch=2, gapopen=5, gapextend=0, greedy=False,
         logstream=sys.stderr):
    assembler = assemble_greedy if greedy else assemble_fml_asm

    for partition in pstream:
        contigs = [c for c in assembler(partition, logstream=logstream)]
        targets = [t for t in localize(contigs, refrfile, ksize, delta=delta)]
        caller = call(
            targets, contigs, match, mismatch, gapopen, gapextend, ksize
        )
        for varcall in caller:
            yield varcall
Exemple #6
0
def test_variant_kmers():
    #            variant here---------------|
    window = 'TTATTTTTAACAAAGGAGCAAAGGAGCAAAGGGCAAATACAATGAGGCAAAGATAGTCTCT'

    qfile = data_file('ssc223.contig.augfasta')
    tfile = data_file('ssc223.gdna.fa')

    qinstream = kevlar.parse_augmented_fastx(kevlar.open(qfile, 'r'))
    queryseqs = [record for record in qinstream]
    targetseqs = [record for record in khmer.ReadParser(tfile)]

    calls = list(call(targetseqs, queryseqs))
    assert len(calls) == 1
    assert calls[0].window == window
Exemple #7
0
def test_perfect_match():
    contigfile = data_file('nodiff.contig.fa')
    contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r'))
    contigs = list(contigstream)

    gdnafile = data_file('nodiff.gdna.fa')
    gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r'))
    targets = list(gdnastream)

    calls = list(call(targets, contigs))
    assert len(calls) == 1
    assert calls[0].seqid == 'chr99'
    assert calls[0].position == 2899377
    assert calls[0].filterstr == 'PerfectMatch'
Exemple #8
0
def test_funky_cigar_deletion():
    contigfile = data_file('funkycigar/deletion.contig.fa')
    contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r'))
    contigs = list(contigstream)

    gdnafile = data_file('funkycigar/deletion.gdna.fa')
    gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r'))
    targets = list(gdnastream)

    calls = list(call(targets, contigs))
    assert len(calls) == 1
    assert calls[0].seqid == 'chr42'
    assert calls[0].position == 53644
    assert calls[0]._refr == 'ATGTCTGTTTTCTTAACCT'
    assert calls[0]._alt == 'A'
Exemple #9
0
def test_funky_cigar(part, coord, window):
    contigfile = data_file('funkycigar/part.cc{:d}.contig.fa.gz'.format(part))
    contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r'))
    contigs = list(contigstream)

    gdnafile = data_file('funkycigar/part.cc{:d}.gdna.fa.gz'.format(part))
    gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r'))
    targets = list(gdnastream)

    calls = list(call(targets, contigs))
    assert len(calls) == 1
    print('DEBUG', calls[0].vcf, file=sys.stderr)
    assert calls[0].seqid == '17'
    assert calls[0].position == coord - 1
    assert calls[0].attribute('ALTWINDOW') == window
Exemple #10
0
def test_call_ssc_isolated_snv(ccid, cigar, varcall):
    """
    Ensure isolated SNVs are called correctly.

    SNVs that are well separated from other variants have a distinct alignment
    signature as reflected in the CIGAR string reported by ksw2. They are
    either of the form "delete-match-delete" or "delete-match-delete-match",
    where the second match is very short (and spurious).
    """
    qfile = data_file('ssc' + ccid + '.contig.augfasta')
    tfile = data_file('ssc' + ccid + '.gdna.fa')

    qinstream = kevlar.parse_augmented_fastx(kevlar.open(qfile, 'r'))
    queryseqs = [record for record in qinstream]
    targetseqs = [record for record in khmer.ReadParser(tfile)]

    calls = list(call(targetseqs, queryseqs))
    assert len(calls) == 1
    assert str(calls[0]) == varcall
Exemple #11
0
def test_multibest_revcom():
    contigfile = data_file('multibestrc.contig.fa')
    contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r'))
    contigs = list(contigstream)

    gdnafile = data_file('multibestrc.gdna.fa')
    gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r'))
    targets = list(gdnastream)

    calls = list(call(targets, contigs))
    assert len(calls) == 4

    coordinates = [c.position + 1 for c in calls]
    assert coordinates == [34495786, 34583830, 58088279, 60344854]
    for c in calls:
        assert c._refr == 'A'
        assert c._alt == 'G'
        assert c.window == ('CCTGAGCCCTCTCAAGTCGGGTCCTGGCCCGGTCTGCCCATGAGGCTGG'
                            'GCCTGAGCCCCA')
Exemple #12
0
def test_cigar_filter_regression():
    contigfile = data_file('14153.cc5463.contig.augfasta.gz')
    contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r'))
    contigs = list(contigstream)

    gdnafile = data_file('14153.cc5463.gdna.augfasta.gz')
    gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r'))
    targets = list(gdnastream)

    calls = sorted(call(targets, contigs), key=lambda c: c.position)
    assert len(calls) == 2
    assert calls[1].seqid == '6'

    # Equally valid calls from equally optimal alignments
    c1 = ('AGAAA', 'A', 154734241)
    c2 = ('GAAGA', 'G', 154734239)

    varcall = (calls[1]._refr, calls[1]._alt, calls[1].position)
    assert varcall in (c1, c2)