Beispiel #1
0
    def call_variants(self, ksize, mindist=6, logstream=sys.stderr):
        """Attempt to call variants from this contig alignment.

        If the alignment CIGAR matches a known pattern, the appropriate caller
        is invoked (SNV or INDEL caller). If not, a "no call" is reported.

        If an SNV call is within `mindist` base pairs of the end of the
        alignment it is ignored. Set to `None` to disable this behavior.

        Variant calls with no spanning interesting k-mers are designated as
        "passenger calls" and discarded.
        """
        offset = 0 if self.targetshort else self.offset
        if self.vartype == 'snv':
            caller = self.call_snv(self.match.query,
                                   self.match.target,
                                   offset,
                                   ksize,
                                   mindist,
                                   logstream=logstream)
            for call in caller:
                if self.is_passenger(call):
                    call.filter(vf.PassengerVariant)
                yield call
        elif self.vartype == 'indel':
            indelcaller = self.call_indel(ksize)
            indel = next(indelcaller)
            if self.is_passenger(indel):
                indel.filter(vf.PassengerVariant)
            yield indel

            leftflankcaller = self.call_snv(self.leftflank.query,
                                            self.leftflank.target,
                                            offset,
                                            ksize,
                                            mindist,
                                            donocall=False)
            offset += self.leftflank.length
            if self.indeltype == 'D':
                offset += self.indel.length
            rightflankcaller = self.call_snv(self.rightflank.query,
                                             self.rightflank.target,
                                             offset,
                                             ksize,
                                             mindist,
                                             donocall=False)
            for call in chain(leftflankcaller, rightflankcaller):
                if self.is_passenger(call):
                    call.filter(vf.PassengerVariant)
                yield call
        else:
            nocall = Variant(self.seqid,
                             self.pos,
                             '.',
                             '.',
                             CONTIG=self.varseq,
                             CIGAR=self.cigar,
                             KSW2=str(self.score))
            nocall.filter(vf.InscrutableCigar)
            yield nocall
Beispiel #2
0
    def call_snv(self,
                 qseq,
                 tseq,
                 offset,
                 ksize,
                 mindist=6,
                 donocall=True,
                 logstream=sys.stderr):
        """Call SNVs from the aligned mismatched sequences.

        The `qseq` and `tseq` are strings containing query and target sequences
        of identical length; `mismatches` is a list of positions where `qseq`
        and `tseq` do not match; `offset` is the number of 5' nucleotides in
        the target not aligned to the query; and `ksize` is used to compute a
        window that spans all reference allele k-mers in `tseq` and all
        alternate allele k-mers in `qseq`.
        """
        length = len(qseq)
        assert len(tseq) == length
        diffs = [i for i in range(length) if tseq[i] != qseq[i]]
        if mindist:
            diffs = trim_terminal_snvs(diffs, length, mindist, logstream)
        if len(diffs) == 0:
            if donocall:
                nocall = Variant(self.seqid,
                                 self.cutout.local_to_global(offset),
                                 '.',
                                 '.',
                                 CONTIG=qseq,
                                 CIGAR=self.cigar,
                                 KSW2=str(self.score),
                                 IKMERS=str(len(self.contig.annotations)))
                nocall.filter(vf.PerfectMatch)
                yield nocall
            return

        for pos in diffs:
            minpos = max(pos - ksize + 1, 0)
            maxpos = min(pos + ksize, length)
            altwindow = qseq[minpos:maxpos]
            refrwindow = tseq[minpos:maxpos]

            refr = tseq[pos].upper()
            alt = qseq[pos].upper()
            localcoord = pos + offset
            globalcoord = self.cutout.local_to_global(localcoord)
            nikmers = n_ikmers_present(self.contig, altwindow)
            snv = Variant(self.seqid,
                          globalcoord,
                          refr,
                          alt,
                          CONTIG=qseq,
                          CIGAR=self.cigar,
                          KSW2=str(self.score),
                          IKMERS=str(nikmers),
                          ALTWINDOW=altwindow,
                          REFRWINDOW=refrwindow)
            yield snv
Beispiel #3
0
def test_region():
    variant = Variant('chr12', 1033773, 'A', 'G')
    assert variant.region == ('chr12', 1033773, 1033774)
    variant = Variant('chr12', 1033773, 'A', 'AGTG')
    assert variant.region == ('chr12', 1033773, 1033774)
    variant = Variant('chr12', 1033773, 'AT', 'TG')
    assert variant.region == ('chr12', 1033773, 1033775)
    variant = Variant('chr12', 1033773, 'ATACCG', 'A')
    assert variant.region == ('chr12', 1033773, 1033779)
Beispiel #4
0
def test_snv_obj():
    snv = Variant('scaffold42', 10773, 'A', 'G')
    assert str(snv) == 'scaffold42:10773:A->G'
    vcfvalues = ['scaffold42', '10774', '.', 'A', 'G', '.', 'PASS', '.']
    assert snv.vcf == '\t'.join(vcfvalues)
    assert snv.cigar is None

    snv2 = Variant('chr5', 500, 'T', 'G', CIGAR='10D200M10D')
    assert snv2.cigar == '10D200M10D'
    assert snv2.window is None
Beispiel #5
0
def test_writer_bad_fmt(yrb_writer):
    v = Variant('1', 12345, 'G', 'C')
    v.annotate('PART', '42')
    v.annotate('CONTIG', 'A' * 100)
    v.format('NA19238', 'GT', '0/0')
    v.format('NA19240', 'GT', '0/1')
    v.format('NA19239', 'ALTABUND', '0,0,0')
    v.format('NA19240', 'ALTABUND', '0,0,0')
    errormsg = r'samples not annotated with the same FORMAT fields'
    with pytest.raises(kevlar.vcf.VariantAnnotationError, match=errormsg):
        yrb_writer.write(v)
Beispiel #6
0
 def call_indel(self, ksize):
     if self.indeltype == 'D':
         refrwindow = self.leftflank.target[-(ksize-1):] \
             + self.indel.target \
             + self.rightflank.target[:(ksize-1)]
         refrallele = self.leftflank.target[-1] + self.indel.target
         altwindow = self.leftflank.query[-(ksize-1):] \
             + self.rightflank.query[:(ksize-1)]
         altallele = self.leftflank.query[-1]
     else:
         refrwindow = self.leftflank.target[-(ksize-1):] \
             + self.rightflank.target[:(ksize-1)]
         refrallele = self.leftflank.target[-1]
         altwindow = self.leftflank.query[-(ksize-1):] \
             + self.indel.query \
             + self.rightflank.query[:(ksize-1)]
         altallele = self.leftflank.query[-1] + self.indel.query
     nikmers = n_ikmers_present(self.contig, altwindow)
     localcoord = 0 if self.targetshort else self.offset
     localcoord += self.leftflank.length
     globalcoord = self.cutout.local_to_global(localcoord)
     indel = Variant(self.seqid,
                     globalcoord - 1,
                     refrallele,
                     altallele,
                     CONTIG=self.varseq,
                     CIGAR=self.cigar,
                     KSW2=str(self.score),
                     IKMERS=str(nikmers),
                     ALTWINDOW=altwindow,
                     REFRWINDOW=refrwindow)
     yield indel
Beispiel #7
0
def test_indel_obj():
    """
    Test indel objects

    The coordinate used to construct the object is 0-based, but includes the
    nucleotide shared by the reference and alternate alleles. The str() output
    coordinate is increased by 1 to account for this nucleotide, while the VCF
    output is increased by 1 to transform to a 1-based system where the shared
    nucleotide is the point of reference.
    """
    indel1 = Variant('chr3', 8998622, 'GATTACA', 'G')
    assert str(indel1) == 'chr3:8998623:6D'
    vcfvalues = ['chr3', '8998623', '.', 'GATTACA', 'G', '.', 'PASS', '.']
    assert indel1.vcf == '\t'.join(vcfvalues)

    indel2 = Variant('chr6', 75522411, 'G', 'GATTACA')
    assert str(indel2) == 'chr6:75522412:I->ATTACA'
    vcfvalues = ['chr6', '75522412', '.', 'G', 'GATTACA', '.', 'PASS', '.']
    assert indel2.vcf == '\t'.join(vcfvalues)
Beispiel #8
0
def generate_mutations(sequences, n=10, ksize=31, weights=DWEIGHTS, rng=None):
    if rng is None:
        seed = random.randrange(sys.maxsize)
        print('[kevlar::gentrio] using random seed', seed, file=sys.stderr)
        rng = random.Random(seed)
    if isinstance(rng, int):
        rng = random.Random(rng)

    weightkeys = sorted(weights.keys())
    weightvalues = [weights[k] for k in weightkeys]
    for _ in range(n):
        seqid = rng.choice(list(sorted(sequences.keys())))
        seq = sequences[seqid]
        seqlength = len(sequences[seqid])
        position = rng.randint(0, seqlength - 1)
        muttype = weighted_choice(weightkeys, weightvalues, rng)

        if muttype == 'snv':
            offset = rng.randint(1, 3)
            refrseq, altseq, refrwindow, altwindow = mutate_snv(
                seq, position, offset, ksize)
        elif muttype == 'ins':
            length = rng.randint(5, 350)
            duplpos = rng.randint(0, seqlength)
            refrseq, altseq, refrwindow, altwindow = mutate_insertion(
                seq, position, length, duplpos, rng, ksize)
        elif muttype == 'del':
            length = rng.randint(5, 350)
            refrseq, altseq, refrwindow, altwindow = mutate_deletion(
                seq, position, length, ksize)
        else:
            raise ValueError('unknown mutation type {}'.format(muttype))
        yield Variant(seqid,
                      position,
                      refrseq,
                      altseq,
                      ALTWINDOW=altwindow,
                      REFRWINDOW=refrwindow)
Beispiel #9
0
def test_info():
    """Test handling of "info" field attributes.

    This tests the mechanics of the .annotate() and .attribute() API, and the
    FormattedList class underpinning it.
    """
    values = FormattedList()
    assert str(values) == '.'
    values.append(42)
    assert str(values) == '42'
    values.append(1776)
    assert str(values) == '42,1776'
    values.append('B0gU$')
    with pytest.raises(kevlar.vcf.KevlarMixedDataTypeError):
        str(values)

    v = Variant('1', 12345, 'G', 'C')
    assert v.attribute('VW') is None

    v.annotate('VW', 'AGTNNNNNNNNNNNNNNNNNNNNNTGA')
    assert v.attribute('VW') == 'AGTNNNNNNNNNNNNNNNNNNNNNTGA'

    v.annotate('VW', 'GATTACA')
    assert v.attribute('VW') == 'GATTACA'
    assert v.attribute('VW', pair=True) == 'VW=GATTACA'

    v.annotate('VW', 'ATGCCCTAG', replace=False)
    assert v.info['VW'] == ['GATTACA', 'ATGCCCTAG']
    assert v.attribute('VW') == ['GATTACA', 'ATGCCCTAG']
    assert v.attribute('VW', string=True) == 'GATTACA,ATGCCCTAG'
    assert v.attribute('VW', pair=True) == 'VW=GATTACA,ATGCCCTAG'

    v.annotate('VW', 'AAAAAAAAA', replace=False)
    assert v.attribute('VW') == ['GATTACA', 'ATGCCCTAG', 'AAAAAAAAA']
    assert v.attribute('VW', pair=True) == 'VW=GATTACA,ATGCCCTAG,AAAAAAAAA'

    v.annotate('DROPPED', 3)
    assert v.attribute('DROPPED') == 3
    assert v.attribute('DROPPED', string=True) == '3'

    v.annotate('DROPPED', 31, replace=False)
    assert v.attribute('DROPPED') == [3, 31]
    assert v.attribute('DROPPED', string=True) == '3,31'
    assert v.attribute('DROPPED', pair=True) == 'DROPPED=3,31'

    v.annotate('MATEDIST', 432.1234, replace=False)
    v.annotate('MATEDIST', 8765.4321, replace=False)
    assert v.attribute('MATEDIST', string=True) == '432.123,8765.432'

    v.annotate('LLIH', -436.0111857750478)
    assert v.attribute('LLIH', pair=True) == 'LLIH=-436.011'
Beispiel #10
0
def test_filter_field():
    v = Variant('scaffold1', 12345, '.', '.')
    assert v.filterstr == '.'
    v.filter(vf.InscrutableCigar)
    assert v.filterstr == 'InscrutableCigar'

    v = Variant('chr1', 55555, '.', '.')
    v.filter(vf.PerfectMatch)
    assert v.filterstr == 'PerfectMatch'

    v = Variant('1', 809768, 'C', 'CAT')
    assert v.filterstr == 'PASS'
    v.filter(vf.PassengerVariant)
    assert v.filterstr == 'PassengerVariant'
    v.filter(vf.Homopolymer)
    assert v.filterstr == 'Homopolymer;PassengerVariant'

    v = Variant('one', 112358, 'T', 'A')
    v.filter('SNPyMcSNPface')
    v.filter(6.022e23)
    v.filter(dict(chicken='waffles', biscuits='gravy'))
    v.filterstr == 'PASS'  # These "filters" shouldn't actually do anything
Beispiel #11
0
def test_writer(yrb_writer, capsys):
    yrb_writer = kevlar.vcf.VCFWriter(sys.stdout, source='py.test')
    yrb_writer.register_sample('NA19238')
    yrb_writer.register_sample('NA19239')
    yrb_writer.register_sample('NA19240')
    yrb_writer.describe_format('GT', 'String', '1', 'Genotype')
    yrb_writer.write_header()

    v = Variant('1', 12345, 'G', 'C')
    v.annotate('PART', '42')
    v.annotate('CONTIG', 'A' * 100)
    v.format('NA19238', 'GT', '0/0')
    v.format('NA19239', 'GT', '0/0')
    v.format('NA19240', 'GT', '0/1')
    v.format('NA19238', 'ALTABUND', '12,9,8')
    v.format('NA19239', 'ALTABUND', '0,0,0')
    v.format('NA19240', 'ALTABUND', '0,0,0')
    yrb_writer.write(v)

    out, err = capsys.readouterr()
    print(out)

    outlines = out.strip().split('\n')
    fmtlines = [l for l in outlines if l.startswith('##FORMAT')]
    assert len(fmtlines) == 2
    gtfmt = '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">'
    assert gtfmt in fmtlines

    varlines = [l for l in outlines if not l.startswith('#')]
    assert len(varlines) == 1
    values = varlines[0].split('\t')
    assert len(values) == 12
    assert values[8:12] == [
        'ALTABUND:GT', '12,9,8:0/0', '0,0,0:0/0', '0,0,0:0/1'
    ]
Beispiel #12
0
def test_format():
    v = Variant('1', 12345, 'G', 'C')
    v.format('NA19238', 'GT', '0/0')
    assert v.format('NA19238', 'GT') == '0/0'
    assert v.format('NA19238', 'XYZ') is None
    assert v.format('NA19239', 'GT') is None