def call_variants(self, ksize, mindist=6, logstream=sys.stderr): """Attempt to call variants from this contig alignment. If the alignment CIGAR matches a known pattern, the appropriate caller is invoked (SNV or INDEL caller). If not, a "no call" is reported. If an SNV call is within `mindist` base pairs of the end of the alignment it is ignored. Set to `None` to disable this behavior. Variant calls with no spanning interesting k-mers are designated as "passenger calls" and discarded. """ offset = 0 if self.targetshort else self.offset if self.vartype == 'snv': caller = self.call_snv(self.match.query, self.match.target, offset, ksize, mindist, logstream=logstream) for call in caller: if self.is_passenger(call): call.filter(vf.PassengerVariant) yield call elif self.vartype == 'indel': indelcaller = self.call_indel(ksize) indel = next(indelcaller) if self.is_passenger(indel): indel.filter(vf.PassengerVariant) yield indel leftflankcaller = self.call_snv(self.leftflank.query, self.leftflank.target, offset, ksize, mindist, donocall=False) offset += self.leftflank.length if self.indeltype == 'D': offset += self.indel.length rightflankcaller = self.call_snv(self.rightflank.query, self.rightflank.target, offset, ksize, mindist, donocall=False) for call in chain(leftflankcaller, rightflankcaller): if self.is_passenger(call): call.filter(vf.PassengerVariant) yield call else: nocall = Variant(self.seqid, self.pos, '.', '.', CONTIG=self.varseq, CIGAR=self.cigar, KSW2=str(self.score)) nocall.filter(vf.InscrutableCigar) yield nocall
def call_snv(self, qseq, tseq, offset, ksize, mindist=6, donocall=True, logstream=sys.stderr): """Call SNVs from the aligned mismatched sequences. The `qseq` and `tseq` are strings containing query and target sequences of identical length; `mismatches` is a list of positions where `qseq` and `tseq` do not match; `offset` is the number of 5' nucleotides in the target not aligned to the query; and `ksize` is used to compute a window that spans all reference allele k-mers in `tseq` and all alternate allele k-mers in `qseq`. """ length = len(qseq) assert len(tseq) == length diffs = [i for i in range(length) if tseq[i] != qseq[i]] if mindist: diffs = trim_terminal_snvs(diffs, length, mindist, logstream) if len(diffs) == 0: if donocall: nocall = Variant(self.seqid, self.cutout.local_to_global(offset), '.', '.', CONTIG=qseq, CIGAR=self.cigar, KSW2=str(self.score), IKMERS=str(len(self.contig.annotations))) nocall.filter(vf.PerfectMatch) yield nocall return for pos in diffs: minpos = max(pos - ksize + 1, 0) maxpos = min(pos + ksize, length) altwindow = qseq[minpos:maxpos] refrwindow = tseq[minpos:maxpos] refr = tseq[pos].upper() alt = qseq[pos].upper() localcoord = pos + offset globalcoord = self.cutout.local_to_global(localcoord) nikmers = n_ikmers_present(self.contig, altwindow) snv = Variant(self.seqid, globalcoord, refr, alt, CONTIG=qseq, CIGAR=self.cigar, KSW2=str(self.score), IKMERS=str(nikmers), ALTWINDOW=altwindow, REFRWINDOW=refrwindow) yield snv
def test_region(): variant = Variant('chr12', 1033773, 'A', 'G') assert variant.region == ('chr12', 1033773, 1033774) variant = Variant('chr12', 1033773, 'A', 'AGTG') assert variant.region == ('chr12', 1033773, 1033774) variant = Variant('chr12', 1033773, 'AT', 'TG') assert variant.region == ('chr12', 1033773, 1033775) variant = Variant('chr12', 1033773, 'ATACCG', 'A') assert variant.region == ('chr12', 1033773, 1033779)
def test_snv_obj(): snv = Variant('scaffold42', 10773, 'A', 'G') assert str(snv) == 'scaffold42:10773:A->G' vcfvalues = ['scaffold42', '10774', '.', 'A', 'G', '.', 'PASS', '.'] assert snv.vcf == '\t'.join(vcfvalues) assert snv.cigar is None snv2 = Variant('chr5', 500, 'T', 'G', CIGAR='10D200M10D') assert snv2.cigar == '10D200M10D' assert snv2.window is None
def test_writer_bad_fmt(yrb_writer): v = Variant('1', 12345, 'G', 'C') v.annotate('PART', '42') v.annotate('CONTIG', 'A' * 100) v.format('NA19238', 'GT', '0/0') v.format('NA19240', 'GT', '0/1') v.format('NA19239', 'ALTABUND', '0,0,0') v.format('NA19240', 'ALTABUND', '0,0,0') errormsg = r'samples not annotated with the same FORMAT fields' with pytest.raises(kevlar.vcf.VariantAnnotationError, match=errormsg): yrb_writer.write(v)
def call_indel(self, ksize): if self.indeltype == 'D': refrwindow = self.leftflank.target[-(ksize-1):] \ + self.indel.target \ + self.rightflank.target[:(ksize-1)] refrallele = self.leftflank.target[-1] + self.indel.target altwindow = self.leftflank.query[-(ksize-1):] \ + self.rightflank.query[:(ksize-1)] altallele = self.leftflank.query[-1] else: refrwindow = self.leftflank.target[-(ksize-1):] \ + self.rightflank.target[:(ksize-1)] refrallele = self.leftflank.target[-1] altwindow = self.leftflank.query[-(ksize-1):] \ + self.indel.query \ + self.rightflank.query[:(ksize-1)] altallele = self.leftflank.query[-1] + self.indel.query nikmers = n_ikmers_present(self.contig, altwindow) localcoord = 0 if self.targetshort else self.offset localcoord += self.leftflank.length globalcoord = self.cutout.local_to_global(localcoord) indel = Variant(self.seqid, globalcoord - 1, refrallele, altallele, CONTIG=self.varseq, CIGAR=self.cigar, KSW2=str(self.score), IKMERS=str(nikmers), ALTWINDOW=altwindow, REFRWINDOW=refrwindow) yield indel
def test_indel_obj(): """ Test indel objects The coordinate used to construct the object is 0-based, but includes the nucleotide shared by the reference and alternate alleles. The str() output coordinate is increased by 1 to account for this nucleotide, while the VCF output is increased by 1 to transform to a 1-based system where the shared nucleotide is the point of reference. """ indel1 = Variant('chr3', 8998622, 'GATTACA', 'G') assert str(indel1) == 'chr3:8998623:6D' vcfvalues = ['chr3', '8998623', '.', 'GATTACA', 'G', '.', 'PASS', '.'] assert indel1.vcf == '\t'.join(vcfvalues) indel2 = Variant('chr6', 75522411, 'G', 'GATTACA') assert str(indel2) == 'chr6:75522412:I->ATTACA' vcfvalues = ['chr6', '75522412', '.', 'G', 'GATTACA', '.', 'PASS', '.'] assert indel2.vcf == '\t'.join(vcfvalues)
def generate_mutations(sequences, n=10, ksize=31, weights=DWEIGHTS, rng=None): if rng is None: seed = random.randrange(sys.maxsize) print('[kevlar::gentrio] using random seed', seed, file=sys.stderr) rng = random.Random(seed) if isinstance(rng, int): rng = random.Random(rng) weightkeys = sorted(weights.keys()) weightvalues = [weights[k] for k in weightkeys] for _ in range(n): seqid = rng.choice(list(sorted(sequences.keys()))) seq = sequences[seqid] seqlength = len(sequences[seqid]) position = rng.randint(0, seqlength - 1) muttype = weighted_choice(weightkeys, weightvalues, rng) if muttype == 'snv': offset = rng.randint(1, 3) refrseq, altseq, refrwindow, altwindow = mutate_snv( seq, position, offset, ksize) elif muttype == 'ins': length = rng.randint(5, 350) duplpos = rng.randint(0, seqlength) refrseq, altseq, refrwindow, altwindow = mutate_insertion( seq, position, length, duplpos, rng, ksize) elif muttype == 'del': length = rng.randint(5, 350) refrseq, altseq, refrwindow, altwindow = mutate_deletion( seq, position, length, ksize) else: raise ValueError('unknown mutation type {}'.format(muttype)) yield Variant(seqid, position, refrseq, altseq, ALTWINDOW=altwindow, REFRWINDOW=refrwindow)
def test_info(): """Test handling of "info" field attributes. This tests the mechanics of the .annotate() and .attribute() API, and the FormattedList class underpinning it. """ values = FormattedList() assert str(values) == '.' values.append(42) assert str(values) == '42' values.append(1776) assert str(values) == '42,1776' values.append('B0gU$') with pytest.raises(kevlar.vcf.KevlarMixedDataTypeError): str(values) v = Variant('1', 12345, 'G', 'C') assert v.attribute('VW') is None v.annotate('VW', 'AGTNNNNNNNNNNNNNNNNNNNNNTGA') assert v.attribute('VW') == 'AGTNNNNNNNNNNNNNNNNNNNNNTGA' v.annotate('VW', 'GATTACA') assert v.attribute('VW') == 'GATTACA' assert v.attribute('VW', pair=True) == 'VW=GATTACA' v.annotate('VW', 'ATGCCCTAG', replace=False) assert v.info['VW'] == ['GATTACA', 'ATGCCCTAG'] assert v.attribute('VW') == ['GATTACA', 'ATGCCCTAG'] assert v.attribute('VW', string=True) == 'GATTACA,ATGCCCTAG' assert v.attribute('VW', pair=True) == 'VW=GATTACA,ATGCCCTAG' v.annotate('VW', 'AAAAAAAAA', replace=False) assert v.attribute('VW') == ['GATTACA', 'ATGCCCTAG', 'AAAAAAAAA'] assert v.attribute('VW', pair=True) == 'VW=GATTACA,ATGCCCTAG,AAAAAAAAA' v.annotate('DROPPED', 3) assert v.attribute('DROPPED') == 3 assert v.attribute('DROPPED', string=True) == '3' v.annotate('DROPPED', 31, replace=False) assert v.attribute('DROPPED') == [3, 31] assert v.attribute('DROPPED', string=True) == '3,31' assert v.attribute('DROPPED', pair=True) == 'DROPPED=3,31' v.annotate('MATEDIST', 432.1234, replace=False) v.annotate('MATEDIST', 8765.4321, replace=False) assert v.attribute('MATEDIST', string=True) == '432.123,8765.432' v.annotate('LLIH', -436.0111857750478) assert v.attribute('LLIH', pair=True) == 'LLIH=-436.011'
def test_filter_field(): v = Variant('scaffold1', 12345, '.', '.') assert v.filterstr == '.' v.filter(vf.InscrutableCigar) assert v.filterstr == 'InscrutableCigar' v = Variant('chr1', 55555, '.', '.') v.filter(vf.PerfectMatch) assert v.filterstr == 'PerfectMatch' v = Variant('1', 809768, 'C', 'CAT') assert v.filterstr == 'PASS' v.filter(vf.PassengerVariant) assert v.filterstr == 'PassengerVariant' v.filter(vf.Homopolymer) assert v.filterstr == 'Homopolymer;PassengerVariant' v = Variant('one', 112358, 'T', 'A') v.filter('SNPyMcSNPface') v.filter(6.022e23) v.filter(dict(chicken='waffles', biscuits='gravy')) v.filterstr == 'PASS' # These "filters" shouldn't actually do anything
def test_writer(yrb_writer, capsys): yrb_writer = kevlar.vcf.VCFWriter(sys.stdout, source='py.test') yrb_writer.register_sample('NA19238') yrb_writer.register_sample('NA19239') yrb_writer.register_sample('NA19240') yrb_writer.describe_format('GT', 'String', '1', 'Genotype') yrb_writer.write_header() v = Variant('1', 12345, 'G', 'C') v.annotate('PART', '42') v.annotate('CONTIG', 'A' * 100) v.format('NA19238', 'GT', '0/0') v.format('NA19239', 'GT', '0/0') v.format('NA19240', 'GT', '0/1') v.format('NA19238', 'ALTABUND', '12,9,8') v.format('NA19239', 'ALTABUND', '0,0,0') v.format('NA19240', 'ALTABUND', '0,0,0') yrb_writer.write(v) out, err = capsys.readouterr() print(out) outlines = out.strip().split('\n') fmtlines = [l for l in outlines if l.startswith('##FORMAT')] assert len(fmtlines) == 2 gtfmt = '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">' assert gtfmt in fmtlines varlines = [l for l in outlines if not l.startswith('#')] assert len(varlines) == 1 values = varlines[0].split('\t') assert len(values) == 12 assert values[8:12] == [ 'ALTABUND:GT', '12,9,8:0/0', '0,0,0:0/0', '0,0,0:0/1' ]
def test_format(): v = Variant('1', 12345, 'G', 'C') v.format('NA19238', 'GT', '0/0') assert v.format('NA19238', 'GT') == '0/0' assert v.format('NA19238', 'XYZ') is None assert v.format('NA19239', 'GT') is None