def test_002_check_split(self): self.split_fps.extend(split_variants(self.vcf_merged)) # 1-based positions on either position to exclude # these are where two variants have been merged and cannot be easily # separated without alignment, or where an indel has been rolled # forwards due to the merge and splitting apart process. expt_excluded = [ {675, 677, 1582, 1734, 1775}, # hap 1 {370, 1194}, # hap 2 ] for expt_vcf, got_vcf, excluded in zip([self.vcf1, self.vcf2], self.split_fps, expt_excluded): expt_vcfr = VCFReader(expt_vcf) got_vcfr = VCFReader(got_vcf) for expt in expt_vcfr.fetch(): if expt.pos + 1 in excluded: continue got = list( got_vcfr.fetch(expt.chrom, expt.pos, expt.pos + len(expt.ref) + 1)) self.assertEqual( len(got), 1, 'Could not find split variant for {}:{}.'.format( expt.chrom, expt.pos + 1)) got = got[0] for key in ('chrom', 'pos', 'ref', 'alt'): expected = getattr(expt, key) result = getattr(got, key) self.assertEqual( expected, result, 'Splitting failed for {}:{} {}.'.format( expt.chrom, expt.pos + 1, key))
def merge_haploid_vcfs(vcf1, vcf2, vcf_out): "Merge SNPs from two haploid VCFs into an unphased diploid vcf." loci_by_chrom = defaultdict(set) vcf1 = VCFReader(vcf1) vcf2 = VCFReader(vcf2) for v in chain(vcf1.fetch(), vcf2.fetch()): loci_by_chrom[v.chrom].add(v.pos) with VCFWriter(vcf_out, 'w', version='4.1') as vcf_writer: for chrom, loci in loci_by_chrom.items(): for pos in sorted(loci): v1 = list(vcf1.fetch(ref_name=chrom, start=pos, end=pos+1)) v2 = list(vcf2.fetch(ref_name=chrom, start=pos, end=pos+1)) # the QC is -10*log10(1-p(label)) where p(label) is the medaka consensus # probability. To combine these, we probably want to multiply the # (1-p(label)) values, i.e. add the QC scores. However, in the case of a # herterozygous SNPs where one of the haplotypes is the reference, we # won't have the QC value of the reference haplotype (no variant was # called). # Hence if we want a common scale we need to assume we can apprimate the missing # QC score for the reference haplotypes as being equal to the non-reference # haplotype so we can set the overall score to double the latter. def get_gq(v1, v2): if len(v1) == 1 and len(v2) == 1: gq = float(v1[0].sample_dict['GQ']) + float(v2[0].sample_dict['GQ']) else: v = v1[0] if len(v1) == 1 else v2[0] gq = 2 * float(v.sample_dict['GQ']) return gq def get_ref(v1, v2): return v1[0].ref if len(v1) == 1 else v2[0].ref # Note we output unphased GTs as we might have multiple phased # regions and the phase can switch between regions # heterozygous on v1: if len(v1) == 1 and (len(v2) == 0 or v2[0].alt == ['.']): alt = v1[0].alt gt = '0/1' # not 1/0 by convention since this is unphased # heterozygous on v2 elif (len(v1) == 0 or v1[0].alt == ['.']) and len(v2) == 1: alt = v2[0].alt gt = '0/1' else: assert len(v1) == 1 and len(v2) == 1 if v1[0].alt == v2[0].alt: #homozygous snp alt = v1[0].alt gt = '1/1' else: #heterozygous snp alt = v1[0].alt + v2[0].alt gt = '1/2' gq = get_gq(v1, v2) v = Variant(chrom, pos, get_ref(v1, v2), alt=alt, qual=gq, sample_dict={'GT':gt, 'GQ':gq}) vcf_writer.write_variant(v)
def test_vcf_annotate(self): variants_annotated = [ Variant('MN908947.3', 29748, 'ACGATCGAGTG', alt=['A'], ident='.', qual=243.965, filt='PASS', info='AR=0,0;DP=200;DPS=100,100;DPSP=199;SC=19484,20327,22036,23215;SR=1,2,98,98', genotype_data=OrderedDict([('GT','1'), ('GQ', '244')])), Variant('MN908947.3', 29764, 'TGAACAATGCT', alt=['A'], ident='.', qual=243.965, filt='PASS', info='AR=0,0;DP=200;DPS=100,100;DPSP=199;SC=19970,21140,15773,16751;SR=99,100,0,0', genotype_data=OrderedDict([('GT','1'), ('GQ', '244')])), Variant('MN908947.3', 29788, 'TATATGGAAGA', alt=['A'], ident='.', qual=243.965, filt='PASS', info='AR=0,0;DP=199;DPS=99,100;DPSP=197;SC=26174,28129,19085,20315;SR=96,100,1,0', genotype_data=OrderedDict([('GT', '1'), ('GQ','244')]))] variants_annotated = variants_annotated + deepcopy(variants_annotated) for i in range(3, 6): variants_annotated[i].chrom = "Duplicate" with tempfile.NamedTemporaryFile() as vcfout: # Annotate vcf args = Namespace(RG=self.rg, vcf=self.vcf,ref_fasta=self.ref_fasta, bam=self.bam, vcfout=vcfout.name, chunk_size=100000, pad=25, dpsp=True) annotate_vcf_n_reads(args) # Read in output variants and compare with expected annotated variants vcf_reader = VCFReader(vcfout.name) for i, v in enumerate(vcf_reader.fetch()): self.assertEqual(v, variants_annotated[i], 'Annotation failed for variant {}: {} {}.'.format(i, v.chrom, v.pos))