def test_phase_ped_sample(tmpdir, sample_set): # running with --ped and --sample on subset of trio, should give same results as running with only --sample # the trio information should be ignored outvcf1 = str(tmpdir.join("output1.vcf")) outvcf2 = str(tmpdir.join("output2.vcf")) run_whatshap( phase_input_files=[ped_samples_bamfile], variant_file="tests/data/ped_samples.vcf", output=outvcf1, ped="tests/data/trio.ped", samples=sample_set, ) run_whatshap( phase_input_files=[ped_samples_bamfile], variant_file="tests/data/ped_samples.vcf", output=outvcf2, samples=sample_set, ) assert os.path.isfile(outvcf1) assert os.path.isfile(outvcf2) tables1 = list(VcfReader(outvcf1, phases=True)) tables2 = list(VcfReader(outvcf2, phases=True)) assert len(tables1) == 1 and len(tables2) == 1 table1, table2 = tables1[0], tables2[0] for individual in sample_set: assert_phasing(table1.phases_of(individual), table2.phases_of(individual))
def test_ped_sample(sample_set, tmp_path): # running with --ped and --sample on subset of trio, # should give same results as running with only --sample # the trio information should be ignored outvcf1 = tmp_path / "output1.vcf" outvcf2 = tmp_path / "output2.vcf" run_genotype( phase_input_files=[ped_samples_bamfile], variant_file="tests/data/ped_samples.vcf", output=outvcf1, ped="tests/data/trio.ped", samples=sample_set, ) run_genotype( phase_input_files=[ped_samples_bamfile], variant_file="tests/data/ped_samples.vcf", output=outvcf2, samples=sample_set, ) assert os.path.isfile(outvcf1) assert os.path.isfile(outvcf2) tables1 = list(VcfReader(outvcf1, phases=True, genotype_likelihoods=True)) tables2 = list(VcfReader(outvcf2, phases=True, genotype_likelihoods=True)) assert (len(tables1) == 1) and (len(tables2) == 1) table1, table2 = tables1[0], tables2[0] for individual in sample_set: for var1, var2 in zip( table1.genotype_likelihoods_of(individual), table2.genotype_likelihoods_of(individual), ): print(var1, var2) assert var1.log10_probs() == var2.log10_probs()
def test_read_region_subsets(): regions = [(1069570, 1070690), (1074910, 1076152)] vcf_reader = VcfReader("tests/data/haplotag_1.vcf.gz", indels=True) table = vcf_reader.fetch_regions("chr1", regions) assert table.chromosome == "chr1" assert len(table.variants) == 8 assert table.variants[5].reference_allele == "CG" assert table.variants[5].alternative_allele == "C"
def test_read_genotype_likelihoods(): tables = list(VcfReader("tests/data/genotype-likelihoods.vcf", genotype_likelihoods=True)) assert len(tables) == 1 table = tables[0] assert table.chromosome == "chrA" assert table.samples == ["sample1", "sample2"] assert len(table.variants) == 4 assert len(table.genotypes) == 2 assert list(table.genotypes[0]) == canonic_index_list_to_biallelic_gt_list([2, 1, 1, 1]) assert list(table.genotypes[1]) == canonic_index_list_to_biallelic_gt_list([1, 0, 0, 1]) gl0 = GenotypeLikelihoods([-2.1206, -0.8195, -0.07525]) gl1 = GenotypeLikelihoods([-10.3849, 0, -5.99143]) gl2 = GenotypeLikelihoods([-2.1, -0.8, -0.8]) gl3 = GenotypeLikelihoods([0, -10.0, -0.6]) assert len(table.genotype_likelihoods_of("sample1")) == 4 assert len(table.genotype_likelihoods_of("sample2")) == 4 expected1 = [gl0, gl2, None, gl0] expected2 = [gl1, gl3, None, gl1] for actual_gl, expected_gl in zip(table.genotype_likelihoods_of("sample1"), expected1): assert_genotype_likelihoods(actual_gl, expected_gl) for actual_gl, expected_gl in zip(table.genotype_likelihoods_of("sample2"), expected2): assert_genotype_likelihoods(actual_gl, expected_gl)
def test_read_multisample_vcf(): tables = list(VcfReader("tests/data/multisample.vcf")) assert len(tables) == 2 table, table_b = tables assert table_b.chromosome == "chrB" assert table_b.samples == ["sample1", "sample2"] assert table.chromosome == "chrA" assert len(table.variants) == 3 assert table.samples == ["sample1", "sample2"] assert table.variants[0].reference_allele == "A" assert table.variants[0].alternative_allele == "T" assert table.variants[1].reference_allele == "C" assert table.variants[1].alternative_allele == "G" assert table.variants[2].reference_allele == "G" assert table.variants[2].alternative_allele == "T" assert len(table.genotypes) == 2 assert list(table.genotypes[0]) == canonic_index_list_to_biallelic_gt_list( [1, 1, 1]) assert list(table.genotypes[1]) == canonic_index_list_to_biallelic_gt_list( [1, 1, 0]) assert list(table.genotypes_of( "sample1")) == canonic_index_list_to_biallelic_gt_list([1, 1, 1]) assert list(table.genotypes_of( "sample2")) == canonic_index_list_to_biallelic_gt_list([1, 1, 0])
def test_read_tetraploid_unphased(): tables = list( VcfReader("tests/data/polyploid.chr22.unphased.vcf", phases=False)) assert len(tables) == 1 table = tables[0] assert table.chromosome == "chr22" assert table.samples == ["HG00514_NA19240"] assert len(table.variants) == 8 assert table.variants[0].reference_allele == "A" assert table.variants[0].alternative_allele == "C" assert table.variants[1].reference_allele == "G" assert table.variants[1].alternative_allele == "A" assert table.variants[2].reference_allele == "G" assert table.variants[2].alternative_allele == "T" assert table.variants[3].reference_allele == "G" assert table.variants[3].alternative_allele == "C" print("Got:") for genotype in table.genotypes[0]: print(genotype) print("Exp:") for genotypte in canonic_index_list_to_biallelic_gt_list( [3, 2, 0, 3, 3, 1, 1, 1]): print(genotype) assert table.genotypes[0] == canonic_index_list_to_biallelic_gt_list( [3, 2, 0, 3, 3, 1, 1, 1], 4)
def test_read_tetraploid_phased(): tables = list(VcfReader("tests/data/polyploid.chr22.phased.vcf", phases=True)) assert len(tables) == 1 table = tables[0] assert table.chromosome == "chr22" assert table.samples == ["HG00514_NA19240"] assert len(table.variants) == 8 expected_phase = [ VariantCallPhase(block_id=20000000, phase=(1, 0, 1, 1), quality=None), VariantCallPhase(block_id=20000000, phase=(1, 0, 1, 0), quality=None), None, VariantCallPhase(block_id=20000000, phase=(1, 0, 1, 1), quality=None), VariantCallPhase(block_id=20001000, phase=(1, 0, 1, 1), quality=None), VariantCallPhase(block_id=20001000, phase=(0, 0, 0, 1), quality=None), VariantCallPhase(block_id=20001000, phase=(0, 0, 0, 1), quality=None), VariantCallPhase(block_id=20001000, phase=(0, 0, 0, 1), quality=None), ] print("Got:") for variant in table.phases[0]: print(variant) print("Exp:") for variant in expected_phase: print(variant) assert list(table.phases[0]) == expected_phase
def test_phase_three_individuals(algorithm): with TemporaryDirectory() as tempdir: outvcf = tempdir + '/output.vcf' outreadlist = tempdir + '/readlist.tsv' run_whatshap( phase_input_files=[trio_bamfile], variant_file='tests/data/trio.vcf', read_list_filename=outreadlist, output=outvcf, algorithm=algorithm) assert os.path.isfile(outvcf) assert os.path.isfile(outreadlist) tables = list(VcfReader(outvcf, phases=True)) assert len(tables) == 1 table = tables[0] assert table.chromosome == '1' assert len(table.variants) == 5 assert table.samples == ['HG004', 'HG003', 'HG002'] phase1 = VariantCallPhase(60906167, 0, None) phase3 = VariantCallPhase(60907394, 0, None) assert_phasing(table.phases_of('HG004'), [None, phase3, phase3, phase3, None]) assert_phasing(table.phases_of('HG003'), [phase1, None, phase1, None, None]) assert_phasing(table.phases_of('HG002'), [None, None, None, None, None])
def test_genotype_log_likelihoods_given(tmp_path): outvcf = tmp_path / "output_gl_log.vcf" outpriors = tmp_path / "priors.vcf" run_genotype( phase_input_files=[trio_bamfile], variant_file="tests/data/trio_genotype_log_likelihoods.vcf", output=outvcf, ped="tests/data/trio.ped", genmap="tests/data/trio.map", gt_qual_threshold=0, prioroutput=outpriors, ) for outfile in [outvcf, outpriors]: assert os.path.isfile(outfile) tables = list( VcfReader(outfile, phases=True, genotype_likelihoods=True)) assert len(tables) == 1 table = tables[0] assert table.chromosome == "1" assert len(table.variants) == 5 assert table.samples == ["HG004", "HG003", "HG002"] # check if GL likelihoods were replaced vcf_reader = VariantFile(outfile) print(list(vcf_reader.header.samples), outfile) for record in vcf_reader: for call in record.samples.values(): GL = call.get("GL", None) GQ = call.get("GQ", None) print("GL:", GL, "GQ", GQ) assert GL != [-1, -1, -1] assert GQ != 100
def test_blockcut_sensitivities(tmp_path): """ Ensure that the block cut sets are monotone to the sensitivity""" results = [] for s in range(6): outvcf = tmp_path / "output{}.vcf".format(s) run_polyphase( phase_input_files=["tests/data/polyploid.chr22.42M.12k.bam"], variant_file="tests/data/polyploid.chr22.42M.12k.vcf", ploidy=4, ignore_read_groups=True, block_cut_sensitivity=s, output=outvcf, ) assert os.path.isfile(outvcf) tables = list(VcfReader(outvcf, phases=True)) assert len(tables) == 1 block_starts = set([ i.block_id for i in tables[0].phases_of("HG00514_NA19240") if i is not None ]) results.append(block_starts) print(block_starts) for s in range(5): assert all(cut in results[s + 1] for cut in results[s])
def test_phase_three_individuals(algorithm, tmpdir): outvcf = str(tmpdir.join("output.vcf")) outreadlist = str(tmpdir.join("readlist.tsv")) run_whatshap( phase_input_files=[trio_bamfile], variant_file="tests/data/trio.vcf", read_list_filename=outreadlist, output=outvcf, algorithm=algorithm, ) assert os.path.isfile(outvcf) assert os.path.isfile(outreadlist) tables = list(VcfReader(outvcf, phases=True)) assert len(tables) == 1 table = tables[0] assert table.chromosome == "1" assert len(table.variants) == 5 assert table.samples == ["HG004", "HG003", "HG002"] phase1 = VariantCallPhase(60906167, (0, 1), None) phase3 = VariantCallPhase(60907394, (0, 1), None) assert_phasing(table.phases_of("HG004"), [None, phase3, phase3, phase3, None]) assert_phasing(table.phases_of("HG003"), [phase1, None, phase1, None, None]) assert_phasing(table.phases_of("HG002"), [None, None, None, None, None])
def test_genotyping_specific_chromosome(): for requested_chromosome in ['1','2']: with TemporaryDirectory() as tempdir: outvcf = tempdir + '/output.vcf' outpriors = tempdir + '/priors.vcf' run_genotype(phase_input_files=[trio_bamfile], variant_file='tests/data/trio-two-chromosomes.vcf', output=outvcf, ped='tests/data/trio.ped', genmap='tests/data/trio.map', chromosomes=[requested_chromosome], prioroutput=outpriors) for outfile in [outvcf, outpriors]: assert os.path.isfile(outfile) tables = list(VcfReader(outfile, genotype_likelihoods=True)) assert len(tables) == 2 for table in tables: assert len(table.variants) == 5 assert table.samples == ['HG004', 'HG003', 'HG002'] index = 0 if requested_chromosome == '1': index = 1 # should be no genotype likelihoods for skipped chromosomes for s in tables[index].samples: tables[index].genotype_likelihoods_of(s) == [None] * 5 tables[not index].genotype_likelihoods_of(s) != [None] * 5
def test_genotyping_one_of_three_individuals(tmp_path): outvcf = tmp_path / "output.vcf" outpriors = tmp_path / "priors.vcf" run_genotype( phase_input_files=[trio_bamfile], variant_file="tests/data/trio.vcf", output=outvcf, samples=["HG003"], prioroutput=outpriors, ) for outfile in [outvcf, outpriors]: assert os.path.isfile(outfile) tables = list( VcfReader(outfile, phases=True, genotype_likelihoods=True)) assert len(tables) == 1 table = tables[0] assert table.chromosome == "1" assert len(table.variants) == 5 assert table.samples == ["HG004", "HG003", "HG002"] # there should be no genotype predictions for HG003/HG002 default_l = math.log10(1 / 3.0) for l in [ table.genotype_likelihoods_of("HG002"), table.genotype_likelihoods_of("HG004"), ]: for var in l: for v in var.log10_probs(): assert pytest.approx(default_l) == v
def test_phase_trio_paired_end_reads(tmp_path): outvcf = tmp_path / "output-paired_end.vcf" run_whatshap( phase_input_files=[trio_paired_end_bamfile], variant_file="tests/data/paired_end.sorted.vcf", output=outvcf, ped="tests/data/trio_paired_end.ped", genmap="tests/data/trio.map", ) assert os.path.isfile(outvcf) tables = list(VcfReader(outvcf, phases=True)) assert len(tables) == 1 table = tables[0] assert table.chromosome == "1" assert len(table.variants) == 3 assert table.samples == ["mother", "father", "child"] assert table.num_of_blocks_of("mother") == 1 assert table.num_of_blocks_of("father") == 0 assert table.num_of_blocks_of("child") == 1 phase0 = VariantCallPhase(80050, (0, 1), None) phase1 = VariantCallPhase(80050, (1, 0), None) assert_phasing(table.phases_of("mother"), [phase1, phase1, phase0]) assert_phasing(table.phases_of("father"), [None, None, None]) assert_phasing(table.phases_of("child"), [None, None, phase1])
def test_duplicate_read(algorithm, expected_block, tmp_path): # This test is very similar to the previous test_phased_blocks # test, except that there is just a single read this time, # with homozygous site. Still, since hapchat would rather # phase this homozygous site, since the context is full # genotyping, it does so, regardless of any genotype # likelihood. See above test for more details. outvcf = tmp_path / "output.vcf" run_whatshap( phase_input_files=[short_duplicate_bamfile], variant_file="tests/data/short-genome/short.vcf", ignore_read_groups=True, distrust_genotypes=True, include_homozygous=True, output=outvcf, algorithm=algorithm, ) assert os.path.isfile(outvcf) tables = list(VcfReader(outvcf, phases=True)) assert len(tables) == 1 table = tables[0] assert table.chromosome == "chr1" assert len(table.variants) == 5 assert table.samples == ["sample"] blocks = [(p.block_id if p is not None else None) for p in table.phases_of("sample")] assert blocks == expected_block
def test_read_genotype_likelihoods(): tables = list( VcfReader('tests/data/genotype-likelihoods.vcf', genotype_likelihoods=True)) assert len(tables) == 1 table = tables[0] assert table.chromosome == 'chrA' assert table.samples == ['sample1', 'sample2'] assert len(table.variants) == 4 assert len(table.genotypes) == 2 assert list(table.genotypes[0]) == [2, 1, 1, 1] assert list(table.genotypes[1]) == [1, 0, 0, 1] gl0 = GenotypeLikelihoods(-2.1206, -0.8195, -0.07525) gl1 = GenotypeLikelihoods(-10.3849, 0, -5.99143) gl2 = GenotypeLikelihoods(-2.1, -0.8, -0.8) gl3 = GenotypeLikelihoods(0, -10.0, -0.6) assert len(table.genotype_likelihoods_of('sample1')) == 4 assert len(table.genotype_likelihoods_of('sample2')) == 4 expected1 = [gl0, gl2, None, gl0] expected2 = [gl1, gl3, None, gl1] for actual_gl, expected_gl in zip(table.genotype_likelihoods_of('sample1'), expected1): assert_genotype_likelihoods(actual_gl, expected_gl) for actual_gl, expected_gl in zip(table.genotype_likelihoods_of('sample2'), expected2): assert_genotype_likelihoods(actual_gl, expected_gl)
def test_phase_trio_dont_merge_blocks(tmpdir): outvcf = str(tmpdir.join("output-merged-blocks.vcf")) run_whatshap( phase_input_files=[trio_merged_bamfile], variant_file="tests/data/trio-merged-blocks.vcf", output=outvcf, ped="tests/data/trio.ped", genmap="tests/data/trio.map", genetic_haplotyping=False, ) assert os.path.isfile(outvcf) tables = list(VcfReader(outvcf, phases=True)) assert len(tables) == 1 table = tables[0] assert table.chromosome == "1" assert len(table.variants) == 8 assert table.samples == ["HG002", "HG003", "HG004"] assert table.num_of_blocks_of("HG004") == 2 assert table.num_of_blocks_of("HG003") == 1 assert table.num_of_blocks_of("HG002") == 1 phase1 = VariantCallPhase(752566, (1, 0), None) phase2_0 = VariantCallPhase(853954, (0, 1), None) phase2_1 = VariantCallPhase(853954, (1, 0), None) assert_phasing( table.phases_of("HG004"), [phase1, phase1, phase1, None, phase2_1, phase2_1, phase2_1, phase2_1], ) assert_phasing( table.phases_of("HG003"), [None, None, None, None, phase2_0, phase2_0, phase2_0, phase2_1], ) assert_phasing(table.phases_of("HG002"), [None, None, None, None, None, None, None, phase2_1])
def test_phase_specific_chromosome(): for requested_chromosome in ['1','2']: with TemporaryDirectory() as tempdir: outvcf = tempdir + '/output.vcf' run_whatshap(phase_input_files=[trio_bamfile], variant_file='tests/data/trio-two-chromosomes.vcf', output=outvcf, ped='tests/data/trio.ped', genmap='tests/data/trio.map', chromosomes=[requested_chromosome]) assert os.path.isfile(outvcf) tables = list(VcfReader(outvcf, phases=True)) assert len(tables) == 2 for table in tables: assert len(table.variants) == 5 assert table.samples == ['HG004', 'HG003', 'HG002'] if table.chromosome == '1' == requested_chromosome: phase0 = VariantCallPhase(60906167, 0, None) assert_phasing(table.phases_of('HG004'), [phase0, phase0, phase0, phase0, phase0]) assert_phasing(table.phases_of('HG003'), [phase0, None, phase0, phase0, phase0]) assert_phasing(table.phases_of('HG002'), [None, phase0, None, None, None]) elif table.chromosome == '2' == requested_chromosome: phase0 = VariantCallPhase(60906167, 0, None) phase1 = VariantCallPhase(60906167, 1, None) assert_phasing(table.phases_of('HG004'), [phase0, None, None, None, phase1]) assert_phasing(table.phases_of('HG003'), [phase0, None, None, None, None]) assert_phasing(table.phases_of('HG002'), [None, None, None, None, phase0]) else: assert_phasing(table.phases_of('HG004'), [None, None, None, None, None]) assert_phasing(table.phases_of('HG003'), [None, None, None, None, None]) assert_phasing(table.phases_of('HG002'), [None, None, None, None, None])
def test_genotyping_specific_chromosome(chromosome, tmp_path): outvcf = tmp_path / "output.vcf" outpriors = tmp_path / "priors.vcf" run_genotype( phase_input_files=[trio_bamfile], variant_file="tests/data/trio-two-chromosomes.vcf", output=outvcf, ped="tests/data/trio.ped", genmap="tests/data/trio.map", chromosomes=[chromosome], prioroutput=outpriors, ) for outfile in [outvcf, outpriors]: assert os.path.isfile(outfile) tables = list(VcfReader(outfile, genotype_likelihoods=True)) assert len(tables) == 2 for table in tables: assert len(table.variants) == 5 assert table.samples == ["HG004", "HG003", "HG002"] index = 0 if chromosome == "1": index = 1 # should be no genotype likelihoods for skipped chromosomes for s in tables[index].samples: assert tables[index].genotype_likelihoods_of(s) == [None] * 5 assert tables[not index].genotype_likelihoods_of(s) != [None] * 5
def __init__( self, bam_or_vcf_paths, reference, numeric_sample_ids, ignore_read_groups, indels, **kwargs # passed to ReadSetReader constructor ): self._bam_paths, self._vcf_paths = self._split_input_file_list(bam_or_vcf_paths) # TODO exit stack! self._numeric_sample_ids = numeric_sample_ids self._fasta = self._open_reference(reference) if reference else None vcf_readers = [VcfReader(f, indels=indels, phases=True) for f in self._vcf_paths] self._vcf_readers = vcf_readers self._ignore_read_groups = ignore_read_groups self._readset_reader = open_readset_reader( self._bam_paths, reference, numeric_sample_ids, **kwargs, ) if not self._vcf_readers: self._vcfs = [] else: self._vcfs = None # None means uninitialized, call .read_vcf() first
def test_phase_trio_use_ped_samples(): with TemporaryDirectory() as tempdir: for ped_samples in [True, False]: outvcf = tempdir + '/output_ped_samples.vcf' outreadlist = tempdir + '/readlist.tsv' run_whatshap(phase_input_files=[ped_samples_bamfile], variant_file='tests/data/ped_samples.vcf', read_list_filename=outreadlist, output=outvcf, ped='tests/data/trio.ped', genmap='tests/data/trio.map', use_ped_samples=ped_samples) assert os.path.isfile(outvcf) assert os.path.isfile(outreadlist) tables = list(VcfReader(outvcf, phases=True)) assert len(tables) == 1 table = tables[0] assert table.chromosome == '1' assert len(table.variants) == 5 assert table.samples == ['HG004', 'HG003', 'HG002', 'orphan'] phase0 = VariantCallPhase(60906167, 0, None) phase1 = VariantCallPhase(60907394, 0, None) assert_phasing(table.phases_of('HG004'), [phase0, phase0, phase0, phase0, phase0]) assert_phasing(table.phases_of('HG003'), [phase0, None, phase0, phase0, phase0]) assert_phasing(table.phases_of('HG002'), [None, phase0, None, None, None]) if ped_samples: assert_phasing(table.phases_of('orphan'), [None, None, None, None, None]) else: assert_phasing(table.phases_of('orphan'), [None, phase1, phase1, phase1, None])
def test_genotype_log_likelihoods_given(): with TemporaryDirectory() as tempdir: outvcf = tempdir + '/output_gl_log.vcf' outpriors = tempdir + '/priors.vcf' run_genotype(phase_input_files=[trio_bamfile], variant_file='tests/data/trio_genotype_log_likelihoods.vcf', output=outvcf, ped='tests/data/trio.ped', genmap='tests/data/trio.map', gt_qual_threshold=0, prioroutput=outpriors) for outfile in [outvcf, outpriors]: assert os.path.isfile(outfile) tables = list(VcfReader(outfile, phases=True, genotype_likelihoods=True)) assert len(tables) == 1 table = tables[0] assert table.chromosome == '1' assert len(table.variants) == 5 assert table.samples == ['HG004', 'HG003', 'HG002'] # check if GL likelihoods were replaced vcf_reader = vcf.Reader(filename=outfile) print(vcf_reader.samples, outfile) for record in vcf_reader: for call in record.samples: GL = getattr(call.data, 'GL', None) GQ = getattr(call.data, 'GQ', None) print('GL:', GL, 'GQ', GQ) assert(GL != [-1,-1,-1]) assert(GQ != 100)
def test_genotype_likelihoods_given(tmp_path): outvcf = tmp_path / "output_gl.vcf" run_genotype( phase_input_files=[trio_bamfile], variant_file="tests/data/trio_genotype_likelihoods.vcf", output=outvcf, ped="tests/data/trio.ped", genmap="tests/data/trio.map", ) assert os.path.isfile(outvcf) tables = list(VcfReader(outvcf, phases=True, genotype_likelihoods=True)) assert len(tables) == 1 table = tables[0] assert table.chromosome == "1" assert len(table.variants) == 5 assert table.samples == ["HG004", "HG003", "HG002"] # check if PL likelihoods (that were present before) are deleted vcf_reader = VariantFile(outvcf) # print(list(vcf_reader.samples), outvcf) for record in vcf_reader: for call in record.samples.values(): PL = call.get("PL", None) GL = call.get("GL", None) print("GL:", GL, "PL:", PL) assert PL == (None, None, None) assert GL is not None
def test_phase_trio_distrust_genotypes(tmpdir): outvcf = str(tmpdir.join("output_gl.vcf")) outreadlist = str(tmpdir.join("readlist.tsv")) run_whatshap( phase_input_files=[trio_bamfile], variant_file="tests/data/trio_genotype_likelihoods.vcf", read_list_filename=outreadlist, output=outvcf, ped="tests/data/trio.ped", genmap="tests/data/trio.map", distrust_genotypes=True, ) assert os.path.isfile(outvcf) assert os.path.isfile(outreadlist) tables = list(VcfReader(outvcf, phases=True)) assert len(tables) == 1 table = tables[0] assert table.chromosome == "1" assert len(table.variants) == 5 assert table.samples == ["HG004", "HG003", "HG002"] phase0 = VariantCallPhase(60906167, (0, 1), None) assert_phasing(table.phases_of("HG004"), [None, phase0, phase0, phase0, None]) assert_phasing(table.phases_of("HG003"), [phase0, None, phase0, phase0, phase0]) assert_phasing(table.phases_of("HG002"), [phase0, None, phase0, phase0, phase0])
def test_read_phased_vcf(): for filename in ["tests/data/phased-via-HP.vcf", "tests/data/phased-via-PS.vcf"]: print("Testing", filename) tables = list(VcfReader(filename, phases=True)) assert len(tables) == 2 table_a, table_b = tables assert table_a.chromosome == "chrA" assert len(table_a.variants) == 4 assert table_a.samples == ["sample1", "sample2"] assert table_b.chromosome == "chrB" assert len(table_b.variants) == 2 assert table_b.samples == ["sample1", "sample2"] assert len(table_a.genotypes) == 2 assert list(table_a.genotypes[0]) == canonic_index_list_to_biallelic_gt_list([1, 2, 1, 1]) assert list(table_a.genotypes[1]) == canonic_index_list_to_biallelic_gt_list([1, 1, 1, 1]) assert list(table_a.genotypes_of("sample1")) == canonic_index_list_to_biallelic_gt_list( [1, 2, 1, 1] ) assert list(table_a.genotypes_of("sample2")) == canonic_index_list_to_biallelic_gt_list( [1, 1, 1, 1] ) assert len(table_b.genotypes) == 2 assert list(table_b.genotypes[0]) == canonic_index_list_to_biallelic_gt_list([0, 1]) assert list(table_b.genotypes[1]) == canonic_index_list_to_biallelic_gt_list([1, 2]) assert list(table_b.genotypes_of("sample1")) == canonic_index_list_to_biallelic_gt_list( [0, 1] ) assert list(table_b.genotypes_of("sample2")) == canonic_index_list_to_biallelic_gt_list( [1, 2] ) print(table_a.phases) assert len(table_a.phases) == 2 expected_phase_sample1 = [ None, None, VariantCallPhase(block_id=300, phase=(1, 0), quality=23), VariantCallPhase(block_id=300, phase=(0, 1), quality=42), ] expected_phase_sample2 = [ VariantCallPhase(block_id=100, phase=(0, 1), quality=10), VariantCallPhase(block_id=100, phase=(1, 0), quality=20), VariantCallPhase(block_id=300, phase=(0, 1), quality=30), VariantCallPhase(block_id=300, phase=(0, 1), quality=None), ] assert list(table_a.phases[0]) == expected_phase_sample1 assert list(table_a.phases[1]) == expected_phase_sample2 assert list(table_a.phases_of("sample1")) == expected_phase_sample1 assert list(table_a.phases_of("sample2")) == expected_phase_sample2 assert len(table_b.phases) == 2 assert list(table_b.phases[0]) == [None, None] assert list(table_b.phases[1]) == [None, None] assert list(table_b.phases_of("sample1")) == [None, None] assert list(table_b.phases_of("sample2")) == [None, None]
def test_inconsistent_ploidy_phased(): try: _ = list( VcfReader("tests/data/polyploid.chr22.inconsistent.vcf", phases=True)) except PloidyError: return assert False
def test_vcf_without_index(tmp_path): vcf_path = tmp_path / "file.vcf.gz" import shutil shutil.copy("tests/data/haplotag_1.vcf.gz", vcf_path) with raises(VcfIndexMissing): with VcfReader(vcf_path) as vr: list(vr.fetch("chr1"))
def test_read_phased(): tables = list(VcfReader('tests/data/phasedinput.vcf', phases=True)) assert len(tables) == 1 table = tables[0] assert table.chromosome == 'ref' assert table.samples == ['sample'] assert len(table.variants) == 2 assert table.variants[0].reference_allele == 'A' assert table.variants[0].alternative_allele == 'C' assert table.variants[1].reference_allele == 'G' assert table.variants[1].alternative_allele == 'T' assert table.genotypes[0][0] == table.genotypes[0][1] == 1
def test_read_phased(): tables = list(VcfReader("tests/data/phasedinput.vcf", phases=True)) assert len(tables) == 1 table = tables[0] assert table.chromosome == "ref" assert table.samples == ["sample"] assert len(table.variants) == 2 assert table.variants[0].reference_allele == "A" assert table.variants[0].alternative_allele == "C" assert table.variants[1].reference_allele == "G" assert table.variants[1].alternative_allele == "T" assert table.genotypes[0][0] == table.genotypes[0][1] == canonic_index_to_biallelic_gt(1)
def test_read_phased_vcf(): for filename in [ 'tests/data/phased-via-HP.vcf', 'tests/data/phased-via-PS.vcf' ]: print('Testing', filename) tables = list(VcfReader(filename, phases=True)) assert len(tables) == 2 table_a, table_b = tables assert table_a.chromosome == 'chrA' assert len(table_a.variants) == 4 assert table_a.samples == ['sample1', 'sample2'] assert table_b.chromosome == 'chrB' assert len(table_b.variants) == 2 assert table_b.samples == ['sample1', 'sample2'] assert len(table_a.genotypes) == 2 assert list(table_a.genotypes[0]) == [1, 2, 1, 1] assert list(table_a.genotypes[1]) == [1, 1, 1, 1] assert list(table_a.genotypes_of('sample1')) == [1, 2, 1, 1] assert list(table_a.genotypes_of('sample2')) == [1, 1, 1, 1] assert len(table_b.genotypes) == 2 assert list(table_b.genotypes[0]) == [0, 1] assert list(table_b.genotypes[1]) == [1, 2] assert list(table_b.genotypes_of('sample1')) == [0, 1] assert list(table_b.genotypes_of('sample2')) == [1, 2] print(table_a.phases) assert len(table_a.phases) == 2 expected_phase_sample1 = [ None, None, VariantCallPhase(block_id=300, phase=1, quality=23), VariantCallPhase(block_id=300, phase=0, quality=42) ] expected_phase_sample2 = [ VariantCallPhase(block_id=100, phase=0, quality=10), VariantCallPhase(block_id=100, phase=1, quality=20), VariantCallPhase(block_id=300, phase=0, quality=30), VariantCallPhase(block_id=300, phase=0, quality=None) ] assert list(table_a.phases[0]) == expected_phase_sample1 assert list(table_a.phases[1]) == expected_phase_sample2 assert list(table_a.phases_of('sample1')) == expected_phase_sample1 assert list(table_a.phases_of('sample2')) == expected_phase_sample2 assert len(table_b.phases) == 2 assert list(table_b.phases[0]) == [None, None] assert list(table_b.phases[1]) == [None, None] assert list(table_b.phases_of('sample1')) == [None, None] assert list(table_b.phases_of('sample2')) == [None, None]