def test_partition_into_regions__missing_index(shared_datadir, is_path): vcf_path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.noindex.g.vcf.bgz", is_path) with pytest.raises(ValueError, match=r"Cannot find .tbi or .csi file."): partition_into_regions(vcf_path, num_parts=2) bogus_index_path = path_for_test( shared_datadir, "CEUTrio.20.21.gatk3.4.noindex.g.vcf.bgz.index", is_path) with pytest.raises(ValueError, match=r"Only .tbi or .csi indexes are supported."): partition_into_regions(vcf_path, index_path=bogus_index_path, num_parts=2)
def test_record_counts_csi(shared_datadir, vcf_file, is_path): # Check record counts in csi with actual count of VCF vcf_path = path_for_test(shared_datadir, vcf_file, is_path) csi_path = get_csi_path(vcf_path) csi = read_csi(csi_path) for i, contig in enumerate(VCF(vcf_path).seqnames): assert csi.record_counts[i] == count_variants(vcf_path, contig)
def test_vcf_to_zarr__mutiple_partitioned_invalid_regions( shared_datadir, is_path, tmp_path ): paths = [ path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path), path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz", is_path), ] output = tmp_path.joinpath("vcf_concat.zarr").as_posix() # invalid regions, should be a sequence of sequences regions = partition_into_regions(paths[0], num_parts=2) with pytest.raises( ValueError, match=r"multiple input regions must be a sequence of sequence of strings", ): vcf_to_zarr(paths, output, regions=regions, chunk_length=5_000)
def test_record_counts_tbi(shared_datadir, vcf_file, is_path): # Check record counts in tabix with actual count of VCF vcf_path = path_for_test(shared_datadir, vcf_file, is_path) tabix_path = get_tabix_path(vcf_path) tabix = read_tabix(tabix_path) for i, contig in enumerate(tabix.sequence_names): assert tabix.record_counts[i] == count_variants(vcf_path, contig)
def test_vcf_to_zarr__multiple(shared_datadir, is_path, tmp_path): paths = [ path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path), path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz", is_path), ] output = tmp_path.joinpath("vcf_concat.zarr").as_posix() vcf_to_zarr(paths, output, chunk_length=5_000) ds = xr.open_zarr(output) # type: ignore[no-untyped-call] assert ds["sample_id"].shape == (1,) assert ds["call_genotype"].shape == (19910, 1, 2) assert ds["call_genotype_mask"].shape == (19910, 1, 2) assert ds["call_genotype_phased"].shape == (19910, 1) assert ds["variant_allele"].shape == (19910, 4) assert ds["variant_contig"].shape == (19910,) assert ds["variant_id"].shape == (19910,) assert ds["variant_id_mask"].shape == (19910,) assert ds["variant_position"].shape == (19910,) assert ds.chunks["variants"] == (5000, 5000, 5000, 4910)
def test_partition_into_regions__num_parts(shared_datadir, vcf_file, is_path): vcf_path = path_for_test(shared_datadir, vcf_file, is_path) regions = partition_into_regions(vcf_path, num_parts=4) assert regions is not None part_variant_counts = [ count_variants(vcf_path, region) for region in regions ] total_variants = count_variants(vcf_path) assert sum(part_variant_counts) == total_variants
def test_partition_into_regions__target_part_size(shared_datadir, is_path): vcf_path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path) regions = partition_into_regions(vcf_path, target_part_size=100_000) assert regions is not None assert len(regions) == 5 part_variant_counts = [ count_variants(vcf_path, region) for region in regions ] total_variants = count_variants(vcf_path) assert sum(part_variant_counts) == total_variants
def test_vcf_to_zarr__parallel_partitioned(shared_datadir, is_path, tmp_path): path = path_for_test( shared_datadir, "1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.gz", is_path, ) output = tmp_path.joinpath("vcf_concat.zarr").as_posix() regions = partition_into_regions(path, num_parts=4) vcf_to_zarr(path, output, regions=regions, chunk_length=1_000, chunk_width=1_000) ds = xr.open_zarr(output) # type: ignore[no-untyped-call] assert ds["sample_id"].shape == (2535,) assert ds["variant_id"].shape == (1406,)
def test_vcf_to_zarr__parallel_temp_chunk_length_not_divisible( shared_datadir, tmp_path ): path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", False) output = tmp_path.joinpath("vcf_concat.zarr").as_posix() regions = ["20", "21"] with pytest.raises( ValueError, match=r"Temporary chunk length in variant dimension \(4000\) must evenly divide target chunk length 5000", ): # Use a temp_chunk_length that does not divide into chunk_length vcf_to_zarr( path, output, regions=regions, chunk_length=5_000, temp_chunk_length=4_000 )
def test_vcf_to_zarr__mutable_mapping(shared_datadir, is_path): path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path) output: MutableMapping[str, bytes] = {} vcf_to_zarr(path, output, chunk_length=5_000) ds = xr.open_zarr(output) # type: ignore[no-untyped-call] assert ds["sample_id"].shape == (1,) assert ds["call_genotype"].shape == (19910, 1, 2) assert ds["call_genotype_mask"].shape == (19910, 1, 2) assert ds["call_genotype_phased"].shape == (19910, 1) assert ds["variant_allele"].shape == (19910, 4) assert ds["variant_contig"].shape == (19910,) assert ds["variant_id"].shape == (19910,) assert ds["variant_id_mask"].shape == (19910,) assert ds["variant_position"].shape == (19910,) assert ds["variant_allele"].dtype == "O" assert ds["variant_id"].dtype == "O"
def test_vcf_to_zarr__parallel(shared_datadir, is_path, tmp_path): path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path) output = tmp_path.joinpath("vcf_concat.zarr").as_posix() regions = ["20", "21"] vcf_to_zarr(path, output, regions=regions, chunk_length=5_000) ds = xr.open_zarr(output) # type: ignore[no-untyped-call] assert ds["sample_id"].shape == (1,) assert ds["call_genotype"].shape == (19910, 1, 2) assert ds["call_genotype_mask"].shape == (19910, 1, 2) assert ds["call_genotype_phased"].shape == (19910, 1) assert ds["variant_allele"].shape == (19910, 4) assert ds["variant_contig"].shape == (19910,) assert ds["variant_id"].shape == (19910,) assert ds["variant_id_mask"].shape == (19910,) assert ds["variant_position"].shape == (19910,) assert ds["variant_allele"].dtype == "S48" assert ds["variant_id"].dtype == "S1"
def test_partition_into_regions__invalid_arguments(shared_datadir, is_path): vcf_path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path) with pytest.raises( ValueError, match=r"One of num_parts or target_part_size must be specified"): partition_into_regions(vcf_path) with pytest.raises( ValueError, match=r"Only one of num_parts or target_part_size may be specified" ): partition_into_regions(vcf_path, num_parts=4, target_part_size=100_000) with pytest.raises(ValueError, match=r"num_parts must be positive"): partition_into_regions(vcf_path, num_parts=0) with pytest.raises(ValueError, match=r"target_part_size must be positive"): partition_into_regions(vcf_path, target_part_size=0)
def test_read_csi__invalid_csi(shared_datadir, file, is_path): with pytest.raises(ValueError, match=r"File not in CSI format."): read_csi(path_for_test(shared_datadir, file, is_path))
def test_vcf_to_zarr__small_vcf(shared_datadir, is_path, tmp_path): path = path_for_test(shared_datadir, "sample.vcf.gz", is_path) output = tmp_path.joinpath("vcf.zarr").as_posix() vcf_to_zarr(path, output, chunk_length=5, chunk_width=2) ds = xr.open_zarr(output) # type: ignore[no-untyped-call] assert ds.attrs["contigs"] == ["19", "20", "X"] assert_array_equal(ds["variant_contig"], [0, 0, 1, 1, 1, 1, 1, 1, 2]) assert_array_equal( ds["variant_position"], [111, 112, 14370, 17330, 1110696, 1230237, 1234567, 1235237, 10], ) assert_array_equal( ds["variant_allele"], [ ["A", "C", "", ""], ["A", "G", "", ""], ["G", "A", "", ""], ["T", "A", "", ""], ["A", "G", "T", ""], ["T", "", "", ""], ["G", "GA", "GAC", ""], ["T", "", "", ""], ["AC", "A", "ATG", "C"], ], ) assert ds["variant_allele"].dtype == "O" assert_array_equal( ds["variant_id"], [".", ".", "rs6054257", ".", "rs6040355", ".", "microsat1", ".", "rsTest"], ) assert ds["variant_id"].dtype == "O" assert_array_equal( ds["variant_id_mask"], [True, True, False, True, False, True, False, True, False], ) assert_array_equal(ds["sample_id"], ["NA00001", "NA00002", "NA00003"]) call_genotype = np.array( [ [[0, 0], [0, 0], [0, 1]], [[0, 0], [0, 0], [0, 1]], [[0, 0], [1, 0], [1, 1]], [[0, 0], [0, 1], [0, 0]], [[1, 2], [2, 1], [2, 2]], [[0, 0], [0, 0], [0, 0]], [[0, 1], [0, 2], [-1, -1]], [[0, 0], [0, 0], [-1, -1]], [[0, -1], [0, 1], [0, 2]], ], dtype="i1", ) call_genotype_phased = np.array( [ [True, True, False], [True, True, False], [True, True, False], [True, True, False], [True, True, False], [True, True, False], [False, False, False], [False, True, False], [True, False, True], ], dtype=bool, ) assert_array_equal(ds["call_genotype"], call_genotype) assert_array_equal(ds["call_genotype_mask"], call_genotype < 0) assert_array_equal(ds["call_genotype_phased"], call_genotype_phased)
def test_partition_into_regions__one_part(shared_datadir, is_path): vcf_path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path) assert partition_into_regions(vcf_path, num_parts=1) is None
def test_read_tabix__invalid_tbi(shared_datadir, file, is_path): with pytest.raises(ValueError, match=r"File not in Tabix format."): read_tabix(path_for_test(shared_datadir, file, is_path))