def test_save_and_load_dataset__mutable_mapping(): store: MutableMapping[str, bytes] = {} ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10) save_dataset(ds, store) ds2 = load_dataset(store) assert_identical(ds, ds2) # save and load again to test https://github.com/pydata/xarray/issues/4386 store2: MutableMapping[str, bytes] = {} save_dataset(ds2, store2) assert_identical(ds, load_dataset(store2))
def test_save_and_load_dataset(tmp_path, is_path): path = tmp_path / "ds.zarr" if not is_path: path = str(path) ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10) save_dataset(ds, path) ds2 = load_dataset(path) assert_identical(ds, ds2) # save and load again to test https://github.com/pydata/xarray/issues/4386 path2 = tmp_path / "ds2.zarr" if not is_path: path2 = str(path2) save_dataset(ds2, path2) assert_identical(ds, load_dataset(path2))
def test_DP_field(shared_datadir, tmpdir): fields = [ "variants/CHROM", "variants/POS", "variants/ID", "variants/REF", "variants/ALT", "calldata/GT", "samples", # extra "calldata/DP", "variants/DP", ] types = {"calldata/DP": "i4"} # override default of i2 allel_vcfzarr_path = create_allel_vcfzarr(shared_datadir, tmpdir, fields=fields, types=types) allel_ds = sg.read_vcfzarr(allel_vcfzarr_path) sg_vcfzarr_path = create_sg_vcfzarr(shared_datadir, tmpdir, fields=["INFO/DP", "FORMAT/DP"]) sg_ds = sg.load_dataset(str(sg_vcfzarr_path)) sg_ds = sg_ds.drop_vars( "call_genotype_phased") # not included in scikit-allel assert_identical(allel_ds, sg_ds)
def test_DP_field(shared_datadir, tmpdir): fields = [ "variants/CHROM", "variants/POS", "variants/ID", "variants/REF", "variants/ALT", "variants/QUAL", "calldata/GT", "samples", # extra "calldata/DP", "variants/DP", ] types = {"calldata/DP": "i4"} # override default of i2 allel_vcfzarr_path = create_allel_vcfzarr(shared_datadir, tmpdir, fields=fields, types=types) allel_ds = sg.read_scikit_allel_vcfzarr(allel_vcfzarr_path) sg_vcfzarr_path = create_sg_vcfzarr( shared_datadir, tmpdir, fields=["INFO/DP", "FORMAT/DP", "FORMAT/GT"]) sg_ds = sg.load_dataset(str(sg_vcfzarr_path)) sg_ds = fix_missing_fields(sg_ds) assert_identical(allel_ds, sg_ds)
def test_vcf_to_zarr__mixed_ploidy_vcf( shared_datadir, tmp_path, ploidy, mixed_ploidy, truncate_calls, regions ): path = path_for_test(shared_datadir, "mixed.vcf.gz") output = tmp_path.joinpath("vcf.zarr").as_posix() vcf_to_zarr( path, output, regions=regions, chunk_length=5, chunk_width=2, ploidy=ploidy, mixed_ploidy=mixed_ploidy, truncate_calls=truncate_calls, ) ds = load_dataset(output) variant_dtype = "|S1" if regions else "O" assert ds.attrs["contigs"] == ["CHR1", "CHR2", "CHR3"] assert_array_equal(ds["variant_contig"], [0, 0]) assert_array_equal(ds["variant_position"], [2, 7]) assert_array_equal( ds["variant_allele"], np.array( [ ["A", "T", "", ""], ["A", "C", "", ""], ], dtype=variant_dtype, ), ) assert ds["variant_allele"].dtype == variant_dtype assert_array_equal( ds["variant_id"], np.array([".", "."], dtype=variant_dtype), ) assert ds["variant_id"].dtype == variant_dtype assert_array_equal( ds["variant_id_mask"], [True, True], ) assert_array_equal(ds["sample_id"], ["SAMPLE1", "SAMPLE2", "SAMPLE3"]) assert ds["call_genotype"].attrs["mixed_ploidy"] == mixed_ploidy pad = -2 if mixed_ploidy else -1 # -2 indicates a non-allele call_genotype = np.array( [ [[0, 0, 1, 1, pad], [0, 0, pad, pad, pad], [0, 0, 0, 1, pad]], [[0, 0, 1, 1, pad], [0, 1, pad, pad, pad], [0, 1, -1, -1, pad]], ], dtype="i1", ) # truncate row vectors if lower ploidy call_genotype = call_genotype[:, :, 0:ploidy] assert_array_equal(ds["call_genotype"], call_genotype) assert_array_equal(ds["call_genotype_mask"], call_genotype < 0) if mixed_ploidy: assert_array_equal(ds["call_genotype_non_allele"], call_genotype < -1)
def test_default_fields(shared_datadir, tmpdir): allel_vcfzarr_path = create_allel_vcfzarr(shared_datadir, tmpdir) allel_ds = sg.read_scikit_allel_vcfzarr(allel_vcfzarr_path) sg_vcfzarr_path = create_sg_vcfzarr(shared_datadir, tmpdir) sg_ds = sg.load_dataset(str(sg_vcfzarr_path)) sg_ds = fix_missing_fields(sg_ds) assert_identical(allel_ds, sg_ds)
def test_default_fields(shared_datadir, tmpdir): allel_vcfzarr_path = create_allel_vcfzarr(shared_datadir, tmpdir) allel_ds = sg.read_vcfzarr(allel_vcfzarr_path) sg_vcfzarr_path = create_sg_vcfzarr(shared_datadir, tmpdir) sg_ds = sg.load_dataset(str(sg_vcfzarr_path)) sg_ds = sg_ds.drop_vars( "call_genotype_phased") # not included in scikit-allel assert_identical(allel_ds, sg_ds)
def test_vcf_to_zarr__call_genotype_dtype(shared_datadir, tmp_path, max_alt_alleles, dtype, warning): path = path_for_test(shared_datadir, "allele_overflow.vcf.gz") output = tmp_path.joinpath("vcf.zarr").as_posix() if warning: with pytest.warns(MaxAltAllelesExceededWarning): vcf_to_zarr(path, output, max_alt_alleles=max_alt_alleles) else: vcf_to_zarr(path, output, max_alt_alleles=max_alt_alleles) ds = load_dataset(output) assert ds.call_genotype.dtype == dtype assert ds.call_genotype.values.max() <= max_alt_alleles
def test_all_fields(shared_datadir, tmpdir, vcf_file, allel_exclude_fields, sgkit_exclude_fields): # change scikit-allel type defaults back to the VCF default types = { "calldata/DP": "i4", "calldata/GQ": "i4", "calldata/HQ": "i4", "calldata/AD": "i4", } allel_vcfzarr_path = create_allel_vcfzarr( shared_datadir, tmpdir, vcf_file=vcf_file, fields=["*"], exclude_fields=allel_exclude_fields, types=types, ) field_defs = { "INFO/AF": { "Number": "A" }, "INFO/AC": { "Number": "A" }, "FORMAT/AD": { "Number": "R" }, "FORMAT/HQ": { "dimension": "haplotypes" }, "FORMAT/SB": { "dimension": "strand_biases" }, } allel_ds = sg.read_vcfzarr(allel_vcfzarr_path, field_defs=field_defs) sg_vcfzarr_path = create_sg_vcfzarr( shared_datadir, tmpdir, vcf_file=vcf_file, fields=["INFO/*", "FORMAT/*"], exclude_fields=sgkit_exclude_fields, field_defs=field_defs, truncate_calls=True, ) sg_ds = sg.load_dataset(str(sg_vcfzarr_path)) sg_ds = sg_ds.drop_vars( "call_genotype_phased") # not included in scikit-allel # scikit-allel only records contigs for which there are actual variants, # whereas sgkit records contigs from the header allel_ds_contigs = set(allel_ds.attrs["contigs"]) sg_ds_contigs = set(sg_ds.attrs["contigs"]) assert allel_ds_contigs <= sg_ds_contigs del allel_ds.attrs["contigs"] del sg_ds.attrs["contigs"] if allel_ds_contigs < sg_ds_contigs: # variant_contig variables are not comparable, so remove them before comparison del allel_ds["variant_contig"] del sg_ds["variant_contig"] assert_identical(allel_ds, sg_ds)
def zarr_to_vcf( input: Union[PathType, MutableMapping[str, bytes]], output: PathType, ) -> None: """Convert a Zarr file to VCF. For test purposes only.""" ds = load_dataset(input) ds = ds.load() header_str = ds.attrs["vcf_header"] contigs = ds.attrs["contigs"] filters = ds.attrs["filters"] n_samples = ds.dims["samples"] with open(output, mode="w") as out: vcf_writer = VcfWriter(out, header_str) info_fields = _info_fields(header_str) format_fields = _format_fields(header_str) for i in range(ds.dims["variants"]): chrom = ds.variant_contig[i].values.item() pos = ds.variant_position[i].values.item() id = ds.variant_id[i].values.item() _, ref_alt = array_to_values(ds.variant_allele[i].values) ref = ref_alt[0] alt = ref_alt[1:] _, qual = array_to_values(ds.variant_quality[i].values) _, filter_ = array_to_values(ds.variant_filter[i].values) if isinstance(filter_, bool): filter_ = np.array([filter_]) if np.all(~filter_): filter_ = None else: filter_ = [filters[i] for i, f in enumerate(filter_) if f] info = {} samples = [{} for _ in range(n_samples)] # type: ignore for key in info_fields: variable_name = f"variant_{key}" if variable_name in ds: arr = ds[variable_name][i].values present, val = array_to_values(arr, variable_name) if present: info[key] = val for key in format_fields: if key == "GT": variable_name = "call_genotype" else: variable_name = f"call_{key}" if variable_name in ds: arr = ds[variable_name][i].values assert len(arr) == n_samples if key == "GT": phased = ds["call_genotype_phased"][i].values for j in range(len(arr)): present, val = array_to_values(arr[j], variable_name) if not present: break # samples should all be present or none are if key == "GT": lst = [(str(v) if v is not None else ".") for v in val] val = ("|" if phased[j] else "/").join(lst) samples[j][key] = val variant = VcfVariant(contigs[chrom], pos, id, ref, alt, qual, filter_, info, samples) vcf_writer.write(variant)