def test_DP_field(shared_datadir, tmpdir): fields = [ "variants/CHROM", "variants/POS", "variants/ID", "variants/REF", "variants/ALT", "calldata/GT", "samples", # extra "calldata/DP", "variants/DP", ] types = {"calldata/DP": "i4"} # override default of i2 allel_vcfzarr_path = create_allel_vcfzarr(shared_datadir, tmpdir, fields=fields, types=types) allel_ds = sg.read_vcfzarr(allel_vcfzarr_path) sg_vcfzarr_path = create_sg_vcfzarr(shared_datadir, tmpdir, fields=["INFO/DP", "FORMAT/DP"]) sg_ds = sg.load_dataset(str(sg_vcfzarr_path)) sg_ds = sg_ds.drop_vars( "call_genotype_phased") # not included in scikit-allel assert_identical(allel_ds, sg_ds)
def test_read_vcfzarr(shared_datadir, tmpdir): vcfzarr_path = create_vcfzarr(shared_datadir, tmpdir) # type: ignore[no-untyped-call] ds = read_vcfzarr(vcfzarr_path) assert ds.attrs["contigs"] == ["19", "20", "X"] assert_array_equal(ds["variant_contig"], [0, 0, 1, 1, 1, 1, 1, 1, 2]) assert_array_equal( ds["variant_position"], [111, 112, 14370, 17330, 1110696, 1230237, 1234567, 1235237, 10], ) assert_array_equal( ds["variant_allele"], [ ["A", "C", "", ""], ["A", "G", "", ""], ["G", "A", "", ""], ["T", "A", "", ""], ["A", "G", "T", ""], ["T", "", "", ""], ["G", "GA", "GAC", ""], ["T", "", "", ""], ["AC", "A", "ATG", "C"], ], ) assert_array_equal( ds["variant_id"], [ ".", ".", "rs6054257", ".", "rs6040355", ".", "microsat1", ".", "rsTest" ], ) assert_array_equal( ds["variant_id_mask"], [True, True, False, True, False, True, False, True, False], ) assert_array_equal(ds["sample_id"], ["NA00001", "NA00002", "NA00003"]) call_genotype = np.array( [ [[0, 0], [0, 0], [0, 1]], [[0, 0], [0, 0], [0, 1]], [[0, 0], [1, 0], [1, 1]], [[0, 0], [0, 1], [0, 0]], [[1, 2], [2, 1], [2, 2]], [[0, 0], [0, 0], [0, 0]], [[0, 1], [0, 2], [-1, -1]], [[0, 0], [0, 0], [-1, -1]], [[0, -1], [0, 1], [0, 2]], ], dtype="i1", ) assert_array_equal(ds["call_genotype"], call_genotype) assert_array_equal(ds["call_genotype_mask"], call_genotype < 0) assert "call_genotype_phased" not in ds
def test_default_fields(shared_datadir, tmpdir): allel_vcfzarr_path = create_allel_vcfzarr(shared_datadir, tmpdir) allel_ds = sg.read_vcfzarr(allel_vcfzarr_path) sg_vcfzarr_path = create_sg_vcfzarr(shared_datadir, tmpdir) sg_ds = sg.load_dataset(str(sg_vcfzarr_path)) sg_ds = sg_ds.drop_vars( "call_genotype_phased") # not included in scikit-allel assert_identical(allel_ds, sg_ds)
def test_all_fields(shared_datadir, tmpdir, vcf_file, allel_exclude_fields, sgkit_exclude_fields): # change scikit-allel type defaults back to the VCF default types = { "calldata/DP": "i4", "calldata/GQ": "i4", "calldata/HQ": "i4", "calldata/AD": "i4", } allel_vcfzarr_path = create_allel_vcfzarr( shared_datadir, tmpdir, vcf_file=vcf_file, fields=["*"], exclude_fields=allel_exclude_fields, types=types, ) field_defs = { "INFO/AF": { "Number": "A" }, "INFO/AC": { "Number": "A" }, "FORMAT/AD": { "Number": "R" }, "FORMAT/HQ": { "dimension": "haplotypes" }, "FORMAT/SB": { "dimension": "strand_biases" }, } allel_ds = sg.read_vcfzarr(allel_vcfzarr_path, field_defs=field_defs) sg_vcfzarr_path = create_sg_vcfzarr( shared_datadir, tmpdir, vcf_file=vcf_file, fields=["INFO/*", "FORMAT/*"], exclude_fields=sgkit_exclude_fields, field_defs=field_defs, truncate_calls=True, ) sg_ds = sg.load_dataset(str(sg_vcfzarr_path)) sg_ds = sg_ds.drop_vars( "call_genotype_phased") # not included in scikit-allel # scikit-allel only records contigs for which there are actual variants, # whereas sgkit records contigs from the header allel_ds_contigs = set(allel_ds.attrs["contigs"]) sg_ds_contigs = set(sg_ds.attrs["contigs"]) assert allel_ds_contigs <= sg_ds_contigs del allel_ds.attrs["contigs"] del sg_ds.attrs["contigs"] if allel_ds_contigs < sg_ds_contigs: # variant_contig variables are not comparable, so remove them before comparison del allel_ds["variant_contig"] del sg_ds["variant_contig"] assert_identical(allel_ds, sg_ds)