Example #1
0
def test_DP_field(shared_datadir, tmpdir):
    fields = [
        "variants/CHROM",
        "variants/POS",
        "variants/ID",
        "variants/REF",
        "variants/ALT",
        "calldata/GT",
        "samples",
        # extra
        "calldata/DP",
        "variants/DP",
    ]
    types = {"calldata/DP": "i4"}  # override default of i2
    allel_vcfzarr_path = create_allel_vcfzarr(shared_datadir,
                                              tmpdir,
                                              fields=fields,
                                              types=types)
    allel_ds = sg.read_vcfzarr(allel_vcfzarr_path)

    sg_vcfzarr_path = create_sg_vcfzarr(shared_datadir,
                                        tmpdir,
                                        fields=["INFO/DP", "FORMAT/DP"])
    sg_ds = sg.load_dataset(str(sg_vcfzarr_path))
    sg_ds = sg_ds.drop_vars(
        "call_genotype_phased")  # not included in scikit-allel

    assert_identical(allel_ds, sg_ds)
Example #2
0
def test_read_vcfzarr(shared_datadir, tmpdir):
    vcfzarr_path = create_vcfzarr(shared_datadir,
                                  tmpdir)  # type: ignore[no-untyped-call]
    ds = read_vcfzarr(vcfzarr_path)

    assert ds.attrs["contigs"] == ["19", "20", "X"]
    assert_array_equal(ds["variant_contig"], [0, 0, 1, 1, 1, 1, 1, 1, 2])
    assert_array_equal(
        ds["variant_position"],
        [111, 112, 14370, 17330, 1110696, 1230237, 1234567, 1235237, 10],
    )
    assert_array_equal(
        ds["variant_allele"],
        [
            ["A", "C", "", ""],
            ["A", "G", "", ""],
            ["G", "A", "", ""],
            ["T", "A", "", ""],
            ["A", "G", "T", ""],
            ["T", "", "", ""],
            ["G", "GA", "GAC", ""],
            ["T", "", "", ""],
            ["AC", "A", "ATG", "C"],
        ],
    )
    assert_array_equal(
        ds["variant_id"],
        [
            ".", ".", "rs6054257", ".", "rs6040355", ".", "microsat1", ".",
            "rsTest"
        ],
    )
    assert_array_equal(
        ds["variant_id_mask"],
        [True, True, False, True, False, True, False, True, False],
    )

    assert_array_equal(ds["sample_id"], ["NA00001", "NA00002", "NA00003"])

    call_genotype = np.array(
        [
            [[0, 0], [0, 0], [0, 1]],
            [[0, 0], [0, 0], [0, 1]],
            [[0, 0], [1, 0], [1, 1]],
            [[0, 0], [0, 1], [0, 0]],
            [[1, 2], [2, 1], [2, 2]],
            [[0, 0], [0, 0], [0, 0]],
            [[0, 1], [0, 2], [-1, -1]],
            [[0, 0], [0, 0], [-1, -1]],
            [[0, -1], [0, 1], [0, 2]],
        ],
        dtype="i1",
    )
    assert_array_equal(ds["call_genotype"], call_genotype)
    assert_array_equal(ds["call_genotype_mask"], call_genotype < 0)
    assert "call_genotype_phased" not in ds
Example #3
0
def test_default_fields(shared_datadir, tmpdir):
    allel_vcfzarr_path = create_allel_vcfzarr(shared_datadir, tmpdir)
    allel_ds = sg.read_vcfzarr(allel_vcfzarr_path)

    sg_vcfzarr_path = create_sg_vcfzarr(shared_datadir, tmpdir)
    sg_ds = sg.load_dataset(str(sg_vcfzarr_path))
    sg_ds = sg_ds.drop_vars(
        "call_genotype_phased")  # not included in scikit-allel

    assert_identical(allel_ds, sg_ds)
Example #4
0
def test_all_fields(shared_datadir, tmpdir, vcf_file, allel_exclude_fields,
                    sgkit_exclude_fields):
    # change scikit-allel type defaults back to the VCF default
    types = {
        "calldata/DP": "i4",
        "calldata/GQ": "i4",
        "calldata/HQ": "i4",
        "calldata/AD": "i4",
    }
    allel_vcfzarr_path = create_allel_vcfzarr(
        shared_datadir,
        tmpdir,
        vcf_file=vcf_file,
        fields=["*"],
        exclude_fields=allel_exclude_fields,
        types=types,
    )

    field_defs = {
        "INFO/AF": {
            "Number": "A"
        },
        "INFO/AC": {
            "Number": "A"
        },
        "FORMAT/AD": {
            "Number": "R"
        },
        "FORMAT/HQ": {
            "dimension": "haplotypes"
        },
        "FORMAT/SB": {
            "dimension": "strand_biases"
        },
    }
    allel_ds = sg.read_vcfzarr(allel_vcfzarr_path, field_defs=field_defs)

    sg_vcfzarr_path = create_sg_vcfzarr(
        shared_datadir,
        tmpdir,
        vcf_file=vcf_file,
        fields=["INFO/*", "FORMAT/*"],
        exclude_fields=sgkit_exclude_fields,
        field_defs=field_defs,
        truncate_calls=True,
    )
    sg_ds = sg.load_dataset(str(sg_vcfzarr_path))
    sg_ds = sg_ds.drop_vars(
        "call_genotype_phased")  # not included in scikit-allel

    # scikit-allel only records contigs for which there are actual variants,
    # whereas sgkit records contigs from the header
    allel_ds_contigs = set(allel_ds.attrs["contigs"])
    sg_ds_contigs = set(sg_ds.attrs["contigs"])
    assert allel_ds_contigs <= sg_ds_contigs
    del allel_ds.attrs["contigs"]
    del sg_ds.attrs["contigs"]

    if allel_ds_contigs < sg_ds_contigs:
        # variant_contig variables are not comparable, so remove them before comparison
        del allel_ds["variant_contig"]
        del sg_ds["variant_contig"]

    assert_identical(allel_ds, sg_ds)