Beispiel #1
0
def test_partition_into_regions__missing_index(shared_datadir, is_path):
    vcf_path = path_for_test(shared_datadir,
                             "CEUTrio.20.21.gatk3.4.noindex.g.vcf.bgz",
                             is_path)
    with pytest.raises(ValueError, match=r"Cannot find .tbi or .csi file."):
        partition_into_regions(vcf_path, num_parts=2)

    bogus_index_path = path_for_test(
        shared_datadir, "CEUTrio.20.21.gatk3.4.noindex.g.vcf.bgz.index",
        is_path)
    with pytest.raises(ValueError,
                       match=r"Only .tbi or .csi indexes are supported."):
        partition_into_regions(vcf_path,
                               index_path=bogus_index_path,
                               num_parts=2)
Beispiel #2
0
def test_record_counts_csi(shared_datadir, vcf_file, is_path):
    # Check record counts in csi with actual count of VCF
    vcf_path = path_for_test(shared_datadir, vcf_file, is_path)
    csi_path = get_csi_path(vcf_path)
    csi = read_csi(csi_path)

    for i, contig in enumerate(VCF(vcf_path).seqnames):
        assert csi.record_counts[i] == count_variants(vcf_path, contig)
Beispiel #3
0
def test_vcf_to_zarr__mutiple_partitioned_invalid_regions(
    shared_datadir, is_path, tmp_path
):
    paths = [
        path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path),
        path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz", is_path),
    ]
    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()

    # invalid regions, should be a sequence of sequences
    regions = partition_into_regions(paths[0], num_parts=2)

    with pytest.raises(
        ValueError,
        match=r"multiple input regions must be a sequence of sequence of strings",
    ):
        vcf_to_zarr(paths, output, regions=regions, chunk_length=5_000)
Beispiel #4
0
def test_record_counts_tbi(shared_datadir, vcf_file, is_path):
    # Check record counts in tabix with actual count of VCF
    vcf_path = path_for_test(shared_datadir, vcf_file, is_path)
    tabix_path = get_tabix_path(vcf_path)
    tabix = read_tabix(tabix_path)

    for i, contig in enumerate(tabix.sequence_names):
        assert tabix.record_counts[i] == count_variants(vcf_path, contig)
Beispiel #5
0
def test_vcf_to_zarr__multiple(shared_datadir, is_path, tmp_path):
    paths = [
        path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path),
        path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz", is_path),
    ]
    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()

    vcf_to_zarr(paths, output, chunk_length=5_000)
    ds = xr.open_zarr(output)  # type: ignore[no-untyped-call]

    assert ds["sample_id"].shape == (1,)
    assert ds["call_genotype"].shape == (19910, 1, 2)
    assert ds["call_genotype_mask"].shape == (19910, 1, 2)
    assert ds["call_genotype_phased"].shape == (19910, 1)
    assert ds["variant_allele"].shape == (19910, 4)
    assert ds["variant_contig"].shape == (19910,)
    assert ds["variant_id"].shape == (19910,)
    assert ds["variant_id_mask"].shape == (19910,)
    assert ds["variant_position"].shape == (19910,)

    assert ds.chunks["variants"] == (5000, 5000, 5000, 4910)
Beispiel #6
0
def test_partition_into_regions__num_parts(shared_datadir, vcf_file, is_path):
    vcf_path = path_for_test(shared_datadir, vcf_file, is_path)

    regions = partition_into_regions(vcf_path, num_parts=4)

    assert regions is not None
    part_variant_counts = [
        count_variants(vcf_path, region) for region in regions
    ]
    total_variants = count_variants(vcf_path)

    assert sum(part_variant_counts) == total_variants
Beispiel #7
0
def test_partition_into_regions__target_part_size(shared_datadir, is_path):
    vcf_path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz",
                             is_path)

    regions = partition_into_regions(vcf_path, target_part_size=100_000)
    assert regions is not None
    assert len(regions) == 5

    part_variant_counts = [
        count_variants(vcf_path, region) for region in regions
    ]
    total_variants = count_variants(vcf_path)

    assert sum(part_variant_counts) == total_variants
Beispiel #8
0
def test_vcf_to_zarr__parallel_partitioned(shared_datadir, is_path, tmp_path):
    path = path_for_test(
        shared_datadir,
        "1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.gz",
        is_path,
    )
    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()

    regions = partition_into_regions(path, num_parts=4)

    vcf_to_zarr(path, output, regions=regions, chunk_length=1_000, chunk_width=1_000)
    ds = xr.open_zarr(output)  # type: ignore[no-untyped-call]

    assert ds["sample_id"].shape == (2535,)
    assert ds["variant_id"].shape == (1406,)
Beispiel #9
0
def test_vcf_to_zarr__parallel_temp_chunk_length_not_divisible(
    shared_datadir, tmp_path
):
    path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", False)
    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()
    regions = ["20", "21"]

    with pytest.raises(
        ValueError,
        match=r"Temporary chunk length in variant dimension \(4000\) must evenly divide target chunk length 5000",
    ):
        # Use a temp_chunk_length that does not divide into chunk_length
        vcf_to_zarr(
            path, output, regions=regions, chunk_length=5_000, temp_chunk_length=4_000
        )
Beispiel #10
0
def test_vcf_to_zarr__mutable_mapping(shared_datadir, is_path):
    path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path)
    output: MutableMapping[str, bytes] = {}

    vcf_to_zarr(path, output, chunk_length=5_000)
    ds = xr.open_zarr(output)  # type: ignore[no-untyped-call]

    assert ds["sample_id"].shape == (1,)
    assert ds["call_genotype"].shape == (19910, 1, 2)
    assert ds["call_genotype_mask"].shape == (19910, 1, 2)
    assert ds["call_genotype_phased"].shape == (19910, 1)
    assert ds["variant_allele"].shape == (19910, 4)
    assert ds["variant_contig"].shape == (19910,)
    assert ds["variant_id"].shape == (19910,)
    assert ds["variant_id_mask"].shape == (19910,)
    assert ds["variant_position"].shape == (19910,)

    assert ds["variant_allele"].dtype == "O"
    assert ds["variant_id"].dtype == "O"
Beispiel #11
0
def test_vcf_to_zarr__parallel(shared_datadir, is_path, tmp_path):
    path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path)
    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()
    regions = ["20", "21"]

    vcf_to_zarr(path, output, regions=regions, chunk_length=5_000)
    ds = xr.open_zarr(output)  # type: ignore[no-untyped-call]

    assert ds["sample_id"].shape == (1,)
    assert ds["call_genotype"].shape == (19910, 1, 2)
    assert ds["call_genotype_mask"].shape == (19910, 1, 2)
    assert ds["call_genotype_phased"].shape == (19910, 1)
    assert ds["variant_allele"].shape == (19910, 4)
    assert ds["variant_contig"].shape == (19910,)
    assert ds["variant_id"].shape == (19910,)
    assert ds["variant_id_mask"].shape == (19910,)
    assert ds["variant_position"].shape == (19910,)

    assert ds["variant_allele"].dtype == "S48"
    assert ds["variant_id"].dtype == "S1"
Beispiel #12
0
def test_partition_into_regions__invalid_arguments(shared_datadir, is_path):
    vcf_path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz",
                             is_path)

    with pytest.raises(
            ValueError,
            match=r"One of num_parts or target_part_size must be specified"):
        partition_into_regions(vcf_path)

    with pytest.raises(
            ValueError,
            match=r"Only one of num_parts or target_part_size may be specified"
    ):
        partition_into_regions(vcf_path, num_parts=4, target_part_size=100_000)

    with pytest.raises(ValueError, match=r"num_parts must be positive"):
        partition_into_regions(vcf_path, num_parts=0)

    with pytest.raises(ValueError, match=r"target_part_size must be positive"):
        partition_into_regions(vcf_path, target_part_size=0)
Beispiel #13
0
def test_read_csi__invalid_csi(shared_datadir, file, is_path):
    with pytest.raises(ValueError, match=r"File not in CSI format."):
        read_csi(path_for_test(shared_datadir, file, is_path))
Beispiel #14
0
def test_vcf_to_zarr__small_vcf(shared_datadir, is_path, tmp_path):
    path = path_for_test(shared_datadir, "sample.vcf.gz", is_path)
    output = tmp_path.joinpath("vcf.zarr").as_posix()

    vcf_to_zarr(path, output, chunk_length=5, chunk_width=2)
    ds = xr.open_zarr(output)  # type: ignore[no-untyped-call]

    assert ds.attrs["contigs"] == ["19", "20", "X"]
    assert_array_equal(ds["variant_contig"], [0, 0, 1, 1, 1, 1, 1, 1, 2])
    assert_array_equal(
        ds["variant_position"],
        [111, 112, 14370, 17330, 1110696, 1230237, 1234567, 1235237, 10],
    )
    assert_array_equal(
        ds["variant_allele"],
        [
            ["A", "C", "", ""],
            ["A", "G", "", ""],
            ["G", "A", "", ""],
            ["T", "A", "", ""],
            ["A", "G", "T", ""],
            ["T", "", "", ""],
            ["G", "GA", "GAC", ""],
            ["T", "", "", ""],
            ["AC", "A", "ATG", "C"],
        ],
    )
    assert ds["variant_allele"].dtype == "O"
    assert_array_equal(
        ds["variant_id"],
        [".", ".", "rs6054257", ".", "rs6040355", ".", "microsat1", ".", "rsTest"],
    )
    assert ds["variant_id"].dtype == "O"
    assert_array_equal(
        ds["variant_id_mask"],
        [True, True, False, True, False, True, False, True, False],
    )

    assert_array_equal(ds["sample_id"], ["NA00001", "NA00002", "NA00003"])

    call_genotype = np.array(
        [
            [[0, 0], [0, 0], [0, 1]],
            [[0, 0], [0, 0], [0, 1]],
            [[0, 0], [1, 0], [1, 1]],
            [[0, 0], [0, 1], [0, 0]],
            [[1, 2], [2, 1], [2, 2]],
            [[0, 0], [0, 0], [0, 0]],
            [[0, 1], [0, 2], [-1, -1]],
            [[0, 0], [0, 0], [-1, -1]],
            [[0, -1], [0, 1], [0, 2]],
        ],
        dtype="i1",
    )
    call_genotype_phased = np.array(
        [
            [True, True, False],
            [True, True, False],
            [True, True, False],
            [True, True, False],
            [True, True, False],
            [True, True, False],
            [False, False, False],
            [False, True, False],
            [True, False, True],
        ],
        dtype=bool,
    )
    assert_array_equal(ds["call_genotype"], call_genotype)
    assert_array_equal(ds["call_genotype_mask"], call_genotype < 0)
    assert_array_equal(ds["call_genotype_phased"], call_genotype_phased)
Beispiel #15
0
def test_partition_into_regions__one_part(shared_datadir, is_path):
    vcf_path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz",
                             is_path)
    assert partition_into_regions(vcf_path, num_parts=1) is None
Beispiel #16
0
def test_read_tabix__invalid_tbi(shared_datadir, file, is_path):
    with pytest.raises(ValueError, match=r"File not in Tabix format."):
        read_tabix(path_for_test(shared_datadir, file, is_path))