def test_partition_into_regions__missing_index(shared_datadir, is_path):
    vcf_path = path_for_test(shared_datadir,
                             "CEUTrio.20.21.gatk3.4.noindex.g.vcf.bgz",
                             is_path)
    with pytest.raises(ValueError, match=r"Cannot find .tbi or .csi file."):
        partition_into_regions(vcf_path, num_parts=2)

    bogus_index_path = path_for_test(
        shared_datadir, "CEUTrio.20.21.gatk3.4.noindex.g.vcf.bgz.index",
        is_path)
    with pytest.raises(ValueError,
                       match=r"Only .tbi or .csi indexes are supported."):
        partition_into_regions(vcf_path,
                               index_path=bogus_index_path,
                               num_parts=2)
def test_partition_into_regions__num_parts(shared_datadir, vcf_file, is_path):
    vcf_path = path_for_test(shared_datadir, vcf_file, is_path)

    regions = partition_into_regions(vcf_path, num_parts=4)

    assert regions is not None
    part_variant_counts = [
        count_variants(vcf_path, region) for region in regions
    ]
    total_variants = count_variants(vcf_path)

    assert sum(part_variant_counts) == total_variants
def test_partition_into_regions__num_parts_large(shared_datadir, is_path):
    vcf_path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz",
                             is_path)

    regions = partition_into_regions(vcf_path, num_parts=100)
    assert regions is not None
    assert len(regions) == 18

    part_variant_counts = [
        count_variants(vcf_path, region) for region in regions
    ]
    total_variants = count_variants(vcf_path)

    assert sum(part_variant_counts) == total_variants
Beispiel #4
0
def test_vcf_to_zarr__parallel_partitioned(shared_datadir, is_path, tmp_path):
    path = path_for_test(
        shared_datadir,
        "1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.gz",
        is_path,
    )
    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()

    regions = partition_into_regions(path, num_parts=4)

    vcf_to_zarr(path, output, regions=regions, chunk_length=1_000, chunk_width=1_000)
    ds = xr.open_zarr(output)  # type: ignore[no-untyped-call]

    assert ds["sample_id"].shape == (2535,)
    assert ds["variant_id"].shape == (1406,)
Beispiel #5
0
def test_vcf_to_zarr__mutiple_partitioned_invalid_regions(
    shared_datadir, is_path, tmp_path
):
    paths = [
        path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path),
        path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz", is_path),
    ]
    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()

    # invalid regions, should be a sequence of sequences
    regions = partition_into_regions(paths[0], num_parts=2)

    with pytest.raises(
        ValueError,
        match=r"multiple input regions must be a sequence of sequence of strings",
    ):
        vcf_to_zarr(paths, output, regions=regions, chunk_length=5_000)
def test_partition_into_regions__invalid_arguments(shared_datadir, is_path):
    vcf_path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz",
                             is_path)

    with pytest.raises(
            ValueError,
            match=r"One of num_parts or target_part_size must be specified"):
        partition_into_regions(vcf_path)

    with pytest.raises(
            ValueError,
            match=r"Only one of num_parts or target_part_size may be specified"
    ):
        partition_into_regions(vcf_path, num_parts=4, target_part_size=100_000)

    with pytest.raises(ValueError, match=r"num_parts must be positive"):
        partition_into_regions(vcf_path, num_parts=0)

    with pytest.raises(ValueError, match=r"target_part_size must be positive"):
        partition_into_regions(vcf_path, target_part_size=0)
Beispiel #7
0
def test_vcf_to_zarr__multiple_partitioned(shared_datadir, is_path, tmp_path):
    paths = [
        path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path),
        path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz", is_path),
    ]
    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()

    regions = [partition_into_regions(path, num_parts=2) for path in paths]

    vcf_to_zarr(paths, output, regions=regions, chunk_length=5_000)
    ds = xr.open_zarr(output)  # type: ignore[no-untyped-call]

    assert ds["sample_id"].shape == (1,)
    assert ds["call_genotype"].shape == (19910, 1, 2)
    assert ds["call_genotype_mask"].shape == (19910, 1, 2)
    assert ds["call_genotype_phased"].shape == (19910, 1)
    assert ds["variant_allele"].shape == (19910, 4)
    assert ds["variant_contig"].shape == (19910,)
    assert ds["variant_id"].shape == (19910,)
    assert ds["variant_id_mask"].shape == (19910,)
    assert ds["variant_position"].shape == (19910,)

    assert ds.chunks["variants"] == (5000, 5000, 5000, 4910)
Beispiel #8
0
        having variable length with ``.``), and names the final dimension of the ``HQ`` array
        (which is defined as Number 2 in the VCF header) as ``haplotypes``.
        (Note that Number ``A`` is the number of alternate alleles, see section 1.4.2 of the
        VCF spec https://samtools.github.io/hts-specs/VCFv4.3.pdf.)
    """

    if temp_chunk_length is not None:
        if chunk_length % temp_chunk_length != 0:
            raise ValueError(
                f"Temporary chunk length in variant dimension ({temp_chunk_length}) "
                f"must evenly divide target chunk length {chunk_length}")
    if regions is None and target_part_size is not None:
        if target_part_size == "auto":
            target_part_size = "100MB"
        if isinstance(input, str) or isinstance(input, Path):
            regions = partition_into_regions(input,
                                             target_part_size=target_part_size)
        else:
            # Multiple inputs
            inputs = input
            regions = [
                partition_into_regions(input,
                                       target_part_size=target_part_size)
                for input in inputs
            ]

    if (isinstance(input, str) or isinstance(
            input, Path)) and (regions is None or isinstance(regions, str)):
        convert_func = vcf_to_zarr_sequential
    else:
        convert_func = functools.partial(
            vcf_to_zarr_parallel,
def test_partition_into_regions__one_part(shared_datadir, is_path):
    vcf_path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz",
                             is_path)
    assert partition_into_regions(vcf_path, num_parts=1) is None