Beispiel #1
0
def test_create_genotype_dosage_dataset():
    variant_contig_names = ["chr1"]
    variant_contig = np.array([0, 0], dtype="i1")
    variant_position = np.array([1000, 2000], dtype="i4")
    variant_alleles = np.array([["A", "C"], ["G", "A"]], dtype="S1")
    variant_id = np.array(["rs1", "rs2"], dtype=str)
    sample_id = np.array(["sample_1", "sample_2", "sample_3"], dtype=str)
    call_dosage = np.array([[0.8, 0.9, 1.0], [1.0, 1.1, 1.2]], dtype="f4")
    ds = create_genotype_dosage_dataset(
        variant_contig_names=variant_contig_names,
        variant_contig=variant_contig,
        variant_position=variant_position,
        variant_alleles=variant_alleles,
        sample_id=sample_id,
        call_dosage=call_dosage,
        variant_id=variant_id,
    )

    assert DIM_VARIANT in ds.dims
    assert DIM_SAMPLE in ds.dims

    assert ds.attrs["contigs"] == variant_contig_names
    assert_array_equal(ds["variant_contig"], variant_contig)
    assert_array_equal(ds["variant_position"], variant_position)
    assert_array_equal(ds["variant_allele"], variant_alleles)
    assert_array_equal(ds["variant_id"], variant_id)
    assert_array_equal(ds["sample_id"], sample_id)
    assert_array_equal(ds["call_dosage"], call_dosage)
    assert_array_equal(ds["call_dosage_mask"], np.isnan(call_dosage))
Beispiel #2
0
def read_bgen(
    path: PathType,
    metafile_path: Optional[PathType] = None,
    sample_path: Optional[PathType] = None,
    chunks: Union[str, int, Tuple[int, int, int]] = "auto",
    lock: bool = False,
    persist: bool = True,
    contig_dtype: DType = "str",
    gp_dtype: DType = "float32",
) -> Dataset:
    """Read BGEN dataset.

    Loads a single BGEN dataset as dask arrays within a Dataset
    from a ``.bgen`` file.

    Parameters
    ----------
    path
        Path to BGEN file.
    metafile_path
        Path to companion index file used to determine BGEN byte offsets.
        Defaults to ``path`` + ".metafile" if not provided.
        This file is necessary for reading BGEN genotype probabilities and it will be
        generated the first time the file is read if it does not already exist.
        If it needs to be created, it can make the first call to this function
        much slower than subsequent calls.
    sample_path
        Path to ``.sample`` file, by default None. This is used to fetch sample identifiers
        and when provided it is preferred over sample identifiers embedded in the ``.bgen`` file.
    chunks
        Chunk size for genotype probability data (3 dimensions),
        by default "auto".
    lock
        Whether or not to synchronize concurrent reads of
        file blocks, by default False. This is passed through to
        [dask.array.from_array](https://docs.dask.org/en/latest/array-api.html#dask.array.from_array).
    persist
        Whether or not to persist variant information in memory, by default True.
        This is an important performance consideration as the metadata file for this data will
        be read multiple times when False.
    contig_dtype
        Data type for contig names, by default "str".
        This may also be an integer type (e.g. "int"), but will fail if any of the contig names
        cannot be converted to integers.
    gp_dtype
        Data type for genotype probabilities, by default "float32".

    Warnings
    --------
    Only bi-allelic, diploid BGEN files are currently supported.

    Returns
    -------
    A dataset containing the following variables:

    - :data:`sgkit.variables.variant_id_spec` (variants)
    - :data:`sgkit.variables.variant_contig_spec` (variants)
    - :data:`sgkit.variables.variant_position_spec` (variants)
    - :data:`sgkit.variables.variant_allele_spec` (variants)
    - :data:`sgkit.variables.sample_id_spec` (samples)
    - :data:`sgkit.variables.call_dosage_spec` (variants, samples)
    - :data:`sgkit.variables.call_dosage_mask_spec` (variants, samples)
    - :data:`sgkit.variables.call_genotype_probability_spec` (variants, samples, genotypes)
    - :data:`sgkit.variables.call_genotype_probability_mask_spec` (variants, samples, genotypes)

    """
    if isinstance(chunks, tuple) and len(chunks) != 3:
        raise ValueError(f"`chunks` must be tuple with 3 items, not {chunks}")
    if not np.issubdtype(gp_dtype, np.floating):
        raise ValueError(
            f"`gp_dtype` must be a floating point data type, not {gp_dtype}"
        )
    if not np.issubdtype(contig_dtype, np.integer) and np.dtype(
        contig_dtype
    ).kind not in {"U", "S"}:
        raise ValueError(
            f"`contig_dtype` must be of string or int type, not {contig_dtype}"
        )

    path = Path(path)
    sample_path = Path(sample_path) if sample_path else path.with_suffix(".sample")

    if sample_path.exists():
        sample_id = read_samples(sample_path).sample_id.values.astype("U")
    else:
        sample_id = _default_sample_ids(path)

    bgen_reader = BgenReader(path, metafile_path=metafile_path, dtype=gp_dtype)

    df = read_metafile(bgen_reader.metafile_path)
    if persist:
        df = df.persist()
    arrs = dataframe_to_dict(df, METAFILE_DTYPE)

    variant_id = arrs["id"]
    variant_contig = arrs["chrom"].astype(contig_dtype)
    variant_contig, variant_contig_names = encode_contigs(variant_contig)
    variant_contig_names = list(variant_contig_names)
    variant_position = arrs["pos"]
    variant_allele = da.hstack((arrs["a1"][:, np.newaxis], arrs["a2"][:, np.newaxis]))

    call_genotype_probability = da.from_array(
        bgen_reader,
        chunks=chunks,
        lock=lock,
        fancy=False,
        asarray=False,
        name=f"{bgen_reader.name}:read_bgen:{path}",
    )
    call_dosage = _to_dosage(call_genotype_probability)

    ds: Dataset = create_genotype_dosage_dataset(
        variant_contig_names=variant_contig_names,
        variant_contig=variant_contig,
        variant_position=variant_position,
        variant_allele=variant_allele,
        sample_id=sample_id,
        call_dosage=call_dosage,
        call_genotype_probability=call_genotype_probability,
        variant_id=variant_id,
    )

    return ds
Beispiel #3
0
def read_bgen(
    path: PathType,
    chunks: Union[str, int, Tuple[int, ...]] = "auto",
    lock: bool = False,
    persist: bool = True,
    dtype: Any = "float32",
) -> Dataset:
    """Read BGEN dataset.

    Loads a single BGEN dataset as dask arrays within a Dataset
    from a bgen file.

    Parameters
    ----------
    path : PathType
        Path to BGEN file.
    chunks : Union[str, int, tuple], optional
        Chunk size for genotype probability data (3 dimensions),
        by default "auto".
    lock : bool, optional
        Whether or not to synchronize concurrent reads of
        file blocks, by default False. This is passed through to
        [dask.array.from_array](https://docs.dask.org/en/latest/array-api.html#dask.array.from_array).
    persist : bool, optional
        Whether or not to persist variant information in
        memory, by default True.  This is an important performance
        consideration as the metadata file for this data will
        be read multiple times when False.
    dtype : Any
        Genotype probability array data type, by default float32.

    Warnings
    --------
    Only bi-allelic, diploid BGEN files are currently supported.
    """

    bgen_reader = BgenReader(path, persist, dtype=dtype)

    variant_contig, variant_contig_names = encode_array(bgen_reader.contig.compute())
    variant_contig_names = list(variant_contig_names)
    variant_contig = variant_contig.astype("int16")
    variant_position = np.asarray(bgen_reader.pos, dtype=int)
    variant_alleles = np.asarray(bgen_reader.variant_alleles, dtype="S")
    variant_id = np.asarray(bgen_reader.variant_id, dtype=str)
    sample_id = np.asarray(bgen_reader.sample_id, dtype=str)

    call_genotype_probability = da.from_array(
        bgen_reader,
        chunks=chunks,
        lock=lock,
        fancy=False,
        asarray=False,
        name=f"{bgen_reader.name}:read_bgen:{path}",
    )
    call_dosage = _to_dosage(call_genotype_probability)

    ds: Dataset = create_genotype_dosage_dataset(
        variant_contig_names=variant_contig_names,
        variant_contig=variant_contig,
        variant_position=variant_position,
        variant_alleles=variant_alleles,
        sample_id=sample_id,
        call_dosage=call_dosage,
        call_genotype_probability=call_genotype_probability,
        variant_id=variant_id,
    )

    return ds