Esempio n. 1
0
def read_plink(
    *,
    path: Optional[PathType] = None,
    bed_path: Optional[PathType] = None,
    bim_path: Optional[PathType] = None,
    fam_path: Optional[PathType] = None,
    chunks: Union[str, int, tuple] = "auto",  # type: ignore[type-arg]
    fam_sep: str = " ",
    bim_sep: str = "\t",
    bim_int_contig: bool = False,
    count_a1: bool = True,
    lock: bool = False,
    persist: bool = True,
) -> Dataset:
    """Read PLINK dataset.

    Loads a single PLINK dataset as dask arrays within a Dataset
    from bed, bim, and fam files.

    Parameters
    ----------
    path
        Path to PLINK file set.
        This should not include a suffix, i.e. if the files are
        at `data.{bed,fam,bim}` then only 'data' should be
        provided (suffixes are added internally).
        Either this path must be provided or all 3 of
        `bed_path`, `bim_path` and `fam_path`.
    bed_path
        Path to PLINK bed file.
        This should be a full path including the `.bed` extension
        and cannot be specified in conjunction with `path`.
    bim_path
        Path to PLINK bim file.
        This should be a full path including the `.bim` extension
        and cannot be specified in conjunction with `path`.
    fam_path
        Path to PLINK fam file.
        This should be a full path including the `.fam` extension
        and cannot be specified in conjunction with `path`.
    chunks
        Chunk size for genotype (i.e. `.bed`) data, by default "auto"
    fam_sep
        Delimiter for `.fam` file, by default " "
    bim_sep
        Delimiter for `.bim` file, by default "\t"
    bim_int_contig
        Whether or not the contig/chromosome name in the `.bim`
        file should be interpreted as an integer, by default False.
        If False, then the `variant/contig` field in the resulting
        dataset will contain the indexes of corresponding strings
        encountered in the first `.bim` field.
    count_a1
        Whether or not allele counts should be for A1 or A2,
        by default True. Typically A1 is the minor allele
        and should be counted instead of A2. This is not enforced
        by PLINK though and it is up to the data generating process
        to ensure that A1 is in fact an alternate/minor/effect
        allele. See https://www.cog-genomics.org/plink/1.9/formats
        for more details.
    lock
        Whether or not to synchronize concurrent reads of `.bed`
        file blocks, by default False. This is passed through to
        [dask.array.from_array](https://docs.dask.org/en/latest/array-api.html#dask.array.from_array).
    persist
        Whether or not to persist `.fam` and `.bim` information in
        memory, by default True. This is an important performance
        consideration as the plain text files for this data will
        be read multiple times when False. This can lead to load
        times that are upwards of 10x slower.

    Returns
    -------
    A dataset containing genotypes as 3 dimensional calls along with
    all accompanying pedigree and variant information. The content
    of this dataset includes:

    - :data:`sgkit.variables.variant_id_spec` (variants)
    - :data:`sgkit.variables.variant_contig_spec` (variants)
    - :data:`sgkit.variables.variant_position_spec` (variants)
    - :data:`sgkit.variables.variant_allele_spec` (variants)
    - :data:`sgkit.variables.sample_id_spec` (samples)
    - :data:`sgkit.variables.call_genotype_spec` (variants, samples, ploidy)
    - :data:`sgkit.variables.call_genotype_mask_spec` (variants, samples, ploidy)

    The following pedigree-specific fields are also included:

    - ``sample_family_id``: Family identifier commonly referred to as FID
    - ``sample_id``: Within-family identifier for sample
    - ``sample_paternal_id``: Within-family identifier for father of sample
    - ``sample_maternal_id``: Within-family identifier for mother of sample
    - ``sample_sex``: Sex code equal to 1 for male, 2 for female, and -1
        for missing
    - ``sample_phenotype``: Phenotype code equal to 1 for control, 2 for case,
        and -1 for missing


    See https://www.cog-genomics.org/plink/1.9/formats#fam for more details.

    Raises
    ------
    ValueError
        If `path` and one of `bed_path`, `bim_path` or `fam_path` are provided.
    """
    if path and (bed_path or bim_path or fam_path):
        raise ValueError(
            "Either `path` or all 3 of `{bed,bim,fam}_path` must be specified but not both"
        )
    if path:
        bed_path, bim_path, fam_path = [
            f"{path}.{ext}" for ext in ["bed", "bim", "fam"]
        ]

    # Load axis data first to determine dimension sizes
    df_fam = read_fam(fam_path, sep=fam_sep)  # type: ignore[arg-type]
    df_bim = read_bim(bim_path, sep=bim_sep)  # type: ignore[arg-type]

    if persist:
        df_fam = df_fam.persist()
        df_bim = df_bim.persist()

    arr_fam = dataframe_to_dict(df_fam, dtype=FAM_ARRAY_DTYPE)
    arr_bim = dataframe_to_dict(df_bim, dtype=BIM_ARRAY_DTYPE)

    # Load genotyping data
    call_genotype = da.from_array(
        # Make sure to use asarray=False in order for masked arrays to propagate
        BedReader(bed_path, (len(df_bim), len(df_fam)), count_A1=count_a1),  # type: ignore[arg-type]
        chunks=chunks,
        # Lock must be true with multiprocessing dask scheduler
        # to not get bed-reader errors (it works w/ threading backend though)
        lock=lock,
        asarray=False,
        name=f"bed_reader:read_plink:{bed_path}",
    )

    # If contigs are already integers, use them as-is
    variant_contig: ArrayLike = None
    if bim_int_contig:
        variant_contig = arr_bim["contig"].astype("int16")
        variant_contig_names = da.unique(variant_contig).astype(str)
        variant_contig_names = list(variant_contig_names.compute())
    # Otherwise create index for contig names based
    # on order of appearance in underlying .bim file
    else:
        variant_contig, variant_contig_names = encode_array(arr_bim["contig"].compute())  # type: ignore
        variant_contig = variant_contig.astype("int16")
        variant_contig_names = list(variant_contig_names)

    variant_position = arr_bim["pos"]
    a1: ArrayLike = arr_bim["a1"].astype("str")
    a2: ArrayLike = arr_bim["a2"].astype("str")

    # Note: column_stack not implemented in Dask, must use [v|h]stack
    variant_allele = da.hstack((a1[:, np.newaxis], a2[:, np.newaxis]))
    variant_allele = variant_allele.astype("S")
    variant_id = arr_bim["variant_id"]

    sample_id = arr_fam["member_id"]

    ds = create_genotype_call_dataset(
        variant_contig_names=variant_contig_names,
        variant_contig=variant_contig,
        variant_position=variant_position,
        variant_allele=variant_allele,
        sample_id=sample_id,
        call_genotype=call_genotype,
        variant_id=variant_id,
    )

    # Assign PLINK-specific pedigree fields
    return ds.assign(
        {f"sample_{f}": (DIM_SAMPLE, arr_fam[f]) for f in arr_fam if f != "member_id"}
    )
Esempio n. 2
0
def read_bgen(
    path: PathType,
    metafile_path: Optional[PathType] = None,
    sample_path: Optional[PathType] = None,
    chunks: Union[str, int, Tuple[int, int, int]] = "auto",
    lock: bool = False,
    persist: bool = True,
    contig_dtype: DType = "str",
    gp_dtype: DType = "float32",
) -> Dataset:
    """Read BGEN dataset.

    Loads a single BGEN dataset as dask arrays within a Dataset
    from a ``.bgen`` file.

    Parameters
    ----------
    path
        Path to BGEN file.
    metafile_path
        Path to companion index file used to determine BGEN byte offsets.
        Defaults to ``path`` + ".metafile" if not provided.
        This file is necessary for reading BGEN genotype probabilities and it will be
        generated the first time the file is read if it does not already exist.
        If it needs to be created, it can make the first call to this function
        much slower than subsequent calls.
    sample_path
        Path to ``.sample`` file, by default None. This is used to fetch sample identifiers
        and when provided it is preferred over sample identifiers embedded in the ``.bgen`` file.
    chunks
        Chunk size for genotype probability data (3 dimensions),
        by default "auto".
    lock
        Whether or not to synchronize concurrent reads of
        file blocks, by default False. This is passed through to
        [dask.array.from_array](https://docs.dask.org/en/latest/array-api.html#dask.array.from_array).
    persist
        Whether or not to persist variant information in memory, by default True.
        This is an important performance consideration as the metadata file for this data will
        be read multiple times when False.
    contig_dtype
        Data type for contig names, by default "str".
        This may also be an integer type (e.g. "int"), but will fail if any of the contig names
        cannot be converted to integers.
    gp_dtype
        Data type for genotype probabilities, by default "float32".

    Warnings
    --------
    Only bi-allelic, diploid BGEN files are currently supported.

    Returns
    -------
    A dataset containing the following variables:

    - :data:`sgkit.variables.variant_id_spec` (variants)
    - :data:`sgkit.variables.variant_contig_spec` (variants)
    - :data:`sgkit.variables.variant_position_spec` (variants)
    - :data:`sgkit.variables.variant_allele_spec` (variants)
    - :data:`sgkit.variables.sample_id_spec` (samples)
    - :data:`sgkit.variables.call_dosage_spec` (variants, samples)
    - :data:`sgkit.variables.call_dosage_mask_spec` (variants, samples)
    - :data:`sgkit.variables.call_genotype_probability_spec` (variants, samples, genotypes)
    - :data:`sgkit.variables.call_genotype_probability_mask_spec` (variants, samples, genotypes)

    """
    if isinstance(chunks, tuple) and len(chunks) != 3:
        raise ValueError(f"`chunks` must be tuple with 3 items, not {chunks}")
    if not np.issubdtype(gp_dtype, np.floating):
        raise ValueError(
            f"`gp_dtype` must be a floating point data type, not {gp_dtype}"
        )
    if not np.issubdtype(contig_dtype, np.integer) and np.dtype(
        contig_dtype
    ).kind not in {"U", "S"}:
        raise ValueError(
            f"`contig_dtype` must be of string or int type, not {contig_dtype}"
        )

    path = Path(path)
    sample_path = Path(sample_path) if sample_path else path.with_suffix(".sample")

    if sample_path.exists():
        sample_id = read_samples(sample_path).sample_id.values.astype("U")
    else:
        sample_id = _default_sample_ids(path)

    bgen_reader = BgenReader(path, metafile_path=metafile_path, dtype=gp_dtype)

    df = read_metafile(bgen_reader.metafile_path)
    if persist:
        df = df.persist()
    arrs = dataframe_to_dict(df, METAFILE_DTYPE)

    variant_id = arrs["id"]
    variant_contig = arrs["chrom"].astype(contig_dtype)
    variant_contig, variant_contig_names = encode_contigs(variant_contig)
    variant_contig_names = list(variant_contig_names)
    variant_position = arrs["pos"]
    variant_allele = da.hstack((arrs["a1"][:, np.newaxis], arrs["a2"][:, np.newaxis]))

    call_genotype_probability = da.from_array(
        bgen_reader,
        chunks=chunks,
        lock=lock,
        fancy=False,
        asarray=False,
        name=f"{bgen_reader.name}:read_bgen:{path}",
    )
    call_dosage = _to_dosage(call_genotype_probability)

    ds: Dataset = create_genotype_dosage_dataset(
        variant_contig_names=variant_contig_names,
        variant_contig=variant_contig,
        variant_position=variant_position,
        variant_allele=variant_allele,
        sample_id=sample_id,
        call_dosage=call_dosage,
        call_genotype_probability=call_genotype_probability,
        variant_id=variant_id,
    )

    return ds