def read_plink( *, path: Optional[PathType] = None, bed_path: Optional[PathType] = None, bim_path: Optional[PathType] = None, fam_path: Optional[PathType] = None, chunks: Union[str, int, tuple] = "auto", # type: ignore[type-arg] fam_sep: str = " ", bim_sep: str = "\t", bim_int_contig: bool = False, count_a1: bool = True, lock: bool = False, persist: bool = True, ) -> Dataset: """Read PLINK dataset. Loads a single PLINK dataset as dask arrays within a Dataset from bed, bim, and fam files. Parameters ---------- path : Optional[PathType] Path to PLINK file set. This should not include a suffix, i.e. if the files are at `data.{bed,fam,bim}` then only 'data' should be provided (suffixes are added internally). Either this path must be provided or all 3 of `bed_path`, `bim_path` and `fam_path`. bed_path: Optional[PathType] Path to PLINK bed file. This should be a full path including the `.bed` extension and cannot be specified in conjunction with `path`. bim_path: Optional[PathType] Path to PLINK bim file. This should be a full path including the `.bim` extension and cannot be specified in conjunction with `path`. fam_path: Optional[PathType] Path to PLINK fam file. This should be a full path including the `.fam` extension and cannot be specified in conjunction with `path`. chunks : Union[str, int, tuple], optional Chunk size for genotype (i.e. `.bed`) data, by default "auto" fam_sep : str, optional Delimiter for `.fam` file, by default " " bim_sep : str, optional Delimiter for `.bim` file, by default "\t" bim_int_contig : bool, optional Whether or not the contig/chromosome name in the `.bim` file should be interpreted as an integer, by default False. If False, then the `variant/contig` field in the resulting dataset will contain the indexes of corresponding strings encountered in the first `.bim` field. count_a1 : bool, optional Whether or not allele counts should be for A1 or A2, by default True. Typically A1 is the minor allele and should be counted instead of A2. This is not enforced by PLINK though and it is up to the data generating process to ensure that A1 is in fact an alternate/minor/effect allele. See https://www.cog-genomics.org/plink/1.9/formats for more details. lock : bool, optional Whether or not to synchronize concurrent reads of `.bed` file blocks, by default False. This is passed through to [dask.array.from_array](https://docs.dask.org/en/latest/array-api.html#dask.array.from_array). persist : bool, optional Whether or not to persist `.fam` and `.bim` information in memory, by default True. This is an important performance consideration as the plain text files for this data will be read multiple times when False. This can lead to load times that are upwards of 10x slower. Returns ------- Dataset A dataset containing genotypes as 3 dimensional calls along with all accompanying pedigree and variant information. The content of this dataset matches that of `sgkit.create_genotype_call_dataset` with all pedigree-specific fields defined as: - sample_family_id: Family identifier commonly referred to as FID - sample_id: Within-family identifier for sample - sample_paternal_id: Within-family identifier for father of sample - sample_maternal_id: Within-family identifier for mother of sample - sample_sex: Sex code equal to 1 for male, 2 for female, and -1 for missing - sample_phenotype: Phenotype code equal to 1 for control, 2 for case, and -1 for missing See https://www.cog-genomics.org/plink/1.9/formats#fam for more details. Raises ------ ValueError If `path` and one of `bed_path`, `bim_path` or `fam_path` are provided. """ if path and (bed_path or bim_path or fam_path): raise ValueError( "Either `path` or all 3 of `{bed,bim,fam}_path` must be specified but not both" ) if path: bed_path, bim_path, fam_path = [ f"{path}.{ext}" for ext in ["bed", "bim", "fam"] ] # Load axis data first to determine dimension sizes df_fam = read_fam(fam_path, sep=fam_sep) # type: ignore[arg-type] df_bim = read_bim(bim_path, sep=bim_sep) # type: ignore[arg-type] if persist: df_fam = df_fam.persist() df_bim = df_bim.persist() arr_fam = _to_dict(df_fam, dtype=FAM_ARRAY_DTYPE) arr_bim = _to_dict(df_bim, dtype=BIM_ARRAY_DTYPE) # Load genotyping data call_genotype = da.from_array( # Make sure to use asarray=False in order for masked arrays to propagate BedReader(bed_path, (len(df_bim), len(df_fam)), count_A1=count_a1), # type: ignore[arg-type] chunks=chunks, # Lock must be true with multiprocessing dask scheduler # to not get bed-reader errors (it works w/ threading backend though) lock=lock, asarray=False, name=f"bed_reader:read_plink:{bed_path}", ) # If contigs are already integers, use them as-is if bim_int_contig: variant_contig = arr_bim["contig"].astype("int16") variant_contig_names = da.unique(variant_contig).astype(str) variant_contig_names = list(variant_contig_names.compute()) # Otherwise create index for contig names based # on order of appearance in underlying .bim file else: variant_contig, variant_contig_names = encode_array( arr_bim["contig"].compute()) variant_contig = variant_contig.astype("int16") variant_contig_names = list(variant_contig_names) variant_position = arr_bim["pos"] a1 = arr_bim["a1"].astype("str") a2 = arr_bim["a2"].astype("str") # Note: column_stack not implemented in Dask, must use [v|h]stack variant_alleles = da.hstack((a1[:, np.newaxis], a2[:, np.newaxis])) variant_alleles = variant_alleles.astype("S") variant_id = arr_bim["variant_id"] sample_id = arr_fam["member_id"] ds = create_genotype_call_dataset( variant_contig_names=variant_contig_names, variant_contig=variant_contig, variant_position=variant_position, variant_alleles=variant_alleles, sample_id=sample_id, call_genotype=call_genotype, variant_id=variant_id, ) # Assign PLINK-specific pedigree fields ds = ds.assign( **{ f"sample_{f}": (DIM_SAMPLE, arr_fam[f]) for f in arr_fam if f != "member_id" }) return ds # type: ignore[no-any-return]
def check(x: ArrayLike, values: ArrayLike, names: List[Any]) -> None: v, n = encode_array(x) np.testing.assert_equal(v, values) np.testing.assert_equal(n, names)
def test_encode_array(x: List[Any], expected_values: List[Any], expected_names: List[Any]) -> None: v, n = encode_array(np.array(x)) np.testing.assert_equal(v, expected_values) np.testing.assert_equal(n, expected_names)
def read_bgen( path: PathType, chunks: Union[str, int, Tuple[int, ...]] = "auto", lock: bool = False, persist: bool = True, dtype: Any = "float32", ) -> Dataset: """Read BGEN dataset. Loads a single BGEN dataset as dask arrays within a Dataset from a bgen file. Parameters ---------- path : PathType Path to BGEN file. chunks : Union[str, int, tuple], optional Chunk size for genotype probability data (3 dimensions), by default "auto". lock : bool, optional Whether or not to synchronize concurrent reads of file blocks, by default False. This is passed through to [dask.array.from_array](https://docs.dask.org/en/latest/array-api.html#dask.array.from_array). persist : bool, optional Whether or not to persist variant information in memory, by default True. This is an important performance consideration as the metadata file for this data will be read multiple times when False. dtype : Any Genotype probability array data type, by default float32. Warnings -------- Only bi-allelic, diploid BGEN files are currently supported. """ bgen_reader = BgenReader(path, persist, dtype=dtype) variant_contig, variant_contig_names = encode_array(bgen_reader.contig.compute()) variant_contig_names = list(variant_contig_names) variant_contig = variant_contig.astype("int16") variant_position = np.asarray(bgen_reader.pos, dtype=int) variant_alleles = np.asarray(bgen_reader.variant_alleles, dtype="S") variant_id = np.asarray(bgen_reader.variant_id, dtype=str) sample_id = np.asarray(bgen_reader.sample_id, dtype=str) call_genotype_probability = da.from_array( bgen_reader, chunks=chunks, lock=lock, fancy=False, asarray=False, name=f"{bgen_reader.name}:read_bgen:{path}", ) call_dosage = _to_dosage(call_genotype_probability) ds: Dataset = create_genotype_dosage_dataset( variant_contig_names=variant_contig_names, variant_contig=variant_contig, variant_position=variant_position, variant_alleles=variant_alleles, sample_id=sample_id, call_dosage=call_dosage, call_genotype_probability=call_genotype_probability, variant_id=variant_id, ) return ds