def __getitem__(self, idx): if not isinstance(idx, tuple): raise IndexError(f'Indexer must be tuple (received {type(idx)})') if len(idx) != self.ndim: raise IndexError( f'Indexer must be two-item tuple (received {len(idx)} slices)') if idx[0].start == idx[0].stop: return np.empty((0, 0), dtype=self.dtype) start_partition = idx[0].start // self.partition_size start_partition_offset = idx[0].start % self.partition_size end_partition = (idx[0].stop - 1) // self.partition_size end_partition_offset = (idx[0].stop - 1) % self.partition_size all_vaddr = [] with bgen_metafile(self.metafile_filepath) as mf: for i in range(start_partition, end_partition + 1): partition = mf.read_partition(i) start_offset = start_partition_offset if i == start_partition else 0 end_offset = end_partition_offset + 1 if i == end_partition else self.partition_size vaddr = partition["vaddr"].tolist() all_vaddr.extend(vaddr[start_offset:end_offset]) with bgen_file(self.path) as bgen: genotypes = [bgen.read_genotype(vaddr) for vaddr in all_vaddr] probs = [genotype["probs"] for genotype in genotypes] return np.stack(probs)[:, idx[1]]
def __init__(self, path, dtype=np.float32): self.path = Path(path) self.metafile_filepath = _infer_metafile_filepath(Path(self.path)) if not self.metafile_filepath.exists(): create_metafile(path, self.metafile_filepath, verbose=False) with bgen_metafile(self.metafile_filepath) as mf: self.n_variants = mf.nvariants self.npartitions = mf.npartitions self.partition_size = mf.partition_size # This may need chunking for large numbers of variants variants_df = mf.create_variants().compute() self.variant_id = variants_df["id"].tolist() self.contig = variants_df["chrom"].tolist() self.pos = variants_df["pos"].tolist() allele_ids = variants_df["allele_ids"].tolist() self.a1, self.a2 = tuple(zip(*[id.split(",") for id in allele_ids])) with bgen_file(self.path) as bgen: sample_path = self.path.with_suffix('.sample') if sample_path.exists(): self.samples = read_samples_file(sample_path, verbose=False) else: if bgen.contain_samples: self.samples = bgen.read_samples() else: self.samples = generate_samples(bgen.nsamples) self.shape = (self.n_variants, len(self.samples), 3) self.dtype = dtype self.ndim = 3
def __init__( self, path: PathType, persist: bool = True, dtype: Any = np.float32 ) -> None: self.path = Path(path) self.metafile_filepath = infer_metafile_filepath(Path(self.path)) if not self.metafile_filepath.exists(): create_metafile(path, self.metafile_filepath, verbose=False) with bgen_metafile(self.metafile_filepath) as mf: self.n_variants = mf.nvariants self.npartitions = mf.npartitions self.partition_size = mf.partition_size df = mf.create_variants() if persist: df = df.persist() variant_arrs = _to_dict(df, dtype=VARIANT_ARRAY_DTYPE) self.variant_id = variant_arrs["id"] self.contig = variant_arrs["chrom"] self.pos = variant_arrs["pos"] def split_alleles( alleles: np.ndarray, block_info: Any = None ) -> np.ndarray: if block_info is None or len(block_info) == 0: return alleles def split(allele_row: np.ndarray) -> np.ndarray: alleles_list = allele_row[0].split(",") assert len(alleles_list) == 2 # bi-allelic return np.array(alleles_list) return np.apply_along_axis(split, 1, alleles[:, np.newaxis]) variant_alleles = variant_arrs["allele_ids"].map_blocks(split_alleles) def max_str_len(arr: ArrayLike) -> Any: return arr.map_blocks( lambda s: np.char.str_len(s.astype(str)), dtype=np.int8 ).max() max_allele_length = max(max_str_len(variant_alleles).compute()) self.variant_alleles = variant_alleles.astype(f"S{max_allele_length}") with bgen_file(self.path) as bgen: sample_path = self.path.with_suffix(".sample") if sample_path.exists(): self.sample_id = read_samples_file(sample_path, verbose=False) else: if bgen.contain_samples: self.sample_id = bgen.read_samples() else: self.sample_id = generate_samples(bgen.nsamples) self.shape = (self.n_variants, len(self.sample_id), 3) self.dtype = dtype self.ndim = 3
def __init__( self, filepath: Union[str, Path], samples_filepath: Optional[Union[str, Path]] = None, verbose: bool = True, ): filepath = Path(filepath) assert_file_exist(filepath) assert_file_readable(filepath) self._verbose = verbose self._filepath = filepath self._bgen_context_manager = bgen_file(filepath) self._bgen = self._bgen_context_manager.__enter__() self._samples = np.array(_get_samples(self._bgen, samples_filepath, self._verbose), dtype="str") self._sample_range = np.arange(len(self._samples), dtype=np.int) # LATER could make a version of this method public metadata2 = self._metadatapath_from_filename( filepath).resolve() #needed because of tmp_cwd below if metadata2.exists() and getmtime(metadata2) < getmtime(filepath): metadata2.unlink() if metadata2.exists(): d = np.load(str(metadata2)) self._ids = d["ids"] self._rsids = d["rsids"] self._vaddr = d["vaddr"] self._chromosomes = d["chromosomes"] self._positions = d["positions"] self._nalleles = d["nalleles"] self._allele_ids = d["allele_ids"] self._ncombinations = d["ncombinations"] self._phased = d["phased"] else: with tmp_cwd(): metafile_filepath = Path("bgen.metadata") self._bgen.create_metafile(metafile_filepath, verbose=self._verbose) self._map_metadata(metafile_filepath) np.savez( metadata2, ids=self._ids, rsids=self._rsids, vaddr=self._vaddr, chromosomes=self._chromosomes, positions=self._positions, nalleles=self._nalleles, allele_ids=self._allele_ids, ncombinations=self._ncombinations, phased=self._phased, ) self._max_combinations = max(self._ncombinations)
def __getitem__(self, idx: Any) -> np.ndarray: if not isinstance(idx, tuple): raise IndexError(f"Indexer must be tuple (received {type(idx)})") if len(idx) != self.ndim: raise IndexError( f"Indexer must have {self.ndim} items (received {len(idx)} slices)" ) if not all(isinstance(i, slice) or isinstance(i, int) for i in idx): raise IndexError( f"Indexer must contain only slices or ints (received types {[type(i) for i in idx]})" ) # Determine which dims should have unit size in result squeeze_dims = tuple(i for i in range(len(idx)) if isinstance(idx[i], int)) # Convert all indexers to slices idx = tuple(slice(i, i + 1) if isinstance(i, int) else i for i in idx) if idx[0].start == idx[0].stop: return np.empty((0,) * self.ndim, dtype=self.dtype) # Determine start and end partitions that correspond to the # given variant dimension indexer start_partition = idx[0].start // self.partition_size start_partition_offset = idx[0].start % self.partition_size end_partition = (idx[0].stop - 1) // self.partition_size end_partition_offset = (idx[0].stop - 1) % self.partition_size # Create a list of all offsets into the underlying file at which # data for each variant begins all_vaddr = [] with bgen_metafile(self.metafile_filepath) as mf: for i in range(start_partition, end_partition + 1): partition = mf.read_partition(i) start_offset = start_partition_offset if i == start_partition else 0 end_offset = ( end_partition_offset + 1 if i == end_partition else self.partition_size ) vaddr = partition["vaddr"].tolist() all_vaddr.extend(vaddr[start_offset:end_offset]) # Read the probabilities for each variant, apply indexer for # samples dimension to give probabilities for all genotypes, # and then apply final genotype dimension indexer with bgen_file(self.path) as bgen: res = None for i, vaddr in enumerate(all_vaddr): probs = bgen.read_genotype(vaddr)["probs"][idx[1]] assert len(probs.shape) == 2 and probs.shape[1] == 3 if res is None: res = np.zeros((len(all_vaddr), len(probs), 3), dtype=self.dtype) res[i] = probs res = res[..., idx[2]] # type: ignore[index] return np.squeeze(res, axis=squeeze_dims)