def _default_sample_ids(path: PathType) -> ArrayLike: """Fetch or generate sample ids""" with bgen_file(path) as bgen: if bgen.contain_samples: return bgen.read_samples() else: return np.char.add(b"sample_", np.arange(bgen.nsamples).astype("S")) # type: ignore[no-untyped-call]
def __init__( self, path: PathType, metafile_path: Optional[PathType] = None, dtype: DType = "float32", ) -> None: self.path = Path(path) self.metafile_path = ( Path(metafile_path) if metafile_path else self.path.with_suffix(".metafile") ) with bgen_file(self.path) as bgen: self.n_variants = bgen.nvariants self.n_samples = bgen.nsamples if not self.metafile_path.exists(): start = time.time() logger.info( f"Generating BGEN metafile for '{self.path}' (this may take a while)" ) bgen.create_metafile(self.metafile_path, verbose=False) stop = time.time() logger.info( f"BGEN metafile generation complete ({stop - start:.0f} seconds)" ) with bgen_metafile(self.metafile_path) as mf: assert self.n_variants == mf.nvariants self.npartitions = mf.npartitions self.partition_size = mf.partition_size self.shape = (self.n_variants, self.n_samples, 3) self.dtype = np.dtype(dtype) self.precision = 64 if self.dtype.itemsize >= 8 else 32 self.ndim = 3
def test_cbgen_error_create_metafile(): filepath = example.get("haplotypes.bgen") mfilepath = "/DmEkq/WkhDu/bla.metafile" bgen = bgen_file(filepath) with pytest.raises(RuntimeError): bgen.create_metafile(mfilepath, verbose=False)
def __getitem__(self, idx: Any) -> np.ndarray: if not isinstance(idx, tuple): raise IndexError(f"Indexer must be tuple (received {type(idx)})") if len(idx) != self.ndim: raise IndexError( f"Indexer must have {self.ndim} items (received {len(idx)} slices)" ) if not all(isinstance(i, slice) or isinstance(i, int) for i in idx): raise IndexError( f"Indexer must contain only slices or ints (received types {[type(i) for i in idx]})" ) # Determine which dims should have unit size in result squeeze_dims = tuple(i for i in range(len(idx)) if isinstance(idx[i], int)) # Convert all indexers to slices idx = tuple(slice(i, i + 1) if isinstance(i, int) else i for i in idx) if idx[0].start == idx[0].stop: return np.empty((0, ) * self.ndim, dtype=self.dtype) # Determine start and end partitions that correspond to the # given variant dimension indexer start_partition = idx[0].start // self.partition_size start_partition_offset = idx[0].start % self.partition_size end_partition = (idx[0].stop - 1) // self.partition_size end_partition_offset = (idx[0].stop - 1) % self.partition_size # Create a list of all offsets into the underlying file at which # data for each variant begins all_vaddr = [] with bgen_metafile(self.metafile_path) as mf: for i in range(start_partition, end_partition + 1): partition = mf.read_partition(i) start_offset = start_partition_offset if i == start_partition else 0 end_offset = (end_partition_offset + 1 if i == end_partition else self.partition_size) vaddr = partition.variants.offset all_vaddr.extend(vaddr[start_offset:end_offset].tolist()) # Read the probabilities for each variant, apply indexer for # samples dimension to give probabilities for all genotypes, # and then apply final genotype dimension indexer with bgen_file(self.path) as bgen: res = None for i, vaddr in enumerate(all_vaddr): probs = bgen.read_probability(vaddr, precision=self.precision)[idx[1]] assert len(probs.shape) == 2 and probs.shape[1] == 3 if res is None: res = np.zeros((len(all_vaddr), len(probs), 3), dtype=self.dtype) res[i] = probs res = res[..., idx[2]] # type: ignore[index] return np.squeeze(res, axis=squeeze_dims) # type: ignore[no-any-return]
def get_samples(bgen_filepath, verbose: bool) -> Series: with bgen_file(bgen_filepath) as bgen: if bgen.contain_samples: samples = bgen.read_samples() else: if verbose: print("Sample IDs are not present in this file." "I will generate them on my own:" " sample_1, sample_2, and so on.") samples = generate_samples(bgen.nsamples) return samples
def create_metafile( bgen_filepath: Union[str, Path], metafile_filepath: Union[str, Path], verbose: bool = True, ): """ Create variants metadata file. Variants metadata file helps speed up subsequent reads of the associated bgen file. Parameters ---------- bgen_filepath : str Bgen file path. metafile_file : str Metafile file path. verbose : bool ``True`` to show progress; ``False`` otherwise. Examples -------- .. doctest:: >>> import os >>> from bgen_reader import create_metafile, example_filepath >>> >>> filepath = example_filepath("example.32bits.bgen") >>> metafile_filepath = filepath.with_suffix(".metafile") >>> >>> try: ... create_metafile(filepath, metafile_filepath, verbose=False) ... finally: ... if metafile_filepath.exists(): ... os.remove(metafile_filepath) """ bgen_filepath = Path(bgen_filepath) metafile_filepath = Path(metafile_filepath) assert_file_exist(bgen_filepath) assert_file_readable(bgen_filepath) if metafile_filepath.exists(): raise ValueError(f"File {metafile_filepath} already exists.") with bgen_file(bgen_filepath) as bgen: bgen.create_metafile(metafile_filepath, verbose)
def test_cbgen_large1(tmp_path): filepath = example.get("merged_487400x220000.bgen") mfilepath = tmp_path / f"{filepath.name}.metafile" bgen = bgen_file(filepath) assert bgen.nvariants == 220000 assert bgen.nsamples == 487400 assert not bgen.contain_samples with pytest.raises(RuntimeError): bgen.read_samples() bgen.create_metafile(mfilepath, verbose=True) mf = bgen_metafile(mfilepath) assert mf.filepath.name == mfilepath.name assert mf.npartitions == 469 assert mf.nvariants == 220000 assert mf.partition_size == 470 part = mf.read_partition(5) assert len(part.variants) == 470 assert part.variants.id[0] == b"sid_1_2350" assert part.variants.rsid[0] == b"sid_1_2350" assert part.variants.chrom[0] == b"1" assert part.variants.position[0] == 2351 assert part.variants.nalleles[0] == 2 assert part.variants.allele_ids[0] == b"A,C" voff = part.variants.offset[0] gt = bgen.read_genotype(voff) assert_allclose(gt.probs.shape, (487400, 3)) assert_allclose(nansum(gt.probs, 0), [475743.0, 0.0, 0.0]) assert_allclose(isnan(gt.probs).sum(0), [11657, 11657, 11657]) assert not gt.phased assert_allclose(gt.ploidy.sum(), 974800) assert_allclose(gt.missing.sum(), 11657) mf.close() bgen.close()
def read_bgen( filepath: Union[str, Path], metafile_filepath: Optional[Union[str, Path]] = None, samples_filepath: Optional[Union[str, Path]] = None, verbose: bool = True, ): """ Read a given BGEN file. Parameters ---------- filepath Bgen file path. metafile_filepath File path to the corresponding metafile. A metafile can be created by calling :func:`bgen_reader.create_metafile`. If ``None``, a metafile will be automatically created. Defaults to ``None``. samples_filepath Path to a `sample format`_ file or ``None`` to read samples from the bgen file itself. Defaults to ``None``. verbose ``True`` to show progress; ``False`` otherwise. Defaults to ``True``. Returns ------- variants : :class:`dask.dataFrame.DataFrame` Variant position, chromosomes, rsids, etc. samples : :class:`pandas.Series` Sample identifications. genotype : list List of genotypes. Examples -------- .. doctest:: >>> from bgen_reader import example_filepath, read_bgen >>> >>> bgen = read_bgen(example_filepath("haplotypes.bgen"), verbose=False) >>> variants = bgen["variants"] >>> samples = bgen["samples"] >>> >>> v = variants.loc[0].compute() >>> g = bgen["genotype"][0].compute() >>> print(v) id rsid chrom pos nalleles allele_ids vaddr 0 SNP1 RS1 1 1 2 A,G 102 >>> print(samples) 0 sample_0 1 sample_1 2 sample_2 3 sample_3 Name: id, dtype: object >>> print(g["probs"][0]) [1. 0. 1. 0.] .. _sample format: https://www.well.ox.ac.uk/~gav/qctool/documentation/sample_file_formats.html """ filepath = Path(filepath) assert_file_exist(filepath) assert_file_readable(filepath) if metafile_filepath is None: metafile_filepath = infer_metafile_filepath(filepath) else: metafile_filepath = Path(metafile_filepath) assert_file_exist(metafile_filepath) assert_file_readable(filepath) if not metafile_filepath.exists(): if verbose: print( f"We will create the metafile `{metafile_filepath}`. This file will " "speed up further\nreads and only need to be created once. So, please, " "bear with me.") create_metafile(filepath, metafile_filepath, verbose) elif os.path.getmtime(metafile_filepath) < os.path.getmtime(filepath): from ._genotype import cache as bgencache from ._metafile import cache as metacache metacache.clear() bgencache.clear() if verbose: print( f"File `{filepath}` has been modified after the creation of `{metafile_filepath}`." "\nWe will therefore recreate the metadata file. So, please, bear with me." ) os.unlink(metafile_filepath) create_metafile(filepath, metafile_filepath, verbose) with bgen_file(filepath) as bgen: samples = _get_samples(bgen, samples_filepath, verbose) with bgen_metafile(metafile_filepath) as mf: nvariants = mf.nvariants npartitions = mf.npartitions part_size = mf.partition_size variants = create_variants(metafile_filepath, nvariants, npartitions, part_size) genotype = create_genotypes(bgen, metafile_filepath, verbose) return dict(variants=variants, samples=samples, genotype=genotype)
def test_cbgen_phased_genotype(tmp_path): filepath = example.get("haplotypes.bgen") mfilepath = tmp_path / f"{filepath.name}.metafile" bgen = bgen_file(filepath) assert bgen.filepath.name == "haplotypes.bgen" assert bgen.nvariants == 4 assert bgen.nsamples == 4 assert bgen.contain_samples samples = bgen.read_samples() assert_array_equal(samples, [b"sample_0", b"sample_1", b"sample_2", b"sample_3"]) bgen.create_metafile(mfilepath, verbose=False) mf = bgen_metafile(mfilepath) assert mf.filepath.name == mfilepath.name assert mf.npartitions == 1 assert mf.nvariants == 4 assert mf.partition_size == 4 part = mf.read_partition(0) assert part.variants.id[0] == b"SNP1" assert part.variants.rsid[0] == b"RS1" assert part.variants.chrom[0] == b"1" assert part.variants.position[0] == 1 assert part.variants.nalleles[0] == 2 assert part.variants.allele_ids[0] == b"A,G" voff = part.variants.offset[0] gt = bgen.read_genotype(voff) assert_allclose( gt.probs, [ [1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 1.0, 0.0], [1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 0.0, 1.0], ], ) assert gt.phased assert_allclose(gt.ploidy, [2, 2, 2, 2]) assert_allclose(gt.missing, [False, False, False, False]) assert part.variants.id[3] == b"SNP4" assert part.variants.rsid[3] == b"RS4" assert part.variants.chrom[3] == b"1" assert part.variants.position[3] == 4 assert part.variants.nalleles[3] == 2 assert part.variants.allele_ids[3] == b"A,G" voff = part.variants.offset[3] gt = bgen.read_genotype(voff) assert_allclose( gt.probs, [ [0.0, 1.0, 0.0, 1.0], [1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 1.0, 0.0], [1.0, 0.0, 0.0, 1.0], ], ) assert gt.phased assert_allclose(gt.ploidy, [2, 2, 2, 2]) assert_allclose(gt.missing, [False, False, False, False]) mf.close() bgen.close()
def test_cbgen_complex_unphased(tmp_path: Path): filepath = example.get("complex.23bits.no.samples.bgen") mfilepath = tmp_path / f"{filepath.name}.metafile" with bgen_file(filepath) as bgen: assert bgen.filepath.name == "complex.23bits.no.samples.bgen" assert bgen.nvariants == 10 assert bgen.nsamples == 4 assert not bgen.contain_samples with pytest.raises(RuntimeError): bgen.read_samples() bgen.create_metafile(mfilepath, verbose=False) with bgen_metafile(mfilepath) as mf: assert mf.filepath.name == mfilepath.name assert mf.npartitions == 1 assert mf.nvariants == 10 assert mf.partition_size == 10 part = mf.read_partition(0) assert part.variants.id[0] == b"" assert part.variants.rsid[0] == b"V1" assert part.variants.chrom[0] == b"01" assert part.variants.position[0] == 1 assert part.variants.nalleles[0] == 2 assert part.variants.allele_ids[0] == b"A,G" with bgen_file(filepath) as bgen: voff = part.variants.offset[0] gt = bgen.read_genotype(voff) assert_allclose( gt.probs, [[1.0, 0.0, nan], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], ) assert not gt.phased assert_allclose(gt.ploidy, [1, 2, 2, 2]) assert_allclose(gt.missing, [False, False, False, False]) voff = part.variants.offset[-1] gt = bgen.read_genotype(voff) assert_allclose( gt.probs, [ [1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0, 0.0], ], ) assert not gt.phased assert_allclose(gt.ploidy, [4, 4, 4, 4]) assert_allclose(gt.missing, [False, False, False, False]) valid_offsets = set(list(part.variants.offset)) all_offsets = set(list(range(0, int(max(valid_offsets)) + 1))) invalid_offsets = all_offsets - valid_offsets for offset in list(invalid_offsets): with pytest.raises(RuntimeError): bgen.read_genotype(offset) with pytest.raises(RuntimeError): part = mf.read_partition(1)
def read_genotype_partition(bgen_filepath: Path, offsets): with bgen_file(bgen_filepath) as bgen: return [bgen.read_genotype(offset) for offset in offsets]
def test_cbgen_nonexistent_bgen_file(): with pytest.raises(RuntimeError): bgen_file("/Fmw/DiKel")
def __init__(self): self._filepath = cbgen.example.get("merged_487400x220000.bgen") self._mfilepath = Path("metafile") with cbgen.bgen_file(self._filepath) as bgen: bgen.create_metafile(self._mfilepath, verbose=False)
def time_create_metafile(self): with tempfile.TemporaryDirectory() as tmpdir: with cbgen.bgen_file(self._filepath) as bgen: bgen.create_metafile(Path(tmpdir) / "metafile", verbose=False)
def time_bgen_file(self): with cbgen.bgen_file(self._filepath): pass