예제 #1
0
def _default_sample_ids(path: PathType) -> ArrayLike:
    """Fetch or generate sample ids"""
    with bgen_file(path) as bgen:
        if bgen.contain_samples:
            return bgen.read_samples()
        else:
            return np.char.add(b"sample_", np.arange(bgen.nsamples).astype("S"))  # type: ignore[no-untyped-call]
예제 #2
0
    def __init__(
        self,
        path: PathType,
        metafile_path: Optional[PathType] = None,
        dtype: DType = "float32",
    ) -> None:
        self.path = Path(path)
        self.metafile_path = (
            Path(metafile_path) if metafile_path else self.path.with_suffix(".metafile")
        )

        with bgen_file(self.path) as bgen:
            self.n_variants = bgen.nvariants
            self.n_samples = bgen.nsamples

            if not self.metafile_path.exists():
                start = time.time()
                logger.info(
                    f"Generating BGEN metafile for '{self.path}' (this may take a while)"
                )
                bgen.create_metafile(self.metafile_path, verbose=False)
                stop = time.time()
                logger.info(
                    f"BGEN metafile generation complete ({stop - start:.0f} seconds)"
                )

            with bgen_metafile(self.metafile_path) as mf:
                assert self.n_variants == mf.nvariants
                self.npartitions = mf.npartitions
                self.partition_size = mf.partition_size

        self.shape = (self.n_variants, self.n_samples, 3)
        self.dtype = np.dtype(dtype)
        self.precision = 64 if self.dtype.itemsize >= 8 else 32
        self.ndim = 3
예제 #3
0
파일: test_bgen.py 프로젝트: limix/cbgen
def test_cbgen_error_create_metafile():
    filepath = example.get("haplotypes.bgen")
    mfilepath = "/DmEkq/WkhDu/bla.metafile"

    bgen = bgen_file(filepath)

    with pytest.raises(RuntimeError):
        bgen.create_metafile(mfilepath, verbose=False)
예제 #4
0
    def __getitem__(self, idx: Any) -> np.ndarray:
        if not isinstance(idx, tuple):
            raise IndexError(f"Indexer must be tuple (received {type(idx)})")
        if len(idx) != self.ndim:
            raise IndexError(
                f"Indexer must have {self.ndim} items (received {len(idx)} slices)"
            )
        if not all(isinstance(i, slice) or isinstance(i, int) for i in idx):
            raise IndexError(
                f"Indexer must contain only slices or ints (received types {[type(i) for i in idx]})"
            )
        # Determine which dims should have unit size in result
        squeeze_dims = tuple(i for i in range(len(idx))
                             if isinstance(idx[i], int))
        # Convert all indexers to slices
        idx = tuple(slice(i, i + 1) if isinstance(i, int) else i for i in idx)

        if idx[0].start == idx[0].stop:
            return np.empty((0, ) * self.ndim, dtype=self.dtype)

        # Determine start and end partitions that correspond to the
        # given variant dimension indexer
        start_partition = idx[0].start // self.partition_size
        start_partition_offset = idx[0].start % self.partition_size
        end_partition = (idx[0].stop - 1) // self.partition_size
        end_partition_offset = (idx[0].stop - 1) % self.partition_size

        # Create a list of all offsets into the underlying file at which
        # data for each variant begins
        all_vaddr = []
        with bgen_metafile(self.metafile_path) as mf:
            for i in range(start_partition, end_partition + 1):
                partition = mf.read_partition(i)
                start_offset = start_partition_offset if i == start_partition else 0
                end_offset = (end_partition_offset +
                              1 if i == end_partition else self.partition_size)
                vaddr = partition.variants.offset
                all_vaddr.extend(vaddr[start_offset:end_offset].tolist())

        # Read the probabilities for each variant, apply indexer for
        # samples dimension to give probabilities for all genotypes,
        # and then apply final genotype dimension indexer
        with bgen_file(self.path) as bgen:
            res = None
            for i, vaddr in enumerate(all_vaddr):
                probs = bgen.read_probability(vaddr,
                                              precision=self.precision)[idx[1]]
                assert len(probs.shape) == 2 and probs.shape[1] == 3
                if res is None:
                    res = np.zeros((len(all_vaddr), len(probs), 3),
                                   dtype=self.dtype)
                res[i] = probs
            res = res[..., idx[2]]  # type: ignore[index]
            return np.squeeze(res,
                              axis=squeeze_dims)  # type: ignore[no-any-return]
예제 #5
0
def get_samples(bgen_filepath, verbose: bool) -> Series:
    with bgen_file(bgen_filepath) as bgen:

        if bgen.contain_samples:
            samples = bgen.read_samples()

        else:
            if verbose:
                print("Sample IDs are not present in this file."
                      "I will generate them on my own:"
                      " sample_1, sample_2, and so on.")
            samples = generate_samples(bgen.nsamples)

    return samples
예제 #6
0
def create_metafile(
    bgen_filepath: Union[str, Path],
    metafile_filepath: Union[str, Path],
    verbose: bool = True,
):
    """
    Create variants metadata file.

    Variants metadata file helps speed up subsequent reads of the associated
    bgen file.

    Parameters
    ----------
    bgen_filepath : str
        Bgen file path.
    metafile_file : str
        Metafile file path.
    verbose : bool
        ``True`` to show progress; ``False`` otherwise.

    Examples
    --------
    .. doctest::

        >>> import os
        >>> from bgen_reader import create_metafile, example_filepath
        >>>
        >>> filepath = example_filepath("example.32bits.bgen")
        >>> metafile_filepath = filepath.with_suffix(".metafile")
        >>>
        >>> try:
        ...     create_metafile(filepath, metafile_filepath, verbose=False)
        ... finally:
        ...     if metafile_filepath.exists():
        ...         os.remove(metafile_filepath)
    """
    bgen_filepath = Path(bgen_filepath)
    metafile_filepath = Path(metafile_filepath)

    assert_file_exist(bgen_filepath)
    assert_file_readable(bgen_filepath)

    if metafile_filepath.exists():
        raise ValueError(f"File {metafile_filepath} already exists.")

    with bgen_file(bgen_filepath) as bgen:
        bgen.create_metafile(metafile_filepath, verbose)
예제 #7
0
def test_cbgen_large1(tmp_path):
    filepath = example.get("merged_487400x220000.bgen")
    mfilepath = tmp_path / f"{filepath.name}.metafile"

    bgen = bgen_file(filepath)
    assert bgen.nvariants == 220000
    assert bgen.nsamples == 487400
    assert not bgen.contain_samples

    with pytest.raises(RuntimeError):
        bgen.read_samples()

    bgen.create_metafile(mfilepath, verbose=True)

    mf = bgen_metafile(mfilepath)

    assert mf.filepath.name == mfilepath.name
    assert mf.npartitions == 469
    assert mf.nvariants == 220000
    assert mf.partition_size == 470

    part = mf.read_partition(5)

    assert len(part.variants) == 470
    assert part.variants.id[0] == b"sid_1_2350"
    assert part.variants.rsid[0] == b"sid_1_2350"
    assert part.variants.chrom[0] == b"1"
    assert part.variants.position[0] == 2351
    assert part.variants.nalleles[0] == 2
    assert part.variants.allele_ids[0] == b"A,C"
    voff = part.variants.offset[0]
    gt = bgen.read_genotype(voff)
    assert_allclose(gt.probs.shape, (487400, 3))
    assert_allclose(nansum(gt.probs, 0), [475743.0, 0.0, 0.0])
    assert_allclose(isnan(gt.probs).sum(0), [11657, 11657, 11657])
    assert not gt.phased
    assert_allclose(gt.ploidy.sum(), 974800)
    assert_allclose(gt.missing.sum(), 11657)

    mf.close()
    bgen.close()
예제 #8
0
def read_bgen(
    filepath: Union[str, Path],
    metafile_filepath: Optional[Union[str, Path]] = None,
    samples_filepath: Optional[Union[str, Path]] = None,
    verbose: bool = True,
):
    """
    Read a given BGEN file.

    Parameters
    ----------
    filepath
        Bgen file path.
    metafile_filepath
        File path to the corresponding metafile. A metafile can be created by calling
        :func:`bgen_reader.create_metafile`. If ``None``, a metafile will be automatically created.
        Defaults to ``None``.
    samples_filepath
        Path to a `sample format`_ file or ``None`` to read samples from the bgen file itself.
        Defaults to ``None``.
    verbose
        ``True`` to show progress; ``False`` otherwise. Defaults to ``True``.

    Returns
    -------
    variants : :class:`dask.dataFrame.DataFrame`
        Variant position, chromosomes, rsids, etc.
    samples : :class:`pandas.Series`
        Sample identifications.
    genotype : list
        List of genotypes.

    Examples
    --------
    .. doctest::

        >>> from bgen_reader import example_filepath, read_bgen
        >>>
        >>> bgen = read_bgen(example_filepath("haplotypes.bgen"), verbose=False)
        >>> variants = bgen["variants"]
        >>> samples = bgen["samples"]
        >>>
        >>> v = variants.loc[0].compute()
        >>> g = bgen["genotype"][0].compute()
        >>> print(v)
             id rsid chrom  pos  nalleles allele_ids  vaddr
        0  SNP1  RS1     1    1         2        A,G    102
        >>> print(samples)
        0    sample_0
        1    sample_1
        2    sample_2
        3    sample_3
        Name: id, dtype: object
        >>> print(g["probs"][0])
        [1. 0. 1. 0.]

    .. _sample format: https://www.well.ox.ac.uk/~gav/qctool/documentation/sample_file_formats.html
    """

    filepath = Path(filepath)
    assert_file_exist(filepath)
    assert_file_readable(filepath)

    if metafile_filepath is None:
        metafile_filepath = infer_metafile_filepath(filepath)
    else:
        metafile_filepath = Path(metafile_filepath)
        assert_file_exist(metafile_filepath)
        assert_file_readable(filepath)

    if not metafile_filepath.exists():
        if verbose:
            print(
                f"We will create the metafile `{metafile_filepath}`. This file will "
                "speed up further\nreads and only need to be created once. So, please, "
                "bear with me.")
        create_metafile(filepath, metafile_filepath, verbose)
    elif os.path.getmtime(metafile_filepath) < os.path.getmtime(filepath):
        from ._genotype import cache as bgencache
        from ._metafile import cache as metacache

        metacache.clear()
        bgencache.clear()

        if verbose:
            print(
                f"File `{filepath}` has been modified after the creation of `{metafile_filepath}`."
                "\nWe will therefore recreate the metadata file. So, please, bear with me."
            )
        os.unlink(metafile_filepath)
        create_metafile(filepath, metafile_filepath, verbose)

    with bgen_file(filepath) as bgen:
        samples = _get_samples(bgen, samples_filepath, verbose)

        with bgen_metafile(metafile_filepath) as mf:
            nvariants = mf.nvariants
            npartitions = mf.npartitions
            part_size = mf.partition_size
            variants = create_variants(metafile_filepath, nvariants,
                                       npartitions, part_size)

        genotype = create_genotypes(bgen, metafile_filepath, verbose)

    return dict(variants=variants, samples=samples, genotype=genotype)
예제 #9
0
def test_cbgen_phased_genotype(tmp_path):
    filepath = example.get("haplotypes.bgen")
    mfilepath = tmp_path / f"{filepath.name}.metafile"

    bgen = bgen_file(filepath)
    assert bgen.filepath.name == "haplotypes.bgen"
    assert bgen.nvariants == 4
    assert bgen.nsamples == 4
    assert bgen.contain_samples
    samples = bgen.read_samples()
    assert_array_equal(samples, [b"sample_0", b"sample_1", b"sample_2", b"sample_3"])

    bgen.create_metafile(mfilepath, verbose=False)

    mf = bgen_metafile(mfilepath)

    assert mf.filepath.name == mfilepath.name
    assert mf.npartitions == 1
    assert mf.nvariants == 4
    assert mf.partition_size == 4

    part = mf.read_partition(0)

    assert part.variants.id[0] == b"SNP1"
    assert part.variants.rsid[0] == b"RS1"
    assert part.variants.chrom[0] == b"1"
    assert part.variants.position[0] == 1
    assert part.variants.nalleles[0] == 2
    assert part.variants.allele_ids[0] == b"A,G"
    voff = part.variants.offset[0]
    gt = bgen.read_genotype(voff)
    assert_allclose(
        gt.probs,
        [
            [1.0, 0.0, 1.0, 0.0],
            [0.0, 1.0, 1.0, 0.0],
            [1.0, 0.0, 0.0, 1.0],
            [0.0, 1.0, 0.0, 1.0],
        ],
    )
    assert gt.phased
    assert_allclose(gt.ploidy, [2, 2, 2, 2])
    assert_allclose(gt.missing, [False, False, False, False])

    assert part.variants.id[3] == b"SNP4"
    assert part.variants.rsid[3] == b"RS4"
    assert part.variants.chrom[3] == b"1"
    assert part.variants.position[3] == 4
    assert part.variants.nalleles[3] == 2
    assert part.variants.allele_ids[3] == b"A,G"
    voff = part.variants.offset[3]
    gt = bgen.read_genotype(voff)
    assert_allclose(
        gt.probs,
        [
            [0.0, 1.0, 0.0, 1.0],
            [1.0, 0.0, 1.0, 0.0],
            [0.0, 1.0, 1.0, 0.0],
            [1.0, 0.0, 0.0, 1.0],
        ],
    )
    assert gt.phased
    assert_allclose(gt.ploidy, [2, 2, 2, 2])
    assert_allclose(gt.missing, [False, False, False, False])

    mf.close()
    bgen.close()
예제 #10
0
def test_cbgen_complex_unphased(tmp_path: Path):
    filepath = example.get("complex.23bits.no.samples.bgen")
    mfilepath = tmp_path / f"{filepath.name}.metafile"
    with bgen_file(filepath) as bgen:
        assert bgen.filepath.name == "complex.23bits.no.samples.bgen"
        assert bgen.nvariants == 10
        assert bgen.nsamples == 4
        assert not bgen.contain_samples
        with pytest.raises(RuntimeError):
            bgen.read_samples()
        bgen.create_metafile(mfilepath, verbose=False)

    with bgen_metafile(mfilepath) as mf:
        assert mf.filepath.name == mfilepath.name
        assert mf.npartitions == 1
        assert mf.nvariants == 10
        assert mf.partition_size == 10

        part = mf.read_partition(0)

        assert part.variants.id[0] == b""
        assert part.variants.rsid[0] == b"V1"
        assert part.variants.chrom[0] == b"01"
        assert part.variants.position[0] == 1
        assert part.variants.nalleles[0] == 2
        assert part.variants.allele_ids[0] == b"A,G"

        with bgen_file(filepath) as bgen:
            voff = part.variants.offset[0]
            gt = bgen.read_genotype(voff)
            assert_allclose(
                gt.probs,
                [[1.0, 0.0, nan], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]],
            )
            assert not gt.phased
            assert_allclose(gt.ploidy, [1, 2, 2, 2])
            assert_allclose(gt.missing, [False, False, False, False])

            voff = part.variants.offset[-1]
            gt = bgen.read_genotype(voff)
            assert_allclose(
                gt.probs,
                [
                    [1.0, 0.0, 0.0, 0.0, 0.0],
                    [0.0, 1.0, 0.0, 0.0, 0.0],
                    [0.0, 0.0, 1.0, 0.0, 0.0],
                    [0.0, 0.0, 0.0, 1.0, 0.0],
                ],
            )
            assert not gt.phased
            assert_allclose(gt.ploidy, [4, 4, 4, 4])
            assert_allclose(gt.missing, [False, False, False, False])

            valid_offsets = set(list(part.variants.offset))
            all_offsets = set(list(range(0, int(max(valid_offsets)) + 1)))
            invalid_offsets = all_offsets - valid_offsets

            for offset in list(invalid_offsets):
                with pytest.raises(RuntimeError):
                    bgen.read_genotype(offset)

        with pytest.raises(RuntimeError):
            part = mf.read_partition(1)
예제 #11
0
def read_genotype_partition(bgen_filepath: Path, offsets):
    with bgen_file(bgen_filepath) as bgen:
        return [bgen.read_genotype(offset) for offset in offsets]
예제 #12
0
파일: test_bgen.py 프로젝트: limix/cbgen
def test_cbgen_nonexistent_bgen_file():
    with pytest.raises(RuntimeError):
        bgen_file("/Fmw/DiKel")
예제 #13
0
파일: bench.py 프로젝트: limix/cbgen
 def __init__(self):
     self._filepath = cbgen.example.get("merged_487400x220000.bgen")
     self._mfilepath = Path("metafile")
     with cbgen.bgen_file(self._filepath) as bgen:
         bgen.create_metafile(self._mfilepath, verbose=False)
예제 #14
0
파일: bench.py 프로젝트: limix/cbgen
 def time_create_metafile(self):
     with tempfile.TemporaryDirectory() as tmpdir:
         with cbgen.bgen_file(self._filepath) as bgen:
             bgen.create_metafile(Path(tmpdir) / "metafile", verbose=False)
예제 #15
0
파일: bench.py 프로젝트: limix/cbgen
 def time_bgen_file(self):
     with cbgen.bgen_file(self._filepath):
         pass