Example #1
0
    def __getitem__(self, idx):
        if not isinstance(idx, tuple):
            raise IndexError(f'Indexer must be tuple (received {type(idx)})')
        if len(idx) != self.ndim:
            raise IndexError(
                f'Indexer must be two-item tuple (received {len(idx)} slices)')

        if idx[0].start == idx[0].stop:
            return np.empty((0, 0), dtype=self.dtype)

        start_partition = idx[0].start // self.partition_size
        start_partition_offset = idx[0].start % self.partition_size
        end_partition = (idx[0].stop - 1) // self.partition_size
        end_partition_offset = (idx[0].stop - 1) % self.partition_size

        all_vaddr = []
        with bgen_metafile(self.metafile_filepath) as mf:
            for i in range(start_partition, end_partition + 1):
                partition = mf.read_partition(i)
                start_offset = start_partition_offset if i == start_partition else 0
                end_offset = end_partition_offset + 1 if i == end_partition else self.partition_size
                vaddr = partition["vaddr"].tolist()
                all_vaddr.extend(vaddr[start_offset:end_offset])

        with bgen_file(self.path) as bgen:
            genotypes = [bgen.read_genotype(vaddr) for vaddr in all_vaddr]
            probs = [genotype["probs"] for genotype in genotypes]
            return np.stack(probs)[:, idx[1]]
Example #2
0
    def __init__(self, path, dtype=np.float32):
        self.path = Path(path)

        self.metafile_filepath = _infer_metafile_filepath(Path(self.path))
        if not self.metafile_filepath.exists():
            create_metafile(path, self.metafile_filepath, verbose=False)

        with bgen_metafile(self.metafile_filepath) as mf:
            self.n_variants = mf.nvariants
            self.npartitions = mf.npartitions
            self.partition_size = mf.partition_size

            # This may need chunking for large numbers of variants
            variants_df = mf.create_variants().compute()
            self.variant_id = variants_df["id"].tolist()
            self.contig = variants_df["chrom"].tolist()
            self.pos = variants_df["pos"].tolist()
            allele_ids = variants_df["allele_ids"].tolist()
            self.a1, self.a2 = tuple(zip(*[id.split(",")
                                           for id in allele_ids]))

        with bgen_file(self.path) as bgen:
            sample_path = self.path.with_suffix('.sample')
            if sample_path.exists():
                self.samples = read_samples_file(sample_path, verbose=False)
            else:
                if bgen.contain_samples:
                    self.samples = bgen.read_samples()
                else:
                    self.samples = generate_samples(bgen.nsamples)

        self.shape = (self.n_variants, len(self.samples), 3)
        self.dtype = dtype
        self.ndim = 3
Example #3
0
    def __init__(
        self, path: PathType, persist: bool = True, dtype: Any = np.float32
    ) -> None:
        self.path = Path(path)

        self.metafile_filepath = infer_metafile_filepath(Path(self.path))
        if not self.metafile_filepath.exists():
            create_metafile(path, self.metafile_filepath, verbose=False)

        with bgen_metafile(self.metafile_filepath) as mf:
            self.n_variants = mf.nvariants
            self.npartitions = mf.npartitions
            self.partition_size = mf.partition_size

            df = mf.create_variants()
            if persist:
                df = df.persist()
            variant_arrs = _to_dict(df, dtype=VARIANT_ARRAY_DTYPE)

            self.variant_id = variant_arrs["id"]
            self.contig = variant_arrs["chrom"]
            self.pos = variant_arrs["pos"]

            def split_alleles(
                alleles: np.ndarray, block_info: Any = None
            ) -> np.ndarray:
                if block_info is None or len(block_info) == 0:
                    return alleles

                def split(allele_row: np.ndarray) -> np.ndarray:
                    alleles_list = allele_row[0].split(",")
                    assert len(alleles_list) == 2  # bi-allelic
                    return np.array(alleles_list)

                return np.apply_along_axis(split, 1, alleles[:, np.newaxis])

            variant_alleles = variant_arrs["allele_ids"].map_blocks(split_alleles)

            def max_str_len(arr: ArrayLike) -> Any:
                return arr.map_blocks(
                    lambda s: np.char.str_len(s.astype(str)), dtype=np.int8
                ).max()

            max_allele_length = max(max_str_len(variant_alleles).compute())
            self.variant_alleles = variant_alleles.astype(f"S{max_allele_length}")

        with bgen_file(self.path) as bgen:
            sample_path = self.path.with_suffix(".sample")
            if sample_path.exists():
                self.sample_id = read_samples_file(sample_path, verbose=False)
            else:
                if bgen.contain_samples:
                    self.sample_id = bgen.read_samples()
                else:
                    self.sample_id = generate_samples(bgen.nsamples)

        self.shape = (self.n_variants, len(self.sample_id), 3)
        self.dtype = dtype
        self.ndim = 3
Example #4
0
    def __init__(
        self,
        filepath: Union[str, Path],
        samples_filepath: Optional[Union[str, Path]] = None,
        verbose: bool = True,
    ):
        filepath = Path(filepath)
        assert_file_exist(filepath)
        assert_file_readable(filepath)

        self._verbose = verbose
        self._filepath = filepath

        self._bgen_context_manager = bgen_file(filepath)
        self._bgen = self._bgen_context_manager.__enter__()

        self._samples = np.array(_get_samples(self._bgen, samples_filepath,
                                              self._verbose),
                                 dtype="str")
        self._sample_range = np.arange(len(self._samples), dtype=np.int)

        # LATER could make a version of this method public
        metadata2 = self._metadatapath_from_filename(
            filepath).resolve()  #needed because of tmp_cwd below
        if metadata2.exists() and getmtime(metadata2) < getmtime(filepath):
            metadata2.unlink()

        if metadata2.exists():
            d = np.load(str(metadata2))
            self._ids = d["ids"]
            self._rsids = d["rsids"]
            self._vaddr = d["vaddr"]
            self._chromosomes = d["chromosomes"]
            self._positions = d["positions"]
            self._nalleles = d["nalleles"]
            self._allele_ids = d["allele_ids"]
            self._ncombinations = d["ncombinations"]
            self._phased = d["phased"]
        else:
            with tmp_cwd():
                metafile_filepath = Path("bgen.metadata")
                self._bgen.create_metafile(metafile_filepath,
                                           verbose=self._verbose)
                self._map_metadata(metafile_filepath)
                np.savez(
                    metadata2,
                    ids=self._ids,
                    rsids=self._rsids,
                    vaddr=self._vaddr,
                    chromosomes=self._chromosomes,
                    positions=self._positions,
                    nalleles=self._nalleles,
                    allele_ids=self._allele_ids,
                    ncombinations=self._ncombinations,
                    phased=self._phased,
                )

        self._max_combinations = max(self._ncombinations)
Example #5
0
    def __getitem__(self, idx: Any) -> np.ndarray:
        if not isinstance(idx, tuple):
            raise IndexError(f"Indexer must be tuple (received {type(idx)})")
        if len(idx) != self.ndim:
            raise IndexError(
                f"Indexer must have {self.ndim} items (received {len(idx)} slices)"
            )
        if not all(isinstance(i, slice) or isinstance(i, int) for i in idx):
            raise IndexError(
                f"Indexer must contain only slices or ints (received types {[type(i) for i in idx]})"
            )
        # Determine which dims should have unit size in result
        squeeze_dims = tuple(i for i in range(len(idx)) if isinstance(idx[i], int))
        # Convert all indexers to slices
        idx = tuple(slice(i, i + 1) if isinstance(i, int) else i for i in idx)

        if idx[0].start == idx[0].stop:
            return np.empty((0,) * self.ndim, dtype=self.dtype)

        # Determine start and end partitions that correspond to the
        # given variant dimension indexer
        start_partition = idx[0].start // self.partition_size
        start_partition_offset = idx[0].start % self.partition_size
        end_partition = (idx[0].stop - 1) // self.partition_size
        end_partition_offset = (idx[0].stop - 1) % self.partition_size

        # Create a list of all offsets into the underlying file at which
        # data for each variant begins
        all_vaddr = []
        with bgen_metafile(self.metafile_filepath) as mf:
            for i in range(start_partition, end_partition + 1):
                partition = mf.read_partition(i)
                start_offset = start_partition_offset if i == start_partition else 0
                end_offset = (
                    end_partition_offset + 1
                    if i == end_partition
                    else self.partition_size
                )
                vaddr = partition["vaddr"].tolist()
                all_vaddr.extend(vaddr[start_offset:end_offset])

        # Read the probabilities for each variant, apply indexer for
        # samples dimension to give probabilities for all genotypes,
        # and then apply final genotype dimension indexer
        with bgen_file(self.path) as bgen:
            res = None
            for i, vaddr in enumerate(all_vaddr):
                probs = bgen.read_genotype(vaddr)["probs"][idx[1]]
                assert len(probs.shape) == 2 and probs.shape[1] == 3
                if res is None:
                    res = np.zeros((len(all_vaddr), len(probs), 3), dtype=self.dtype)
                res[i] = probs
            res = res[..., idx[2]]  # type: ignore[index]
            return np.squeeze(res, axis=squeeze_dims)