Ejemplo n.º 1
0
    def __getitem__(self, idx):
        if not isinstance(idx, tuple):
            raise IndexError(f'Indexer must be tuple (received {type(idx)})')
        if len(idx) != self.ndim:
            raise IndexError(
                f'Indexer must be two-item tuple (received {len(idx)} slices)')

        if idx[0].start == idx[0].stop:
            return np.empty((0, 0), dtype=self.dtype)

        start_partition = idx[0].start // self.partition_size
        start_partition_offset = idx[0].start % self.partition_size
        end_partition = (idx[0].stop - 1) // self.partition_size
        end_partition_offset = (idx[0].stop - 1) % self.partition_size

        all_vaddr = []
        with bgen_metafile(self.metafile_filepath) as mf:
            for i in range(start_partition, end_partition + 1):
                partition = mf.read_partition(i)
                start_offset = start_partition_offset if i == start_partition else 0
                end_offset = end_partition_offset + 1 if i == end_partition else self.partition_size
                vaddr = partition["vaddr"].tolist()
                all_vaddr.extend(vaddr[start_offset:end_offset])

        with bgen_file(self.path) as bgen:
            genotypes = [bgen.read_genotype(vaddr) for vaddr in all_vaddr]
            probs = [genotype["probs"] for genotype in genotypes]
            return np.stack(probs)[:, idx[1]]
Ejemplo n.º 2
0
    def __init__(self, path, dtype=np.float32):
        self.path = Path(path)

        self.metafile_filepath = _infer_metafile_filepath(Path(self.path))
        if not self.metafile_filepath.exists():
            create_metafile(path, self.metafile_filepath, verbose=False)

        with bgen_metafile(self.metafile_filepath) as mf:
            self.n_variants = mf.nvariants
            self.npartitions = mf.npartitions
            self.partition_size = mf.partition_size

            # This may need chunking for large numbers of variants
            variants_df = mf.create_variants().compute()
            self.variant_id = variants_df["id"].tolist()
            self.contig = variants_df["chrom"].tolist()
            self.pos = variants_df["pos"].tolist()
            allele_ids = variants_df["allele_ids"].tolist()
            self.a1, self.a2 = tuple(zip(*[id.split(",")
                                           for id in allele_ids]))

        with bgen_file(self.path) as bgen:
            sample_path = self.path.with_suffix('.sample')
            if sample_path.exists():
                self.samples = read_samples_file(sample_path, verbose=False)
            else:
                if bgen.contain_samples:
                    self.samples = bgen.read_samples()
                else:
                    self.samples = generate_samples(bgen.nsamples)

        self.shape = (self.n_variants, len(self.samples), 3)
        self.dtype = dtype
        self.ndim = 3
Ejemplo n.º 3
0
    def __init__(
        self, path: PathType, persist: bool = True, dtype: Any = np.float32
    ) -> None:
        self.path = Path(path)

        self.metafile_filepath = infer_metafile_filepath(Path(self.path))
        if not self.metafile_filepath.exists():
            create_metafile(path, self.metafile_filepath, verbose=False)

        with bgen_metafile(self.metafile_filepath) as mf:
            self.n_variants = mf.nvariants
            self.npartitions = mf.npartitions
            self.partition_size = mf.partition_size

            df = mf.create_variants()
            if persist:
                df = df.persist()
            variant_arrs = _to_dict(df, dtype=VARIANT_ARRAY_DTYPE)

            self.variant_id = variant_arrs["id"]
            self.contig = variant_arrs["chrom"]
            self.pos = variant_arrs["pos"]

            def split_alleles(
                alleles: np.ndarray, block_info: Any = None
            ) -> np.ndarray:
                if block_info is None or len(block_info) == 0:
                    return alleles

                def split(allele_row: np.ndarray) -> np.ndarray:
                    alleles_list = allele_row[0].split(",")
                    assert len(alleles_list) == 2  # bi-allelic
                    return np.array(alleles_list)

                return np.apply_along_axis(split, 1, alleles[:, np.newaxis])

            variant_alleles = variant_arrs["allele_ids"].map_blocks(split_alleles)

            def max_str_len(arr: ArrayLike) -> Any:
                return arr.map_blocks(
                    lambda s: np.char.str_len(s.astype(str)), dtype=np.int8
                ).max()

            max_allele_length = max(max_str_len(variant_alleles).compute())
            self.variant_alleles = variant_alleles.astype(f"S{max_allele_length}")

        with bgen_file(self.path) as bgen:
            sample_path = self.path.with_suffix(".sample")
            if sample_path.exists():
                self.sample_id = read_samples_file(sample_path, verbose=False)
            else:
                if bgen.contain_samples:
                    self.sample_id = bgen.read_samples()
                else:
                    self.sample_id = generate_samples(bgen.nsamples)

        self.shape = (self.n_variants, len(self.sample_id), 3)
        self.dtype = dtype
        self.ndim = 3
Ejemplo n.º 4
0
    def __getitem__(self, idx: Any) -> np.ndarray:
        if not isinstance(idx, tuple):
            raise IndexError(f"Indexer must be tuple (received {type(idx)})")
        if len(idx) != self.ndim:
            raise IndexError(
                f"Indexer must have {self.ndim} items (received {len(idx)} slices)"
            )
        if not all(isinstance(i, slice) or isinstance(i, int) for i in idx):
            raise IndexError(
                f"Indexer must contain only slices or ints (received types {[type(i) for i in idx]})"
            )
        # Determine which dims should have unit size in result
        squeeze_dims = tuple(i for i in range(len(idx)) if isinstance(idx[i], int))
        # Convert all indexers to slices
        idx = tuple(slice(i, i + 1) if isinstance(i, int) else i for i in idx)

        if idx[0].start == idx[0].stop:
            return np.empty((0,) * self.ndim, dtype=self.dtype)

        # Determine start and end partitions that correspond to the
        # given variant dimension indexer
        start_partition = idx[0].start // self.partition_size
        start_partition_offset = idx[0].start % self.partition_size
        end_partition = (idx[0].stop - 1) // self.partition_size
        end_partition_offset = (idx[0].stop - 1) % self.partition_size

        # Create a list of all offsets into the underlying file at which
        # data for each variant begins
        all_vaddr = []
        with bgen_metafile(self.metafile_filepath) as mf:
            for i in range(start_partition, end_partition + 1):
                partition = mf.read_partition(i)
                start_offset = start_partition_offset if i == start_partition else 0
                end_offset = (
                    end_partition_offset + 1
                    if i == end_partition
                    else self.partition_size
                )
                vaddr = partition["vaddr"].tolist()
                all_vaddr.extend(vaddr[start_offset:end_offset])

        # Read the probabilities for each variant, apply indexer for
        # samples dimension to give probabilities for all genotypes,
        # and then apply final genotype dimension indexer
        with bgen_file(self.path) as bgen:
            res = None
            for i, vaddr in enumerate(all_vaddr):
                probs = bgen.read_genotype(vaddr)["probs"][idx[1]]
                assert len(probs.shape) == 2 and probs.shape[1] == 3
                if res is None:
                    res = np.zeros((len(all_vaddr), len(probs), 3), dtype=self.dtype)
                res[i] = probs
            res = res[..., idx[2]]  # type: ignore[index]
            return np.squeeze(res, axis=squeeze_dims)
Ejemplo n.º 5
0
    def _map_metadata(self, metafile_filepath):
        with log_in_place("metadata", logging.INFO) as updater:
            with bgen_metafile(Path(metafile_filepath)) as mf:
                nparts = mf.npartitions
                (
                    id_list,
                    rsid_list,
                    chrom_list,
                    position_list,
                    vaddr_list,
                    nalleles_list,
                    allele_ids_list,
                    ncombinations_list,
                    phased_list,
                ) = ([], [], [], [], [], [], [], [], [])

                for ipart2 in range(nparts):  # LATER multithread?
                    # LATER in notebook this message doesn't appear on one line
                    updater("step 2: part {0:,} of {1:,}".format(
                        ipart2, nparts))

                    (
                        nvariants,
                        vid,
                        rsid,
                        chrom,
                        position,
                        nalleles,
                        allele_ids,
                        offset,
                    ) = _inner_read_partition(mf, ipart2)

                    id_list.append(vid)
                    rsid_list.append(rsid)
                    chrom_list.append(chrom)
                    position_list.append(position)
                    nalleles_list.append(nalleles)
                    allele_ids_list.append(allele_ids)
                    vaddr_list.append(offset)

            # LATER use concatenate(...out=) instead
            self._ids = np.array(np.concatenate(id_list),
                                 dtype="str")  # dtype needed to make unicode
            self._rsids = np.array(np.concatenate(rsid_list), dtype="str")
            self._vaddr = np.concatenate(vaddr_list)
            self._chromosomes = np.array(np.concatenate(chrom_list),
                                         dtype="str")
            self._positions = np.concatenate(position_list)
            self._nalleles = np.concatenate(nalleles_list)
            self._allele_ids = np.array(np.concatenate(allele_ids_list),
                                        dtype="str")

            for i, vaddr0 in enumerate(self._vaddr):
                if i % 1000 == 0:
                    updater("step 3: part {0:,} of {1:,}".format(
                        i, self.nvariants))
                genotype = lib.bgen_file_open_genotype(self._bgen._bgen_file,
                                                       vaddr0)
                ncombinations_list.append(lib.bgen_genotype_ncombs(genotype))
                phased_list.append(lib.bgen_genotype_phased(genotype))
                lib.bgen_genotype_close(genotype)

            self._ncombinations = np.array(ncombinations_list, dtype="int")
            self._phased = np.array(phased_list, dtype="bool")