Example #1
0
    def retrieve(
        cls,
        batch: Batch,
        columns: Union[List[str], List[List[str]]],
        proc_fns: Union[str, Callable, List[Union[str, Callable]]] = None,
        identifier: Union[str, Identifier] = None,
        reapply: bool = False,
        **kwargs,
    ) -> Optional[Union[Batch, List[Batch]]]:
        if not reapply:
            if "slices" not in batch:
                return None

            # Infer the most relevant key to retrieve if an identifier is not specified
            if not identifier:
                for ident_key in batch["slices"][0].keys():
                    # Pick the first key that matches the cls name
                    if ident_key.startswith(cls.__name__):
                        identifier = ident_key
                        break

            try:
                if isinstance(columns[0], str):
                    retrieval = {
                        strings_as_json(columns): [
                            cls.decode(cache[str(identifier)][strings_as_json(
                                columns)]) for cache in batch["cache"]
                        ]
                    }
                else:
                    retrieval = {
                        strings_as_json(cols_): [
                            cls.decode(
                                cache[str(identifier)][strings_as_json(cols_)])
                            for cache in batch["cache"]
                        ]
                        for cols_ in columns
                    }
            except KeyError:
                raise ValueError(
                    "Could not retrieve information for all keys.")

            # Check if the retrieved information needs to be processed
            if not proc_fns:
                return retrieval
            pass
        else:
            pass
Example #2
0
 def _construct_updates(self, encoded_outputs: List[str],
                        columns: List[str]):
     return [{
         str(self.identifier): {
             strings_as_json(columns): val
         }
     } for val in encoded_outputs]
Example #3
0
    def check(self, identifier: Union[str, Identifier],
              columns: List[str]) -> bool:
        """

        Args:
            identifier:
            columns:

        Returns:

        """
        if not (isinstance(identifier, str)
                or isinstance(identifier, Identifier)):
            raise ValueError(
                f"Parameter `identifier` should be an instance of class Identifier "
                f"or str, "
                f"not {type(identifier)}.")

        # Dump the column names to JSON
        json_columns = strings_as_json(strings=columns)

        # Check if the entry is already in the history
        if (identifier, json_columns) in self.history:
            return True
        return False
Example #4
0
    def update(self, identifier: Union[str, Identifier],
               columns: List[str]) -> None:
        """Update the interaction tape with information about an interaction.

        Args:
            identifier: Identifier for the interaction used.
            columns: list of columns on which the interaction was applied.

        Returns: True if the interaction was added to the tape, False if it was
        already applied before.
        """
        if isinstance(identifier, str):
            identifier = Identifier(_name=identifier)
        elif isinstance(identifier, Identifier):
            pass
        else:
            raise ValueError(
                f"Parameter `identifier` should be an instance of class Identifier "
                f"or str, "
                f"not {type(identifier)}.")

        # Dump the column names to JSON
        json_columns = strings_as_json(strings=columns)

        # Check if the entry is not in the history
        if (identifier, json_columns) not in self.history:
            # Give it the next index
            self.history[(identifier, json_columns)] = len(self.history)
Example #5
0
    def process_dataset(
        self,
        dp: DataPanel,
        columns: List[str],
        batch_size: int = 32,
        num_proc: int = None,
        *args,
        **kwargs,
    ) -> Tuple[List[DataPanel], np.ndarray]:

        # Create slices using the dataset
        all_slice_memberships = []

        # Batch the dataset, and process each batch
        for batch in dp.batch(batch_size):
            # Process the batch
            _, slice_memberships = self.process_batch(
                dp=batch,
                columns=columns,
                *args,
                **kwargs,
            )

            # Keep track of the slice memberships
            all_slice_memberships.append(slice_memberships)

        # Create a single slice label matrix
        slice_membership = np.concatenate(all_slice_memberships, axis=0)

        slices = []
        for i in range(len(self.identifiers)):
            # Create a view of the original DataPanel
            sl = dp.view()

            # Only keep the filtered rows visible
            for column in sl._data.values():
                column.visible_rows = np.where(slice_membership[:, i])[0]

            # Set the Slice category using the SliceBuilder's category
            sl.category = self.category

            # Append the the lineage
            sl.add_to_lineage(
                category=str(self.category.capitalize()),
                identifier=self.identifiers[i],
                columns=strings_as_json(columns),
            )

            #
            # sl.identifier = ...

            slices.append(sl)

        # for i, sl in enumerate(slices):
        #     # Set the visible rows for each slice
        #     sl.set_visible_rows(np.where(slice_membership[:, i])[0])

        return slices, slice_membership
Example #6
0
    def score(self, batch: Dict[str, List], columns: List[str], *args,
              **kwargs) -> np.ndarray:
        assert len(columns) == 2, "Must have exactly 2 columns."

        # Retrieve Rouge scores
        scores = RougeScore.retrieve(
            batch=batch,
            columns=columns,
            proc_fns=partial(RougeScore.select, metric=self.metric),
        )[strings_as_json(columns)]

        return np.array(scores)
Example #7
0
    def score(self, batch: Dict[str, List], columns: List[str], *args,
              **kwargs) -> np.ndarray:
        assert len(columns) == 2, "Must have exactly 2 columns."

        # Retrieve the relevant Rouge matrices
        matrices = RougeMatrix.retrieve(
            batch=batch,
            columns=columns,
            proc_fns=partial(RougeMatrix.select, metric=self.metric),
        )[strings_as_json(columns)]

        return self.reduce(matrices)
Example #8
0
    def test_multicolumn(self):
        # Apply the multi-column cached operation
        dataset = self.multicol_cachedop(self.testbed.dataset,
                                         columns=["label", "z"])

        # Check that caching happens and that the cached values are correct
        self.assertTrue(
            strings_as_json(["label", "z"]) in dataset.features["cache"][str(
                self.multicol_cachedop.identifier)])
        self.assertEqual(
            self.multicol_cachedop.retrieve(dataset[:], columns=["label",
                                                                 "z"]),
            {'["label", "z"]': [0.3, 0.0, 0.4, 0.1, 0.3, 0.0]},
        )

        # Apply the single-column cached operation
        dataset = self.cachedop(dataset, columns=["label"])
        dataset = self.cachedop(dataset, columns=["z"])

        # Now recheck that everything can be retrieved correctly
        self.assertTrue(
            strings_as_json(["label", "z"]) in dataset.features["cache"][str(
                self.multicol_cachedop.identifier)])
        self.assertEqual(
            self.multicol_cachedop.retrieve(dataset[:], columns=["label",
                                                                 "z"]),
            {'["label", "z"]': [0.3, 0.0, 0.4, 0.1, 0.3, 0.0]},
        )
        self.assertEqual(
            self.cachedop.retrieve(dataset[:], columns=["label"]),
            {"label": [3.14, 3.14, 10.14, 10.14, 3.14, 3.14]},
        )
        self.assertEqual(
            self.cachedop.retrieve(dataset[:], columns=["z"]),
            {"z": [10.14, 3.14, 10.14, 3.14, 10.14, 3.14]},
        )
    def apply(
        self,
        batch: DataPanel,
        columns: List[str],
        skeleton_batches: List[DataPanel],
        slice_membership: np.ndarray,
        *args,
        **kwargs,
    ) -> Tuple[List[DataPanel], np.ndarray]:
        assert len(columns) == 2

        # Retrieve the relevant Rouge matrices
        matrices = RougeMatrix.retrieve(
            batch=batch,
            columns=columns,
            proc_fns=partial(RougeMatrix.select, metric=self.metric),
        )[strings_as_json(columns)]

        # Find max value along each row, remove rows that have max value below a
        # threshold
        rows_to_keep = [
            (m / (m.sum(axis=0) + 1e-5)).max(axis=1) >= self.threshold for m in matrices
        ]

        # Fetch sentences for the first column
        sentences = SpacyOp.retrieve(
            batch=batch,
            columns=[columns[0]],
            proc_fns=SpacyOp.sentences,
        )[columns[0]]

        # Delete sentences
        new_sentences = [
            " ".join(np.array(sent)[rows_to_keep[i]])
            for i, sent in enumerate(sentences)
        ]

        # Store the augmented text in the skeleton batches
        for i, augmented in enumerate(new_sentences):
            skeleton_batches[0][columns[0]][i] = augmented

        return skeleton_batches, slice_membership
Example #10
0
def _run_aligners(
    dataset: Dataset,
    aligners: List[CachedOperation],
    doc_column: str,
    reference_column: str,
    summary_columns: List[str] = None,
):
    if not summary_columns:
        summary_columns = []

    to_columns = []
    if reference_column is not None:
        to_columns.append(reference_column)
    to_columns.extend(summary_columns)

    for aligner in aligners:

        # Run the aligner on (document, summary) pairs

        dataset = aligner(
            dataset,
            [doc_column] + to_columns,
            # Must use `batch_size = 1`
            batch_size=1,
        )

        if reference_column is not None and len(summary_columns):
            # Run the aligner on (reference, summary) pairs
            dataset = aligner(
                dataset,
                [reference_column] + summary_columns,
                # Must use `batch_size = 1`
                batch_size=1,
            )

        if len(to_columns) > 1:
            # Instead of having one column for (document, summary) comparisons, split
            # off into (1 + |summary_columns|) total columns, one for each comparison

            # Retrieve the (document, summary) column
            doc_summary_column = aligner.retrieve(
                dataset[:],
                [doc_column] + to_columns,
            )[tuple([doc_column] + to_columns)]

            for i, col in enumerate(to_columns):
                # Add as a new column after encoding with the aligner's `encode` method
                dataset.add_column(
                    column=str(aligner.identifier(columns=[doc_column, col])),
                    values=[
                        aligner.encode([row[i]]) for row in doc_summary_column
                    ],
                )

            # Remove the (document, summary) column
            dataset.remove_column(
                str(aligner.identifier(columns=[doc_column] + to_columns)))
            del dataset.interactions[CACHEDOPS].history[(
                aligner.identifier,
                strings_as_json(strings=[doc_column] + to_columns))]

        if reference_column is not None and len(summary_columns) > 1:
            # Instead of having one column for (reference, summary) comparisons, split
            # off into (|summary_columns|) total columns, one for each comparison

            # Retrieve the (reference, summary) column
            reference_summary_column = aligner.retrieve(
                dataset[:],
                [reference_column] + summary_columns,
            )[tuple([reference_column] + summary_columns)]

            for i, col in enumerate(summary_columns):
                # Add as a new column
                dataset.add_column(column=str(
                    aligner.identifier(columns=[reference_column, col])),
                                   values=[
                                       aligner.encode([row[i]])
                                       for row in reference_summary_column
                                   ])

            # Remove the (reference, summary) column
            dataset.remove_column(
                str(
                    aligner.identifier(columns=[reference_column] +
                                       summary_columns)))
            del dataset.interactions[CACHEDOPS].history[(
                aligner.identifier,
                strings_as_json(strings=[reference_column] + summary_columns))]

    return dataset
Example #11
0
    def process_dataset(
        self,
        dataset: Dataset,
        columns: List[str],
        batch_size: int = 32,
        mask: List[int] = None,
        store_compressed: bool = True,
        store: bool = True,
        num_proc: int = None,
        *args,
        **kwargs,
    ) -> Tuple[Dataset, List[Slice], np.ndarray]:
        """Apply a SliceBuilder to a dataset.

        Args:
            dataset: Dataset
            columns: list of columns
            batch_size: integer batch size
            mask: boolean or integer mask array, mask[i] = True means that the ith
            slice will be masked out
            store_compressed: whether to store in a compressed format
            store: whether to store the results along with the example in Dataset
            num_proc: num processes for multiprocessing
            *args: optional additional arguments
            **kwargs: optional additional keyword arguments

        Returns: tuple of (Dataset, list of Slices, matrix of (example,
        slice) membership)
        """
        # Prepare the dataset
        dataset = self.prepare_dataset(
            dataset=dataset,
            columns=columns,
            batch_size=batch_size,
            mask=mask,
            store_compressed=store_compressed,
            store=store,
            *args,
            **kwargs,
        )

        # Compute a hash
        val = persistent_hash(str(
            dataset.identifier)) ^ dataset.hash_interactions()
        for i, identifier in enumerate(self.identifiers):
            if not mask[i]:
                val ^= persistent_hash(
                    str(identifier) + str(strings_as_json(columns)))

        try:
            # Map the SliceBuilder over the dataset
            all_sliced_batches = []
            all_slice_memberships = []

            def _map_fn(batch):
                """Map function for processing batches.

                Note that using this map_fn in a stateful way is
                dangerous, since every invocation of this function
                appends to the all_slice_batches list. The .map()
                function will invoke this once for testing before
                performing the map, so we discard the first entry
                inserted into all_sliced_batches.
                """
                batch, sliced_batches, slice_membership = self.process_batch(
                    batch=batch,
                    columns=columns,
                    mask=mask,
                    store_compressed=store_compressed,
                    store=store,
                    *args,
                    **kwargs,
                )
                all_sliced_batches.append(sliced_batches)
                all_slice_memberships.append(slice_membership)
                return batch

            dataset = dataset.map(
                _map_fn,
                batched=True,
                batch_size=batch_size,
                # FIXME(karan): enable this by adding logic for generating
                #  all_sliced_batches and all_slice_memberships
                #  when loading from cache file
                load_from_cache_file=False,
                # The cache file name is a XOR of the interaction history and the
                # current operation
                cache_file_name=str(dataset.logdir /
                                    ("cache-" + str(abs(val)) + ".arrow")),
            )

            # Remove the first entry (see _map_fn)
            all_sliced_batches = all_sliced_batches[1:]
            all_slice_memberships = all_slice_memberships[1:]

        except:  # noqa
            # Batch the dataset, and process each batch
            all_batches, all_sliced_batches, all_slice_memberships = zip(*[
                self.process_batch(
                    batch=batch,
                    columns=columns,
                    mask=mask,
                    store_compressed=store_compressed,
                    store=store,
                    *args,
                    **kwargs,
                ) for batch in dataset.batch(batch_size)
            ])

            # Update the dataset efficiently by reusing all_batches
            dataset = dataset.map(
                lambda examples, indices: all_batches[indices[0] // batch_size
                                                      ],
                batched=True,
                batch_size=batch_size,
                with_indices=True,
                load_from_cache_file=False,
                # The cache file name is a XOR of the interaction history and the
                # current operation
                cache_file_name=str(dataset.logdir /
                                    ("cache-" + str(abs(val)) + ".arrow")),
            )

        # Create a single slice label matrix
        slice_membership = np.concatenate(all_slice_memberships, axis=0)

        slice_cache_hashes = []
        for identifier in self.identifiers:
            slice_cache_hashes.append(val ^ persistent_hash(str(identifier)))

        if not num_proc or num_proc == 1:
            # Construct slices
            slices = []
            for i, slice_batches in enumerate(zip(*all_sliced_batches)):
                slices.append(
                    create_slice((
                        dataset,
                        slice_membership,
                        slice_batches,
                        i,
                        batch_size,
                        slice_cache_hashes[i],
                    )))
        else:
            # Parallelized slice construction
            with Pool(num_proc) as pool:
                slices = pool.map(
                    create_slice,
                    [(
                        dataset,
                        slice_membership,
                        slice_batches,
                        i,
                        batch_size,
                        slice_cache_hashes[i],
                    )
                     for i, slice_batches in enumerate(zip(
                         *all_sliced_batches))],
                )

        # TODO(karan): make this more systematic
        # TODO(karan): fix bug when slicing a Slice
        for i, sl in enumerate(slices):
            # # Set the Slice features
            # sl.info.features = dataset.features

            # Set the Slice category using the SliceBuilder's category
            sl.category = self.category

            # Create the lineage
            sl.lineage = [
                (str(Dataset.__name__), dataset.identifier),
                (
                    str(self.category.capitalize()),
                    self.identifiers[i],
                    strings_as_json(columns),
                ),
            ]
            if isinstance(dataset, Slice):
                # Prepend the Slice's lineage instead, if the dataset was a slice
                sl.lineage = dataset.lineage + [(
                    str(self.category.capitalize()),
                    self.identifiers[i],
                    strings_as_json(columns),
                )]

        return dataset, slices, slice_membership
Example #12
0
    def prepare_dataset(
        self,
        dataset: Dataset,
        columns: List[str],
        batch_size: int = 32,
        mask: List[int] = None,
        store_compressed: bool = True,
        store: bool = True,
        *args,
        **kwargs,
    ) -> Dataset:

        # Compute the hash for this operation
        # FIXME(karan): this is repeated inside process_dataset
        val = persistent_hash(str(
            dataset.identifier)) ^ dataset.hash_interactions()
        for i, identifier in enumerate(self.identifiers):
            if not mask[i]:
                val ^= persistent_hash(
                    str(identifier) + str(strings_as_json(columns)))

        try:
            return dataset.map(
                partial(
                    self.prepare_batch,
                    columns=columns,
                    mask=mask,
                    store_compressed=store_compressed,
                    store=store,
                    *args,
                    **kwargs,
                ),
                batched=True,
                batch_size=batch_size,
                load_from_cache_file=False,
                cache_file_name=str(
                    dataset.logdir /
                    ("cache-" + str(abs(val)) + "-prep.arrow")),
            )
        except:  # TypeError or PicklingError or AttributeError: # noqa
            # Batch the dataset, and process each batch
            all_batches = [
                self.prepare_batch(
                    batch=batch,
                    columns=columns,
                    mask=mask,
                    store_compressed=store_compressed,
                    store=store,
                    *args,
                    **kwargs,
                ) for batch in dataset.batch(batch_size)
            ]

            # Update the dataset efficiently by reusing all_batches
            return dataset.map(
                lambda examples, indices: all_batches[indices[0] // batch_size
                                                      ],
                batched=True,
                batch_size=batch_size,
                with_indices=True,
                load_from_cache_file=False,
                cache_file_name=str(
                    dataset.logdir /
                    ("cache-" + str(abs(val)) + "-prep.arrow")),
            )
Example #13
0
    def process_dataset(
        self,
        dataset: Dataset,
        columns: List[str],
        batch_size: int = 32,
        # mask: List[int] = None,
        num_proc: int = None,
        *args,
        **kwargs,
    ) -> Tuple[List[Slice], np.ndarray]:

        # Create slices using the dataset
        slices = [Slice(dataset) for _ in range(len(self.identifiers))]
        all_slice_memberships = []
        # Batch the dataset, and process each batch
        for batch in dataset.batch(batch_size):
            # Process the batch
            _, slice_memberships = self.process_batch(
                batch=batch,
                columns=columns,
                *args,
                **kwargs,
            )

            # Keep track of the slice memberships
            all_slice_memberships.append(slice_memberships)

        # Create a single slice label matrix
        slice_membership = np.concatenate(all_slice_memberships, axis=0)

        for i, sl in enumerate(slices):
            # Set the visible rows for each slice
            sl.set_visible_rows(np.where(slice_membership[:, i])[0])

            # Set the Slice category using the SliceBuilder's category
            sl.category = self.category

            # Append the the lineage
            sl.add_to_lineage(
                category=str(self.category.capitalize()),
                identifier=self.identifiers[i],
                columns=strings_as_json(columns),
            )

            # # Create the lineage
            # sl.lineage = [
            #     (str(Dataset.__name__), dataset.identifier),
            #     (
            #         str(self.category.capitalize()),
            #         self.identifiers[i],
            #         strings_as_json(columns),
            #     ),
            # ]
            # if isinstance(dataset, Slice):
            #     # Prepend the Slice's lineage instead, if the dataset was a slice
            #     sl.lineage = dataset.lineage + [
            #         (
            #             str(self.category.capitalize()),
            #             self.identifiers[i],
            #             strings_as_json(columns),
            #         )
            #     ]

        return slices, slice_membership
Example #14
0
    def process_dataset(
        self,
        dp: DataPanel,
        columns: List[str],
        batch_size: int = 32,
        num_proc: int = None,
        *args,
        **kwargs,
    ) -> Tuple[List[DataPanel], np.ndarray]:
        """Apply a SliceBuilder to a dataset.

        Args:
            dp: DataPanel
            columns: list of columns
            batch_size: integer batch size
            num_proc: num processes for multiprocessing
            *args: optional additional arguments
            **kwargs: optional additional keyword arguments

        Returns: tuple of (DataPanel, list of Slices,
        matrix of (example, slice) membership)
        """
        # Create slices
        slices = [[DataPanel()] for _ in range(len(self.identifiers))]
        all_slice_memberships = []

        # Batch the dataset, and process each batch
        for batch in dp.batch(batch_size):
            # Process the batch
            sliced_batches, slice_memberships = self.process_batch(
                dp=batch,
                columns=columns,
                *args,
                **kwargs,
            )

            # Incrementally build the slices
            for sl, sl_batch in zip(slices, sliced_batches):
                sl.append(DataPanel(sl_batch))

            # Keep track of the slice memberships
            all_slice_memberships.append(slice_memberships)

        # Create a single slice label matrix
        slice_membership = np.concatenate(all_slice_memberships, axis=0)

        # Create a single DataPanel for each slice
        slices = [
            meerkat.concat(e[1:], axis=0) if len(e) > 1 else e[0]
            for e in slices
        ]

        # TODO(karan): DataPanel doesn't support this
        for i, sl in enumerate(slices):
            # Set the Slice category using the SliceBuilder's category
            sl.category = self.category

            # Append the the lineage
            sl.add_to_lineage(
                category=str(self.category.capitalize()),
                identifier=self.identifiers[i],
                columns=strings_as_json(columns),
            )

        return slices, slice_membership
Example #15
0
    def process_dataset(
        self,
        dataset: Dataset,
        columns: List[str],
        batch_size: int = 32,
        # mask: List[int] = None,
        num_proc: int = None,
        *args,
        **kwargs,
    ) -> Tuple[List[Slice], np.ndarray]:
        """Apply a SliceBuilder to a dataset.

        Args:
            dataset: Dataset
            columns: list of columns
            batch_size: integer batch size
            # mask: boolean or integer mask array, mask[i] = True means that the ith
            # slice will be masked out
            num_proc: num processes for multiprocessing
            *args: optional additional arguments
            **kwargs: optional additional keyword arguments

        Returns: tuple of (Dataset, list of Slices, matrix of (example,
        slice) membership)
        """

        # # Compute a hash
        # val = persistent_hash(str(dataset.identifier)) ^ dataset.hash_interactions()
        # for i, identifier in enumerate(self.identifiers):
        #     if not mask[i]:
        #         val ^= persistent_hash(str(identifier)
        #         + str(strings_as_json(columns)))

        # try:
        #     # Map the SliceBuilder over the dataset
        #     all_sliced_batches = []
        #     all_slice_memberships = []
        #
        #     def _map_fn(batch):
        #         """Map function for processing batches.
        #
        #         Note that using this map_fn in a stateful way is
        #         dangerous, since every invocation of this function
        #         appends to the all_slice_batches list. The .map()
        #         function will invoke this once for testing before
        #         performing the map, so we discard the first entry
        #         inserted into all_sliced_batches.
        #         """
        #         batch, sliced_batches, slice_membership = self.process_batch(
        #             batch=batch,
        #             columns=columns,
        #             mask=mask,
        #             store_compressed=store_compressed,
        #             store=store,
        #             *args,
        #             **kwargs,
        #         )
        #         all_sliced_batches.append(sliced_batches)
        #         all_slice_memberships.append(slice_membership)
        #         return batch
        #
        #     dataset = dataset.map(
        #         _map_fn,
        #         batched=True,
        #         batch_size=batch_size,
        #         # FIXME(karan): enable this by adding logic for generating
        #         #  all_sliced_batches and all_slice_memberships
        #         #  when loading from cache file
        #         load_from_cache_file=False,
        #         # The cache file name is a XOR of the interaction history and the
        #         # current operation
        #         cache_file_name=str(
        #             dataset.logdir / ("cache-" + str(abs(val)) + ".arrow")
        #         ),
        #     )
        #
        #     # Remove the first entry (see _map_fn)
        #     all_sliced_batches = all_sliced_batches[1:]
        #     all_slice_memberships = all_slice_memberships[1:]
        #
        # except:  # noqa
        # all_batches, all_sliced_batches, all_slice_memberships = zip(
        #     *[
        #         self.process_batch(
        #             batch=batch,
        #             columns=columns,
        #             mask=mask,
        #             store_compressed=store_compressed,
        #             store=store,
        #             *args,
        #             **kwargs,
        #         )
        #         for batch in dataset.batch(batch_size)
        #     ]
        # )
        # # Update the dataset efficiently by reusing all_batches
        # dataset = dataset.map(
        #     lambda examples, indices: all_batches[indices[0] // batch_size],
        #     batched=True,
        #     batch_size=batch_size,
        #     with_indices=True,
        #     load_from_cache_file=False,
        #     # The cache file name is a XOR of the interaction history and the
        #     # current operation
        #     cache_file_name=str(
        #         dataset.logdir / ("cache-" + str(abs(val)) + ".arrow")
        #     ),
        # )

        # Create slices
        slices = [Slice() for _ in range(len(self.identifiers))]
        all_slice_memberships = []
        # Batch the dataset, and process each batch
        for batch in dataset.batch(batch_size):
            # Process the batch
            sliced_batches, slice_memberships = self.process_batch(
                batch=batch,
                columns=columns,
                *args,
                **kwargs,
            )

            # Incrementally build the slices
            for sl, sl_batch in zip(slices, sliced_batches):
                sl._dataset.append(sl_batch)

            # Keep track of the slice memberships
            all_slice_memberships.append(slice_memberships)

        # Create a single slice label matrix
        slice_membership = np.concatenate(all_slice_memberships, axis=0)

        # slice_cache_hashes = []
        # for identifier in self.identifiers:
        #     slice_cache_hashes.append(val ^ persistent_hash(str(identifier)))

        # if not num_proc or num_proc == 1:
        #     # Construct slices
        #     slices = []
        #     for i, slice_batches in enumerate(zip(*all_sliced_batches)):
        #         slices.append(
        #             create_slice(
        #                 (
        #                     dataset,
        #                     slice_membership,
        #                     slice_batches,
        #                     i,
        #                     batch_size,
        #                     slice_cache_hashes[i],
        #                 )
        #             )
        #         )
        # else:
        #     # Parallelized slice construction
        #     with Pool(num_proc) as pool:
        #         slices = pool.map(
        #             create_slice,
        #             [
        #                 (
        #                     dataset,
        #                     slice_membership,
        #                     slice_batches,
        #                     i,
        #                     batch_size,
        #                     slice_cache_hashes[i],
        #                 )
        #                 for i, slice_batches in enumerate(zip(*all_sliced_batches))
        #             ],
        #         )

        for i, sl in enumerate(slices):
            # Set the Slice category using the SliceBuilder's category
            sl.category = self.category

            # Append the the lineage
            sl.add_to_lineage(
                category=str(self.category.capitalize()),
                identifier=self.identifiers[i],
                columns=strings_as_json(columns),
            )

            # # Create the lineage
            # sl.lineage = [
            #     (str(Dataset.__name__), dataset.identifier),
            #     (
            #         str(self.category.capitalize()),
            #         self.identifiers[i],
            #         strings_as_json(columns),
            #     ),
            # ]
            # if isinstance(dataset, Slice):
            #     # Prepend the Slice's lineage instead, if the dataset was a slice
            #     sl.lineage = dataset.lineage + [
            #         (
            #             str(self.category.capitalize()),
            #             self.identifiers[i],
            #             strings_as_json(columns),
            #         )
            #     ]

        return slices, slice_membership