def retrieve( cls, batch: Batch, columns: Union[List[str], List[List[str]]], proc_fns: Union[str, Callable, List[Union[str, Callable]]] = None, identifier: Union[str, Identifier] = None, reapply: bool = False, **kwargs, ) -> Optional[Union[Batch, List[Batch]]]: if not reapply: if "slices" not in batch: return None # Infer the most relevant key to retrieve if an identifier is not specified if not identifier: for ident_key in batch["slices"][0].keys(): # Pick the first key that matches the cls name if ident_key.startswith(cls.__name__): identifier = ident_key break try: if isinstance(columns[0], str): retrieval = { strings_as_json(columns): [ cls.decode(cache[str(identifier)][strings_as_json( columns)]) for cache in batch["cache"] ] } else: retrieval = { strings_as_json(cols_): [ cls.decode( cache[str(identifier)][strings_as_json(cols_)]) for cache in batch["cache"] ] for cols_ in columns } except KeyError: raise ValueError( "Could not retrieve information for all keys.") # Check if the retrieved information needs to be processed if not proc_fns: return retrieval pass else: pass
def _construct_updates(self, encoded_outputs: List[str], columns: List[str]): return [{ str(self.identifier): { strings_as_json(columns): val } } for val in encoded_outputs]
def check(self, identifier: Union[str, Identifier], columns: List[str]) -> bool: """ Args: identifier: columns: Returns: """ if not (isinstance(identifier, str) or isinstance(identifier, Identifier)): raise ValueError( f"Parameter `identifier` should be an instance of class Identifier " f"or str, " f"not {type(identifier)}.") # Dump the column names to JSON json_columns = strings_as_json(strings=columns) # Check if the entry is already in the history if (identifier, json_columns) in self.history: return True return False
def update(self, identifier: Union[str, Identifier], columns: List[str]) -> None: """Update the interaction tape with information about an interaction. Args: identifier: Identifier for the interaction used. columns: list of columns on which the interaction was applied. Returns: True if the interaction was added to the tape, False if it was already applied before. """ if isinstance(identifier, str): identifier = Identifier(_name=identifier) elif isinstance(identifier, Identifier): pass else: raise ValueError( f"Parameter `identifier` should be an instance of class Identifier " f"or str, " f"not {type(identifier)}.") # Dump the column names to JSON json_columns = strings_as_json(strings=columns) # Check if the entry is not in the history if (identifier, json_columns) not in self.history: # Give it the next index self.history[(identifier, json_columns)] = len(self.history)
def process_dataset( self, dp: DataPanel, columns: List[str], batch_size: int = 32, num_proc: int = None, *args, **kwargs, ) -> Tuple[List[DataPanel], np.ndarray]: # Create slices using the dataset all_slice_memberships = [] # Batch the dataset, and process each batch for batch in dp.batch(batch_size): # Process the batch _, slice_memberships = self.process_batch( dp=batch, columns=columns, *args, **kwargs, ) # Keep track of the slice memberships all_slice_memberships.append(slice_memberships) # Create a single slice label matrix slice_membership = np.concatenate(all_slice_memberships, axis=0) slices = [] for i in range(len(self.identifiers)): # Create a view of the original DataPanel sl = dp.view() # Only keep the filtered rows visible for column in sl._data.values(): column.visible_rows = np.where(slice_membership[:, i])[0] # Set the Slice category using the SliceBuilder's category sl.category = self.category # Append the the lineage sl.add_to_lineage( category=str(self.category.capitalize()), identifier=self.identifiers[i], columns=strings_as_json(columns), ) # # sl.identifier = ... slices.append(sl) # for i, sl in enumerate(slices): # # Set the visible rows for each slice # sl.set_visible_rows(np.where(slice_membership[:, i])[0]) return slices, slice_membership
def score(self, batch: Dict[str, List], columns: List[str], *args, **kwargs) -> np.ndarray: assert len(columns) == 2, "Must have exactly 2 columns." # Retrieve Rouge scores scores = RougeScore.retrieve( batch=batch, columns=columns, proc_fns=partial(RougeScore.select, metric=self.metric), )[strings_as_json(columns)] return np.array(scores)
def score(self, batch: Dict[str, List], columns: List[str], *args, **kwargs) -> np.ndarray: assert len(columns) == 2, "Must have exactly 2 columns." # Retrieve the relevant Rouge matrices matrices = RougeMatrix.retrieve( batch=batch, columns=columns, proc_fns=partial(RougeMatrix.select, metric=self.metric), )[strings_as_json(columns)] return self.reduce(matrices)
def test_multicolumn(self): # Apply the multi-column cached operation dataset = self.multicol_cachedop(self.testbed.dataset, columns=["label", "z"]) # Check that caching happens and that the cached values are correct self.assertTrue( strings_as_json(["label", "z"]) in dataset.features["cache"][str( self.multicol_cachedop.identifier)]) self.assertEqual( self.multicol_cachedop.retrieve(dataset[:], columns=["label", "z"]), {'["label", "z"]': [0.3, 0.0, 0.4, 0.1, 0.3, 0.0]}, ) # Apply the single-column cached operation dataset = self.cachedop(dataset, columns=["label"]) dataset = self.cachedop(dataset, columns=["z"]) # Now recheck that everything can be retrieved correctly self.assertTrue( strings_as_json(["label", "z"]) in dataset.features["cache"][str( self.multicol_cachedop.identifier)]) self.assertEqual( self.multicol_cachedop.retrieve(dataset[:], columns=["label", "z"]), {'["label", "z"]': [0.3, 0.0, 0.4, 0.1, 0.3, 0.0]}, ) self.assertEqual( self.cachedop.retrieve(dataset[:], columns=["label"]), {"label": [3.14, 3.14, 10.14, 10.14, 3.14, 3.14]}, ) self.assertEqual( self.cachedop.retrieve(dataset[:], columns=["z"]), {"z": [10.14, 3.14, 10.14, 3.14, 10.14, 3.14]}, )
def apply( self, batch: DataPanel, columns: List[str], skeleton_batches: List[DataPanel], slice_membership: np.ndarray, *args, **kwargs, ) -> Tuple[List[DataPanel], np.ndarray]: assert len(columns) == 2 # Retrieve the relevant Rouge matrices matrices = RougeMatrix.retrieve( batch=batch, columns=columns, proc_fns=partial(RougeMatrix.select, metric=self.metric), )[strings_as_json(columns)] # Find max value along each row, remove rows that have max value below a # threshold rows_to_keep = [ (m / (m.sum(axis=0) + 1e-5)).max(axis=1) >= self.threshold for m in matrices ] # Fetch sentences for the first column sentences = SpacyOp.retrieve( batch=batch, columns=[columns[0]], proc_fns=SpacyOp.sentences, )[columns[0]] # Delete sentences new_sentences = [ " ".join(np.array(sent)[rows_to_keep[i]]) for i, sent in enumerate(sentences) ] # Store the augmented text in the skeleton batches for i, augmented in enumerate(new_sentences): skeleton_batches[0][columns[0]][i] = augmented return skeleton_batches, slice_membership
def _run_aligners( dataset: Dataset, aligners: List[CachedOperation], doc_column: str, reference_column: str, summary_columns: List[str] = None, ): if not summary_columns: summary_columns = [] to_columns = [] if reference_column is not None: to_columns.append(reference_column) to_columns.extend(summary_columns) for aligner in aligners: # Run the aligner on (document, summary) pairs dataset = aligner( dataset, [doc_column] + to_columns, # Must use `batch_size = 1` batch_size=1, ) if reference_column is not None and len(summary_columns): # Run the aligner on (reference, summary) pairs dataset = aligner( dataset, [reference_column] + summary_columns, # Must use `batch_size = 1` batch_size=1, ) if len(to_columns) > 1: # Instead of having one column for (document, summary) comparisons, split # off into (1 + |summary_columns|) total columns, one for each comparison # Retrieve the (document, summary) column doc_summary_column = aligner.retrieve( dataset[:], [doc_column] + to_columns, )[tuple([doc_column] + to_columns)] for i, col in enumerate(to_columns): # Add as a new column after encoding with the aligner's `encode` method dataset.add_column( column=str(aligner.identifier(columns=[doc_column, col])), values=[ aligner.encode([row[i]]) for row in doc_summary_column ], ) # Remove the (document, summary) column dataset.remove_column( str(aligner.identifier(columns=[doc_column] + to_columns))) del dataset.interactions[CACHEDOPS].history[( aligner.identifier, strings_as_json(strings=[doc_column] + to_columns))] if reference_column is not None and len(summary_columns) > 1: # Instead of having one column for (reference, summary) comparisons, split # off into (|summary_columns|) total columns, one for each comparison # Retrieve the (reference, summary) column reference_summary_column = aligner.retrieve( dataset[:], [reference_column] + summary_columns, )[tuple([reference_column] + summary_columns)] for i, col in enumerate(summary_columns): # Add as a new column dataset.add_column(column=str( aligner.identifier(columns=[reference_column, col])), values=[ aligner.encode([row[i]]) for row in reference_summary_column ]) # Remove the (reference, summary) column dataset.remove_column( str( aligner.identifier(columns=[reference_column] + summary_columns))) del dataset.interactions[CACHEDOPS].history[( aligner.identifier, strings_as_json(strings=[reference_column] + summary_columns))] return dataset
def process_dataset( self, dataset: Dataset, columns: List[str], batch_size: int = 32, mask: List[int] = None, store_compressed: bool = True, store: bool = True, num_proc: int = None, *args, **kwargs, ) -> Tuple[Dataset, List[Slice], np.ndarray]: """Apply a SliceBuilder to a dataset. Args: dataset: Dataset columns: list of columns batch_size: integer batch size mask: boolean or integer mask array, mask[i] = True means that the ith slice will be masked out store_compressed: whether to store in a compressed format store: whether to store the results along with the example in Dataset num_proc: num processes for multiprocessing *args: optional additional arguments **kwargs: optional additional keyword arguments Returns: tuple of (Dataset, list of Slices, matrix of (example, slice) membership) """ # Prepare the dataset dataset = self.prepare_dataset( dataset=dataset, columns=columns, batch_size=batch_size, mask=mask, store_compressed=store_compressed, store=store, *args, **kwargs, ) # Compute a hash val = persistent_hash(str( dataset.identifier)) ^ dataset.hash_interactions() for i, identifier in enumerate(self.identifiers): if not mask[i]: val ^= persistent_hash( str(identifier) + str(strings_as_json(columns))) try: # Map the SliceBuilder over the dataset all_sliced_batches = [] all_slice_memberships = [] def _map_fn(batch): """Map function for processing batches. Note that using this map_fn in a stateful way is dangerous, since every invocation of this function appends to the all_slice_batches list. The .map() function will invoke this once for testing before performing the map, so we discard the first entry inserted into all_sliced_batches. """ batch, sliced_batches, slice_membership = self.process_batch( batch=batch, columns=columns, mask=mask, store_compressed=store_compressed, store=store, *args, **kwargs, ) all_sliced_batches.append(sliced_batches) all_slice_memberships.append(slice_membership) return batch dataset = dataset.map( _map_fn, batched=True, batch_size=batch_size, # FIXME(karan): enable this by adding logic for generating # all_sliced_batches and all_slice_memberships # when loading from cache file load_from_cache_file=False, # The cache file name is a XOR of the interaction history and the # current operation cache_file_name=str(dataset.logdir / ("cache-" + str(abs(val)) + ".arrow")), ) # Remove the first entry (see _map_fn) all_sliced_batches = all_sliced_batches[1:] all_slice_memberships = all_slice_memberships[1:] except: # noqa # Batch the dataset, and process each batch all_batches, all_sliced_batches, all_slice_memberships = zip(*[ self.process_batch( batch=batch, columns=columns, mask=mask, store_compressed=store_compressed, store=store, *args, **kwargs, ) for batch in dataset.batch(batch_size) ]) # Update the dataset efficiently by reusing all_batches dataset = dataset.map( lambda examples, indices: all_batches[indices[0] // batch_size ], batched=True, batch_size=batch_size, with_indices=True, load_from_cache_file=False, # The cache file name is a XOR of the interaction history and the # current operation cache_file_name=str(dataset.logdir / ("cache-" + str(abs(val)) + ".arrow")), ) # Create a single slice label matrix slice_membership = np.concatenate(all_slice_memberships, axis=0) slice_cache_hashes = [] for identifier in self.identifiers: slice_cache_hashes.append(val ^ persistent_hash(str(identifier))) if not num_proc or num_proc == 1: # Construct slices slices = [] for i, slice_batches in enumerate(zip(*all_sliced_batches)): slices.append( create_slice(( dataset, slice_membership, slice_batches, i, batch_size, slice_cache_hashes[i], ))) else: # Parallelized slice construction with Pool(num_proc) as pool: slices = pool.map( create_slice, [( dataset, slice_membership, slice_batches, i, batch_size, slice_cache_hashes[i], ) for i, slice_batches in enumerate(zip( *all_sliced_batches))], ) # TODO(karan): make this more systematic # TODO(karan): fix bug when slicing a Slice for i, sl in enumerate(slices): # # Set the Slice features # sl.info.features = dataset.features # Set the Slice category using the SliceBuilder's category sl.category = self.category # Create the lineage sl.lineage = [ (str(Dataset.__name__), dataset.identifier), ( str(self.category.capitalize()), self.identifiers[i], strings_as_json(columns), ), ] if isinstance(dataset, Slice): # Prepend the Slice's lineage instead, if the dataset was a slice sl.lineage = dataset.lineage + [( str(self.category.capitalize()), self.identifiers[i], strings_as_json(columns), )] return dataset, slices, slice_membership
def prepare_dataset( self, dataset: Dataset, columns: List[str], batch_size: int = 32, mask: List[int] = None, store_compressed: bool = True, store: bool = True, *args, **kwargs, ) -> Dataset: # Compute the hash for this operation # FIXME(karan): this is repeated inside process_dataset val = persistent_hash(str( dataset.identifier)) ^ dataset.hash_interactions() for i, identifier in enumerate(self.identifiers): if not mask[i]: val ^= persistent_hash( str(identifier) + str(strings_as_json(columns))) try: return dataset.map( partial( self.prepare_batch, columns=columns, mask=mask, store_compressed=store_compressed, store=store, *args, **kwargs, ), batched=True, batch_size=batch_size, load_from_cache_file=False, cache_file_name=str( dataset.logdir / ("cache-" + str(abs(val)) + "-prep.arrow")), ) except: # TypeError or PicklingError or AttributeError: # noqa # Batch the dataset, and process each batch all_batches = [ self.prepare_batch( batch=batch, columns=columns, mask=mask, store_compressed=store_compressed, store=store, *args, **kwargs, ) for batch in dataset.batch(batch_size) ] # Update the dataset efficiently by reusing all_batches return dataset.map( lambda examples, indices: all_batches[indices[0] // batch_size ], batched=True, batch_size=batch_size, with_indices=True, load_from_cache_file=False, cache_file_name=str( dataset.logdir / ("cache-" + str(abs(val)) + "-prep.arrow")), )
def process_dataset( self, dataset: Dataset, columns: List[str], batch_size: int = 32, # mask: List[int] = None, num_proc: int = None, *args, **kwargs, ) -> Tuple[List[Slice], np.ndarray]: # Create slices using the dataset slices = [Slice(dataset) for _ in range(len(self.identifiers))] all_slice_memberships = [] # Batch the dataset, and process each batch for batch in dataset.batch(batch_size): # Process the batch _, slice_memberships = self.process_batch( batch=batch, columns=columns, *args, **kwargs, ) # Keep track of the slice memberships all_slice_memberships.append(slice_memberships) # Create a single slice label matrix slice_membership = np.concatenate(all_slice_memberships, axis=0) for i, sl in enumerate(slices): # Set the visible rows for each slice sl.set_visible_rows(np.where(slice_membership[:, i])[0]) # Set the Slice category using the SliceBuilder's category sl.category = self.category # Append the the lineage sl.add_to_lineage( category=str(self.category.capitalize()), identifier=self.identifiers[i], columns=strings_as_json(columns), ) # # Create the lineage # sl.lineage = [ # (str(Dataset.__name__), dataset.identifier), # ( # str(self.category.capitalize()), # self.identifiers[i], # strings_as_json(columns), # ), # ] # if isinstance(dataset, Slice): # # Prepend the Slice's lineage instead, if the dataset was a slice # sl.lineage = dataset.lineage + [ # ( # str(self.category.capitalize()), # self.identifiers[i], # strings_as_json(columns), # ) # ] return slices, slice_membership
def process_dataset( self, dp: DataPanel, columns: List[str], batch_size: int = 32, num_proc: int = None, *args, **kwargs, ) -> Tuple[List[DataPanel], np.ndarray]: """Apply a SliceBuilder to a dataset. Args: dp: DataPanel columns: list of columns batch_size: integer batch size num_proc: num processes for multiprocessing *args: optional additional arguments **kwargs: optional additional keyword arguments Returns: tuple of (DataPanel, list of Slices, matrix of (example, slice) membership) """ # Create slices slices = [[DataPanel()] for _ in range(len(self.identifiers))] all_slice_memberships = [] # Batch the dataset, and process each batch for batch in dp.batch(batch_size): # Process the batch sliced_batches, slice_memberships = self.process_batch( dp=batch, columns=columns, *args, **kwargs, ) # Incrementally build the slices for sl, sl_batch in zip(slices, sliced_batches): sl.append(DataPanel(sl_batch)) # Keep track of the slice memberships all_slice_memberships.append(slice_memberships) # Create a single slice label matrix slice_membership = np.concatenate(all_slice_memberships, axis=0) # Create a single DataPanel for each slice slices = [ meerkat.concat(e[1:], axis=0) if len(e) > 1 else e[0] for e in slices ] # TODO(karan): DataPanel doesn't support this for i, sl in enumerate(slices): # Set the Slice category using the SliceBuilder's category sl.category = self.category # Append the the lineage sl.add_to_lineage( category=str(self.category.capitalize()), identifier=self.identifiers[i], columns=strings_as_json(columns), ) return slices, slice_membership
def process_dataset( self, dataset: Dataset, columns: List[str], batch_size: int = 32, # mask: List[int] = None, num_proc: int = None, *args, **kwargs, ) -> Tuple[List[Slice], np.ndarray]: """Apply a SliceBuilder to a dataset. Args: dataset: Dataset columns: list of columns batch_size: integer batch size # mask: boolean or integer mask array, mask[i] = True means that the ith # slice will be masked out num_proc: num processes for multiprocessing *args: optional additional arguments **kwargs: optional additional keyword arguments Returns: tuple of (Dataset, list of Slices, matrix of (example, slice) membership) """ # # Compute a hash # val = persistent_hash(str(dataset.identifier)) ^ dataset.hash_interactions() # for i, identifier in enumerate(self.identifiers): # if not mask[i]: # val ^= persistent_hash(str(identifier) # + str(strings_as_json(columns))) # try: # # Map the SliceBuilder over the dataset # all_sliced_batches = [] # all_slice_memberships = [] # # def _map_fn(batch): # """Map function for processing batches. # # Note that using this map_fn in a stateful way is # dangerous, since every invocation of this function # appends to the all_slice_batches list. The .map() # function will invoke this once for testing before # performing the map, so we discard the first entry # inserted into all_sliced_batches. # """ # batch, sliced_batches, slice_membership = self.process_batch( # batch=batch, # columns=columns, # mask=mask, # store_compressed=store_compressed, # store=store, # *args, # **kwargs, # ) # all_sliced_batches.append(sliced_batches) # all_slice_memberships.append(slice_membership) # return batch # # dataset = dataset.map( # _map_fn, # batched=True, # batch_size=batch_size, # # FIXME(karan): enable this by adding logic for generating # # all_sliced_batches and all_slice_memberships # # when loading from cache file # load_from_cache_file=False, # # The cache file name is a XOR of the interaction history and the # # current operation # cache_file_name=str( # dataset.logdir / ("cache-" + str(abs(val)) + ".arrow") # ), # ) # # # Remove the first entry (see _map_fn) # all_sliced_batches = all_sliced_batches[1:] # all_slice_memberships = all_slice_memberships[1:] # # except: # noqa # all_batches, all_sliced_batches, all_slice_memberships = zip( # *[ # self.process_batch( # batch=batch, # columns=columns, # mask=mask, # store_compressed=store_compressed, # store=store, # *args, # **kwargs, # ) # for batch in dataset.batch(batch_size) # ] # ) # # Update the dataset efficiently by reusing all_batches # dataset = dataset.map( # lambda examples, indices: all_batches[indices[0] // batch_size], # batched=True, # batch_size=batch_size, # with_indices=True, # load_from_cache_file=False, # # The cache file name is a XOR of the interaction history and the # # current operation # cache_file_name=str( # dataset.logdir / ("cache-" + str(abs(val)) + ".arrow") # ), # ) # Create slices slices = [Slice() for _ in range(len(self.identifiers))] all_slice_memberships = [] # Batch the dataset, and process each batch for batch in dataset.batch(batch_size): # Process the batch sliced_batches, slice_memberships = self.process_batch( batch=batch, columns=columns, *args, **kwargs, ) # Incrementally build the slices for sl, sl_batch in zip(slices, sliced_batches): sl._dataset.append(sl_batch) # Keep track of the slice memberships all_slice_memberships.append(slice_memberships) # Create a single slice label matrix slice_membership = np.concatenate(all_slice_memberships, axis=0) # slice_cache_hashes = [] # for identifier in self.identifiers: # slice_cache_hashes.append(val ^ persistent_hash(str(identifier))) # if not num_proc or num_proc == 1: # # Construct slices # slices = [] # for i, slice_batches in enumerate(zip(*all_sliced_batches)): # slices.append( # create_slice( # ( # dataset, # slice_membership, # slice_batches, # i, # batch_size, # slice_cache_hashes[i], # ) # ) # ) # else: # # Parallelized slice construction # with Pool(num_proc) as pool: # slices = pool.map( # create_slice, # [ # ( # dataset, # slice_membership, # slice_batches, # i, # batch_size, # slice_cache_hashes[i], # ) # for i, slice_batches in enumerate(zip(*all_sliced_batches)) # ], # ) for i, sl in enumerate(slices): # Set the Slice category using the SliceBuilder's category sl.category = self.category # Append the the lineage sl.add_to_lineage( category=str(self.category.capitalize()), identifier=self.identifiers[i], columns=strings_as_json(columns), ) # # Create the lineage # sl.lineage = [ # (str(Dataset.__name__), dataset.identifier), # ( # str(self.category.capitalize()), # self.identifiers[i], # strings_as_json(columns), # ), # ] # if isinstance(dataset, Slice): # # Prepend the Slice's lineage instead, if the dataset was a slice # sl.lineage = dataset.lineage + [ # ( # str(self.category.capitalize()), # self.identifiers[i], # strings_as_json(columns), # ) # ] return slices, slice_membership