def filter_extracts(extracts: types.Extracts) -> types.Extracts: # pylint: disable=invalid-name """Filters extracts.""" if not include and not exclude: return extracts elif include: return {k: v for k, v in extracts.items() if k in include} else: assert exclude return {k: v for k, v in extracts.items() if k not in exclude}
def process(self, element: types.Extracts ) -> List[Tuple[SliceKeyType, types.Extracts]]: key_filter_fn = self._key_filter_fn # Local cache. filtered = {k: v for k, v in element.items() if key_filter_fn(k)} result = [(slice_key, filtered) for slice_key in element.get(constants.SLICE_KEY_TYPES_KEY)] self._num_slices_generated_per_instance.update(len(result)) self._post_slice_num_instances.inc(len(result)) return result
def process(self, element: types.Extracts) -> Sequence[types.Extracts]: batch_size = element[constants.ARROW_RECORD_BATCH_KEY].num_rows try: result = self._batch_reducible_process(element) self._batch_size.update(batch_size) self._num_instances.inc(batch_size) return result except (ValueError, tf.errors.InvalidArgumentError) as e: logging.warning( 'Large batch_size %s failed with error %s. ' 'Attempting to run batch through serially. Note that this will ' 'significantly affect the performance.', batch_size, e) self._batch_size_failed.update(batch_size) result = [] record_batch = element[constants.ARROW_RECORD_BATCH_KEY] for i in range(batch_size): self._batch_size.update(1) unbatched_element = {} for key in element.keys(): if key == constants.ARROW_RECORD_BATCH_KEY: unbatched_element[key] = record_batch.slice(i, 1) else: unbatched_element[key] = [element[key][i]] result.extend(self._batch_reducible_process(unbatched_element)) self._num_instances.inc(len(result)) return result
def process(self, element: types.Extracts) -> List[types.Extracts]: fpl = element.get(constants.FEATURES_PREDICTIONS_LABELS_KEY) if not fpl: raise RuntimeError( 'FPL missing, Please ensure Predict() was called.') if not isinstance(fpl, types.FeaturesPredictionsLabels): raise TypeError( 'Expected FPL to be instance of FeaturesPredictionsLabel. FPL was: ' '%s of type %s' % (str(fpl), type(fpl))) features = fpl.features slices = list( slicer.get_slices_for_features_dict(features, self._slice_spec)) # Make a a shallow copy, so we don't mutate the original. element_copy = copy.copy(element) element_copy[constants.SLICE_KEY_TYPES_KEY] = slices # Add a list of stringified slice keys to be materialized to output table. if self._materialize: element_copy[constants.SLICE_KEYS_KEY] = types.MaterializedColumn( name=constants.SLICE_KEYS_KEY, value=(list( slicer.stringify_slice_key(x).encode('utf-8') for x in slices))) return [element_copy]
def _convert_legacy_fpl( extracts: types.Extracts, example_weight_key: Union[Text, Dict[Text, Text]]) -> types.Extracts: """Converts from legacy FPL types to features, labels, predictions.""" if constants.FEATURES_PREDICTIONS_LABELS_KEY not in extracts: return extracts remove_node = lambda d: {k: list(v.values())[0] for k, v in d.items()} remove_batch = lambda v: v[0] if len(v.shape) > 1 and v.shape[0 ] == 1 else v remove_batches = lambda d: {k: remove_batch(v) for k, v in d.items()} remove_default_key = lambda d: list(d.values())[0] if len(d) == 1 else d extracts = copy.copy(extracts) fpl = extracts.pop(constants.FEATURES_PREDICTIONS_LABELS_KEY) features = remove_node(fpl.features) example_weights = np.array([1.0]) if example_weight_key: if isinstance(example_weight_key, dict): example_weights = {} for k, v in example_weight_key.items(): example_weights[k] = remove_batch(features[v]) else: example_weights = remove_batch(features[example_weight_key]) labels = remove_default_key(remove_batches(remove_node(fpl.labels))) predictions = remove_default_key( remove_batches(remove_node(fpl.predictions))) extracts[constants.FEATURES_KEY] = features extracts[constants.PREDICTIONS_KEY] = predictions extracts[constants.LABELS_KEY] = labels extracts[constants.EXAMPLE_WEIGHTS_KEY] = example_weights return extracts
def _ExtractUnbatchedInputs( batched_extract: types.Extracts) -> Sequence[types.Extracts]: """Extract features, predictions, labels and weights from batched extract.""" keys_to_retain = set(batched_extract.keys()) keys_to_retain.remove(constants.ARROW_RECORD_BATCH_KEY) dataframe = pd.DataFrame() for key in keys_to_retain: dataframe[key] = batched_extract[key] return dataframe.to_dict(orient='records')
def process( self, element: types.Extracts ) -> List[Tuple[SliceKeyType, types.Extracts]]: key_filter_fn = self._key_filter_fn # Local cache. filtered = {k: v for k, v in element.items() if key_filter_fn(k)} slice_keys = element.get(constants.SLICE_KEY_TYPES_KEY) # The query based evaluator will group slices from multiple examples, so we # deduplicate to avoid overcounting. Depending on whether the rows within a # batch have a variable or fixed length, either a VarLenTensorValue or a 2D # np.ndarray will be created. if isinstance(slice_keys, types.VarLenTensorValue): slice_keys = slice_keys.values elif isinstance(slice_keys, np.ndarray) and len(slice_keys.shape) == 2: slice_keys = slice_keys.flatten() result = [(slice_key, filtered) for slice_key in set(slice_keys)] self._num_slices_generated_per_instance.update(len(result)) self._post_slice_num_instances.inc(len(result)) return result
def process(self, extracts: types.Extracts) -> Iterable[Any]: start_time = datetime.datetime.now() self._evaluate_num_instances.inc(1) use_default_combiner_input = None features = None combiner_inputs = [] for computation in self._computations: if computation.preprocessor is None: combiner_inputs.append(None) use_default_combiner_input = True elif isinstance(computation.preprocessor, metric_types.FeaturePreprocessor): if features is None: features = {} for v in computation.preprocessor.process(extracts): features.update(v) combiner_inputs.append(None) use_default_combiner_input = True else: combiner_inputs.append( next(computation.preprocessor.process(extracts))) output = { constants.SLICE_KEY_TYPES_KEY: extracts[constants.SLICE_KEY_TYPES_KEY], _COMBINER_INPUTS_KEY: combiner_inputs } if use_default_combiner_input: default_combiner_input = [] if features is not None: extracts = copy.copy(extracts) extracts.update({constants.FEATURES_KEY: features}) default_combiner_input = metric_util.to_standard_metric_inputs( extracts, include_features=features is not None) output[_DEFAULT_COMBINER_INPUT_KEY] = default_combiner_input yield output self._timer.update( int((datetime.datetime.now() - start_time).total_seconds()))
def process( self, element: types.Extracts ) -> List[Tuple[SliceKeyType, types.Extracts]]: key_filter_fn = self._key_filter_fn # Local cache. filtered = {k: v for k, v in element.items() if key_filter_fn(k)} slice_keys = element.get(constants.SLICE_KEY_TYPES_KEY) # The query based evaluator will group slices into a multi-dimentional array # with an extra dimension representing the examples matching the query key. # We need to flatten and dedup the slice keys. if _is_multi_dim_keys(slice_keys): arr = np.array(slice_keys) unique_keys = set() for k in arr.flatten(): unique_keys.add(k) if not unique_keys and arr.shape: # If only the empty overall slice is in array, it is removed by flatten unique_keys.add(()) slice_keys = unique_keys result = [(slice_key, filtered) for slice_key in slice_keys] self._num_slices_generated_per_instance.update(len(result)) self._post_slice_num_instances.inc(len(result)) return result
def merge_lists(target: types.Extracts) -> types.Extracts: """Converts target's leaves which are lists to batched np.array's, etc.""" if isinstance(target, Mapping): result = {} for key, value in target.items(): try: result[key] = merge_lists(value) except Exception as e: raise RuntimeError( 'Failed to convert value for key "{}"'.format( key)) from e return {k: merge_lists(v) for k, v in target.items()} elif target and (isinstance(target[0], tf.compat.v1.SparseTensorValue) or isinstance(target[0], types.SparseTensorValue)): t = tf.sparse.concat(0, [ tf.sparse.expand_dims(to_tensorflow_tensor(t), 0) for t in target ]) return to_tensor_value(t) elif target and isinstance(target[0], types.RaggedTensorValue): t = tf.concat( [tf.expand_dims(to_tensorflow_tensor(t), 0) for t in target], 0) return to_tensor_value(t) else: arr = np.array(target) # Flatten values that were originally single item lists into a single list # e.g. [[1], [2], [3]] -> [1, 2, 3] if len(arr.shape) == 2 and arr.shape[1] == 1: return arr.squeeze(axis=1) # Special case for empty slice arrays since numpy treats empty tuples as # arrays with dimension 0. # e.g. [[()], [()], [()]] -> [(), (), ()] elif len(arr.shape ) == 3 and arr.shape[1] == 1 and arr.shape[2] == 0: return arr.squeeze(axis=1) else: return arr
def get_fpl_copy(extracts: types.Extracts) -> types.FeaturesPredictionsLabels: """Get a copy of the FPL in the extracts of extracts.""" fpl_orig = extracts.get(constants.FEATURES_PREDICTIONS_LABELS_KEY) if not fpl_orig: raise RuntimeError('FPL missing, Please ensure _Predict() was called.') # We must make a copy of the FPL tuple as well, so that we don't mutate the # original which is disallowed by Beam. fpl_copy = types.FeaturesPredictionsLabels( features=copy.copy(fpl_orig.features), labels=fpl_orig.labels, predictions=fpl_orig.predictions, input_ref=fpl_orig.input_ref) return fpl_copy
def merge_lists(target: types.Extracts) -> types.Extracts: """Converts target's leaves which are lists to batched np.array's, etc.""" if isinstance(target, Mapping): result = {} for key, value in target.items(): try: result[key] = merge_lists(value) except Exception as e: raise RuntimeError( 'Failed to convert value for key "{}"'.format( key)) from e return {k: merge_lists(v) for k, v in target.items()} elif target and (isinstance(target[0], tf.compat.v1.SparseTensorValue) or isinstance(target[0], types.SparseTensorValue)): t = tf.compat.v1.sparse_concat(0, [ tf.sparse.expand_dims(to_tensorflow_tensor(t), 0) for t in target ], expand_nonconcat_dim=True) return to_tensor_value(t) elif target and isinstance(target[0], types.RaggedTensorValue): t = tf.concat( [tf.expand_dims(to_tensorflow_tensor(t), 0) for t in target], 0) return to_tensor_value(t) elif (all(isinstance(t, np.ndarray) for t in target) and len({t.shape for t in target}) > 1): return types.VarLenTensorValue.from_dense_rows(target) else: arr = np.array(target) # Flatten values that were originally single item lists into a single list # e.g. [[1], [2], [3]] -> [1, 2, 3] if len(arr.shape) == 2 and arr.shape[1] == 1: return arr.squeeze(axis=1) return arr
def process(self, element: types.Extracts ) -> Generator[Tuple[SliceKeyType, types.Extracts], None, None]: filtered = {} for key in element: if not self._include_slice_keys_in_output and key in ( constants.SLICE_KEY_TYPES_KEY, constants.SLICE_KEYS_KEY): continue filtered[key] = element[key] slice_count = 0 for slice_key in element.get(constants.SLICE_KEY_TYPES_KEY): slice_count += 1 yield (slice_key, filtered) self._num_slices_generated_per_instance.update(slice_count) self._post_slice_num_instances.inc(slice_count)
def _extract_unbatched_inputs( # pylint: disable=invalid-name mixed_legacy_batched_extract: types.Extracts) -> Sequence[types.Extracts]: """Extract features, predictions, labels and weights from batched extract.""" batched_extract = {} # TODO(mdreves): Remove record batch keys_to_retain = set(mixed_legacy_batched_extract.keys()) if constants.ARROW_RECORD_BATCH_KEY in keys_to_retain: keys_to_retain.remove(constants.ARROW_RECORD_BATCH_KEY) dataframe = pd.DataFrame() for key in keys_to_retain: # Previously a batch of transformed features were stored as a list of dicts # instead of a dict of np.arrays with batch dimensions. These legacy # conversions are done using dataframes instead. if isinstance(mixed_legacy_batched_extract[key], list): try: dataframe[key] = mixed_legacy_batched_extract[key] except Exception as e: raise RuntimeError( f'Exception encountered while adding key {key} with ' f'batched length {len(mixed_legacy_batched_extract[key])}' ) from e else: batched_extract[key] = mixed_legacy_batched_extract[key] unbatched_extracts = util.split_extracts(batched_extract) legacy_unbatched_extracts = dataframe.to_dict(orient='records') if unbatched_extracts and legacy_unbatched_extracts: if len(unbatched_extracts) != len(legacy_unbatched_extracts): raise ValueError( f'Batch sizes have differing values: {len(unbatched_extracts)} != ' f'{len(legacy_unbatched_extracts)}, ' f'unbatched_extracts={unbatched_extracts}, ' f'legacy_unbatched_extracts={legacy_unbatched_extracts}') result = [] for unbatched_extract, legacy_unbatched_extract in zip( unbatched_extracts, legacy_unbatched_extracts): legacy_unbatched_extract.update(unbatched_extract) result.append(legacy_unbatched_extract) return result elif legacy_unbatched_extracts: return legacy_unbatched_extracts else: return unbatched_extracts
def visit(subtree: types.Extracts, keys: List[str]): for key, value in subtree.items(): if isinstance(value, Mapping): visit(value, keys + [key]) else: add_to_results(keys + [key], value)