def test_list_lengths(self, list_type_factory): list_lengths = array_util.ListLengthsFromListArray( pa.array([], type=list_type_factory(pa.int64()))) self.assertTrue(list_lengths.equals(pa.array([], type=pa.int64()))) list_lengths = array_util.ListLengthsFromListArray( pa.array([[1., 2.], [], [3.]], type=list_type_factory(pa.float32()))) self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int64()))) list_lengths = array_util.ListLengthsFromListArray( pa.array([[1., 2.], None, [3.]], type=list_type_factory(pa.float64()))) self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int64())))
def test_list_lengths(self): list_lengths = array_util.ListLengthsFromListArray( pa.array([], type=pa.list_(pa.int64()))) self.assertTrue(list_lengths.equals(pa.array([], type=pa.int32()))) list_lengths = array_util.ListLengthsFromListArray( pa.array([[1., 2.], [], [3.]])) self.assertTrue( list_lengths.equals(pa.array([2, 0, 1], type=pa.int32()))) list_lengths = array_util.ListLengthsFromListArray( pa.array([[1., 2.], None, [3.]])) self.assertTrue( list_lengths.equals(pa.array([2, 0, 1], type=pa.int32())))
def get_broadcastable_column(input_table: pa.Table, column_name: Text) -> pa.Array: """Gets a column from the input table, validating that it can be broadcast. Args: input_table: Input table. column_name: Name of the column to be retrieved and validated. This column must refer to a ListArray in which each list has length 1. Returns: An arrow array containing a flattened view of the broadcast column. Raises: ValueError: If the broadcast feature is not present in the input table or is not a valid column. A valid column must have exactly one value per example and be of a numeric type. """ try: column = input_table.column(column_name).data.chunk(0) except KeyError: raise ValueError( 'Column "{}" not present in the input table.'.format(column_name)) # Before flattening, check that there is a single value for each example. column_lengths = array_util.ListLengthsFromListArray(column).to_numpy() if not np.all(column_lengths == 1): raise ValueError( 'Column "{}" must have exactly one value in each example.'.format( column_name)) return column.flatten()
def list_lengths(self, path: types.FeaturePath) -> np.ndarray: """Returns a numpy array containing the length of each feature list. If the requested path is not present in the table wrapped by the InputBatch, the returned array will consist of zeros, and be of length equal to the number of rows in the table. Args: path: The path for which to return list lengths. Returns: An ndarray containing the lengths of each nested list. The returned ndarray will be of shape (N,) where N is the number of rows in the referenced array (or in the table, if the path cannot be found). Raises: ValueError: When the referenced array is neither a ListArray nor null. """ key = ('list_lengths({})', path) if key in self._cache: return self._cache[key] try: array, _ = arrow_util.get_array( self._table, path, broadcast_column_name=None) if pa.types.is_null(array.type): lengths = np.full(self._table.num_rows, 0) elif not pa.types.is_list(array.type): raise ValueError('Can only compute list lengths on list arrays, found ' '{}'.format(array.type)) else: lengths = np.asarray(array_util.ListLengthsFromListArray(array)) except KeyError: lengths = np.full(self._table.num_rows, 0) self._cache[key] = lengths return lengths
def _flatten_and_impute( examples_table: pa.Table, categorical_features: Set[types.FeaturePath] ) -> Dict[types.FeaturePath, np.ndarray]: """Flattens and imputes the values in the input Arrow table. Replaces missing values with CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE for categorical features and 10*max(feature_values) for numeric features. We impute missing values with an extreme value that is far from observed values so it does not incorrectly impact KNN results. 10*max(feature_values) is used instead of sys.max_float because max_float is large enough to cause unexpected float arithmetic errors. Args: examples_table: Arrow table containing a batch of examples where all features are univalent. categorical_features: Set of categorical feature names. Returns: A Dict[FeaturePath, np.ndarray] where the key is the feature path and the value is a 1D numpy array corresponding to the feature values. """ num_rows = examples_table.num_rows result = {} for feature_column in examples_table.itercolumns(): feature_path = types.FeaturePath([feature_column.name]) # Assume we have only a single chunk. feature_array = feature_column.data.chunk(0) # to_pandas returns a readonly array. Create a copy as we will be imputing # the NaN values. non_missing_values = np.copy( arrow_util.primitive_array_to_numpy(feature_array.flatten())) non_missing_parent_indices = arrow_util.primitive_array_to_numpy( array_util.GetFlattenedArrayParentIndices(feature_array)) is_categorical_feature = feature_path in categorical_features result_dtype = non_missing_values.dtype if non_missing_parent_indices.size < num_rows and is_categorical_feature: result_dtype = np.object flattened_array = np.ndarray(shape=num_rows, dtype=result_dtype) num_values = arrow_util.primitive_array_to_numpy( array_util.ListLengthsFromListArray(feature_array)) missing_parent_indices = np.where(num_values == 0)[0] if feature_path in categorical_features: imputation_fill_value = CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE else: # Also impute any NaN values. nan_mask = np.isnan(non_missing_values) imputation_fill_value = sys.maxsize if not np.all(nan_mask): imputation_fill_value = non_missing_values[~nan_mask].max( ) * 10 non_missing_values[nan_mask.nonzero()[0]] = imputation_fill_value flattened_array[non_missing_parent_indices] = non_missing_values if missing_parent_indices.any(): flattened_array[missing_parent_indices] = imputation_fill_value result[feature_path] = flattened_array return result
def update(self, feature_path: types.FeaturePath, feature_array: pa.Array, feature_type: types.FeatureNameStatisticsType, num_values_quantiles_combiner: Any, weights: Optional[np.ndarray] = None) -> None: """Update the partial common statistics using the input value.""" # All the values in this column is null and we cannot deduce the type of # the feature. This is not an error as this feature might have some values # in other batches. if feature_type is None: return if self.type is None: self.type = feature_type elif self.type != feature_type: raise TypeError('Cannot determine the type of feature %s. ' 'Found values of types %s and %s.' % (feature_path, self.type, feature_type)) # np.max / np.min below cannot handle empty arrays. And there's nothing # we can collect in this case. if not feature_array: return num_values = arrow_util.primitive_array_to_numpy( array_util.ListLengthsFromListArray(feature_array)) none_mask = arrow_util.primitive_array_to_numpy( array_util.GetArrayNullBitmapAsByteArray(feature_array)).view( np.bool) self.num_non_missing += len(feature_array) - feature_array.null_count num_values_not_none = num_values[~none_mask] # We do this check to avoid failing in np.min/max with empty array. if num_values_not_none.size == 0: return # Use np.maximum.reduce(num_values_not_none, initial=self.max_num_values) # once we upgrade to numpy 1.16 self.max_num_values = max(np.max(num_values_not_none), self.max_num_values) self.min_num_values = min(np.min(num_values_not_none), self.min_num_values) self.total_num_values += np.sum(num_values_not_none) self.num_values_summary = num_values_quantiles_combiner.add_input( self.num_values_summary, [num_values_not_none]) if weights is not None: if weights.size != num_values.size: raise ValueError('Weight feature must not be missing.') self.weighted_total_num_values += np.sum(num_values * weights) self.weighted_num_non_missing += np.sum(weights[~none_mask])
def update(self, feature_path: types.FeaturePath, feature_array: pa.Array, feature_type: types.FeatureNameStatisticsType, make_quantiles_sketch_fn: Callable[[], sketches.QuantilesSketch], weights: Optional[np.ndarray] = None) -> None: """Update the partial common statistics using the input value.""" if self.type is None: self.type = feature_type # pytype: disable=annotation-type-mismatch elif feature_type is not None and self.type != feature_type: raise TypeError('Cannot determine the type of feature %s. ' 'Found values of types %s and %s.' % (feature_path, self.type, feature_type)) nest_level = arrow_util.get_nest_level(feature_array.type) if self.presence_and_valency_stats is None: self.presence_and_valency_stats = [ _PresenceAndValencyStats(make_quantiles_sketch_fn) for _ in range(nest_level) ] elif nest_level != len(self.presence_and_valency_stats): raise ValueError('Inconsistent nestedness in feature {}: {} vs {}'.format( feature_path, nest_level, len(self.presence_and_valency_stats))) # And there's nothing we can collect in this case. if not feature_array: return level = 0 while arrow_util.is_list_like(feature_array.type): presence_mask = ~np.asarray( array_util.GetArrayNullBitmapAsByteArray(feature_array)).view(np.bool) num_values = np.asarray( array_util.ListLengthsFromListArray(feature_array)) num_values_not_none = num_values[presence_mask] self.presence_and_valency_stats[level].update(feature_array, presence_mask, num_values, num_values_not_none, weights) flattened = feature_array.flatten() if weights is not None: parent_indices = array_util.GetFlattenedArrayParentIndices( feature_array).to_numpy() weights = weights[parent_indices] feature_array = flattened level += 1
def _get_univalent_values_with_parent_indices( self, examples_table: pa.Table) -> Dict[types.FeatureName, pd.DataFrame]: """Extracts univalent values for each feature along with parent indices.""" result = {} for feature_column in examples_table.itercolumns(): feature_name = feature_column.name if (self._features_needed is not None and feature_name not in self._features_needed): continue feature_type = stats_util.get_feature_type_from_arrow_type( feature_name, feature_column.type) # Only consider crosses of numeric features. # TODO(zhuo): Support numeric features nested under structs. if feature_type in (statistics_pb2.FeatureNameStatistics.STRING, statistics_pb2.FeatureNameStatistics.STRUCT): continue # Assume we have only a single chunk. assert feature_column.data.num_chunks == 1 feat_arr = feature_column.data.chunk(0) value_lengths = arrow_util.primitive_array_to_numpy( array_util.ListLengthsFromListArray(feat_arr)) univalent_parent_indices = set((value_lengths == 1).nonzero()[0]) # If there are no univalent values, continue to the next feature. if not univalent_parent_indices: continue non_missing_values = arrow_util.primitive_array_to_numpy( feat_arr.flatten()) value_parent_indices = arrow_util.primitive_array_to_numpy( array_util.GetFlattenedArrayParentIndices(feat_arr)) if feature_type == statistics_pb2.FeatureNameStatistics.FLOAT: # Remove any NaN values if present. non_nan_mask = ~np.isnan(non_missing_values) non_missing_values = non_missing_values[non_nan_mask] value_parent_indices = value_parent_indices[non_nan_mask] df = pd.DataFrame({ feature_name: non_missing_values, 'parent_index': value_parent_indices }) # Only keep the univalent feature values. df = df[df['parent_index'].isin(univalent_parent_indices)] result[feature_name] = df return result
def get_weight_feature(input_record_batch: pa.RecordBatch, weight_column: Text) -> np.ndarray: """Gets the weight column from the input record batch. Args: input_record_batch: Input record batch. weight_column: Name of the column containing the weight. Returns: A numpy array containing the weights of the examples in the input record_batch. Raises: ValueError: If the weight feature is not present in the input record_batch or is not a valid weight feature (must be of numeric type and have a single value for each example). """ weights_field_index = input_record_batch.schema.get_field_index( weight_column) if weights_field_index < 0: raise ValueError('Weight column "{}" not present in the input ' 'record batch.'.format(weight_column)) weights = input_record_batch.column(weights_field_index) if pa.types.is_null(weights.type): raise ValueError( 'Weight column "{}" cannot be null.'.format(weight_column)) # Before flattening, check that there is a single value for each example. weight_lengths = array_util.ListLengthsFromListArray(weights).to_numpy() if not np.all(weight_lengths == 1): raise ValueError( 'Weight column "{}" must have exactly one value in each example.'. format(weight_column)) flat_weights = weights.flatten() # Before converting to numpy view, check the type (cannot convert string and # binary arrays to numpy view). flat_weights_type = flat_weights.type if (not pa.types.is_floating(flat_weights_type) and not pa.types.is_integer(flat_weights_type)): raise ValueError( 'Weight column "{}" must be of numeric type. Found {}.'.format( weight_column, flat_weights_type)) return np.asarray(flat_weights)
def _get_univalent_values_with_parent_indices( self, examples: pa.RecordBatch) -> Dict[types.FeatureName, DataFrame]: """Extracts univalent values for each feature along with parent indices.""" result = {} for feature_name, feat_arr in zip(examples.schema.names, examples.columns): if (self._features_needed is not None and feature_name not in self._features_needed): continue feature_type = stats_util.get_feature_type_from_arrow_type( feature_name, feat_arr.type) # Only consider crosses of numeric features. # TODO(zhuo): Support numeric features nested under structs. if feature_type in (None, statistics_pb2.FeatureNameStatistics.STRING, statistics_pb2.FeatureNameStatistics.STRUCT): continue value_lengths = np.asarray( array_util.ListLengthsFromListArray(feat_arr)) univalent_parent_indices = set((value_lengths == 1).nonzero()[0]) # If there are no univalent values, continue to the next feature. if not univalent_parent_indices: continue flattened, value_parent_indices = arrow_util.flatten_nested( feat_arr, True) non_missing_values = np.asarray(flattened) if feature_type == statistics_pb2.FeatureNameStatistics.FLOAT: # Remove any NaN values if present. non_nan_mask = ~np.isnan(non_missing_values) non_missing_values = non_missing_values[non_nan_mask] value_parent_indices = value_parent_indices[non_nan_mask] df = pd.DataFrame({ feature_name: non_missing_values, 'parent_index': value_parent_indices }) # Only keep the univalent feature values. df = df[df['parent_index'].isin(univalent_parent_indices)] result[feature_name] = df return result
def _get_weight_feature(input_table: pa.Table, weight_feature: Text) -> np.ndarray: """Gets the weight column from the input table. Args: input_table: Input table. weight_feature: Name of the weight feature. Returns: A numpy array containing the weights of the examples in the input table. Raises: ValueError: If the weight feature is not present in the input table or is not a valid weight feature (must be of numeric type and have a single value for each example). """ try: weights = input_table.column(weight_feature).data.chunk(0) except KeyError: raise ValueError('Weight feature "{}" not present in the input ' 'table.'.format(weight_feature)) # Before flattening, check that there is a single value for each example. weight_lengths = array_util.ListLengthsFromListArray(weights).to_numpy() if not np.all(weight_lengths == 1): raise ValueError( 'Weight feature "{}" must have exactly one value in each example.'. format(weight_feature)) weights = weights.flatten() # Before converting to numpy view, check the type (cannot convert string and # binary arrays to numpy view). weights_type = weights.type if pa.types.is_string(weights_type) or pa.types.is_binary(weights_type): raise ValueError( 'Weight feature "{}" must be of numeric type. Found {}.'.format( weight_feature, weights_type)) return weights.to_numpy()
def add_input( self, accumulator: Dict[types.FeaturePath, _PartialSparseFeatureStats], input_table: pa.Table ) -> Dict[types.FeaturePath, _PartialSparseFeatureStats]: """Returns result of folding a batch of inputs into the current accumulator. Args: accumulator: The current accumulator. input_table: An Arrow Table whose columns are features and rows are examples. Returns: The accumulator after updating the statistics for the batch of inputs. """ feature_value_list_lengths = dict() feature_is_missing = dict() batch_example_count = input_table.num_rows # Do a single pass through the input table to determine the value list # lengths and whether the feature is missing for every feature # that is an index or value feature in any sparse feature in the schema. for feature_path, leaf_array, _ in arrow_util.enumerate_arrays( input_table, weight_column=None, enumerate_leaves_only=True): if (feature_path in self._all_index_feature_paths or feature_path in self._all_value_feature_paths): if pa.types.is_null(leaf_array.type): # If the column is a NullArray, it is missing from the entire batch # (missing features have value list lengths of 0). feature_value_list_lengths[feature_path] = np.full( batch_example_count, 0) feature_is_missing[feature_path] = np.full( batch_example_count, True) else: feature_value_list_lengths[feature_path] = np.asarray( array_util.ListLengthsFromListArray(leaf_array)) feature_is_missing[feature_path] = np.asarray( array_util.GetArrayNullBitmapAsByteArray(leaf_array)) # Now create a partial sparse feature stats object for each sparse feature # using the value list lengths and feature missing information collected # above. for feature_path in self._sparse_feature_component_paths: value_feature_path = self._sparse_feature_component_paths[ feature_path].value_feature index_feature_paths = self._sparse_feature_component_paths[ feature_path].index_features # Create a filter identifying examples in which the entire sparse feature # is missing since those examples should not be included in counting # missing counts or length differences. component_features_missing = np.array([ feature_is_missing.get(path, np.full(batch_example_count, True)) for path in itertools.chain([value_feature_path], index_feature_paths) ]) entire_sparse_feature_missing = np.all(component_features_missing, axis=0) num_examples_missing_sparse_feature = np.sum( entire_sparse_feature_missing) # If all examples in the batch are missing the sparse feature, do not # update the accumulator with the partial stats for that sparse feature. if num_examples_missing_sparse_feature == batch_example_count: continue is_missing_value_feature = feature_is_missing.get( value_feature_path) # If this batch does not have the value feature at all, # missing_value_count is the number of examples in the batch. # Also populate the value list lengths for the value feature with all 0s # since a missing feature is considered to have a value list length of 0. if is_missing_value_feature is None: missing_value_count = batch_example_count feature_value_list_lengths[value_feature_path] = np.full( batch_example_count, 0) else: missing_value_count = np.sum(is_missing_value_feature) # Do not include examples that are entirely missing the sparse feature in # the missing value count. missing_value_count -= num_examples_missing_sparse_feature missing_index_counts = collections.Counter() min_length_diff = dict() max_length_diff = dict() for index_feature_path in index_feature_paths: is_missing_index_feature = feature_is_missing.get( index_feature_path) if is_missing_index_feature is None: # If this batch does not have this index feature at all, # missing_index_count for that index feature is the number of # examples in the batch. missing_index_count = batch_example_count # Populate the value list lengths for the index feature with all 0s # since a missing feature is considered to have a value list length of # 0. feature_value_list_lengths[index_feature_path] = np.full( batch_example_count, 0) else: missing_index_count = np.sum(is_missing_index_feature) # Do not include examples that are entirely missing the sparse feature # in the missing value count. missing_index_counts[index_feature_path] = ( missing_index_count - num_examples_missing_sparse_feature) length_differences = np.subtract( feature_value_list_lengths[index_feature_path], feature_value_list_lengths[value_feature_path]) # Do not include examples that are entirely missing the sparse feature # in determining the min and max length differences. filtered_length_differences = length_differences[ ~entire_sparse_feature_missing] # This generator should not get to this point if the current sparse # feature is missing from all examples in the batch (which would cause # filtered_length_differences to be empty). assert filtered_length_differences.size != 0 min_length_diff[index_feature_path] = np.min( filtered_length_differences) max_length_diff[index_feature_path] = np.max( filtered_length_differences) stats_for_feature = _PartialSparseFeatureStats( missing_value_count, missing_index_counts, min_length_diff, max_length_diff) existing_stats_for_feature = accumulator.get(feature_path) if existing_stats_for_feature is None: accumulator[feature_path] = stats_for_feature else: accumulator[feature_path] += stats_for_feature return accumulator
def _flatten_and_impute( examples: pa.RecordBatch, categorical_features: Set[types.FeaturePath] ) -> Dict[types.FeaturePath, np.ndarray]: """Flattens and imputes the values in the input Arrow RecordBatch. Replaces missing values with CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE for categorical features and 10*max(feature_values) for numeric features. We impute missing values with an extreme value that is far from observed values so it does not incorrectly impact KNN results. 10*max(feature_values) is used instead of sys.max_float because max_float is large enough to cause unexpected float arithmetic errors. Args: examples: Arrow RecordBatch containing a batch of examples where all features are univalent. categorical_features: Set of categorical feature names. Returns: A Dict[FeaturePath, np.ndarray] where the key is the feature path and the value is a 1D numpy array corresponding to the feature values. """ num_rows = examples.num_rows result = {} for column_name, feature_array in zip(examples.schema.names, examples.columns): feature_path = types.FeaturePath([column_name]) imputation_fill_value = (CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE if feature_path in categorical_features else sys.maxsize) if pa.types.is_null(feature_array.type): # If null array, impute all values. imputed_values_array = np.full(shape=num_rows, fill_value=imputation_fill_value) result[feature_path] = imputed_values_array else: # to_pandas returns a readonly array. Create a copy as we will be imputing # the NaN values. flattened_array, non_missing_parent_indices = arrow_util.flatten_nested( feature_array, return_parent_indices=True) assert non_missing_parent_indices is not None non_missing_values = np.copy(np.asarray(flattened_array)) is_categorical_feature = feature_path in categorical_features result_dtype = non_missing_values.dtype if non_missing_parent_indices.size < num_rows and is_categorical_feature: result_dtype = np.object flattened_array = np.ndarray(shape=num_rows, dtype=result_dtype) num_values = np.asarray( array_util.ListLengthsFromListArray(feature_array)) missing_parent_indices = np.where(num_values == 0)[0] if feature_path not in categorical_features: # Also impute any NaN values. nan_mask = np.isnan(non_missing_values) if not np.all(nan_mask): imputation_fill_value = non_missing_values[~nan_mask].max( ) * 10 non_missing_values[nan_mask.nonzero() [0]] = imputation_fill_value flattened_array[non_missing_parent_indices] = non_missing_values if missing_parent_indices.any(): flattened_array[missing_parent_indices] = imputation_fill_value result[feature_path] = flattened_array return result