def _remove_unsupported_feature_columns(examples_table: pa.Table, schema: schema_pb2.Schema) -> pa.Table: """Removes feature columns that contain unsupported values. All feature columns that are multivalent are dropped since they are not supported by sk-learn. All columns of STRUCT type are also dropped. Args: examples_table: Arrow table containing a batch of examples. schema: The schema for the data. Returns: Arrow table. """ multivalent_features = schema_util.get_multivalent_features(schema) unsupported_columns = set() for f in multivalent_features: unsupported_columns.add(f.steps()[0]) for column_name, column in zip(examples_table.schema.names, examples_table.itercolumns()): if (stats_util.get_feature_type_from_arrow_type( types.FeaturePath([column_name]), column.type) == statistics_pb2.FeatureNameStatistics.STRUCT): unsupported_columns.add(column_name) return examples_table.drop(unsupported_columns)
def find_nonnull_table_mask(table: pa.Table) -> pa.Array: mask = pa.array(np.ones(table.num_rows), pa.bool_()) for column in table.itercolumns(): mask = pa.compute.and_(mask, column.chunks[0].is_valid()) return mask
def _filter_table(arrow_table: pa.Table, params: Dict[str, Any]) -> pa.Table: if not params["condition"]: return arrow_table if params["keep"]: condition = params["condition"] else: condition = {"operation": "not", "condition": params["condition"]} mask = condition_to_mask(arrow_table, condition) # or raise ConditionError return pa.table({ name: _filter_column(column, mask) for name, column in zip(arrow_table.column_names, arrow_table.itercolumns()) })
def _get_univalent_values_with_parent_indices( self, examples_table: pa.Table) -> Dict[types.FeatureName, pd.DataFrame]: """Extracts univalent values for each feature along with parent indices.""" result = {} for feature_column in examples_table.itercolumns(): feature_name = feature_column.name if (self._features_needed is not None and feature_name not in self._features_needed): continue feature_type = stats_util.get_feature_type_from_arrow_type( feature_name, feature_column.type) # Only consider crosses of numeric features. # TODO(zhuo): Support numeric features nested under structs. if feature_type in (statistics_pb2.FeatureNameStatistics.STRING, statistics_pb2.FeatureNameStatistics.STRUCT): continue # Assume we have only a single chunk. assert feature_column.data.num_chunks == 1 feat_arr = feature_column.data.chunk(0) value_lengths = arrow_util.primitive_array_to_numpy( arrow_util.ListLengthsFromListArray(feat_arr)) univalent_parent_indices = set((value_lengths == 1).nonzero()[0]) # If there are no univalent values, continue to the next feature. if not univalent_parent_indices: continue non_missing_values = arrow_util.primitive_array_to_numpy( feat_arr.flatten()) value_parent_indices = arrow_util.primitive_array_to_numpy( arrow_util.GetFlattenedArrayParentIndices(feat_arr)) if feature_type == statistics_pb2.FeatureNameStatistics.FLOAT: # Remove any NaN values if present. non_nan_mask = ~np.isnan(non_missing_values) non_missing_values = non_missing_values[non_nan_mask] value_parent_indices = value_parent_indices[non_nan_mask] df = pd.DataFrame({ feature_name: non_missing_values, 'parent_index': value_parent_indices }) # Only keep the univalent feature values. df = df[df['parent_index'].isin(univalent_parent_indices)] result[feature_name] = df return result
def _arrow_table_to_json_records( table: pyarrow.Table, begin: int, end: int ) -> List[Dict[str, Any]]: """ Convert `table` to JSON records. Slice from `begin` (inclusive, first is 0) to `end` (exclusive). String values become Strings; Number values become int/float; Datetime values become ISO8601-encoded Strings. """ # Select the values we want -- columnar, so memory accesses are contiguous values = { column.name: _arrow_array_to_json_list(column[begin:end]) for column in table.itercolumns() } # Transpose into JSON records return [{k: v[i] for k, v in values.items()} for i in range(end - begin)]
def read_columns(table: pa.Table, full: bool = True) -> List[Column]: """Read Column definitions and validate Workbench assumptions. Raise ValidateError if: * table has metadata * table has more than one record batch * columns have invalid metadata (e.g., a "format" on a "text" column, or a timestamp with unit!=ns or a timezone) * column values disagree with metadata (e.g., date32 "2021-04-12" with `ColumnType.Date("month")`) Be sure the Arrow file backing the table was validated with `validate_arrow_file()` first. Otherwise, you'll get undefined behavior. If `full=False`, skip costly checks. Only pass `full=False` when you can guarantee the data has been generated by a source you trust. (In particular, module output is not trusted and it must use the default `full=True`.) """ if table.schema.metadata is not None: raise TableSchemaHasMetadata() seen_column_names: Dict[str, int] = {} ret = [] for position, column in enumerate(table.itercolumns()): field = table.field(position) if column.num_chunks > 1: raise TableHasTooManyRecordBatches(column.num_chunks) if field.name in seen_column_names: raise DuplicateColumnName( field.name, seen_column_names[field.name], position ) else: seen_column_names[field.name] = position ret.append(Column(field.name, _read_column_type(column, field, full=full))) return ret
def enumerate_arrays( table: pa.Table, weight_column: Optional[Text], enumerate_leaves_only: bool ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]: """Enumerates arrays in a Table. It assumes all the columns in `table` have only one chunk. It assumes `table` contains only arrays of the following supported types: - list<primitive> - list<struct<[Ts]>> where Ts are the types of the fields in the struct type, and they can only be one of the supported types (recursion intended). It enumerates each column (i.e. array, because there is only one chunk) in the table (also see `enumerate_leaves_only`) If an array is of type list<struct<[Ts]>>, then it flattens the outermost list, then enumerates the array of each field in the result struct<[Ts]> array, and continues recursively. The weights get "aligned" automatically in this process, therefore weights, the third term in the returned tuple always has array[i]'s weight being weights[i]. Args: table: The Table whose arrays to be visited. It is assumed that the table contains only one chunk. weight_column: The name of the weight column, or None. The elements of the weight column should be lists of numerics, and each list should contain only one value. enumerate_leaves_only: If True, only enumerate "leaf" arrays. Otherwise, also enumerate the struct arrays where the leaf arrays are contained. Yields: A tuple. The first term is the path of the feature; the second term is the feature array and the third term is the weight array for the feature array (i.e. weights[i] is the weight for array[i]). Raises: ValueError: When the weight column is not a list array whose elements are 1-element lists. """ def _recursion_helper( feature_path: types.FeaturePath, array: pa.Array, weights: Optional[np.ndarray] ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]: """Recursion helper.""" array_type = array.type if is_list_like(array_type) and pa.types.is_struct( array_type.value_type): if not enumerate_leaves_only: yield (feature_path, array, weights) flat_struct_array = array.flatten() flat_weights = None if weights is not None: flat_weights = weights[ array_util.GetFlattenedArrayParentIndices( array).to_numpy()] for field in flat_struct_array.type: field_name = field.name # use "yield from" after PY 3.3. for e in _recursion_helper(feature_path.child(field_name), flat_struct_array.field(field_name), flat_weights): yield e else: yield (feature_path, array, weights) weights = None if weight_column is not None: weights = get_broadcastable_column(table, weight_column) weight_type = weights.type if (not pa.types.is_floating(weight_type) and not pa.types.is_integer(weight_type)): raise ValueError( 'Weight column "{}" must be of numeric type. Found {}.'.format( weight_column, weight_type)) weights = np.asarray(weights) for column_name, column in zip(table.schema.names, table.itercolumns()): # use "yield from" after PY 3.3. for e in _recursion_helper(types.FeaturePath([column_name]), column.data.chunk(0), weights): yield e
def _flatten_and_impute(examples_table: pa.Table, categorical_features: Set[types.FeaturePath] ) -> Dict[types.FeaturePath, np.ndarray]: """Flattens and imputes the values in the input Arrow table. Replaces missing values with CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE for categorical features and 10*max(feature_values) for numeric features. We impute missing values with an extreme value that is far from observed values so it does not incorrectly impact KNN results. 10*max(feature_values) is used instead of sys.max_float because max_float is large enough to cause unexpected float arithmetic errors. Args: examples_table: Arrow table containing a batch of examples where all features are univalent. categorical_features: Set of categorical feature names. Returns: A Dict[FeaturePath, np.ndarray] where the key is the feature path and the value is a 1D numpy array corresponding to the feature values. """ num_rows = examples_table.num_rows result = {} for column_name, feature_column in zip(examples_table.schema.names, examples_table.itercolumns()): feature_path = types.FeaturePath([column_name]) # Assume we have only a single chunk. feature_array = feature_column.data.chunk(0) imputation_fill_value = ( CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE if feature_path in categorical_features else sys.maxsize) if pa.types.is_null(feature_array.type): # If null array, impute all values. imputed_values_array = np.full( shape=num_rows, fill_value=imputation_fill_value) result[feature_path] = imputed_values_array else: # to_pandas returns a readonly array. Create a copy as we will be imputing # the NaN values. non_missing_values = np.copy( np.asarray(feature_array.flatten())) non_missing_parent_indices = np.asarray( array_util.GetFlattenedArrayParentIndices(feature_array)) is_categorical_feature = feature_path in categorical_features result_dtype = non_missing_values.dtype if non_missing_parent_indices.size < num_rows and is_categorical_feature: result_dtype = np.object flattened_array = np.ndarray(shape=num_rows, dtype=result_dtype) num_values = np.asarray( array_util.ListLengthsFromListArray(feature_array)) missing_parent_indices = np.where(num_values == 0)[0] if feature_path not in categorical_features: # Also impute any NaN values. nan_mask = np.isnan(non_missing_values) if not np.all(nan_mask): imputation_fill_value = non_missing_values[~nan_mask].max() * 10 non_missing_values[nan_mask.nonzero()[0]] = imputation_fill_value flattened_array[non_missing_parent_indices] = non_missing_values if missing_parent_indices.any(): flattened_array[missing_parent_indices] = imputation_fill_value result[feature_path] = flattened_array return result