def _remove_unsupported_feature_columns(examples_table: pa.Table,
                                        schema: schema_pb2.Schema) -> pa.Table:
  """Removes feature columns that contain unsupported values.

  All feature columns that are multivalent are dropped since they are
  not supported by sk-learn.

  All columns of STRUCT type are also dropped.

  Args:
    examples_table: Arrow table containing a batch of examples.
    schema: The schema for the data.

  Returns:
    Arrow table.
  """
  multivalent_features = schema_util.get_multivalent_features(schema)
  unsupported_columns = set()
  for f in multivalent_features:
    unsupported_columns.add(f.steps()[0])
  for column_name, column in zip(examples_table.schema.names,
                                 examples_table.itercolumns()):
    if (stats_util.get_feature_type_from_arrow_type(
        types.FeaturePath([column_name]),
        column.type) == statistics_pb2.FeatureNameStatistics.STRUCT):
      unsupported_columns.add(column_name)
  return examples_table.drop(unsupported_columns)
Esempio n. 2
0
def find_nonnull_table_mask(table: pa.Table) -> pa.Array:
    mask = pa.array(np.ones(table.num_rows), pa.bool_())

    for column in table.itercolumns():
        mask = pa.compute.and_(mask, column.chunks[0].is_valid())

    return mask
Esempio n. 3
0
def _filter_table(arrow_table: pa.Table, params: Dict[str, Any]) -> pa.Table:
    if not params["condition"]:
        return arrow_table

    if params["keep"]:
        condition = params["condition"]
    else:
        condition = {"operation": "not", "condition": params["condition"]}

    mask = condition_to_mask(arrow_table, condition)  # or raise ConditionError

    return pa.table({
        name: _filter_column(column, mask)
        for name, column in zip(arrow_table.column_names,
                                arrow_table.itercolumns())
    })
    def _get_univalent_values_with_parent_indices(
            self,
            examples_table: pa.Table) -> Dict[types.FeatureName, pd.DataFrame]:
        """Extracts univalent values for each feature along with parent indices."""
        result = {}
        for feature_column in examples_table.itercolumns():
            feature_name = feature_column.name
            if (self._features_needed is not None
                    and feature_name not in self._features_needed):
                continue
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_name, feature_column.type)
            # Only consider crosses of numeric features.
            # TODO(zhuo): Support numeric features nested under structs.
            if feature_type in (statistics_pb2.FeatureNameStatistics.STRING,
                                statistics_pb2.FeatureNameStatistics.STRUCT):
                continue
            # Assume we have only a single chunk.
            assert feature_column.data.num_chunks == 1
            feat_arr = feature_column.data.chunk(0)
            value_lengths = arrow_util.primitive_array_to_numpy(
                arrow_util.ListLengthsFromListArray(feat_arr))
            univalent_parent_indices = set((value_lengths == 1).nonzero()[0])
            # If there are no univalent values, continue to the next feature.
            if not univalent_parent_indices:
                continue
            non_missing_values = arrow_util.primitive_array_to_numpy(
                feat_arr.flatten())
            value_parent_indices = arrow_util.primitive_array_to_numpy(
                arrow_util.GetFlattenedArrayParentIndices(feat_arr))
            if feature_type == statistics_pb2.FeatureNameStatistics.FLOAT:
                # Remove any NaN values if present.
                non_nan_mask = ~np.isnan(non_missing_values)
                non_missing_values = non_missing_values[non_nan_mask]
                value_parent_indices = value_parent_indices[non_nan_mask]
            df = pd.DataFrame({
                feature_name: non_missing_values,
                'parent_index': value_parent_indices
            })
            # Only keep the univalent feature values.
            df = df[df['parent_index'].isin(univalent_parent_indices)]

            result[feature_name] = df

        return result
Esempio n. 5
0
def _arrow_table_to_json_records(
    table: pyarrow.Table, begin: int, end: int
) -> List[Dict[str, Any]]:
    """
    Convert `table` to JSON records.

    Slice from `begin` (inclusive, first is 0) to `end` (exclusive).

    String values become Strings; Number values become int/float; Datetime
    values become ISO8601-encoded Strings.
    """
    # Select the values we want -- columnar, so memory accesses are contiguous
    values = {
        column.name: _arrow_array_to_json_list(column[begin:end])
        for column in table.itercolumns()
    }
    # Transpose into JSON records
    return [{k: v[i] for k, v in values.items()} for i in range(end - begin)]
Esempio n. 6
0
def read_columns(table: pa.Table, full: bool = True) -> List[Column]:
    """Read Column definitions and validate Workbench assumptions.

    Raise ValidateError if:

    * table has metadata
    * table has more than one record batch
    * columns have invalid metadata (e.g., a "format" on a "text" column, or
      a timestamp with unit!=ns or a timezone)
    * column values disagree with metadata (e.g., date32 "2021-04-12" with
      `ColumnType.Date("month")`)

    Be sure the Arrow file backing the table was validated with
    `validate_arrow_file()` first. Otherwise, you'll get undefined behavior.

    If `full=False`, skip costly checks. Only pass `full=False` when you can
    guarantee the data has been generated by a source you trust. (In particular,
    module output is not trusted and it must use the default `full=True`.)
    """
    if table.schema.metadata is not None:
        raise TableSchemaHasMetadata()

    seen_column_names: Dict[str, int] = {}
    ret = []

    for position, column in enumerate(table.itercolumns()):
        field = table.field(position)
        if column.num_chunks > 1:
            raise TableHasTooManyRecordBatches(column.num_chunks)

        if field.name in seen_column_names:
            raise DuplicateColumnName(
                field.name, seen_column_names[field.name], position
            )
        else:
            seen_column_names[field.name] = position

        ret.append(Column(field.name, _read_column_type(column, field, full=full)))

    return ret
Esempio n. 7
0
def enumerate_arrays(
    table: pa.Table, weight_column: Optional[Text], enumerate_leaves_only: bool
) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]:
    """Enumerates arrays in a Table.

  It assumes all the columns in `table` have only one chunk.
  It assumes `table` contains only arrays of the following supported types:
    - list<primitive>
    - list<struct<[Ts]>> where Ts are the types of the fields in the struct
      type, and they can only be one of the supported types
      (recursion intended).

  It enumerates each column (i.e. array, because there is only one chunk) in
  the table (also see `enumerate_leaves_only`) If an array is of type
  list<struct<[Ts]>>, then it flattens the outermost list, then enumerates the
  array of each field in the result struct<[Ts]> array, and continues
  recursively. The weights get "aligned" automatically in this process,
  therefore weights, the third term in the returned tuple always has array[i]'s
  weight being weights[i].

  Args:
    table: The Table whose arrays to be visited. It is assumed that the table
      contains only one chunk.
    weight_column: The name of the weight column, or None. The elements of
      the weight column should be lists of numerics, and each list should
      contain only one value.
    enumerate_leaves_only: If True, only enumerate "leaf" arrays.
      Otherwise, also enumerate the struct arrays where the leaf arrays are
      contained.

  Yields:
    A tuple. The first term is the path of the feature; the second term is
    the feature array and the third term is the weight array for the feature
    array (i.e. weights[i] is the weight for array[i]).

  Raises:
    ValueError: When the weight column is not a list array whose elements are
      1-element lists.
  """
    def _recursion_helper(
        feature_path: types.FeaturePath, array: pa.Array,
        weights: Optional[np.ndarray]
    ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]:
        """Recursion helper."""
        array_type = array.type
        if is_list_like(array_type) and pa.types.is_struct(
                array_type.value_type):
            if not enumerate_leaves_only:
                yield (feature_path, array, weights)
            flat_struct_array = array.flatten()
            flat_weights = None
            if weights is not None:
                flat_weights = weights[
                    array_util.GetFlattenedArrayParentIndices(
                        array).to_numpy()]
            for field in flat_struct_array.type:
                field_name = field.name
                # use "yield from" after PY 3.3.
                for e in _recursion_helper(feature_path.child(field_name),
                                           flat_struct_array.field(field_name),
                                           flat_weights):
                    yield e
        else:
            yield (feature_path, array, weights)

    weights = None
    if weight_column is not None:
        weights = get_broadcastable_column(table, weight_column)
        weight_type = weights.type
        if (not pa.types.is_floating(weight_type)
                and not pa.types.is_integer(weight_type)):
            raise ValueError(
                'Weight column "{}" must be of numeric type. Found {}.'.format(
                    weight_column, weight_type))
        weights = np.asarray(weights)
    for column_name, column in zip(table.schema.names, table.itercolumns()):
        # use "yield from" after PY 3.3.
        for e in _recursion_helper(types.FeaturePath([column_name]),
                                   column.data.chunk(0), weights):
            yield e
def _flatten_and_impute(examples_table: pa.Table,
                        categorical_features: Set[types.FeaturePath]
                       ) -> Dict[types.FeaturePath, np.ndarray]:
  """Flattens and imputes the values in the input Arrow table.

  Replaces missing values with CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE
  for categorical features and 10*max(feature_values) for numeric features.
  We impute missing values with an extreme value that is far from observed
  values so it does not incorrectly impact KNN results. 10*max(feature_values)
  is used instead of sys.max_float because max_float is large enough to cause
  unexpected float arithmetic errors.

  Args:
    examples_table: Arrow table containing a batch of examples where all
      features are univalent.
    categorical_features: Set of categorical feature names.

  Returns:
    A Dict[FeaturePath, np.ndarray] where the key is the feature path and the
    value is a 1D numpy array corresponding to the feature values.
  """
  num_rows = examples_table.num_rows
  result = {}
  for column_name, feature_column in zip(examples_table.schema.names,
                                         examples_table.itercolumns()):
    feature_path = types.FeaturePath([column_name])
    # Assume we have only a single chunk.
    feature_array = feature_column.data.chunk(0)
    imputation_fill_value = (
        CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE
        if feature_path in categorical_features else sys.maxsize)
    if pa.types.is_null(feature_array.type):
      # If null array, impute all values.
      imputed_values_array = np.full(
          shape=num_rows,
          fill_value=imputation_fill_value)
      result[feature_path] = imputed_values_array
    else:
      # to_pandas returns a readonly array. Create a copy as we will be imputing
      # the NaN values.
      non_missing_values = np.copy(
          np.asarray(feature_array.flatten()))
      non_missing_parent_indices = np.asarray(
          array_util.GetFlattenedArrayParentIndices(feature_array))
      is_categorical_feature = feature_path in categorical_features
      result_dtype = non_missing_values.dtype
      if non_missing_parent_indices.size < num_rows and is_categorical_feature:
        result_dtype = np.object
      flattened_array = np.ndarray(shape=num_rows, dtype=result_dtype)
      num_values = np.asarray(
          array_util.ListLengthsFromListArray(feature_array))
      missing_parent_indices = np.where(num_values == 0)[0]
      if feature_path not in categorical_features:
        # Also impute any NaN values.
        nan_mask = np.isnan(non_missing_values)
        if not np.all(nan_mask):
          imputation_fill_value = non_missing_values[~nan_mask].max() * 10
        non_missing_values[nan_mask.nonzero()[0]] = imputation_fill_value
      flattened_array[non_missing_parent_indices] = non_missing_values
      if missing_parent_indices.any():
        flattened_array[missing_parent_indices] = imputation_fill_value
      result[feature_path] = flattened_array
  return result