Esempio n. 1
0
def _remove_unsupported_feature_columns(examples_table: pa.Table,
                                        schema: schema_pb2.Schema) -> pa.Table:
  """Removes feature columns that contain unsupported values.

  All feature columns that are multivalent are dropped since they are
  not supported by sk-learn.

  All columns of STRUCT type are also dropped.

  Args:
    examples_table: Arrow table containing a batch of examples.
    schema: The schema for the data.

  Returns:
    Arrow table.
  """
  multivalent_features = schema_util.get_multivalent_features(schema)
  unsupported_columns = set()
  for f in multivalent_features:
    unsupported_columns.add(f.steps()[0])
  for column_name, column in zip(examples_table.schema.names,
                                 examples_table.itercolumns()):
    if (stats_util.get_feature_type_from_arrow_type(
        types.FeaturePath([column_name]),
        column.type) == statistics_pb2.FeatureNameStatistics.STRUCT):
      unsupported_columns.add(column_name)
  return examples_table.drop(unsupported_columns)
Esempio n. 2
0
def generate_partial_statistics_in_memory(
    table: pa.Table, options: stats_options.StatsOptions,
    stats_generators: List[stats_generator.CombinerStatsGenerator]
) -> List[Any]:
    """Generates statistics for an in-memory list of examples.

  Args:
    table: Arrow table.
    options: Options for generating data statistics.
    stats_generators: A list of combiner statistics generators.

  Returns:
    A list of accumulators containing partial statistics.
  """
    result = []

    if options.feature_whitelist:
        whitelisted_columns = [
            table.column(f) for f in options.feature_whitelist
        ]
        table = pa.Table.from_arrays(whitelisted_columns)
    for generator in stats_generators:
        result.append(
            generator.add_input(generator.create_accumulator(), table))

    return result
Esempio n. 3
0
def get_broadcastable_column(input_table: pa.Table,
                             column_name: Text) -> pa.Array:
    """Gets a column from the input table, validating that it can be broadcast.

  Args:
    input_table: Input table.
    column_name: Name of the column to be retrieved and validated.
      This column must refer to a ListArray in which each list has length 1.

  Returns:
    An arrow array containing a flattened view of the broadcast column.

  Raises:
    ValueError: If the broadcast feature is not present in the input table or is
        not a valid column. A valid column must have exactly one value per
        example and be of a numeric type. If copy_array is True, the numeric
        type constraint is relaxed.
  """
    try:
        column = input_table.column(column_name).data.chunk(0)
    except KeyError:
        raise ValueError(
            'Column "{}" not present in the input table.'.format(column_name))

    # Before flattening, check that there is a single value for each example.
    column_lengths = array_util.ListLengthsFromListArray(column).to_numpy()
    if not np.all(column_lengths == 1):
        raise ValueError(
            'Column "{}" must have exactly one value in each example.'.format(
                column_name))
    return column.flatten()
Esempio n. 4
0
 def add_input(self, accumulator: List[float],
               examples_table: pa.Table) -> List[float]:
     accumulator[0] += examples_table.num_rows
     if self._weight_feature:
         weights_column = examples_table.column(self._weight_feature)
         for weight_array in weights_column.data.iterchunks():
             accumulator[1] += np.sum(np.asarray(weight_array.flatten()))
     return accumulator
def _flatten_and_impute(
    examples_table: pa.Table, categorical_features: Set[types.FeaturePath]
) -> Dict[types.FeaturePath, np.ndarray]:
    """Flattens and imputes the values in the input Arrow table.

  Replaces missing values with CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE
  for categorical features and 10*max(feature_values) for numeric features.
  We impute missing values with an extreme value that is far from observed
  values so it does not incorrectly impact KNN results. 10*max(feature_values)
  is used instead of sys.max_float because max_float is large enough to cause
  unexpected float arithmetic errors.

  Args:
    examples_table: Arrow table containing a batch of examples where all
      features are univalent.
    categorical_features: Set of categorical feature names.

  Returns:
    A Dict[FeaturePath, np.ndarray] where the key is the feature path and the
    value is a 1D numpy array corresponding to the feature values.
  """
    num_rows = examples_table.num_rows
    result = {}
    for feature_column in examples_table.itercolumns():
        feature_path = types.FeaturePath([feature_column.name])
        # Assume we have only a single chunk.
        feature_array = feature_column.data.chunk(0)
        # to_pandas returns a readonly array. Create a copy as we will be imputing
        # the NaN values.
        non_missing_values = np.copy(
            arrow_util.primitive_array_to_numpy(feature_array.flatten()))
        non_missing_parent_indices = arrow_util.primitive_array_to_numpy(
            arrow_util.GetFlattenedArrayParentIndices(feature_array))
        is_categorical_feature = feature_path in categorical_features
        result_dtype = non_missing_values.dtype
        if non_missing_parent_indices.size < num_rows and is_categorical_feature:
            result_dtype = np.object
        flattened_array = np.ndarray(shape=num_rows, dtype=result_dtype)
        num_values = arrow_util.primitive_array_to_numpy(
            arrow_util.ListLengthsFromListArray(feature_array))
        missing_parent_indices = np.where(num_values == 0)[0]
        if feature_path in categorical_features:
            imputation_fill_value = CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE
        else:
            # Also impute any NaN values.
            nan_mask = np.isnan(non_missing_values)
            imputation_fill_value = sys.maxsize
            if not np.all(nan_mask):
                imputation_fill_value = non_missing_values[~nan_mask].max(
                ) * 10
            non_missing_values[nan_mask.nonzero()[0]] = imputation_fill_value
        flattened_array[non_missing_parent_indices] = non_missing_values
        if missing_parent_indices.any():
            flattened_array[missing_parent_indices] = imputation_fill_value
        result[feature_path] = flattened_array
    return result
Esempio n. 6
0
def _filter_features(table: pa.Table,
                     feature_whitelist: List[types.FeatureName]) -> pa.Table:
    """Removes features that are not whitelisted.

  Args:
    table: Input Arrow table.
    feature_whitelist: A set of feature names to whitelist.

  Returns:
    An Arrow table containing only the whitelisted features of the input table.
  """
    column_names = set(table.schema.names)
    columns_to_select = []
    for feature_name in feature_whitelist:
        if feature_name in column_names:
            columns_to_select.append(table.column(feature_name))
    return pa.Table.from_arrays(columns_to_select)
Esempio n. 7
0
def _get_weight_feature(input_table: pa.Table,
                        weight_feature: Text) -> np.ndarray:
    """Gets the weight column from the input table.

  Args:
    input_table: Input table.
    weight_feature: Name of the weight feature.

  Returns:
    A numpy array containing the weights of the examples in the input table.

  Raises:
    ValueError: If the weight feature is not present in the input table or is
        not a valid weight feature (must be of numeric type and have a
        single value for each example).
  """
    try:
        weights = input_table.column(weight_feature).data.chunk(0)
    except KeyError:
        raise ValueError('Weight feature "{}" not present in the input '
                         'table.'.format(weight_feature))

    # Before flattening, check that there is a single value for each example.
    weight_lengths = ListLengthsFromListArray(weights).to_numpy()
    if not np.all(weight_lengths == 1):
        raise ValueError(
            'Weight feature "{}" must have exactly one value in each example.'.
            format(weight_feature))
    weights = weights.flatten()
    # Before converting to numpy view, check the type (cannot convert string and
    # binary arrays to numpy view).
    weights_type = weights.type
    if pa.types.is_string(weights_type) or pa.types.is_binary(weights_type):
        raise ValueError(
            'Weight feature "{}" must be of numeric type. Found {}.'.format(
                weight_feature, weights_type))
    return weights.to_numpy()
Esempio n. 8
0
def enumerate_arrays(
    table: pa.Table, weight_column: Optional[Text], enumerate_leaves_only: bool
) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]:
    """Enumerates arrays in a Table.

  It assumes all the columns in `table` has only one chunk.
  It assumes `table` contains only arrays of the following supported types:
    - list<primitive>
    - list<struct<[Ts]>> where Ts are the types of the fields in the struct
      type, and they can only be one of the supported types
      (recursion intended).

  It enumerates each column (i.e. array, because there is only one chunk) in
  the table (also see `enumerate_leaves_only`) If an array is of type
  list<struct<[Ts]>>, then it flattens the outermost list, then enumerates the
  array of each field in the result struct<[Ts]> array, and continues
  recursively. The weights get "aligned" automatically in this process,
  therefore weights, the third term in the returned tuple always has array[i]'s
  weight being weights[i].

  Args:
    table: The Table whose arrays to be visited. It is assumed that the table
      contains only one chunk.
    weight_column: The name of the weight column, or None. The elements of
      the weight column should be lists of numerics, and each list should
      contain only one value.
    enumerate_leaves_only: If True, only enumerate "leaf" arrays.
      Otherwise, also enumerate the struct arrays where the leaf arrays are
      contained.

  Yields:
    A tuple. The first term is the path of the feature; the second term is
    the feature array and the third term is the weight array for the feature
    array (i.e. weights[i] is the weight for array[i]).

  Raises:
    ValueError: When the weight column is not a list array whose elements are
      not 1-element lists.
  """
    def _recursion_helper(
        feature_path: types.FeaturePath, array: pa.Array,
        weights: Optional[np.ndarray]
    ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]:
        """Recursion helper."""
        array_type = array.type
        if (pa.types.is_list(array_type)
                and pa.types.is_struct(array_type.value_type)):
            if not enumerate_leaves_only:
                yield (feature_path, array, weights)
            flat_struct_array = array.flatten()
            flat_weights = None
            if weights is not None:
                flat_weights = weights[GetFlattenedArrayParentIndices(
                    array).to_numpy()]
            for field in flat_struct_array.type:
                field_name = field.name
                # use "yield from" after PY 3.3.
                for e in _recursion_helper(feature_path.child(field_name),
                                           flat_struct_array.field(field_name),
                                           flat_weights):
                    yield e
        else:
            yield (feature_path, array, weights)

    weights = None
    if weight_column is not None:
        weights = table.column(weight_column).data.chunk(
            0).flatten().to_numpy()
        if weights.size != table.num_rows:
            raise ValueError(
                'The weight feature must have exactly one value in each example'
            )
    for column in table.columns:
        column_name = column.name
        # use "yield from" after PY 3.3.
        for e in _recursion_helper(types.FeaturePath([column_name]),
                                   column.data.chunk(0), weights):
            yield e
Esempio n. 9
0
def get_array(
        table: pa.Table, query_path: types.FeaturePath,
        return_example_indices: bool) -> Tuple[pa.Array, Optional[np.ndarray]]:
    """Retrieve a nested array (and optionally example indices) from a table.

  It assumes all the columns in `table` have only one chunk.
  It assumes `table` contains only arrays of the following supported types:
    - list<primitive>
    - list<struct<[Ts]>> where Ts are the types of the fields in the struct
      type, and they can only be one of the supported types
      (recursion intended).

  If the provided path refers to a leaf in the table, then a ListArray with a
  primitive element type will be returned. If the provided path does not refer
  to a leaf, a ListArray with a StructArray element type will be returned.

  Args:
    table: The Table whose arrays to be visited. It is assumed that the table
      contains only one chunk.
    query_path: The FeaturePath to lookup in the table.
    return_example_indices: Whether to return an additional array containing the
      example indices of the elements in the array corresponding to the
      query_path.

  Returns:
    A tuple. The first term is the feature array and the second term is the
    example_indeices array for the feature array (i.e. array[i] came from the
    example at row example_indices[i] in the table.).

  Raises:
    KeyError: When the query_path is empty, or cannot be found in the table and
      its nested struct arrays.
  """
    def _recursion_helper(
        query_path: types.FeaturePath, array: pa.Array,
        example_indices: Optional[np.ndarray]
    ) -> Tuple[pa.Array, Optional[np.ndarray]]:
        """Recursion helper."""
        if not query_path:
            return array, example_indices
        array_type = array.type
        if (not pa.types.is_list(array_type)
                or not pa.types.is_struct(array_type.value_type)):
            raise KeyError(
                'Cannot process query_path "{}" inside an array of type '
                '{}. Expecting a list<struct<...>>.'.format(
                    query_path, array_type))
        flat_struct_array = array.flatten()
        flat_indices = None
        if example_indices is not None:
            flat_indices = example_indices[
                array_util.GetFlattenedArrayParentIndices(array).to_numpy()]

        step = query_path.steps()[0]
        try:
            child_array = flat_struct_array.field(step)
        except KeyError:
            raise KeyError('query_path step "{}" not in struct.'.format(step))
        relative_path = types.FeaturePath(query_path.steps()[1:])
        return _recursion_helper(relative_path, child_array, flat_indices)

    if not query_path:
        raise KeyError('query_path must be non-empty.')
    column_name = query_path.steps()[0]
    try:
        array = table.column(column_name).data.chunk(0)
    except KeyError:
        raise KeyError(
            'query_path step 0 "{}" not in table.'.format(column_name))
    array_path = types.FeaturePath(query_path.steps()[1:])

    example_indices = np.arange(
        table.num_rows) if return_example_indices else None
    return _recursion_helper(array_path, array, example_indices)
Esempio n. 10
0
def get_array(
    table: pa.Table,
    query_path: types.FeaturePath,
    broadcast_column_name: Optional[Text] = None
) -> Tuple[pa.Array, Optional[np.ndarray]]:
    """Retrieve a nested array (and optionally weights) from a table.

  It assumes all the columns in `table` have only one chunk.
  It assumes `table` contains only arrays of the following supported types:
    - list<primitive>
    - list<struct<[Ts]>> where Ts are the types of the fields in the struct
      type, and they can only be one of the supported types
      (recursion intended).

  If the provided path refers to a leaf in the table, then a ListArray with a
  primitive element type will be returned. If the provided path does not refer
  to a leaf, a ListArray with a StructArray element type will be returned.

  Args:
    table: The Table whose arrays to be visited. It is assumed that the table
      contains only one chunk.
    query_path: The FeaturePath to lookup in the table.
    broadcast_column_name: The name of a column to broadcast, or None. Each list
      should contain exactly one value.

  Returns:
    A tuple. The first term is the feature array and the second term is the
    broadcast column array for the feature array (i.e. broadcast_column[i] is
    the corresponding value for array[i]).

  Raises:
    ValueError: When the broadcast column is not a list array or its elements
      are not 1-element arrays. Or, if copy_broadcast_column is False, an error
      will be raised if its elements are not of a numeric type.
    KeyError: When the query_path is empty, or cannot be found in the table and
      its nested struct arrays.
  """
    def _recursion_helper(
        query_path: types.FeaturePath, array: pa.Array,
        weights: Optional[np.ndarray]
    ) -> Tuple[pa.Array, Optional[np.ndarray]]:
        """Recursion helper."""
        if not query_path:
            return array, weights
        array_type = array.type
        if (not pa.types.is_list(array_type)
                or not pa.types.is_struct(array_type.value_type)):
            raise KeyError(
                'Cannot process query_path "{}" inside an array of type '
                '{}. Expecting a list<struct<...>>.'.format(
                    query_path, array_type))
        flat_struct_array = array.flatten()
        flat_weights = None
        if weights is not None:
            flat_weights = weights[array_util.GetFlattenedArrayParentIndices(
                array).to_numpy()]

        step = query_path.steps()[0]
        try:
            child_array = flat_struct_array.field(step)
        except KeyError:
            raise KeyError('query_path step "{}" not in struct.'.format(step))
        relative_path = types.FeaturePath(query_path.steps()[1:])
        return _recursion_helper(relative_path, child_array, flat_weights)

    if not query_path:
        raise KeyError('query_path must be non-empty.')
    column_name = query_path.steps()[0]
    try:
        array = table.column(column_name).data.chunk(0)
    except KeyError:
        raise KeyError(
            'query_path step 0 "{}" not in table.'.format(column_name))
    array_path = types.FeaturePath(query_path.steps()[1:])

    broadcast_column = None
    if broadcast_column_name is not None:
        broadcast_column = np.asarray(
            get_broadcastable_column(table, broadcast_column_name))
    return _recursion_helper(array_path, array, broadcast_column)