def testIsBinaryLike(self):
        for t in (pa.binary(), pa.large_binary(), pa.string(),
                  pa.large_string()):
            self.assertTrue(arrow_util.is_binary_like(t))

        for t in (pa.list_(pa.binary()), pa.large_list(pa.string())):
            self.assertFalse(arrow_util.is_binary_like(t))
def DecodedExamplesToRecordBatch(
        decoded_examples: List[types.Example]) -> pa.RecordBatch:
    """Converts a list of types.Example to an Arrow RecordBatch.

  where types.Example is Dict[Union[bytes, unicode], Union[None, np.ndarray]]
  The result record batch has M rows and N columns where M is the number of
  examples in the list and N is the number of unique features in the examples.
  Each column is either a ListArray<primitive|string|binary> or a NullArray.
  None and missing feature handling:
    - if a feature's value is None in an example, then its corresponding column
      in the result batch will have a null at the corresponding position.
    - if a feature's value is always None across all the examples in the input
      list, then its corresponding column in the result batch will be a
      NullArray.
    - if an example does not contain a feature (in the universe of features),
      then the column of that feature will have a null at the corresponding
      position.

  Args:
    decoded_examples: a Dict[Union[bytes, unicode], Union[None, np.ndarray]]

  Returns:
    a pa.RecordBatch.

  Raises:
    ValueError: when the conversion fails.
    TypeError: when some of the output columns are not of supported types.
  """
    if not decoded_examples:
        return pa.RecordBatch.from_arrays([], [])

    struct_array = pa.array(decoded_examples)
    if not pa.types.is_struct(struct_array.type):
        raise ValueError("Unexpected Arrow type created from input")
    field_names = [f.name for f in list(struct_array.type)]
    if not field_names:
        return _GetEmptyRecordBatch(len(decoded_examples))
    value_arrays = struct_array.flatten()
    for name, array in six.moves.zip(field_names, value_arrays):
        if pa.types.is_null(array.type):
            continue
        if not arrow_util.is_list_like(array.type):
            raise TypeError(
                "Expected list arrays for field {} but got {}".format(
                    name, array.type))
        value_type = array.type.value_type
        if (not pa.types.is_integer(value_type)
                and not pa.types.is_floating(value_type)
                and not arrow_util.is_binary_like(value_type)
                and not pa.types.is_null(value_type)):
            raise TypeError("Type not supported: {} {}".format(
                name, array.type))

    return pa.RecordBatch.from_arrays(value_arrays, field_names)
 def update(self, feature_array: pa.Array) -> None:
   """Update the partial string statistics using the input value."""
   if pa.types.is_null(feature_array.type):
     return
   # Iterate through the value array and update the partial stats.
   flattened_values_array, _ = arrow_util.flatten_nested(feature_array)
   if arrow_util.is_binary_like(flattened_values_array.type):
     # GetBinaryArrayTotalByteSize returns a Python long (to be compatible
     # with Python3). To make sure we do cheaper integer arithemetics in
     # Python2, we first convert it to int.
     self.total_bytes_length += int(array_util.GetBinaryArrayTotalByteSize(
         flattened_values_array))
   elif flattened_values_array:
     # We can only do flattened_values_array.to_numpy() when it's not empty.
     # This could be computed faster by taking log10 of the integer.
     def _len_after_conv(s):
       return len(str(s))
     self.total_bytes_length += np.sum(
         np.vectorize(_len_after_conv,
                      otypes=[np.int32])(np.asarray(flattened_values_array)))
def get_feature_type_from_arrow_type(
        feature_path: types.FeaturePath,
        arrow_type: pa.DataType) -> Optional[types.FeatureNameStatisticsType]:
    """Get feature type from Arrow type.

  Args:
    feature_path: path of the feature.
    arrow_type: Arrow DataType.

  Returns:
    A statistics_pb2.FeatureNameStatistics.Type value or None if arrow_type
    is null (which means it cannot be determined for now).

  Raises:
    TypeError: if the type is not supported.
  """
    if pa.types.is_null(arrow_type):
        return None
    if not arrow_util.is_list_like(arrow_type):
        raise TypeError(
            'Expected feature column to be a '
            '(Large)List<primitive|struct> or null, but feature {} '
            'was {}.'.format(feature_path, arrow_type))

    value_type = arrow_util.get_innermost_nested_type(arrow_type)
    if pa.types.is_integer(value_type):
        return statistics_pb2.FeatureNameStatistics.INT
    elif pa.types.is_floating(value_type):
        return statistics_pb2.FeatureNameStatistics.FLOAT
    elif arrow_util.is_binary_like(value_type):
        return statistics_pb2.FeatureNameStatistics.STRING
    elif pa.types.is_struct(value_type):
        return statistics_pb2.FeatureNameStatistics.STRUCT
    elif pa.types.is_null(value_type):
        return None

    raise TypeError('Feature {} has unsupported arrow type: {}'.format(
        feature_path, arrow_type))
Beispiel #5
0
def _get_example_value_presence(
        record_batch: pa.RecordBatch, path: types.FeaturePath,
        boundaries: Optional[Sequence[float]],
        weight_column_name: Optional[Text]) -> Optional[pd.DataFrame]:
    """Returns information about which examples contained which values.

  This function treats all values for a given path within a single example
  as a set and and returns a mapping between each example index and the distinct
  values which are present in that example.

  The result of calling this function for path 'p' on an arrow record batch with
  the two records [{'p': ['a', 'a', 'b']}, {'p': [a]}] will be
  pd.Series(['a', 'b', 'a'], index=[0, 0, 1]).

  If the array retrieved from get_array is null, this function returns None.

  Args:
    record_batch: The RecordBatch in which to look up the path.
    path: The FeaturePath for which to fetch values.
    boundaries: Optionally, a set of bin boundaries to use for binning the array
      values.
    weight_column_name: Optionally, a weight column to return in addition to the
      value and example index.

  Returns:
    A Pandas DataFrame containing distinct pairs of array values and example
    indices, along with the corresponding flattened example weights. The index
    will be the example indices and the values will be stored in a column named
    'values'. If weight_column_name is provided, a second column will be
    returned containing the array values, and 'weights' containing the weights
    for the example from which each value came.
  """
    arr, example_indices = arrow_util.get_array(record_batch,
                                                path,
                                                return_example_indices=True)
    if stats_util.get_feature_type_from_arrow_type(path, arr.type) is None:
        return None

    arr_flat, parent_indices = arrow_util.flatten_nested(
        arr, return_parent_indices=True)
    is_binary_like = arrow_util.is_binary_like(arr_flat.type)
    assert boundaries is None or not is_binary_like, (
        'Boundaries can only be applied to numeric columns')
    if is_binary_like:
        # use dictionary_encode so we can use np.unique on object arrays
        dict_array = arr_flat.dictionary_encode()
        arr_flat = dict_array.indices
        arr_flat_dict = np.asarray(dict_array.dictionary)
    example_indices_flat = example_indices[parent_indices]
    if boundaries is not None:
        element_indices, bins = bin_util.bin_array(arr_flat, boundaries)
        rows = np.vstack([example_indices_flat[element_indices], bins])
    else:
        rows = np.vstack([example_indices_flat, np.asarray(arr_flat)])
    if not rows.size:
        return None
    # Deduplicate values which show up more than once in the same example. This
    # makes P(X=x|Y=y) in the standard lift definition behave as
    # P(x \in Xs | y \in Ys) if examples contain more than one value of X and Y.
    unique_rows = np.unique(rows, axis=1)
    example_indices = unique_rows[0, :]
    values = unique_rows[1, :]
    if is_binary_like:
        # return binary like values a pd.Categorical wrapped in a Series. This makes
        # subsqeuent operations like pd.Merge cheaper.
        values = pd.Categorical.from_codes(values, categories=arr_flat_dict)
    columns = {'example_indices': example_indices, 'values': values}
    if weight_column_name:
        weights = arrow_util.get_weight_feature(record_batch,
                                                weight_column_name)
        columns['weights'] = np.asarray(weights)[example_indices]
    df = pd.DataFrame(columns)
    return df.set_index('example_indices')