コード例 #1
0
 def _recursion_helper(
     feature_path: types.FeaturePath, array: pa.Array,
     all_weights: Dict[types.FeatureName, np.ndarray],
 ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]:
   """Recursion helper."""
   array_type = array.type
   innermost_nested_type = get_innermost_nested_type(array_type)
   if pa.types.is_struct(innermost_nested_type):
     if not enumerate_leaves_only:
       weights = all_weights.get(example_weight_map.get(feature_path))
       # special handing for a flat struct array -- wrap it in a ListArray
       # whose elements are singleton lists. This way downstream can keep
       # assuming the enumerated arrays are list<*>.
       to_yield = array
       if pa.types.is_struct(array_type) and wrap_flat_struct_in_list:
         to_yield = array_util.ToSingletonListArray(array)
       yield (feature_path, to_yield, weights)
     flat_struct_array, parent_indices = flatten_nested(
         array, bool(all_weights))
     # Potential optimization:
     # Only flatten weights that we know will be used in the recursion.
     flat_all_weights = {
         weight_feature_name: w[parent_indices]
         for weight_feature_name, w in all_weights.items()
     }
     for field in flat_struct_array.type:
       field_name = field.name
       yield from _recursion_helper(
           feature_path.child(field_name), flat_struct_array.field(field_name),
           flat_all_weights)
   else:
     weights = all_weights.get(example_weight_map.get(feature_path))
     yield (feature_path, array, weights)
コード例 #2
0
  def _recursion_helper(
      query_path: types.FeaturePath, array: pa.Array,
      example_indices: Optional[np.ndarray]
  ) -> Tuple[pa.Array, Optional[np.ndarray]]:
    """Recursion helper."""
    array_type = array.type
    if not query_path:
      if pa.types.is_struct(array_type) and wrap_flat_struct_in_list:
        array = array_util.ToSingletonListArray(array)
      return array, example_indices
    if not pa.types.is_struct(get_innermost_nested_type(array_type)):
      raise KeyError('Cannot process query_path "{}" inside an array of type '
                     '{}. Expecting a struct<...> or '
                     '(large_)list...<struct<...>>.'.format(
                         query_path, array_type))
    flat_struct_array, parent_indices = flatten_nested(
        array, example_indices is not None)
    flat_indices = None
    if example_indices is not None:
      flat_indices = example_indices[parent_indices]

    step = query_path.steps()[0]
    try:
      child_array = flat_struct_array.field(step)
    except KeyError:
      raise KeyError('query_path step "{}" not in struct.'.format(step))
    relative_path = types.FeaturePath(query_path.steps()[1:])
    return _recursion_helper(relative_path, child_array, flat_indices)
コード例 #3
0
 def _recursion_helper(
     feature_path: types.FeaturePath, array: pa.Array,
     weights: Optional[np.ndarray]
 ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]:
     """Recursion helper."""
     array_type = array.type
     innermost_nested_type = get_innermost_nested_type(array_type)
     if pa.types.is_struct(innermost_nested_type):
         if not enumerate_leaves_only:
             # special handing for a flat struct array -- wrap it in a ListArray
             # whose elements are singleton lists. This way downstream can keep
             # assuming the enumerated arrays are list<*>.
             to_yield = array
             if pa.types.is_struct(array_type) and wrap_flat_struct_in_list:
                 to_yield = array_util.ToSingletonListArray(array)
             yield (feature_path, to_yield, weights)
         flat_struct_array, parent_indices = flatten_nested(
             array, weights is not None)
         flat_weights = None if weights is None else weights[parent_indices]
         for field in flat_struct_array.type:
             field_name = field.name
             # use "yield from" after PY 3.3.
             for e in _recursion_helper(feature_path.child(field_name),
                                        flat_struct_array.field(field_name),
                                        flat_weights):
                 yield e
     else:
         yield (feature_path, array, weights)
コード例 #4
0
 def testGetArrayWrapFlatStructArray(self, feature, expected):
   actual_arr, actual_indices = arrow_util.get_array(
       _INPUT_RECORD_BATCH, feature, return_example_indices=True,
       wrap_flat_struct_in_list=True)
   expected_arr, expected_indices = expected
   if pa.types.is_struct(expected_arr.type):
     expected_arr = array_util.ToSingletonListArray(expected_arr)
   self.assertTrue(
       actual_arr.equals(expected_arr),
       "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format(
           feature, expected_arr, actual_arr))
   np.testing.assert_array_equal(expected_indices, actual_indices)
コード例 #5
0
    def testEnumerateArrays(self):
        for leaves_only, has_weights, wrap_flat_struct_in_list in (
                itertools.product([True, False], [True, False],
                                  [True, False])):
            actual_results = {}
            for feature_path, feature_array, weights in arrow_util.enumerate_arrays(
                    _INPUT_RECORD_BATCH,
                    _EXAMPLE_WEIGHT_MAP if has_weights else None, leaves_only,
                    wrap_flat_struct_in_list):
                actual_results[feature_path] = (feature_array, weights)

            expected_results = {}
            # leaf fields
            for p in [["f1"], ["w"], ["w_override1"], ["w_override2"],
                      ["f2", "sf1"], ["f2", "sf2", "ssf1"], ["f3", "sf1"],
                      ["f3", "sf2"]]:
                feature_path = types.FeaturePath(p)
                expected_results[feature_path] = (
                    _FEATURES_TO_ARRAYS[feature_path].array,
                    _FEATURES_TO_ARRAYS[feature_path].weights
                    if has_weights else None)
            if not leaves_only:
                for p in [["f2"], ["f2", "sf2"], ["f3"]]:
                    feature_path = types.FeaturePath(p)
                    expected_array = _FEATURES_TO_ARRAYS[feature_path][0]
                    if wrap_flat_struct_in_list and pa.types.is_struct(
                            expected_array.type):
                        expected_array = array_util.ToSingletonListArray(
                            expected_array)
                    expected_results[feature_path] = (
                        expected_array,
                        _FEATURES_TO_ARRAYS[feature_path].weights
                        if has_weights else None)

            self.assertLen(actual_results, len(expected_results))
            for k, v in six.iteritems(expected_results):
                self.assertIn(k, actual_results)
                actual = actual_results[k]
                self.assertTrue(
                    actual[0].equals(v[0]), "leaves_only={}; has_weights={}; "
                    "wrap_flat_struct_in_list={} feature={}; expected: {}; actual: {}"
                    .format(leaves_only, has_weights, wrap_flat_struct_in_list,
                            k, v, actual))
                np.testing.assert_array_equal(actual[1], v[1])
コード例 #6
0
def _generate_partial_statistics_from_df(
    dataframe: pd.DataFrame, stats_options: options.StatsOptions,
    stats_generators: List[stats_generator.CombinerStatsGenerator]
) -> List[Any]:
    """Generate accumulators containing partial stats."""
    feature_whitelist = set()
    if stats_options.feature_whitelist:
        feature_whitelist.update(stats_options.feature_whitelist)
    # Create a copy of the stats options so that we don't modify the input object.
    stats_options_modified = copy.copy(stats_options)
    # Remove feature_whitelist option as it is no longer needed.
    stats_options_modified.feature_whitelist = None
    schema = schema_pb2.Schema()

    arrow_fields = []
    for col_name, col_type in zip(dataframe.columns, dataframe.dtypes):
        kind = col_type.kind
        if (kind not in _NUMPY_KIND_TO_ARROW_TYPE
                or (feature_whitelist and col_name not in feature_whitelist)):
            logging.warning('Ignoring feature %s of type %s', col_name,
                            col_type)
            continue
        if kind == 'b':
            # Track bool type feature as categorical.
            schema.feature.add(name=col_name,
                               type=schema_pb2.INT,
                               bool_domain=schema_pb2.BoolDomain())
        arrow_fields.append(pa.field(col_name,
                                     _NUMPY_KIND_TO_ARROW_TYPE[kind]))
    if schema.feature:
        stats_options_modified.schema = schema
    record_batch_with_primitive_arrays = pa.RecordBatch.from_pandas(
        dataframe, schema=pa.schema(arrow_fields))
    arrays = []
    for column_array in record_batch_with_primitive_arrays.columns:
        arrays.append(array_util.ToSingletonListArray(column_array))
    # TODO(pachristopher): Consider using a list of record batches instead of a
    # single record batch to avoid having list arrays larger than 2^31 elements.
    record_batch_with_list_arrays = pa.RecordBatch.from_arrays(
        arrays, record_batch_with_primitive_arrays.schema.names)
    return stats_impl.generate_partial_statistics_in_memory(
        record_batch_with_list_arrays, stats_options_modified,
        stats_generators)
コード例 #7
0
def CanonicalizeRecordBatch(
    record_batch_with_primitive_arrays: pa.RecordBatch,) -> pa.RecordBatch:
  """Converts primitive arrays in a pyarrow.RecordBatch to SingletonListArrays.

  Args:
    record_batch_with_primitive_arrays: A pyarrow.RecordBatch where values are
      stored in primitive arrays or singleton list arrays.

  Returns:
    pyArrow.RecordBatch in SingletonListArray format.
  """
  arrays = []
  for column_array in record_batch_with_primitive_arrays.columns:
    arr_type = column_array.type
    if not (pa.types.is_list(arr_type) or pa.types.is_large_list(arr_type)):
      arrays.append(array_util.ToSingletonListArray(column_array))
    else:
      arrays.append(column_array)
  # TODO(pachristopher): Consider using a list of record batches instead of a
  # single record batch to avoid having list arrays larger than 2^31 elements.
  return pa.RecordBatch.from_arrays(
      arrays, record_batch_with_primitive_arrays.schema.names)
コード例 #8
0
ファイル: array_util_test.py プロジェクト: tensorflow/tfx-bsl
 def testToSingletonListArray(self, array, expected_result):
     result = array_util.ToSingletonListArray(array)
     result.validate()
     self.assertTrue(
         result.equals(expected_result),
         "expected: {}; got: {}".format(expected_result, result))