Ejemplo n.º 1
0
def generate_partial_statistics_in_memory(examples, options, stats_generators):
    """Generates statistics for an in-memory list of examples.

  Args:
    examples: A list of input examples.
    options: Options for generating data statistics.
    stats_generators: A list of combiner statistics generators.

  Returns:
    A list of accumulators containing partial statistics.
  """
    result = []

    # DecodedExamplesToTable cannot handle empty input.
    if not examples:
        return [gen.create_accumulator() for gen in stats_generators]

    table = decoded_examples_to_arrow.DecodedExamplesToTable(examples)
    if options.feature_whitelist:
        whitelisted_columns = [
            table.column(f) for f in options.feature_whitelist
        ]
        table = pa.Table.from_arrays(whitelisted_columns)
    for generator in stats_generators:
        result.append(
            generator.add_input(generator.create_accumulator(), table))

    return result
Ejemplo n.º 2
0
def BatchExamplesToArrowTables(
    examples: beam.pvalue.PCollection,
    desired_batch_size: Optional[int] = constants.
    DEFAULT_DESIRED_INPUT_BATCH_SIZE
) -> beam.pvalue.PCollection:
    """Batches example dicts into Arrow tables.

  Args:
    examples: A PCollection of example dicts.
    desired_batch_size: Batch size. The output Arrow tables will have as many
      rows as the `desired_batch_size`.

  Returns:
    A PCollection of Arrow tables.
  """
    # DecodedExamplesToTable should be called within a lambda function instead of
    # specifying the function name in beam.Map for the reasons discussed in
    # b/143648957.
    # TODO(b/131315065): Remove the comment above when the CSV decoder no longer
    # uses BatchExamplesToArrowTables.
    return (
        examples
        | "BatchBeamExamples" >>
        beam.BatchElements(**GetBeamBatchKwargs(desired_batch_size))
        | "DecodeExamplesToTable" >>
        # pylint: disable=unnecessary-lambda
        beam.Map(lambda x: decoded_examples_to_arrow.DecodedExamplesToTable(x))
    )
Ejemplo n.º 3
0
def _generate_partial_statistics_from_df(
    dataframe: pd.DataFrame, stats_options: options.StatsOptions,
    stats_generators: List[stats_generator.CombinerStatsGenerator]
) -> List[Any]:
    """Generate accumulators containing partial stats."""
    inmemory_dicts = [{} for _ in range(len(dataframe))]
    isnull = pd.isnull
    # Initialize decoding fn based on column type.
    int_fn = lambda x: np.array([x], dtype=np.integer)
    float_fn = lambda x: None if isnull(x) else np.array([x],
                                                         dtype=np.floating)
    str_fn = lambda x: None if isnull(x) else np.array([x], dtype=np.object)
    decode_fn = {
        # int type.
        'i': int_fn,
        'u': int_fn,
        # float type.
        'f': float_fn,
        # bool type.
        'b': int_fn,
        # string type.
        'S': str_fn,
        'O': str_fn,
        'U': str_fn,
    }

    feature_whitelist = set()
    if stats_options.feature_whitelist:
        feature_whitelist.update(stats_options.feature_whitelist)
    # Create a copy of the stats options so that we don't modify the input object.
    stats_options_modified = copy.copy(stats_options)
    # Remove feature_whitelist option as it is no longer needed.
    stats_options_modified.feature_whitelist = None
    schema = schema_pb2.Schema()
    for col_name, col_type in zip(dataframe.columns, dataframe.dtypes):
        kind = col_type.kind
        if (kind not in decode_fn
                or (feature_whitelist and col_name not in feature_whitelist)):
            logging.warning('Ignoring feature %s of type %s', col_name,
                            col_type)
            continue
        if kind == 'b':
            # Track bool type feature as categorical.
            schema.feature.add(name=col_name,
                               type=schema_pb2.INT,
                               bool_domain=schema_pb2.BoolDomain())

        # Get decoding fn based on column type.
        fn = decode_fn[kind]
        # Iterate over the column and apply the decoding fn.
        j = 0
        for val in dataframe[col_name]:
            inmemory_dicts[j][col_name] = fn(val)
            j += 1
    if schema.feature:
        stats_options_modified.schema = schema
    return stats_impl.generate_partial_statistics_in_memory(
        decoded_examples_to_arrow.DecodedExamplesToTable(inmemory_dicts),
        stats_options_modified, stats_generators)
 def test_conversion(self, input_examples, expected_output):
     table = decoded_examples_to_arrow.DecodedExamplesToTable(
         input_examples)
     self.assertLen(expected_output, table.num_columns)
     for feature_name, expected_arrow_array in six.iteritems(
             expected_output):
         self.assertLen(table.column(feature_name).data.chunks, 1)
         self.assertTrue(
             expected_arrow_array.equals(
                 table.column(feature_name).data.chunk(0)))
Ejemplo n.º 5
0
def _generate_partial_statistics_from_df(
    dataframe,
    stats_options,
    stats_generators
):
  """Generate accumulators containing partial stats."""
  inmemory_dicts = [{} for _ in range(len(dataframe))]
  isnull = pd.isnull
  # Initialize decoding fn based on column type.
  int_fn = lambda x: np.array([x], dtype=np.integer)
  float_fn = lambda x: None if isnull(x) else np.array([x], dtype=np.floating)
  str_fn = lambda x: None if isnull(x) else np.array([x], dtype=np.object)
  decode_fn = {
      # int type.
      'i': int_fn,
      'u': int_fn,
      # float type.
      'f': float_fn,
      # bool type.
      'b': int_fn,
      # string type.
      'S': str_fn,
      'O': str_fn,
      'U': str_fn,
  }

  schema = schema_pb2.Schema()
  for col_name, col_type in zip(dataframe.columns, dataframe.dtypes):
    kind = col_type.kind
    if kind not in decode_fn:
      logging.warning('Ignoring feature %s of type %s', col_name, col_type)
      continue
    if kind == 'b':
      # Track bool type feature as categorical.
      schema.feature.add(
          name=col_name, type=schema_pb2.INT,
          bool_domain=schema_pb2.BoolDomain())

    # Get decoding fn based on column type.
    fn = decode_fn[kind]
    # Iterate over the column and apply the decoding fn.
    j = 0
    for val in dataframe[col_name]:
      inmemory_dicts[j][col_name] = fn(val)
      j += 1
  if schema.feature:
    stats_options.schema = schema
  return stats_impl.generate_partial_statistics_in_memory(
      decoded_examples_to_arrow.DecodedExamplesToTable(inmemory_dicts),
      stats_options, stats_generators)
Ejemplo n.º 6
0
    def _maybe_do_batch(self, accumulator, force=False):
        """Maybe updates accumulator in place.

    Checks if accumulator has enough examples for a batch, and if so, does the
    stats computation for the batch and updates accumulator in place.

    Args:
      accumulator: Accumulator. Will be updated in place.
      force: Force computation of stats even if accumulator has less examples
        than the batch size.
    """
        batch_size = len(accumulator.input_examples)
        if (force
                and batch_size > 0) or batch_size >= self._desired_batch_size:
            self._combine_add_input_batch_size.update(batch_size)
            arrow_table = decoded_examples_to_arrow.DecodedExamplesToTable(
                accumulator.input_examples)

            accumulator.partial_accumulators = self._for_each_generator(
                lambda gen, gen_acc: gen.add_input(gen_acc, arrow_table),
                accumulator.partial_accumulators)
            del accumulator.input_examples[:]
 def test_conversion_empty_examples(self):
     input_examples = [{}] * 10
     table = decoded_examples_to_arrow.DecodedExamplesToTable(
         input_examples)
     self.assertEqual(table.num_rows, 10)
     self.assertEqual(table.num_columns, 0)
 def test_invalid_input(self, test_input, expected_error,
                        expected_error_regexp):
     with self.assertRaisesRegexp(expected_error, expected_error_regexp):
         decoded_examples_to_arrow.DecodedExamplesToTable(test_input)
Ejemplo n.º 9
0
 def test_conversion_empty_input(self):
     table = decoded_examples_to_arrow.DecodedExamplesToTable([])
     self.assertEqual(table.num_columns, 0)
     self.assertEqual(table.num_rows, 0)
Ejemplo n.º 10
0
 def _flush_buffer(self):
   arrow_table = decoded_examples_to_arrow.DecodedExamplesToTable(
       self._buffer)
   del self._buffer[:]
   return arrow_table