def generate_partial_statistics_in_memory(examples, options, stats_generators): """Generates statistics for an in-memory list of examples. Args: examples: A list of input examples. options: Options for generating data statistics. stats_generators: A list of combiner statistics generators. Returns: A list of accumulators containing partial statistics. """ result = [] # DecodedExamplesToTable cannot handle empty input. if not examples: return [gen.create_accumulator() for gen in stats_generators] table = decoded_examples_to_arrow.DecodedExamplesToTable(examples) if options.feature_whitelist: whitelisted_columns = [ table.column(f) for f in options.feature_whitelist ] table = pa.Table.from_arrays(whitelisted_columns) for generator in stats_generators: result.append( generator.add_input(generator.create_accumulator(), table)) return result
def BatchExamplesToArrowTables( examples: beam.pvalue.PCollection, desired_batch_size: Optional[int] = constants. DEFAULT_DESIRED_INPUT_BATCH_SIZE ) -> beam.pvalue.PCollection: """Batches example dicts into Arrow tables. Args: examples: A PCollection of example dicts. desired_batch_size: Batch size. The output Arrow tables will have as many rows as the `desired_batch_size`. Returns: A PCollection of Arrow tables. """ # DecodedExamplesToTable should be called within a lambda function instead of # specifying the function name in beam.Map for the reasons discussed in # b/143648957. # TODO(b/131315065): Remove the comment above when the CSV decoder no longer # uses BatchExamplesToArrowTables. return ( examples | "BatchBeamExamples" >> beam.BatchElements(**GetBeamBatchKwargs(desired_batch_size)) | "DecodeExamplesToTable" >> # pylint: disable=unnecessary-lambda beam.Map(lambda x: decoded_examples_to_arrow.DecodedExamplesToTable(x)) )
def _generate_partial_statistics_from_df( dataframe: pd.DataFrame, stats_options: options.StatsOptions, stats_generators: List[stats_generator.CombinerStatsGenerator] ) -> List[Any]: """Generate accumulators containing partial stats.""" inmemory_dicts = [{} for _ in range(len(dataframe))] isnull = pd.isnull # Initialize decoding fn based on column type. int_fn = lambda x: np.array([x], dtype=np.integer) float_fn = lambda x: None if isnull(x) else np.array([x], dtype=np.floating) str_fn = lambda x: None if isnull(x) else np.array([x], dtype=np.object) decode_fn = { # int type. 'i': int_fn, 'u': int_fn, # float type. 'f': float_fn, # bool type. 'b': int_fn, # string type. 'S': str_fn, 'O': str_fn, 'U': str_fn, } feature_whitelist = set() if stats_options.feature_whitelist: feature_whitelist.update(stats_options.feature_whitelist) # Create a copy of the stats options so that we don't modify the input object. stats_options_modified = copy.copy(stats_options) # Remove feature_whitelist option as it is no longer needed. stats_options_modified.feature_whitelist = None schema = schema_pb2.Schema() for col_name, col_type in zip(dataframe.columns, dataframe.dtypes): kind = col_type.kind if (kind not in decode_fn or (feature_whitelist and col_name not in feature_whitelist)): logging.warning('Ignoring feature %s of type %s', col_name, col_type) continue if kind == 'b': # Track bool type feature as categorical. schema.feature.add(name=col_name, type=schema_pb2.INT, bool_domain=schema_pb2.BoolDomain()) # Get decoding fn based on column type. fn = decode_fn[kind] # Iterate over the column and apply the decoding fn. j = 0 for val in dataframe[col_name]: inmemory_dicts[j][col_name] = fn(val) j += 1 if schema.feature: stats_options_modified.schema = schema return stats_impl.generate_partial_statistics_in_memory( decoded_examples_to_arrow.DecodedExamplesToTable(inmemory_dicts), stats_options_modified, stats_generators)
def test_conversion(self, input_examples, expected_output): table = decoded_examples_to_arrow.DecodedExamplesToTable( input_examples) self.assertLen(expected_output, table.num_columns) for feature_name, expected_arrow_array in six.iteritems( expected_output): self.assertLen(table.column(feature_name).data.chunks, 1) self.assertTrue( expected_arrow_array.equals( table.column(feature_name).data.chunk(0)))
def _generate_partial_statistics_from_df( dataframe, stats_options, stats_generators ): """Generate accumulators containing partial stats.""" inmemory_dicts = [{} for _ in range(len(dataframe))] isnull = pd.isnull # Initialize decoding fn based on column type. int_fn = lambda x: np.array([x], dtype=np.integer) float_fn = lambda x: None if isnull(x) else np.array([x], dtype=np.floating) str_fn = lambda x: None if isnull(x) else np.array([x], dtype=np.object) decode_fn = { # int type. 'i': int_fn, 'u': int_fn, # float type. 'f': float_fn, # bool type. 'b': int_fn, # string type. 'S': str_fn, 'O': str_fn, 'U': str_fn, } schema = schema_pb2.Schema() for col_name, col_type in zip(dataframe.columns, dataframe.dtypes): kind = col_type.kind if kind not in decode_fn: logging.warning('Ignoring feature %s of type %s', col_name, col_type) continue if kind == 'b': # Track bool type feature as categorical. schema.feature.add( name=col_name, type=schema_pb2.INT, bool_domain=schema_pb2.BoolDomain()) # Get decoding fn based on column type. fn = decode_fn[kind] # Iterate over the column and apply the decoding fn. j = 0 for val in dataframe[col_name]: inmemory_dicts[j][col_name] = fn(val) j += 1 if schema.feature: stats_options.schema = schema return stats_impl.generate_partial_statistics_in_memory( decoded_examples_to_arrow.DecodedExamplesToTable(inmemory_dicts), stats_options, stats_generators)
def _maybe_do_batch(self, accumulator, force=False): """Maybe updates accumulator in place. Checks if accumulator has enough examples for a batch, and if so, does the stats computation for the batch and updates accumulator in place. Args: accumulator: Accumulator. Will be updated in place. force: Force computation of stats even if accumulator has less examples than the batch size. """ batch_size = len(accumulator.input_examples) if (force and batch_size > 0) or batch_size >= self._desired_batch_size: self._combine_add_input_batch_size.update(batch_size) arrow_table = decoded_examples_to_arrow.DecodedExamplesToTable( accumulator.input_examples) accumulator.partial_accumulators = self._for_each_generator( lambda gen, gen_acc: gen.add_input(gen_acc, arrow_table), accumulator.partial_accumulators) del accumulator.input_examples[:]
def test_conversion_empty_examples(self): input_examples = [{}] * 10 table = decoded_examples_to_arrow.DecodedExamplesToTable( input_examples) self.assertEqual(table.num_rows, 10) self.assertEqual(table.num_columns, 0)
def test_invalid_input(self, test_input, expected_error, expected_error_regexp): with self.assertRaisesRegexp(expected_error, expected_error_regexp): decoded_examples_to_arrow.DecodedExamplesToTable(test_input)
def test_conversion_empty_input(self): table = decoded_examples_to_arrow.DecodedExamplesToTable([]) self.assertEqual(table.num_columns, 0) self.assertEqual(table.num_rows, 0)
def _flush_buffer(self): arrow_table = decoded_examples_to_arrow.DecodedExamplesToTable( self._buffer) del self._buffer[:] return arrow_table