Esempio n. 1
0
 def add_input(
     self, accumulator: _CombinerStatsGeneratorsCombineFnAcc,
     input_record_batch: pa.RecordBatch
 ) -> _CombinerStatsGeneratorsCombineFnAcc:
   accumulator.input_record_batches.append(input_record_batch)
   num_rows = input_record_batch.num_rows
   accumulator.curr_batch_size += num_rows
   accumulator.curr_byte_size += table_util.TotalByteSize(input_record_batch)
   self._maybe_do_batch(accumulator)
   self._num_instances.inc(num_rows)
   return accumulator
Esempio n. 2
0
 def process(self, record_batch: pa.RecordBatch) -> Iterable[pa.RecordBatch]:
   num_rows = record_batch.num_rows
   self._num_rows.inc(num_rows)
   self._UpdateNumCellsCounters(record_batch)
   total_byte_size = table_util.TotalByteSize(
       record_batch, ignore_unsupported=True)
   self._byte_size_dist.update(total_byte_size)
   # These distributions are per-row therefore expensive to update because
   # dist.update() needs to be called num_rows * k times.
   if np.random.rand() < self._dist_update_prob:
     self._UpdateNumColumnsDist(record_batch)
     self._UpdateNumValuesDist(record_batch)
   yield record_batch
Esempio n. 3
0
  def test_simple(self, factory):
    # 3 int64 values
    # 4 int32 offsets
    # 1 null bitmap byte for outer ListArray
    # 1 null bitmap byte for inner Int64Array
    # 42 bytes in total.
    list_array = pa.array([[1, 2], [3], None], type=pa.list_(pa.int64()))

    # 1 null bitmap byte for outer StructArray.
    # 1 null bitmap byte for inner Int64Array.
    # 3 int64 values.
    # 26 bytes in total
    struct_array = pa.array([{"a": 1}, {"a": 2}, {"a": 3}],
                            type=pa.struct([pa.field("a", pa.int64())]))
    entity = factory([list_array, struct_array], ["a1", "a2"])

    self.assertEqual(42 + 26, table_util.TotalByteSize(entity))