def testGetBatchElementsKwargs(self): kwargs = record_based_tfxio.GetBatchElementsKwargs(batch_size=None) self.assertDictEqual(kwargs, {"max_batch_size": 1000}) kwargs = record_based_tfxio.GetBatchElementsKwargs(batch_size=5000) self.assertDictEqual(kwargs, { "max_batch_size": 5000, "min_batch_size": 5000 })
def expand(self, lines: beam.pvalue.PCollection): """Decodes the input CSV records into an in-memory dict representation. Args: lines: A PCollection of strings representing the lines in the CSV file. Returns: A PCollection of dicts representing the CSV records. """ csv_lines = (lines | 'ParseCSVLines' >> beam.ParDo( csv_decoder.ParseCSVLine(self._delimiter))) if self._infer_type_from_schema: column_infos = _get_feature_types_from_schema( self._schema, self._column_names) else: # TODO(b/72746442): Consider using a DeepCopy optimization similar to TFT. # Do first pass to infer the feature types. column_infos = beam.pvalue.AsSingleton( csv_lines | 'InferColumnTypes' >> beam.CombineGlobally( csv_decoder.ColumnTypeInferrer( column_names=self._column_names, skip_blank_lines=self._skip_blank_lines))) # Do second pass to generate the in-memory dict representation. return ( csv_lines | 'BatchCSVLines' >> beam.BatchElements(**record_based_tfxio.GetBatchElementsKwargs( self._desired_batch_size)) | 'BatchedCSVRowsToArrow' >> beam.ParDo( _BatchedCSVRowsToArrow( skip_blank_lines=self._skip_blank_lines), column_infos))
def BatchExamplesToArrowTables( examples: beam.pvalue.PCollection, desired_batch_size: Optional[int] = constants. DEFAULT_DESIRED_INPUT_BATCH_SIZE ) -> beam.pvalue.PCollection: """Batches example dicts into Arrow tables. Args: examples: A PCollection of example dicts. desired_batch_size: Batch size. The output Arrow tables will have as many rows as the `desired_batch_size`. Returns: A PCollection of Arrow tables. """ # DecodedExamplesToTable should be called within a lambda function instead of # specifying the function name in beam.Map for the reasons discussed in # b/143648957. # TODO(b/131315065): Remove the comment above when the CSV decoder no longer # uses BatchExamplesToArrowTables. return ( examples | "BatchBeamExamples" >> beam.BatchElements( **record_based_tfxio.GetBatchElementsKwargs(desired_batch_size)) | "DecodeExamplesToTable" >> # pylint: disable=unnecessary-lambda beam.Map(lambda x: decoded_examples_to_arrow.DecodedExamplesToTable(x)) )
def _PTransformFn(raw_record_pcoll: beam.pvalue.PCollection): return ( raw_record_pcoll | 'Batch' >> beam.BatchElements( **record_based_tfxio.GetBatchElementsKwargs(batch_size)) | 'ToRecordBatch' >> beam.Map(_BatchedRecordsToArrow, self.raw_record_column_name))
def _ptransform_fn(raw_records_pcoll: beam.pvalue.PCollection): return ( raw_records_pcoll | "Batch" >> beam.BatchElements( **record_based_tfxio.GetBatchElementsKwargs(batch_size)) | "Decode" >> beam.ParDo( _DecodeBatchExamplesDoFn(self._schema, self.raw_record_column_name)))
def BatchSerializedExamplesToArrowTables( examples: beam.pvalue.PCollection, desired_batch_size: Optional[int] = constants. DEFAULT_DESIRED_INPUT_BATCH_SIZE ) -> beam.pvalue.PCollection: """Batches serialized examples into Arrow tables. Args: examples: A PCollection of serialized tf.Examples. desired_batch_size: Batch size. The output Arrow tables will have as many rows as the `desired_batch_size`. Returns: A PCollection of Arrow tables. """ return ( examples | "BatchSerializedExamples" >> beam.BatchElements( **record_based_tfxio.GetBatchElementsKwargs(desired_batch_size)) | "BatchDecodeExamples" >> beam.ParDo(_BatchDecodeExamplesDoFn()))
def BatchExamplesToArrowRecordBatches( examples: beam.pvalue.PCollection, desired_batch_size: Optional[int] = constants. DEFAULT_DESIRED_INPUT_BATCH_SIZE ) -> beam.pvalue.PCollection: """Batches example dicts into Arrow record batches. Args: examples: A PCollection of example dicts. desired_batch_size: Batch size. The output Arrow record batches will have as many rows as the `desired_batch_size`. Returns: A PCollection of Arrow record batches. """ return ( examples | "BatchBeamExamples" >> beam.BatchElements( **record_based_tfxio.GetBatchElementsKwargs(desired_batch_size)) | "DecodeExamplesToRecordBatch" >> # pylint: disable=unnecessary-lambda beam.Map(lambda x: decoded_examples_to_arrow. DecodedExamplesToRecordBatch(x)))