Example #1
0
 def testGetBatchElementsKwargs(self):
     kwargs = record_based_tfxio.GetBatchElementsKwargs(batch_size=None)
     self.assertDictEqual(kwargs, {"max_batch_size": 1000})
     kwargs = record_based_tfxio.GetBatchElementsKwargs(batch_size=5000)
     self.assertDictEqual(kwargs, {
         "max_batch_size": 5000,
         "min_batch_size": 5000
     })
    def expand(self, lines: beam.pvalue.PCollection):
        """Decodes the input CSV records into an in-memory dict representation.

    Args:
      lines: A PCollection of strings representing the lines in the CSV file.

    Returns:
      A PCollection of dicts representing the CSV records.
    """
        csv_lines = (lines | 'ParseCSVLines' >> beam.ParDo(
            csv_decoder.ParseCSVLine(self._delimiter)))

        if self._infer_type_from_schema:
            column_infos = _get_feature_types_from_schema(
                self._schema, self._column_names)
        else:
            # TODO(b/72746442): Consider using a DeepCopy optimization similar to TFT.
            # Do first pass to infer the feature types.
            column_infos = beam.pvalue.AsSingleton(
                csv_lines | 'InferColumnTypes' >> beam.CombineGlobally(
                    csv_decoder.ColumnTypeInferrer(
                        column_names=self._column_names,
                        skip_blank_lines=self._skip_blank_lines)))

        # Do second pass to generate the in-memory dict representation.
        return (
            csv_lines
            | 'BatchCSVLines' >>
            beam.BatchElements(**record_based_tfxio.GetBatchElementsKwargs(
                self._desired_batch_size))
            | 'BatchedCSVRowsToArrow' >> beam.ParDo(
                _BatchedCSVRowsToArrow(
                    skip_blank_lines=self._skip_blank_lines), column_infos))
Example #3
0
def BatchExamplesToArrowTables(
    examples: beam.pvalue.PCollection,
    desired_batch_size: Optional[int] = constants.
    DEFAULT_DESIRED_INPUT_BATCH_SIZE
) -> beam.pvalue.PCollection:
    """Batches example dicts into Arrow tables.

  Args:
    examples: A PCollection of example dicts.
    desired_batch_size: Batch size. The output Arrow tables will have as many
      rows as the `desired_batch_size`.

  Returns:
    A PCollection of Arrow tables.
  """
    # DecodedExamplesToTable should be called within a lambda function instead of
    # specifying the function name in beam.Map for the reasons discussed in
    # b/143648957.
    # TODO(b/131315065): Remove the comment above when the CSV decoder no longer
    # uses BatchExamplesToArrowTables.
    return (
        examples
        | "BatchBeamExamples" >> beam.BatchElements(
            **record_based_tfxio.GetBatchElementsKwargs(desired_batch_size))
        | "DecodeExamplesToTable" >>
        # pylint: disable=unnecessary-lambda
        beam.Map(lambda x: decoded_examples_to_arrow.DecodedExamplesToTable(x))
    )
Example #4
0
 def _PTransformFn(raw_record_pcoll: beam.pvalue.PCollection):
     return (
         raw_record_pcoll
         | 'Batch' >> beam.BatchElements(
             **record_based_tfxio.GetBatchElementsKwargs(batch_size))
         | 'ToRecordBatch' >> beam.Map(_BatchedRecordsToArrow,
                                       self.raw_record_column_name))
Example #5
0
 def _ptransform_fn(raw_records_pcoll: beam.pvalue.PCollection):
     return (
         raw_records_pcoll
         | "Batch" >> beam.BatchElements(
             **record_based_tfxio.GetBatchElementsKwargs(batch_size))
         | "Decode" >> beam.ParDo(
             _DecodeBatchExamplesDoFn(self._schema,
                                      self.raw_record_column_name)))
Example #6
0
def BatchSerializedExamplesToArrowTables(
    examples: beam.pvalue.PCollection,
    desired_batch_size: Optional[int] = constants.
    DEFAULT_DESIRED_INPUT_BATCH_SIZE
) -> beam.pvalue.PCollection:
    """Batches serialized examples into Arrow tables.

  Args:
    examples: A PCollection of serialized tf.Examples.
    desired_batch_size: Batch size. The output Arrow tables will have as many
      rows as the `desired_batch_size`.

  Returns:
    A PCollection of Arrow tables.
  """
    return (
        examples
        | "BatchSerializedExamples" >> beam.BatchElements(
            **record_based_tfxio.GetBatchElementsKwargs(desired_batch_size))
        | "BatchDecodeExamples" >> beam.ParDo(_BatchDecodeExamplesDoFn()))
Example #7
0
def BatchExamplesToArrowRecordBatches(
    examples: beam.pvalue.PCollection,
    desired_batch_size: Optional[int] = constants.
    DEFAULT_DESIRED_INPUT_BATCH_SIZE
) -> beam.pvalue.PCollection:
    """Batches example dicts into Arrow record batches.

  Args:
    examples: A PCollection of example dicts.
    desired_batch_size: Batch size. The output Arrow record batches will have as
      many rows as the `desired_batch_size`.

  Returns:
    A PCollection of Arrow record batches.
  """
    return (
        examples
        | "BatchBeamExamples" >> beam.BatchElements(
            **record_based_tfxio.GetBatchElementsKwargs(desired_batch_size))
        | "DecodeExamplesToRecordBatch" >>
        # pylint: disable=unnecessary-lambda
        beam.Map(lambda x: decoded_examples_to_arrow.
                 DecodedExamplesToRecordBatch(x)))