Ejemplo n.º 1
0
def _create_dataset(
    file_pattern: List[Text],
    data_accessor: DataAccessor,
    tf_transform_output: tft.TFTransformOutput,
    is_train: bool = False,
    batch_size: int = 200
) -> tf.data.Dataset:
    """create dataset

    Args:
        file_pattern (List[Text]): List of paths or patterns of input tfrecord files.
        data_accessor (DataAccessor): DataAccessor for converting input to RecordBatch.
        tf_transform_output (tft.TFTransformOutput): A TFTransformOutput.
        is_train (bool, optional): Whether the input dataset is train split or not. Defaults to False.
        batch_size (int, optional): representing the number of consecutive elements of returned dataset to combine in a single batch. Defaults to 200.

    Returns:
        tf.data.Dataset: A dataset that contains (features, indices) tuple where features is a dictionary of Tensors, and indices is a single Tensor of label indices.
    """

    dataset = data_accessor.tf_dataset_factory(
        file_pattern,
        dataset_options.TensorFlowDatasetOptions(
            batch_size=batch_size, label_key=_transform_key_name(LABEL_KEY)
        ),
        tf_transform_output.transformed_metadata.schema
    )

    if is_train:
        dataset = dataset.map(lambda x, y: (_data_augment(x), y))
    return dataset
Ejemplo n.º 2
0
def _input_fn(file_pattern: List[str],
              data_accessor: DataAccessor,
              tf_transform_output: tft.TFTransformOutput,
              is_train: bool = False,
              batch_size: int = 200) -> tf.data.Dataset:
    """Generates features and label for tuning/training.

  Args:
    file_pattern: List of paths or patterns of input tfrecord files.
    data_accessor: DataAccessor for converting input to RecordBatch.
    tf_transform_output: A TFTransformOutput.
    is_train: Whether the input dataset is train split or not.
    batch_size: representing the number of consecutive elements of returned
      dataset to combine in a single batch

  Returns:
    A dataset that contains (features, indices) tuple where features is a
      dictionary of Tensors, and indices is a single Tensor of label indices.
  """
    dataset = data_accessor.tf_dataset_factory(
        file_pattern,
        dataset_options.TensorFlowDatasetOptions(
            batch_size=batch_size, label_key=_transformed_name(_LABEL_KEY)),
        tf_transform_output.transformed_metadata.schema)
    # Apply data augmentation. We have to do data augmentation here because
    # we need to apply data agumentation on-the-fly during training. If we put
    # it in Transform, it will only be applied once on the whole dataset, which
    # will lose the point of data augmentation.
    if is_train:
        dataset = dataset.map(lambda x, y: (_data_augmentation(x), y))

    return dataset
Ejemplo n.º 3
0
 def testTensorFlowDatasetGraphMode(self):
     column_name = "raw_record"
     tfxio = raw_tf_record.RawTfRecordTFXIO(
         self._raw_record_file,
         column_name,
         telemetry_descriptors=["some", "component"])
     actual_records = []
     with tf.compat.v1.Graph().as_default():
         ds = tfxio.TensorFlowDataset(
             dataset_options.TensorFlowDatasetOptions(
                 batch_size=1,
                 shuffle=False,
                 num_epochs=1,
                 reader_num_threads=1,
                 sloppy_ordering=False))
         iterator = tf.compat.v1.data.make_one_shot_iterator(ds)
         next_elem = iterator.get_next()
         with tf.compat.v1.Session() as sess:
             while True:
                 try:
                     actual_records.append(
                         sess.run(next_elem)[column_name][0])
                 except tf.errors.OutOfRangeError:
                     break
     self.assertEqual(actual_records, _RAW_RECORDS)
Ejemplo n.º 4
0
def _input_fn(file_pattern: List[Text],
              data_accessor: DataAccessor,
              tf_transform_output: tft.TFTransformOutput,
              batch_size: int = 200) -> tf.data.Dataset:
    """Generates features and label for tuning/training.

  Args:
    file_pattern: List of paths or patterns of input tfrecord files.
    data_accessor: DataAccessor for converting input to RecordBatch.
    tf_transform_output: A TFTransformOutput.
    batch_size: representing the number of consecutive elements of returned
      dataset to combine in a single batch

  Returns:
    A dataset that contains (features, indices) tuple where features is a
      dictionary of Tensors, and indices is a single Tensor of label indices.
  """
    dataset = data_accessor.tf_dataset_factory(
        file_pattern,
        dataset_options.TensorFlowDatasetOptions(
            batch_size=batch_size,
            label_key=features.transformed_name(features.LABEL_KEY)),
        tf_transform_output.transformed_metadata.schema)

    return dataset
Ejemplo n.º 5
0
    def ReadExamplesArtifact(self,
                             examples: types.Artifact,
                             num_examples: int,
                             split_name: Optional[Text] = None):
        """Read records from Examples artifact.

    Currently it assumes Examples artifact contains serialized tf.Example in
    gzipped TFRecord files.

    Args:
      examples: `Examples` artifact.
      num_examples: Number of examples to read. If the specified value is larger
          than the actual number of examples, all examples would be read.
      split_name: Name of the split to read from the Examples artifact.

    Raises:
      RuntimeError: If read twice.
    """
        if self._records:
            raise RuntimeError('Cannot read records twice.')

        if num_examples < 1:
            raise ValueError('num_examples < 1 (got {})'.format(num_examples))

        available_splits = artifact_utils.decode_split_names(
            examples.split_names)
        if not available_splits:
            raise ValueError(
                'No split_name is available in given Examples artifact.')
        if split_name is None:
            split_name = available_splits[0]
        if split_name not in available_splits:
            raise ValueError(
                'No split_name {}; available split names: {}'.format(
                    split_name, ', '.join(available_splits)))

        # ExampleGen generates artifacts under each split_name directory.
        glob_pattern = os.path.join(examples.uri, split_name, '*')
        tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact(
            examples=[examples],
            telemetry_descriptors=_TELEMETRY_DESCRIPTORS,
            schema=None,
            read_as_raw_records=True,
            raw_record_column_name=_RAW_RECORDS_COLUMN)
        try:
            filenames = fileio.glob(glob_pattern)
        except tf.errors.NotFoundError:
            filenames = []
        if not filenames:
            raise ValueError(
                'Unable to find examples matching {}.'.format(glob_pattern))

        self._payload_format = examples_utils.get_payload_format(examples)
        tfxio = tfxio_factory(filenames)

        self._ReadFromDataset(
            tfxio.TensorFlowDataset(
                dataset_options.TensorFlowDatasetOptions(
                    batch_size=num_examples)))
 def test_tensorflow_dataset_with_invalid_label_key(self):
   tfxio = record_to_tensor_tfxio.TFRecordToTensorTFXIO(
       self._input_path, self._decoder_path, ["some", "component"])
   label_key = "invalid"
   options = dataset_options.TensorFlowDatasetOptions(
       batch_size=1, shuffle=False, num_epochs=1, label_key=label_key)
   with self.assertRaisesRegex(ValueError, "The `label_key` provided.*"):
     tfxio.TensorFlowDataset(options=options)
 def test_tensorflow_dataset(self):
   tfxio = record_to_tensor_tfxio.TFRecordToTensorTFXIO(
       self._input_path, self._decoder_path, ["some", "component"])
   options = dataset_options.TensorFlowDatasetOptions(
       batch_size=1, shuffle=False, num_epochs=1)
   for i, decoded_tensors_dict in enumerate(
       tfxio.TensorFlowDataset(options=options)):
     for key, tensor in decoded_tensors_dict.items():
       self._AssertSparseTensorEqual(tensor, _RECORDS_AS_TENSORS[i][key])
Ejemplo n.º 8
0
def _input_fn(file_pattern,
              data_accessor,
              tf_transform_output,
              batch_size=200):
    return data_accessor.tf_dataset_factory(
        file_pattern,
        dataset_options.TensorFlowDatasetOptions(
            batch_size=batch_size, label_key=transformed_name(LABEL_KEY)),
        tf_transform_output.transformed_metadata.schema)
Ejemplo n.º 9
0
 def testTensorFlowDataset(self):
   tfxio = self._MakeTFXIO(_SCHEMA)
   options = dataset_options.TensorFlowDatasetOptions(
       batch_size=1, shuffle=False, num_epochs=1)
   for i, parsed_examples_dict in enumerate(
       tfxio.TensorFlowDataset(options=options)):
     self.assertLen(parsed_examples_dict, 3)
     for feature_name, tensor in parsed_examples_dict.items():
       self._AssertSparseTensorEqual(
           tensor, _EXAMPLES_AS_TENSORS[i][feature_name])
Ejemplo n.º 10
0
 def testTensorFlowDatasetWithLabelKey(self):
   tfxio = self._MakeTFXIO(_SCHEMA)
   options = dataset_options.TensorFlowDatasetOptions(
       batch_size=1, shuffle=False, num_epochs=1, label_key="string_feature")
   for i, (parsed_examples_dict, label_feature) in enumerate(
       tfxio.TensorFlowDataset(options=options)):
     self._AssertSparseTensorEqual(
         label_feature, _EXAMPLES_AS_TENSORS[i]["string_feature"])
     self.assertLen(parsed_examples_dict, 2)
     for feature_name, tensor in parsed_examples_dict.items():
       self._AssertSparseTensorEqual(
           tensor, _EXAMPLES_AS_TENSORS[i][feature_name])
Ejemplo n.º 11
0
 def testProjectedTensorFlowDataset(self):
   tfxio = self._MakeTFXIO(_SCHEMA)
   feature_name = "string_feature"
   projected_tfxio = tfxio.Project([feature_name])
   options = dataset_options.TensorFlowDatasetOptions(
       batch_size=1, shuffle=False, num_epochs=1)
   for i, parsed_examples_dict in enumerate(
       projected_tfxio.TensorFlowDataset(options=options)):
     self.assertIn(feature_name, parsed_examples_dict)
     self.assertLen(parsed_examples_dict, 1)
     self._AssertSparseTensorEqual(parsed_examples_dict[feature_name],
                                   _EXAMPLES_AS_TENSORS[i][feature_name])
Ejemplo n.º 12
0
   def testTensorFlowDatasetWithTensorRepresentation(self):
       schema = text_format.Parse(
           """
     feature {
       name: "int_feature"
       type: INT
       value_count {
         min: 1
         max: 1
       }
     }
     feature {
       name: "float_feature"
       type: FLOAT
       value_count {
         min: 4
         max: 4
       }
     }
     feature {
       name: "string_feature"
       type: BYTES
       value_count {
         min: 0
         max: 2
       }
     }
     tensor_representation_group {
   key: ""
   value {
     tensor_representation {
       key: "var_len_feature"
       value {
         varlen_sparse_tensor {
           column_name: "string_feature"
         }
       }
     }
   }
 }
   """, schema_pb2.Schema())
       tfxio = self._MakeTFXIO(schema)
       options = dataset_options.TensorFlowDatasetOptions(batch_size=1,
                                                          shuffle=False,
                                                          num_epochs=1)
       for i, parsed_examples_dict in enumerate(
               tfxio.TensorFlowDataset(options=options)):
           self.assertLen(parsed_examples_dict, 1)
           for tensor_name, tensor in parsed_examples_dict.items():
               self.assertEqual(tensor_name, "var_len_feature")
               self._AssertSparseTensorEqual(
                   tensor, _EXAMPLES_AS_TENSORS[i]["string_feature"])
Ejemplo n.º 13
0
def input_fn(
    file_pattern: List[Text],
    data_accessor: DataAccessor,
    tf_transform_output: tft.TFTransformOutput,
    batch_size: int = 200,
) -> tf.data.Dataset:
    return data_accessor.tf_dataset_factory(
        file_pattern,
        dataset_options.TensorFlowDatasetOptions(
            batch_size=batch_size,
            label_key=module.transformed_name(module.LABEL_KEY)),
        tf_transform_output.transformed_metadata.schema,
    )
Ejemplo n.º 14
0
 def test_tensorflow_dataset_with_label_key(self):
   decoder_path = _write_decoder()
   tfxio = record_to_tensor_tfxio.TFRecordToTensorTFXIO(
       self._input_path, decoder_path, ["some", "component"])
   label_key = "st1"
   options = dataset_options.TensorFlowDatasetOptions(
       batch_size=1, shuffle=False, num_epochs=1, label_key=label_key)
   for i, (decoded_tensors_dict, label_feature) in enumerate(
       tfxio.TensorFlowDataset(options=options)):
     self._assert_sparse_tensor_equal(
         label_feature, _RECORDS_AS_TENSORS[i][label_key])
     for key, tensor in decoded_tensors_dict.items():
       self._assert_sparse_tensor_equal(tensor, _RECORDS_AS_TENSORS[i][key])
 def test_projected_tensorflow_dataset(self):
   tfxio = record_to_tensor_tfxio.TFRecordToTensorTFXIO(
       self._input_path, self._decoder_path, ["some", "component"])
   feature_name = "st1"
   projected_tfxio = tfxio.Project([feature_name])
   options = dataset_options.TensorFlowDatasetOptions(
       batch_size=1, shuffle=False, num_epochs=1)
   for i, decoded_tensors_dict in enumerate(
       projected_tfxio.TensorFlowDataset(options=options)):
     self.assertIn(feature_name, decoded_tensors_dict)
     self.assertLen(decoded_tensors_dict, 1)
     tensor = decoded_tensors_dict[feature_name]
     self._AssertSparseTensorEqual(tensor,
                                   _RECORDS_AS_TENSORS[i][feature_name])
Ejemplo n.º 16
0
 def testTensorFlowDataset(self):
     column_name = "raw_record"
     tfxio = raw_tf_record.RawTfRecordTFXIO(
         self._raw_record_file,
         column_name,
         telemetry_descriptors=["some", "component"])
     ds = tfxio.TensorFlowDataset(
         dataset_options.TensorFlowDatasetOptions(batch_size=1,
                                                  shuffle=False,
                                                  num_epochs=1,
                                                  reader_num_threads=1,
                                                  sloppy_ordering=False))
     actual_records = [d[column_name].numpy()[0] for d in ds]
     self.assertEqual(actual_records, _RAW_RECORDS)
Ejemplo n.º 17
0
 def build_dataset(files):
     return (
         fn_args.data_accessor.tf_dataset_factory(
             files,
             dataset_options.TensorFlowDatasetOptions(batch_size),
             schema,
         )
         .map(
             lambda batch: (
                 Features(inputs).map(lambda name, _: batch[name]),
                 Features(outputs).map(lambda name, _: batch[name]),
             )
         )
         .repeat()
     )
Ejemplo n.º 18
0
def _input_fn(
    file_pattern: List[str],
    data_accessor: DataAccessor,
    tf_transform_output: tft.TFTransformOutput,
    batch_size: int = 200,
) -> tf.data.Dataset:
    dataset = data_accessor.tf_dataset_factory(
        file_pattern,
        dataset_options.TensorFlowDatasetOptions(
            batch_size=batch_size,
            label_key="class_xf",
        ),
        tf_transform_output.transformed_metadata.schema,
    )

    return dataset.repeat()
Ejemplo n.º 19
0
 def testTensorFlowDatasetGraphMode(self):
     tfxio = self._MakeTFXIO(_SCHEMA)
     options = dataset_options.TensorFlowDatasetOptions(batch_size=1,
                                                        shuffle=False,
                                                        num_epochs=1)
     with tf.compat.v1.Graph().as_default():
         ds = tfxio.TensorFlowDataset(options=options)
         iterator = tf.compat.v1.data.make_one_shot_iterator(ds)
         next_elem = iterator.get_next()
         records = []
         with tf.compat.v1.Session() as sess:
             while True:
                 try:
                     records.append(sess.run(next_elem))
                 except tf.errors.OutOfRangeError:
                     break
     for i, parsed_examples_dict in enumerate(records):
         self.assertLen(parsed_examples_dict, 3)
         for tensor_name, tensor in parsed_examples_dict.items():
             self._AssertSparseTensorEqual(
                 tensor, _EXAMPLES_AS_TENSORS[i][tensor_name])
Ejemplo n.º 20
0
def _input_fn(file_pattern: List[Text],
              data_accessor: DataAccessor,
              schema: schema_pb2.Schema,
              batch_size: int = 200) -> tf.data.Dataset:
    """Generates features and label for tuning/training.

  Args:
    file_pattern: List of paths or patterns of input tfrecord files.
    data_accessor: DataAccessor for converting input to RecordBatch.
    schema: schema of the input data.
    batch_size: representing the number of consecutive elements of returned
      dataset to combine in a single batch

  Returns:
    A dataset that contains (features, indices) tuple where features is a
      dictionary of Tensors, and indices is a single Tensor of label indices.
  """
    return data_accessor.tf_dataset_factory(
        file_pattern,
        dataset_options.TensorFlowDatasetOptions(batch_size=batch_size,
                                                 label_key=_LABEL_KEY), schema)
Ejemplo n.º 21
0
def _read_transformed_dataset(
    file_pattern,
    data_accessor,
    tf_transform_output,
    num_epochs=1,
    shuffle=False,
    sloppy_ordering=True,
    batch_size=100000,
):
    """
    Read data coming out of Transformation component.

    Parameters
    ----------
    file_pattern : list(str)
        List of paths or patterns of input tfrecord files.
    data_accessor : tfx.components.trainer.fn_args_utils.DataAccessor
        DataAccessor for converting input to RecordBatch.
    tf_transform_output : tft.TFTransformOutput
        A TFTransformOutput.

    Returns
    -------
    tf.Dataset (iterable)
        An iterable dataset where each iteration returns a data batch
        as dictionary {"field1": array[...], "field1": array[...], ...}
    """
    return data_accessor.tf_dataset_factory(
        file_pattern,
        dataset_options.TensorFlowDatasetOptions(
            num_epochs=num_epochs,
            shuffle=shuffle,
            sloppy_ordering=sloppy_ordering,
            batch_size=int(batch_size),
        ),
        tf_transform_output.transformed_metadata.schema,
    )
Ejemplo n.º 22
0
    def testTensorFlowDatasetWithRaggedTensorRepresentation(self):
        schema = text_format.Parse(
            """
      feature {
        name: "varlen_feature"
        type: INT
      }
      feature {
        name: "row_lengths"
        type: INT
      }
      tensor_representation_group {
        key: ""
        value {
          tensor_representation {
            key: "ragged"
            value {
              ragged_tensor {
                feature_path { step: "varlen_feature" }
                partition { row_length: "row_lengths" }
              }
            }
          }
        }
      }
    """, schema_pb2.Schema())
        tfxio = self._MakeTFXIO(schema)
        projected_tfxio = tfxio.Project(["ragged"])

        expected_column_values = {
            "varlen_feature":
            pa.array([[1, 2, 3], [4], [5, 6]], type=pa.large_list(pa.int64())),
            "row_lengths":
            pa.array([[2, 1], [1], [1, 1]], type=pa.large_list(pa.int64())),
        }

        def _AssertFn(record_batch_list):
            self.assertLen(record_batch_list, 1)
            record_batch = record_batch_list[0]

            self.assertIsInstance(record_batch, pa.RecordBatch)
            self.assertEqual(record_batch.num_rows, 3)
            print(record_batch.schema)
            for i, field in enumerate(record_batch.schema):
                self.assertTrue(
                    record_batch.column(i).equals(
                        expected_column_values[field.name]),
                    "Column {} did not match ({} vs {}).".format(
                        field.name, record_batch.column(i),
                        expected_column_values[field.name]))

            # self._ValidateRecordBatch(tfxio, record_batch)
            expected_schema = projected_tfxio.ArrowSchema()
            self.assertTrue(
                record_batch.schema.equals(expected_schema),
                "actual: {}; expected: {}".format(record_batch.schema,
                                                  expected_schema))
            tensor_adapter = projected_tfxio.TensorAdapter()
            dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch)
            self.assertLen(dict_of_tensors, 1)
            self.assertIn("ragged", dict_of_tensors)

            if tf.executing_eagerly():
                ragged_factory = tf.RaggedTensor.from_row_splits
            else:
                ragged_factory = tf.compat.v1.ragged.RaggedTensorValue
            expected_tensor = ragged_factory(values=ragged_factory(
                values=[1, 2, 3, 4, 5, 6], row_splits=[0, 2, 3, 4, 5, 6]),
                                             row_splits=[0, 2, 3, 5])
            self.assertAllEqual(dict_of_tensors["ragged"], expected_tensor)

        with beam.Pipeline() as p:
            # Setting the betch_size to make sure only one batch is generated.
            record_batch_pcoll = p | projected_tfxio.BeamSource(
                batch_size=len(_EXAMPLES))
            beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)

        if tf.executing_eagerly():
            ragged_factory = tf.RaggedTensor.from_row_splits
        else:
            ragged_factory = tf.compat.v1.ragged.RaggedTensorValue

        expected_tensors = [
            ragged_factory(values=ragged_factory(values=[1, 2, 3],
                                                 row_splits=[0, 2, 3]),
                           row_splits=[0, 2]),
            ragged_factory(values=ragged_factory(values=[4], row_splits=[0,
                                                                         1]),
                           row_splits=[0, 1]),
            ragged_factory(values=ragged_factory(values=[5, 6],
                                                 row_splits=[0, 1, 2]),
                           row_splits=[0, 2]),
        ]

        options = dataset_options.TensorFlowDatasetOptions(batch_size=1,
                                                           shuffle=False,
                                                           num_epochs=1)
        for i, parsed_examples_dict in enumerate(
                projected_tfxio.TensorFlowDataset(options)):
            self.assertLen(parsed_examples_dict, 1)
            self.assertIn("ragged", parsed_examples_dict)
            self.assertAllEqual(parsed_examples_dict["ragged"],
                                expected_tensors[i])