Exemple #1
0
def _input_fn(file_pattern: List[Text],
              data_accessor: tfx.components.DataAccessor,
              tf_transform_output: tft.TFTransformOutput,
              batch_size: int = 200) -> tf.data.Dataset:
    """Generates features and label for tuning/training.

  Args:
    file_pattern: List of paths or patterns of input tfrecord files.
    data_accessor: DataAccessor for converting input to RecordBatch.
    tf_transform_output: A TFTransformOutput.
    batch_size: representing the number of consecutive elements of returned
      dataset to combine in a single batch

  Returns:
    A dataset that contains (features, indices) tuple where features is a
      dictionary of Tensors, and indices is a single Tensor of label indices.
  """
    dataset = data_accessor.tf_dataset_factory(
        file_pattern,
        tfxio.TensorFlowDatasetOptions(batch_size=batch_size,
                                       label_key=_LABEL_KEY),
        tf_transform_output.transformed_metadata.schema)
    dataset = dataset.repeat()

    return dataset.prefetch(tf.data.AUTOTUNE)
Exemple #2
0
def get_dataset(
    file_pattern: List[str],
    data_accessor: tfx.components.DataAccessor,
    schema: schema_pb2.Schema,
    batch_size: int = 200,
) -> tf.data.Dataset:
    """Generates features and label for training.
    Args:
      file_pattern: List of paths or patterns of input tfrecord files.
      data_accessor: DataAccessor for converting input to RecordBatch.
      schema: schema of the input data.
      batch_size: representing the number of consecutive elements of returned
        dataset to combine in a single batch.
    Returns:
      A dataset that contains (features, indices) tuple where features is a
        dictionary of Tensors, and indices is a single Tensor of label indices.
    """
    dataset = data_accessor.tf_dataset_factory(
        file_pattern,
        tfxio.TensorFlowDatasetOptions(batch_size=batch_size,
                                       label_key=features.TARGET_FEATURE_NAME),
        schema=schema,
    ).repeat()

    return dataset
Exemple #3
0
def _input_fn(file_patterns, data_accessor, batch_size) -> tf.data.Dataset:
    """Returns a dataset of decoded tensors."""
    def prepare_label(parsed_ragged_tensors):
        label = parsed_ragged_tensors.pop(features.LABEL)
        # Convert labels to a dense tensor.
        label = label.to_tensor(default_value=features.LABEL_PADDING_VALUE)
        return parsed_ragged_tensors, label

    # NOTE: this dataset already contains RaggedTensors from the Decoder.
    dataset = data_accessor.tf_dataset_factory(
        file_patterns,
        tfxio.TensorFlowDatasetOptions(batch_size=batch_size),
        schema=None)
    return dataset.map(prepare_label).repeat()
Exemple #4
0
def _input_fn(file_pattern: List[str],
              data_accessor: tfx.components.DataAccessor,
              schema: schema_pb2.Schema,
              label: str,
              batch_size: int = 200) -> tf.data.Dataset:
    """Generates features and label for tuning/training.

  Args:
    file_pattern: List of paths or patterns of input tfrecord files.
    data_accessor: DataAccessor for converting input to RecordBatch.
    schema: A schema proto of input data.
    label: Name of the label.
    batch_size: representing the number of consecutive elements of returned
      dataset to combine in a single batch

  Returns:
    A dataset that contains (features, indices) tuple where features is a
      dictionary of Tensors, and indices is a single Tensor of label indices.
  """
    return data_accessor.tf_dataset_factory(
        file_pattern,
        tfxio.TensorFlowDatasetOptions(batch_size=batch_size, label_key=label),
        schema).repeat()
Exemple #5
0
def input_fn(file_pattern: List[str],
             data_accessor: tfx.components.DataAccessor,
             tf_transform_output: tft.TFTransformOutput,
             batch_size: int) -> tf.data.Dataset:
  """Generates features and label for tuning/training for a single epoch.

  Args:
    file_pattern: List of paths or patterns of input tfrecord files.
    data_accessor: DataAccessor for converting input to RecordBatch.
    tf_transform_output: A TFTransformOutput.
    batch_size: representing the number of consecutive elements of returned
      dataset to combine in a single batch

  Returns:
    A dataset that contains (features, indices) tuple where features is a
      dictionary of Tensors, and indices is a single Tensor of label indices.
  """
  return data_accessor.tf_dataset_factory(
      file_pattern,
      tfxio.TensorFlowDatasetOptions(
          batch_size=batch_size, num_epochs=1,
          label_key=base.transformed_name(base._LABEL_KEY)),  # pylint: disable=protected-access
      tf_transform_output.transformed_metadata.schema)