Ejemplo n.º 1
0
 def write_metadata(futures_dict, non_deferred_metadata, destination):
     unresolved_futures = non_deferred_metadata.substitute_futures(
         futures_dict)
     if unresolved_futures:
         raise ValueError('Some futures were unresolved: %r' %
                          unresolved_futures)
     metadata_io.write_metadata(non_deferred_metadata, destination)
Ejemplo n.º 2
0
def analyze_in_place(preprocessing_fn, force_tf_compat_v1, feature_specs,
                     type_specs, transform_output_path):
    """Analyzes the `preprocessing_fn` in-place without looking at the data.

  This should only be used if the `preprocessing_fn` contains no TFT
  analyzers or TFT mappers that use analyzers.

  Writes out a transform function and transformed metadata to subdirs under
  `transform_output_path`.

  Args:
    preprocessing_fn: The tf.Transform preprocessing_fn.
    force_tf_compat_v1: If True, call Transform's API to use Tensorflow in
      tf.compat.v1 mode.
    feature_specs: a Dict from input feature key to its feature spec.
    type_specs: a Dict from input feature key to its type spec.
    transform_output_path: An absolute path to write the output to.

  Raises:
    RuntimeError if `preprocessing_fn` contains TFT analyzers.
  """
    use_tf_compat_v1 = tf2_utils.use_tf_compat_v1(force_tf_compat_v1)
    transform_fn_path = os.path.join(transform_output_path,
                                     TFTransformOutput.TRANSFORM_FN_DIR)
    if use_tf_compat_v1:
        graph, structured_inputs, structured_outputs = (
            trace_preprocessing_function(preprocessing_fn,
                                         feature_specs,
                                         use_tf_compat_v1=use_tf_compat_v1))
        _assert_no_analyzers_in_graph(graph)
        with tf.compat.v1.Session(graph=graph) as sess:
            sess.run(tf.compat.v1.global_variables_initializer())
            sess.run(tf.compat.v1.tables_initializer())
            saved_transform_io.write_saved_transform_from_session(
                sess, structured_inputs, structured_outputs, transform_fn_path)

            transformed_metadata = dataset_metadata.DatasetMetadata(
                schema=schema_inference.infer_feature_schema(
                    structured_outputs, graph, sess))
    else:
        concrete_transform_fn = _trace_and_write_transform_fn(
            saved_model_dir=transform_fn_path,
            preprocessing_fn=preprocessing_fn,
            input_signature=type_specs,
            base_temp_dir=None,
            tensor_replacement_map=None,
            output_keys_to_name_map=None)
        _assert_no_analyzers_in_graph(concrete_transform_fn.graph)
        structured_inputs = tf2_utils.get_structured_inputs_from_func_graph(
            concrete_transform_fn.graph)
        transformed_metadata = _trace_and_get_metadata(
            concrete_transform_fn=concrete_transform_fn,
            structured_inputs=structured_inputs,
            preprocessing_fn=preprocessing_fn,
            base_temp_dir=None,
            tensor_replacement_map=None)
    transformed_metadata_dir = os.path.join(
        transform_output_path, TFTransformOutput.TRANSFORMED_METADATA_DIR)
    metadata_io.write_metadata(transformed_metadata, transformed_metadata_dir)
Ejemplo n.º 3
0
def trace_and_write_v2_saved_model(saved_model_dir, preprocessing_fn,
                                   input_signature, base_temp_dir,
                                   tensor_replacement_map,
                                   output_keys_to_name_map):
    """Writes out a SavedModelV2 with preprocessing_fn traced using tf.function.

  The SavedModel written contains a method called `transform_fn` that
  represents the traced `preprocessing_fn`. Additionally, if this is the final
  SavedModel being written out, it will contain a method called `metadata_fn`
  that provides deferred schema annotations.

  Args:
    saved_model_dir: Path to write SavedModel to.
    preprocessing_fn: A user defined python function to be traced.
    input_signature: TypeSpecs describing the inputs to the `preprocessing_fn`.
    base_temp_dir: Base path to write temporary artifacts to.
    tensor_replacement_map: A map from placeholder tensor names to their
      evaluated replacement tensors.
    output_keys_to_name_map: A map from output dictionary keys to the names of
      the tensors that they represent.

  Returns:
    A tuple containing a pair of `tf.ConcreteFunction`s:
      1. The traced preprocessing_fn.
      2. A metadata_fn that returns a dictionary containing the deferred
      annotations added to the graph when invoked with any valid input.
  """

    transform_fn = get_traced_transform_fn(
        preprocessing_fn,
        input_signature,
        base_temp_dir,
        tensor_replacement_map=tensor_replacement_map,
        output_keys_to_name_map=output_keys_to_name_map)

    concrete_transform_fn = _write_v2_saved_model(transform_fn, 'transform_fn',
                                                  saved_model_dir)

    concrete_metadata_fn = None
    # If the `TENSOR_REPLACEMENTS` graph collection is empty, all TFT analyzers
    # in the `preprocessing_fn` have already been evaluated.
    if not concrete_transform_fn.graph.get_collection(
            analyzer_nodes.TENSOR_REPLACEMENTS):
        metadata_fn = schema_inference.get_traced_metadata_fn(
            tensor_replacement_map,
            preprocessing_fn,
            input_signature,
            base_temp_dir,
            evaluate_schema_overrides=True)
        concrete_metadata_fn = metadata_fn.get_concrete_function()
        metadata = dataset_metadata.DatasetMetadata(
            schema=schema_inference.infer_feature_schema_v2(
                concrete_transform_fn.structured_outputs,
                concrete_metadata_fn,
                evaluate_schema_overrides=True))
        metadata_io.write_metadata(
            metadata, os.path.join(saved_model_dir, METADATA_DIR_NAME))

    return concrete_transform_fn, concrete_metadata_fn
def make_spec(output_dir, batch_size=None):
    fixed_shape = [batch_size, 1] if batch_size is not None else []
    spec = {}
    spec[LABEL_COLUMN] = tf.FixedLenFeature(shape=fixed_shape,
                                            dtype=tf.int64,
                                            default_value=None)
    spec[DISPLAY_ID_COLUMN] = tf.FixedLenFeature(shape=fixed_shape,
                                                 dtype=tf.int64,
                                                 default_value=None)
    spec[IS_LEAK_COLUMN] = tf.FixedLenFeature(shape=fixed_shape,
                                              dtype=tf.int64,
                                              default_value=None)
    spec[DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN] = tf.FixedLenFeature(
        shape=fixed_shape, dtype=tf.int64, default_value=None)

    for name in BOOL_COLUMNS:
        spec[name] = tf.FixedLenFeature(shape=fixed_shape,
                                        dtype=tf.int64,
                                        default_value=None)
    for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM + FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM:
        spec[name] = tf.FixedLenFeature(shape=fixed_shape,
                                        dtype=tf.float32,
                                        default_value=None)
    for name in FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM:
        spec[name + '_binned'] = tf.FixedLenFeature(shape=fixed_shape,
                                                    dtype=tf.int64,
                                                    default_value=None)
    for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM:
        spec[name + '_binned'] = tf.FixedLenFeature(shape=fixed_shape,
                                                    dtype=tf.int64,
                                                    default_value=None)
        spec[name + '_log_01scaled'] = tf.FixedLenFeature(shape=fixed_shape,
                                                          dtype=tf.float32,
                                                          default_value=None)
    for name in INT_COLUMNS:
        spec[name + '_log_int'] = tf.FixedLenFeature(shape=fixed_shape,
                                                     dtype=tf.int64,
                                                     default_value=None)
        spec[name + '_log_01scaled'] = tf.FixedLenFeature(shape=fixed_shape,
                                                          dtype=tf.float32,
                                                          default_value=None)
    for name in BOOL_COLUMNS + CATEGORICAL_COLUMNS:
        spec[name] = tf.FixedLenFeature(shape=fixed_shape,
                                        dtype=tf.int64,
                                        default_value=None)

    for multi_category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS:
        #spec[multi_category] = tf.VarLenFeature(dtype=tf.int64)
        shape = fixed_shape[:-1] + [
            len(DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multi_category])
        ]
        spec[multi_category] = tf.FixedLenFeature(shape=shape, dtype=tf.int64)

    metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec(spec))

    metadata_io.write_metadata(metadata, output_dir)
Ejemplo n.º 5
0
def trace_and_write_v2_saved_model(
        saved_model_dir: str,
        preprocessing_fn: Callable[[Mapping[str, common_types.TensorType]],
                                   Mapping[str, common_types.TensorType]],
        input_signature: Mapping[str, tf.TypeSpec],
        base_temp_dir: Optional[str], baseline_analyzers_fingerprint: Mapping[
            str, graph_tools.AnalyzersFingerprint],
        tensor_replacement_map: Optional[Dict[str, tf.Tensor]],
        output_keys_to_name_map: Optional[Dict[str, str]]):
    """Writes out a SavedModelV2 with preprocessing_fn traced using tf.function.

  The SavedModel written contains a method called `transform_fn` that
  represents the traced `preprocessing_fn`. Additionally, if this is the final
  SavedModel being written out, it will contain a method called `metadata_fn`
  that provides deferred schema annotations.

  Args:
    saved_model_dir: Path to write SavedModel to.
    preprocessing_fn: A user defined python function to be traced.
    input_signature: TypeSpecs describing the inputs to the `preprocessing_fn`.
    base_temp_dir: Base path to write temporary artifacts to.
    baseline_analyzers_fingerprint: A mapping from analyzer name to a set of
      paths that define its fingerprint.
    tensor_replacement_map: A map from placeholder tensor names to their
      evaluated replacement tensors.
    output_keys_to_name_map: A map from output dictionary keys to the names of
      the tensors that they represent.

  Returns:
    A tuple containing a pair of `tf.ConcreteFunction`s:
      1. The traced preprocessing_fn.
      2. A metadata_fn that returns a dictionary containing the deferred
      annotations added to the graph when invoked with any valid input.

  Raises:
    RuntimeError: if analyzers in `preprocessing_fn` are encountered in a
    non-deterministic order.
  """
    concrete_transform_fn = _trace_and_write_transform_fn(
        saved_model_dir, preprocessing_fn, input_signature, base_temp_dir,
        tensor_replacement_map, output_keys_to_name_map)
    structured_inputs = tf2_utils.get_structured_inputs_from_func_graph(
        concrete_transform_fn.graph)
    _validate_analyzers_fingerprint(baseline_analyzers_fingerprint,
                                    concrete_transform_fn.graph,
                                    structured_inputs)

    # If the `TENSOR_REPLACEMENTS` graph collection is empty, all TFT analyzers
    # in the `preprocessing_fn` have already been evaluated.
    if not concrete_transform_fn.graph.get_collection(
            analyzer_nodes.TENSOR_REPLACEMENTS):
        metadata = _trace_and_get_metadata(concrete_transform_fn,
                                           structured_inputs, preprocessing_fn,
                                           base_temp_dir,
                                           tensor_replacement_map)
        metadata_io.write_metadata(
            metadata, os.path.join(saved_model_dir, METADATA_DIR_NAME))
Ejemplo n.º 6
0
    def _RunInPlaceImpl(self, preprocessing_fn: Any,
                        metadata: dataset_metadata.DatasetMetadata,
                        transform_output_path: Text) -> _Status:
        """Runs a transformation iteration in-place without looking at the data.

    Args:
      preprocessing_fn: The tf.Transform preprocessing_fn.
      metadata: A DatasetMetadata object for the input data.
      transform_output_path: An absolute path to write the output to.

    Returns:
      Status of the execution.
    """

        tf.logging.info('Processing an in-place transform')

        raw_metadata_dir = os.path.join(transform_output_path,
                                        tft.TFTransformOutput.RAW_METADATA_DIR)
        metadata_io.write_metadata(metadata, raw_metadata_dir)

        with tf.Graph().as_default() as graph:
            with tf.Session(graph=graph) as sess:

                input_signature = impl_helper.feature_spec_as_batched_placeholders(
                    schema_utils.schema_as_feature_spec(
                        _GetSchemaProto(metadata)).feature_spec)

                # In order to avoid a bug where import_graph_def fails when the
                # input_map and return_elements of an imported graph are the same
                # (b/34288791), we avoid using the placeholder of an input column as an
                # output of a graph. We do this by applying tf.identity to all inputs of
                # the preprocessing_fn.  Note this applies at the level of raw tensors.
                # TODO(b/34288791): Remove this workaround and use a shallow copy of
                # inputs instead.  A shallow copy is needed in case
                # self._preprocessing_fn mutates its input.
                copied_inputs = impl_helper.copy_tensors(input_signature)

                output_signature = preprocessing_fn(copied_inputs)
                sess.run(tf.global_variables_initializer())
                sess.run(tf.tables_initializer())
                transform_fn_path = os.path.join(
                    transform_output_path,
                    tft.TFTransformOutput.TRANSFORM_FN_DIR)
                saved_transform_io.write_saved_transform_from_session(
                    sess, input_signature, output_signature, transform_fn_path)

                transformed_metadata = dataset_metadata.DatasetMetadata(
                    schema=tft.schema_inference.infer_feature_schema(
                        output_signature, graph, sess))

        transformed_metadata_dir = os.path.join(
            transform_output_path,
            tft.TFTransformOutput.TRANSFORMED_METADATA_DIR)
        metadata_io.write_metadata(transformed_metadata,
                                   transformed_metadata_dir)

        return _Status.OK()
    def test_write_and_read(self):
        basedir = tempfile.mkdtemp()
        original = dataset_metadata.DatasetMetadata(
            schema=test_common.get_test_schema())

        metadata_io.write_metadata(original, basedir, versions=_test_versions)
        reloaded = metadata_io.read_metadata(basedir, versions=_test_versions)

        generated_feature_spec = reloaded.schema.as_feature_spec()
        self.assertEqual(test_common.test_feature_spec, generated_feature_spec)
Ejemplo n.º 8
0
    def test_write_and_read(self):
        # TODO(b/123241798): use TEST_TMPDIR
        basedir = tempfile.mkdtemp()
        original = dataset_metadata.DatasetMetadata(
            schema=test_common.get_test_schema())

        metadata_io.write_metadata(original, basedir)
        reloaded = metadata_io.read_metadata(basedir)

        self.assertEqual(original, reloaded)
Ejemplo n.º 9
0
    def test_write_and_read(self):
        # TODO(b/123241798): use TEST_TMPDIR
        basedir = tempfile.mkdtemp()
        original = dataset_metadata.DatasetMetadata(
            schema=test_common.get_test_schema())

        metadata_io.write_metadata(original, basedir)
        reloaded = metadata_io.read_metadata(basedir)

        generated_feature_spec = reloaded.schema.as_feature_spec()
        self.assertEqual(test_common.test_feature_spec, generated_feature_spec)
def create_metadata(df, prebatch_size, output_path):
    fixed_shape = [prebatch_size, 1]
    spec = {}
    for column in df:
        if column in CATEGORICAL_COLUMNS + [DISPLAY_ID_COLUMN]:
            spec[transform_nvt_to_spark(column)] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.int64,
                                                                         default_value=None)
        else:
            spec[transform_nvt_to_spark(column)] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.float32,
                                                                         default_value=None)
    metadata = dataset_metadata.DatasetMetadata(dataset_schema.from_feature_spec(spec))
    metadata_io.write_metadata(metadata, output_path)
Ejemplo n.º 11
0
  def _RunInPlaceImpl(self, preprocessing_fn,
                      metadata,
                      transform_output_path):
    """Runs a transformation iteration in-place without looking at the data.

    Args:
      preprocessing_fn: The tf.Transform preprocessing_fn.
      metadata: A DatasetMetadata object for the input data.
      transform_output_path: An absolute path to write the output to.

    Returns:
      Status of the execution.
    """

    tf.logging.info('Processing an in-place transform')

    raw_metadata_dir = os.path.join(transform_output_path,
                                    tft.TFTransformOutput.RAW_METADATA_DIR)
    metadata_io.write_metadata(metadata, raw_metadata_dir)

    with tf.Graph().as_default() as graph:
      with tf.Session(graph=graph) as sess:

        input_signature = impl_helper.feature_spec_as_batched_placeholders(
            metadata.schema.as_feature_spec())

        # In order to avoid a bug where import_graph_def fails when the
        # input_map and return_elements of an imported graph are the same
        # (b/34288791), we avoid using the placeholder of an input column as an
        # output of a graph. We do this by applying tf.identity to all inputs of
        # the preprocessing_fn.  Note this applies at the level of raw tensors.
        # TODO(b/34288791): Remove this workaround and use a shallow copy of
        # inputs instead.  A shallow copy is needed in case
        # self._preprocessing_fn mutates its input.
        copied_inputs = impl_helper.copy_tensors(input_signature)

        output_signature = preprocessing_fn(copied_inputs)
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        transform_fn_path = os.path.join(transform_output_path,
                                         tft.TFTransformOutput.TRANSFORM_FN_DIR)
        saved_transform_io.write_saved_transform_from_session(
            sess, input_signature, output_signature, transform_fn_path)

        transformed_metadata = dataset_metadata.DatasetMetadata(
            schema=tft.schema_inference.infer_feature_schema(
                output_signature, graph, sess))

    transformed_metadata_dir = os.path.join(
        transform_output_path, tft.TFTransformOutput.TRANSFORMED_METADATA_DIR)
    metadata_io.write_metadata(transformed_metadata, transformed_metadata_dir)

    return _Status.OK()
Ejemplo n.º 12
0
  def test_write_and_read(self):
    basedir = tempfile.mkdtemp()
    original_schema = schema_io_vtest.TestSchema(
        {'test_feature_1': 'bogus 1', 'test_feature_2': 'bogus 2'})
    original = dataset_metadata.DatasetMetadata(schema=original_schema)

    metadata_io.write_metadata(original, basedir, versions=_test_versions)
    reloaded = metadata_io.read_metadata(basedir, versions=_test_versions)

    self.assertTrue('test_feature_1' in reloaded.schema.column_schemas)
    self.assertTrue('test_feature_2' in reloaded.schema.column_schemas)
    self.assertEqual(2, len(reloaded.schema.column_schemas))
Ejemplo n.º 13
0
 def write_metadata_output(metadata):
     output_path = self._path
     if self._write_to_unique_subdirectory:
         output_path = common.get_unique_temp_path(self._path)
     metadata_io.write_metadata(metadata, output_path)
     if asset_map:
         with tf.io.gfile.GFile(
                 os.path.join(
                     output_path,
                     output_wrapper.TFTransformOutput.ASSET_MAP),
                 'w') as f:
             f.write(json.dumps(asset_map))
     return output_path
Ejemplo n.º 14
0
    def testReadTransformFn(self):
        path = self.get_temp_dir()
        # NOTE: we don't need to create or write to the transform_fn directory since
        # ReadTransformFn never inspects this directory.
        transform_fn_dir = os.path.join(path, 'transform_fn')
        transformed_metadata_dir = os.path.join(path, 'transformed_metadata')
        metadata_io.write_metadata(_TEST_METADATA, transformed_metadata_dir)

        with beam.Pipeline() as pipeline:
            saved_model_dir_pcoll, metadata = (
                pipeline | transform_fn_io.ReadTransformFn(path))
            beam_test_util.assert_that(saved_model_dir_pcoll,
                                       beam_test_util.equal_to(
                                           [transform_fn_dir]),
                                       label='AssertSavedModelDir')
            # NOTE: metadata is currently read in a non-deferred manner.
            self.assertEqual(metadata, _TEST_METADATA)
Ejemplo n.º 15
0
def make_transform_graph(output_dir, schema, features):
    """Writes a tft transform fn, and metadata files.

  Args:
    output_dir: output folder
    schema: schema list
    features: features dict
  """

    tft_input_schema = make_tft_input_schema(
        schema, os.path.join(output_dir, STATS_FILE))
    tft_input_metadata = dataset_metadata.DatasetMetadata(
        schema=tft_input_schema)
    preprocessing_fn = make_preprocessing_fn(output_dir, features)

    # preprocessing_fn does not use any analyzer, so we can run a local beam job
    # to properly make and write the transform function.
    temp_dir = os.path.join(output_dir, 'tmp')
    with beam.Pipeline('DirectRunner', options=None) as p:
        with tft_impl.Context(temp_dir=temp_dir):

            # Not going to transform, so no data is needed.
            train_data = p | beam.Create([])

            transform_fn = (
                (train_data, tft_input_metadata)
                | 'BuildTransformFn'  # noqa
                >> tft_impl.AnalyzeDataset(preprocessing_fn))  # noqa

            # Writes transformed_metadata and transfrom_fn folders
            _ = (transform_fn |
                 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir)
                 )  # noqa

            # Write the raw_metadata
            metadata_io.write_metadata(metadata=tft_input_metadata,
                                       path=os.path.join(
                                           output_dir, RAW_METADATA_DIR))
Ejemplo n.º 16
0
def make_transform_graph(output_dir, schema, features):
  """Writes a tft transform fn, and metadata files.

  Args:
    output_dir: output folder
    schema: schema list
    features: features dict
  """

  tft_input_schema = make_tft_input_schema(schema, os.path.join(output_dir,
                                                                STATS_FILE))
  tft_input_metadata = dataset_metadata.DatasetMetadata(schema=tft_input_schema)
  preprocessing_fn = make_preprocessing_fn(output_dir, features)

  # copy from /tft/beam/impl
  inputs, outputs = impl_helper.run_preprocessing_fn(
      preprocessing_fn=preprocessing_fn,
      schema=tft_input_schema)
  output_metadata = dataset_metadata.DatasetMetadata(
      schema=impl_helper.infer_feature_schema(outputs))

  transform_fn_dir = os.path.join(output_dir, TRANSFORM_FN_DIR)

  # This writes the SavedModel
  impl_helper.make_transform_fn_def(
      schema=tft_input_schema,
      inputs=inputs,
      outputs=outputs,
      saved_model_dir=transform_fn_dir)

  metadata_io.write_metadata(
      metadata=output_metadata,
      path=os.path.join(output_dir, TRANSFORMED_METADATA_DIR))
  metadata_io.write_metadata(
      metadata=tft_input_metadata,
      path=os.path.join(output_dir, RAW_METADATA_DIR))
  def _create_test_data(self):
    """Makes local test data.

    The fllowing files and folders will be created in self.output_folder:

    self.output_folder/
        features.json
        img.png
        input.csv
        schema.json
        raw_metadata/
            (tft metadata files)
        transformed_metadata/
            (tft metadata files)
        transform_fn/
            (tft saved model file)
    """
    self.output_folder = tempfile.mkdtemp()

    # Make image file
    self.img_filepath = os.path.join(self.output_folder, 'img.png')
    image = Image.new('RGBA', size=(50, 50), color=(155, 0, 0))
    image.save(self.img_filepath, 'png')

    # Make csv input file
    self.csv_input_filepath = os.path.join(self.output_folder, 'input.csv')
    file_io.write_string_to_file(
        self.csv_input_filepath,
        '23.0,%s' % self.img_filepath)

    # Make schema file
    self.schema_filepath = os.path.join(self.output_folder, 'schema.json')
    file_io.write_string_to_file(
        self.schema_filepath,
        json.dumps([{'name': 'num_col', 'type': 'FLOAT'},
                    {'name': 'img_col', 'type': 'STRING'}]))

    # Make features file
    self.features_filepath = os.path.join(self.output_folder, 'features.json')
    file_io.write_string_to_file(
        self.features_filepath,
        json.dumps({'num_col': {'transform': 'target'},
                    'img_col': {'transform': 'img_url_to_vec'}}))

    # Run a local beam job to make the transform_fn
    with beam.Pipeline('DirectRunner'):
      with tft_impl.Context(temp_dir=os.path.join(self.output_folder, 'tmp')):
        def preprocessing_fn(inputs):
          return {'img_col': tft.map(tf.decode_base64, inputs['img_col']),
                  'num_col': tft.map(lambda x: tf.add(x, 1), inputs['num_col'])}

        input_data = [{'img_col': base64.urlsafe_b64encode('abcd'), 'num_col': 3}]

        input_metadata = dataset_metadata.DatasetMetadata(
            schema=dataset_schema.from_feature_spec(
                {'img_col': tf.FixedLenFeature(shape=[], dtype=tf.string),
                 'num_col': tf.FixedLenFeature(shape=[], dtype=tf.float32)}))

        (dataset, train_metadata), transform_fn = (
            (input_data, input_metadata)
            | 'AnalyzeAndTransform'  # noqa: W503
            >> tft_impl.AnalyzeAndTransformDataset(preprocessing_fn))  # noqa: W503

        # WriteTransformFn writes transform_fn and metadata
        _ = (transform_fn  # noqa: F841
             | 'WriteTransformFn'  # noqa: W503
             >> tft_beam_io.WriteTransformFn(self.output_folder))  # noqa: W503

        metadata_io.write_metadata(
            metadata=input_metadata,
            path=os.path.join(self.output_folder, 'raw_metadata'))
def data_transformation(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/marshal"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    # -----------------------DATA LOADING START--------------------------------
    _kale_directory_file_names = [
        os.path.splitext(f)[0]
        for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
    ]

    if "column_names" not in _kale_directory_file_names:
        raise ValueError("column_names" + " does not exists in directory")

    _kale_load_file_name = [
        f
        for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f)) and
        os.path.splitext(f)[0] == "column_names"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " +
                         "column_names" + ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    column_names = _kale_resource_load(os.path.join(
        _kale_data_directory, _kale_load_file_name))

    if "schema" not in _kale_directory_file_names:
        raise ValueError("schema" + " does not exists in directory")

    _kale_load_file_name = [
        f
        for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f)) and
        os.path.splitext(f)[0] == "schema"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " +
                         "schema" + ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    schema = _kale_resource_load(os.path.join(
        _kale_data_directory, _kale_load_file_name))
    # -----------------------DATA LOADING END----------------------------------

    import os
    import shutil
    import logging
    import apache_beam as beam
    import tensorflow as tf
    import tensorflow_transform as tft
    import tensorflow_model_analysis as tfma
    import tensorflow_data_validation as tfdv

    from apache_beam.io import textio
    from apache_beam.io import tfrecordio

    from tensorflow_transform.beam import impl as beam_impl
    from tensorflow_transform.beam.tft_beam_io import transform_fn_io
    from tensorflow_transform.coders.csv_coder import CsvCoder
    from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder
    from tensorflow_transform.tf_metadata import dataset_metadata
    from tensorflow_transform.tf_metadata import metadata_io
    DATA_DIR = 'data/'
    TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv')
    EVALUATION_DATA = os.path.join(
        DATA_DIR, 'taxi-cab-classification/eval.csv')

    # Categorical features are assumed to each have a maximum value in the dataset.
    MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12]
    CATEGORICAL_FEATURE_KEYS = ['trip_start_hour',
                                'trip_start_day', 'trip_start_month']

    DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds']

    # Number of buckets used by tf.transform for encoding each feature.
    FEATURE_BUCKET_COUNT = 10

    BUCKET_FEATURE_KEYS = [
        'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']

    # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform
    VOCAB_SIZE = 1000

    # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed.
    OOV_SIZE = 10

    VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company',
                          'pickup_community_area', 'dropoff_community_area']

    # allow nan values in these features.
    OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract',
                         'company', 'trip_seconds', 'dropoff_community_area']

    LABEL_KEY = 'tips'
    FARE_KEY = 'fare'
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    # tf.get_logger().setLevel(logging.ERROR)

    def to_dense(tensor):
        """Takes as input a SparseTensor and return a Tensor with correct default value
        Args:
          tensor: tf.SparseTensor
        Returns:
          tf.Tensor with default value
        """
        if not isinstance(tensor, tf.sparse.SparseTensor):
            return tensor
        if tensor.dtype == tf.string:
            default_value = ''
        elif tensor.dtype == tf.float32:
            default_value = 0.0
        elif tensor.dtype == tf.int32:
            default_value = 0
        else:
            raise ValueError(f"Tensor type not recognized: {tensor.dtype}")

        return tf.squeeze(tf.sparse_to_dense(tensor.indices,
                                             [tensor.dense_shape[0], 1],
                                             tensor.values, default_value=default_value), axis=1)
        # TODO: Update to below version
        # return tf.squeeze(tf.sparse.to_dense(tensor, default_value=default_value), axis=1)

    def preprocess_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.
        Args:
          inputs: map from feature keys to raw not-yet-transformed features.
        Returns:
          Map from string feature key to transformed feature operations.
        """
        outputs = {}
        for key in DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[key] = tft.scale_to_z_score(to_dense(inputs[key]))

        for key in VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            if inputs[key].dtype == tf.string:
                vocab_tensor = to_dense(inputs[key])
            else:
                vocab_tensor = tf.as_string(to_dense(inputs[key]))
            outputs[key] = tft.compute_and_apply_vocabulary(
                vocab_tensor, vocab_filename='vocab_' + key,
                top_k=VOCAB_SIZE, num_oov_buckets=OOV_SIZE)

        for key in BUCKET_FEATURE_KEYS:
            outputs[key] = tft.bucketize(
                to_dense(inputs[key]), FEATURE_BUCKET_COUNT)

        for key in CATEGORICAL_FEATURE_KEYS:
            outputs[key] = tf.cast(to_dense(inputs[key]), tf.int64)

        taxi_fare = to_dense(inputs[FARE_KEY])
        taxi_tip = to_dense(inputs[LABEL_KEY])
        # Test if the tip was > 20% of the fare.
        tip_threshold = tf.multiply(taxi_fare, tf.constant(0.2))
        outputs[LABEL_KEY] = tf.logical_and(
            tf.logical_not(tf.math.is_nan(taxi_fare)),
            tf.greater(taxi_tip, tip_threshold))

        for key in outputs:
            if outputs[key].dtype == tf.bool:
                outputs[key] = tft.compute_and_apply_vocabulary(tf.as_string(outputs[key]),
                                                                vocab_filename='vocab_' + key)

        return outputs
    trns_output = os.path.join(DATA_DIR, "transformed")
    if os.path.exists(trns_output):
        shutil.rmtree(trns_output)

    tft_input_metadata = dataset_metadata.DatasetMetadata(schema)

    runner = 'DirectRunner'
    with beam.Pipeline(runner, options=None) as p:
        with beam_impl.Context(temp_dir=os.path.join(trns_output, 'tmp')):
            converter = CsvCoder(column_names, tft_input_metadata.schema)

            # READ TRAIN DATA
            train_data = (
                p
                | 'ReadTrainData' >> textio.ReadFromText(TRAIN_DATA, skip_header_lines=1)
                | 'DecodeTrainData' >> beam.Map(converter.decode))

            # TRANSFORM TRAIN DATA (and get transform_fn function)
            transformed_dataset, transform_fn = (
                (train_data, tft_input_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocess_fn))
            transformed_data, transformed_metadata = transformed_dataset

            # SAVE TRANSFORMED TRAIN DATA
            _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                os.path.join(trns_output, 'train'),
                coder=ExampleProtoCoder(transformed_metadata.schema))

            # READ EVAL DATA
            eval_data = (
                p
                | 'ReadEvalData' >> textio.ReadFromText(EVALUATION_DATA, skip_header_lines=1)
                | 'DecodeEvalData' >> beam.Map(converter.decode))

            # TRANSFORM EVAL DATA (using previously created transform_fn function)
            eval_dataset = (eval_data, tft_input_metadata)
            transformed_eval_data, transformed_metadata = (
                (eval_dataset, transform_fn) | beam_impl.TransformDataset())

            # SAVE EVAL DATA
            _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord(
                os.path.join(trns_output, 'eval'),
                coder=ExampleProtoCoder(transformed_metadata.schema))

            # SAVE transform_fn FUNCTION FOR LATER USE
            # TODO: check out what is the transform function (transform_fn) that came from previous step
            _ = (transform_fn | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(trns_output))

            # SAVE TRANSFORMED METADATA
            metadata_io.write_metadata(
                metadata=tft_input_metadata,
                path=os.path.join(trns_output, 'metadata'))

    # -----------------------DATA SAVING START---------------------------------
    if "trns_output" in locals():
        _kale_resource_save(trns_output, os.path.join(
            _kale_data_directory, "trns_output"))
    else:
        print("_kale_resource_save: `trns_output` not found.")
Ejemplo n.º 19
0
def run_transform(output_dir,
                  schema,
                  train_data_file,
                  eval_data_file,
                  project,
                  mode,
                  preprocessing_fn=None):
    """Writes a tft transform fn, and metadata files.
  Args:
    output_dir: output folder
    schema: schema list.
    train_data_file: training data file pattern.
    eval_data_file: eval data file pattern.
    project: the project to run dataflow in.
    local: whether the job should be local or cloud.
    preprocessing_fn: a function used to preprocess the raw data. If not
                      specified, a function will be automatically inferred
                      from the schema.
  """

    tft_input_metadata = make_tft_input_metadata(schema)
    temp_dir = os.path.join(output_dir, 'tmp')
    preprocessing_fn = preprocessing_fn or make_preprocessing_fn(schema)

    if mode == 'local':
        pipeline_options = None
        runner = 'DirectRunner'
    elif mode == 'cloud':
        options = {
            'job_name':
            'pipeline-tft-' +
            datetime.datetime.now().strftime('%y%m%d-%H%M%S'),
            'temp_location':
            temp_dir,
            'project':
            project,
            'extra_packages': [
                'gs://ml-pipeline-playground/tensorflow-transform-0.6.0.dev0.tar.gz'
            ]
        }
        pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
        runner = 'DataFlowRunner'
    else:
        raise ValueError("Invalid mode %s." % mode)

    with beam.Pipeline(runner, options=pipeline_options) as p:
        with beam_impl.Context(temp_dir=temp_dir):
            names = [x['name'] for x in schema]
            converter = CsvCoder(names, tft_input_metadata.schema)
            train_data = (
                p
                | 'ReadTrainData' >> textio.ReadFromText(train_data_file)
                | 'DecodeTrainData' >> beam.Map(converter.decode))

            train_dataset = (train_data, tft_input_metadata)
            transformed_dataset, transform_fn = (
                train_dataset
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset

            # Writes transformed_metadata and transfrom_fn folders
            _ = (transform_fn | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(output_dir))

            # Write the raw_metadata
            metadata_io.write_metadata(metadata=tft_input_metadata,
                                       path=os.path.join(
                                           output_dir, 'metadata'))

            _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                os.path.join(output_dir, 'train'),
                coder=ExampleProtoCoder(transformed_metadata.schema))

            eval_data = (p
                         |
                         'ReadEvalData' >> textio.ReadFromText(eval_data_file)
                         | 'DecodeEvalData' >> beam.Map(converter.decode))

            eval_dataset = (eval_data, tft_input_metadata)

            transformed_eval_dataset = ((eval_dataset, transform_fn)
                                        | beam_impl.TransformDataset())
            transformed_eval_data, transformed_metadata = transformed_eval_dataset

            _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord(
                os.path.join(output_dir, 'eval'),
                coder=ExampleProtoCoder(transformed_metadata.schema))
Ejemplo n.º 20
0
 def expand(self, metadata):
     """A PTransform to write Metadata to disk."""
     metadata_io.write_metadata(metadata, self._path)
Ejemplo n.º 21
0
 def write_metadata_output(metadata):
     output_path = self._path
     if self._write_to_unique_subdirectory:
         output_path = common.get_unique_temp_path(self._path)
     metadata_io.write_metadata(metadata, output_path)
     return output_path