def preprocess(in_test_mode):
  import os
  import os.path
  import tempfile
  from apache_beam.io import tfrecordio
  from tensorflow_transform.coders import example_proto_coder
  from tensorflow_transform.tf_metadata import dataset_metadata
  from tensorflow_transform.tf_metadata import dataset_schema
  from tensorflow_transform.beam import tft_beam_io
  from tensorflow_transform.beam.tft_beam_io import transform_fn_io

  job_name = 'preprocess-taxi-features' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')    
  if in_test_mode:
    import shutil
    print 'Launching local job ... hang on'
    OUTPUT_DIR = './preproc_tft'
    shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
    EVERY_N = 100000
  else:
    print 'Launching Dataflow job {} ... hang on'.format(job_name)
    OUTPUT_DIR = 'gs://{0}/taxifare/preproc_tft/'.format(BUCKET)
    import subprocess
    subprocess.call('gsutil rm -r {}'.format(OUTPUT_DIR).split())
    EVERY_N = 10000
    
  options = {
    'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'),
    'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
    'job_name': job_name,
    'project': PROJECT,
    'max_num_workers': 24,
    'teardown_policy': 'TEARDOWN_ALWAYS',
    'no_save_main_session': True,
    'requirements_file': 'requirements.txt'
  }
  opts = beam.pipeline.PipelineOptions(flags=[], **options)
  if in_test_mode:
    RUNNER = 'DirectRunner'
  else:
    RUNNER = 'DataflowRunner'

  # set up metadata
  raw_data_schema = {
    colname : dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation())
                   for colname in 'dayofweek,key'.split(',')
  }
  raw_data_schema.update({
      colname : dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation())
                   for colname in 'fare_amount,pickuplon,pickuplat,dropofflon,dropofflat'.split(',')
    })
  raw_data_schema.update({
      colname : dataset_schema.ColumnSchema(tf.int64, [], dataset_schema.FixedColumnRepresentation())
                   for colname in 'hourofday,passengers'.split(',')
    })
  raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(raw_data_schema))

  # run Beam  
  with beam.Pipeline(RUNNER, options=opts) as p:
    with beam_impl.Context(temp_dir=os.path.join(OUTPUT_DIR, 'tmp')):
      # save the raw data metadata
      _ = (raw_data_metadata
        | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(
            os.path.join(OUTPUT_DIR, 'metadata/rawdata_metadata'),
            pipeline=p))
      
      # analyze and transform training       
      raw_data = (p 
        | 'train_read' >> beam.io.Read(beam.io.BigQuerySource(query=create_query(1, EVERY_N), use_standard_sql=True))
        | 'train_filter' >> beam.Filter(is_valid))

      raw_dataset = (raw_data, raw_data_metadata)
      transformed_dataset, transform_fn = (
          raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocess_tft))
      transformed_data, transformed_metadata = transformed_dataset
      _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
          os.path.join(OUTPUT_DIR, 'train'),
          file_name_suffix='.gz',
          coder=example_proto_coder.ExampleProtoCoder(
              transformed_metadata.schema))
      
      # transform eval data
      raw_test_data = (p 
        | 'eval_read' >> beam.io.Read(beam.io.BigQuerySource(query=create_query(2, EVERY_N), use_standard_sql=True))
        | 'eval_filter' >> beam.Filter(is_valid))
      
      raw_test_dataset = (raw_test_data, raw_data_metadata)
      transformed_test_dataset = (
          (raw_test_dataset, transform_fn) | beam_impl.TransformDataset())
      transformed_test_data, _ = transformed_test_dataset
      _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
          os.path.join(OUTPUT_DIR, 'eval'),
          file_name_suffix='.gz',
          coder=example_proto_coder.ExampleProtoCoder(
              transformed_metadata.schema))
      _ = (transform_fn
           | 'WriteTransformFn' >>
           transform_fn_io.WriteTransformFn(os.path.join(OUTPUT_DIR, 'metadata')))
Exemple #2
0
    def Transform(self, inputs, outputs, status_file):
        """Executes on request.

    This is the implementation part of transform executor. This is intended for
    using or extending the executor without artifact dependency.

    Args:
      inputs: A dictionary of labelled input values, including:
        - labels.COMPUTE_STATISTICS_LABEL: Whether compute statistics.
        - labels.SCHEMA_PATH_LABEL: Path to schema file.
        - labels.EXAMPLES_FILE_FORMAT_LABEL: Example file format, optional.
        - labels.EXAMPLES_DATA_FORMAT_LABEL: Example data format.
        - labels.ANALYZE_AND_TRANSFORM_DATA_PATHS_LABEL: Paths or path patterns
          to analyze and transform data.
        - labels.TRANSFORM_DATA_PATHS_LABEL: Paths or path patterns to transform
          only data.
        - labels.TFT_STATISTICS_USE_TFDV_LABEL: Whether use tfdv to compute
          statistics.
        - labels.PREPROCESSING_FN: Path to a Python module that contains the
          preprocessing_fn, optional.
      outputs: A dictionary of labelled output values, including:
        - labels.PER_SET_STATS_OUTPUT_PATHS_LABEL: Paths to statistics output,
          optional.
        - labels.TRANSFORM_METADATA_OUTPUT_PATH_LABEL: A path to
          TFTransformOutput output.
        - labels.TRANSFORM_MATERIALIZE_OUTPUT_PATHS_LABEL: Paths to transform
          materialization.
        - labels.TEMP_OUTPUT_LABEL: A path to temporary directory.
      status_file: Where the status should be written (not yet implemented)
    """

        del status_file  # unused
        compute_statistics = common.GetSoleValue(
            inputs, labels.COMPUTE_STATISTICS_LABEL)
        transform_output_path = common.GetSoleValue(
            outputs, labels.TRANSFORM_METADATA_OUTPUT_PATH_LABEL)
        raw_examples_data_format = common.GetSoleValue(
            inputs, labels.EXAMPLES_DATA_FORMAT_LABEL)
        schema = common.GetSoleValue(inputs, labels.SCHEMA_PATH_LABEL)
        input_dataset_schema = self._ReadSchema(raw_examples_data_format,
                                                schema)
        input_dataset_metadata = dataset_metadata.DatasetMetadata(
            input_dataset_schema)

        tf.logging.info(
            'Inputs to executor.Transform function: {}'.format(inputs))
        tf.logging.info(
            'Outputs to executor.Transform function: {}'.format(outputs))

        # NOTE: We disallow an empty schema, which we detect by testing the
        # number of columns.  While in principal an empty schema is valid, in
        # practice this is a sign of a user error, and this is a convenient
        # place to catch that error.
        if (not input_dataset_metadata.schema.as_feature_spec() and
                not self._ShouldDecodeAsRawExample(raw_examples_data_format)):
            raise ValueError(messages.SCHEMA_EMPTY)

        preprocessing_fn = self._GetPreprocessingFn(inputs, outputs)

        materialize_output_paths = common.GetValues(
            outputs, labels.TRANSFORM_MATERIALIZE_OUTPUT_PATHS_LABEL)

        feature_spec = input_dataset_metadata.schema.as_feature_spec()

        # Inspecting the preprocessing_fn even if we know we need a full pass in
        # order to fail faster if it fails.
        try:
            analyze_input_columns = tft.get_analyze_input_columns(
                preprocessing_fn, feature_spec)
        except AttributeError:
            # If using TFT 1.12, fall back to assuming all features are used.
            analyze_input_columns = feature_spec.keys()

        if not compute_statistics and not materialize_output_paths:
            if analyze_input_columns:
                tf.logging.warning(
                    'Not using the in-place Transform because the following features '
                    'require analyzing: {}'.format(
                        tuple(c for c in analyze_input_columns)))
            else:
                tf.logging.warning(
                    'Using the in-place Transform since compute_statistics=False, '
                    'it does not materialize transformed data, and the configured '
                    'preprocessing_fn appears to not require analyzing the data.'
                )
                self._RunInPlaceImpl(preprocessing_fn, input_dataset_metadata,
                                     transform_output_path)
                # TODO(b/122478841): Writes status to status file.
                return
        self._RunBeamImpl(inputs, outputs, preprocessing_fn,
                          input_dataset_metadata, raw_examples_data_format,
                          transform_output_path, compute_statistics,
                          materialize_output_paths)
Exemple #3
0
    def test_single_phase_run_twice(self):

        span_0_key = 'span-0'
        span_1_key = 'span-1'

        def preprocessing_fn(inputs):

            _ = tft.vocabulary(inputs['s'])

            _ = tft.bucketize(inputs['x'], 2, name='bucketize')

            return {
                'x_min':
                tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'x_mean':
                tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'y_min':
                tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
                'y_mean':
                tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
            }

        input_data = [{'x': 12, 'y': 1, 's': 'b'}, {'x': 10, 'y': 1, 's': 'c'}]
        input_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec({
                'x':
                tf.io.FixedLenFeature([], tf.float32),
                'y':
                tf.io.FixedLenFeature([], tf.float32),
                's':
                tf.io.FixedLenFeature([], tf.string),
            }))
        input_data_dict = {
            span_0_key: [{
                'x': -2,
                'y': 1,
                's': 'a',
            }, {
                'x': 4,
                'y': -4,
                's': 'a',
            }],
            span_1_key:
            input_data,
        }
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            with beam.Pipeline() as p:

                flat_data = p | 'CreateInputData' >> beam.Create(
                    list(itertools.chain(*input_data_dict.values())))

                # This is needed due to b/123895600.
                for a, b in six.iteritems(input_data_dict):
                    input_data_dict[a] = p | a >> beam.Create(b)

                transform_fn, cache_output = (
                    (flat_data, input_data_dict, {}, input_metadata)
                    | 'Analyze' >>
                    (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
                _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(
                    self._cache_dir)

                transformed_dataset = (
                    ((input_data_dict[span_1_key], input_metadata),
                     transform_fn)
                    | 'Transform' >> beam_impl.TransformDataset())

                transformed_data, unused_transformed_metadata = transformed_dataset

                expected_transformed_data = [
                    {
                        'x_mean': 6.0,
                        'x_min': -2.0,
                        'y_mean': -0.25,
                        'y_min': -4.0,
                    },
                    {
                        'x_mean': 6.0,
                        'x_min': -2.0,
                        'y_mean': -0.25,
                        'y_min': -4.0,
                    },
                ]
                beam_test_util.assert_that(
                    transformed_data,
                    beam_test_util.equal_to(expected_transformed_data),
                    label='first')

                transform_fn_dir = os.path.join(self.base_test_dir,
                                                'transform_fn')
                _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir)

                for key in input_data_dict:
                    self.assertIn(key, cache_output)
                    self.assertEqual(6, len(cache_output[key]))

                transform_fn, second_output_cache = (
                    (flat_data, input_data_dict, cache_output, input_metadata)
                    | 'AnalyzeAgain' >>
                    (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))

                dot_string = nodes.get_dot_graph(
                    [analysis_graph_builder._ANALYSIS_GRAPH]).to_string()
                self.WriteRenderedDotFile(dot_string)

                transformed_dataset = (
                    ((input_data_dict[span_1_key], input_metadata),
                     transform_fn)
                    | 'TransformAgain' >> beam_impl.TransformDataset())
        transformed_data, unused_transformed_metadata = transformed_dataset
        beam_test_util.assert_that(
            transformed_data,
            beam_test_util.equal_to(expected_transformed_data),
            label='second')

        self.assertFalse(second_output_cache)
Exemple #4
0
def run(input_feature_spec,
        labels,
        feature_extraction,
        feature_scaling=None,
        eval_percent=20.0,
        beam_options=None,
        work_dir=None):
    """Runs the whole preprocessing step.
    This runs the feature extraction PTransform, validates that the data conforms
    to the schema provided, normalizes the features, and splits the dataset into
    a training and evaluation dataset.
    """

    # Populate optional arguments
    if not feature_scaling:
        feature_scaling = lambda inputs: inputs

    # Type checking
    if not isinstance(labels, list):
        raise ValueError(
            '`labels` must be list(str). '
            'Given: {} {}'.format(labels, type(labels)))

    if not isinstance(feature_extraction, beam.PTransform):
        raise ValueError(
            '`feature_extraction` must be {}. '
            'Given: {} {}'.format(beam.PTransform,
                                  feature_extraction, type(feature_extraction)))

    if not callable(feature_scaling):
        raise ValueError(
            '`feature_scaling` must be callable. '
            'Given: {} {}'.format(feature_scaling,
                                  type(feature_scaling)))

    if beam_options and not isinstance(beam_options, PipelineOptions):
        raise ValueError(
            '`beam_options` must be {}. '
            'Given: {} {}'.format(PipelineOptions,
                                  beam_options, type(beam_options)))

    if not work_dir:
        work_dir = tempfile.mkdtemp(prefix='tensorflow-preprocessing')

    tft_temp_dir = os.path.join(work_dir, 'tft-temp')
    train_dataset_dir = os.path.join(work_dir, 'train-dataset')
    eval_dataset_dir = os.path.join(work_dir, 'eval-dataset')

    transform_fn_dir = os.path.join(work_dir, transform_fn_io.TRANSFORM_FN_DIR)
    if tf.gfile.Exists(transform_fn_dir):
        tf.gfile.DeleteRecursively(transform_fn_dir)

    with beam.Pipeline(options=beam_options) as p, \
            beam_impl.Context(temp_dir=tft_temp_dir):

        # [START feature_extraction]
        dataset = (
                p
                | 'Feature extraction' >> feature_extraction
                | 'Validate inputs' >> beam.ParDo(ValidateInputData(input_feature_spec)))
        # [END feature_extraction]

        input_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec(input_feature_spec))

        dataset_and_metadata, transform_fn = (
                (dataset, input_metadata)
                | 'Feature scaling' >> beam_impl.AnalyzeAndTransformDataset(feature_scaling))
        dataset, metadata = dataset_and_metadata
        # [END _analyze_and_transform_dataset]

        # [START_split_to_train_and_eval_datasets]
        # Split the dataset into a training set and an evaluation set
        assert 0 < eval_percent < 100, 'eval_percent must in the range (0-100)'
        train_dataset, eval_dataset = (
                dataset
                | 'Split dataset' >> beam.Partition(
            lambda elem, _: int(random.uniform(0, 100) < eval_percent), 2)
        )
        # [END split_to_train_and_eval_datasets]

        # [START write_tfrecords]
        # # Write the datasets as TFRecords
        coder = example_proto_coder.ExampleProtoCoder(metadata.schema)

        train_dataset_prefix = os.path.join(train_dataset_dir, 'part')
        _ = (
                train_dataset
                | 'Write train dataset' >> tfrecordio.WriteToTFRecord(train_dataset_prefix, coder))

        eval_dataset_prefix = os.path.join(eval_dataset_dir, 'part')
        _ = (
                eval_dataset
                | 'Write eval dataset' >> tfrecordio.WriteToTFRecord(eval_dataset_prefix, coder))

        # Write the transform_fn
        _ = (
                transform_fn
                | 'Write transformFn' >> transform_fn_io.WriteTransformFn(work_dir))
        # [END write_tfrecords]

        return PreprocessData(
            input_feature_spec,
            labels,
            train_dataset_prefix + '*',
            eval_dataset_prefix + '*')
        self._fn = fn

    def expand(self, pcoll):
        return pcoll | beam.ParDo(self._MapAndFilterErrorsDoFn(self._fn))


RAW_DATA_FEATURE_SPEC = dict([(name, tf.io.FixedLenFeature([], tf.string))
                              for name in CATEGORICAL_FEATURE_KEYS] +
                             [(name, tf.io.FixedLenFeature([], tf.float32))
                              for name in NUMERIC_FEATURE_KEYS] +
                             [(name, tf.io.VarLenFeature(tf.float32))
                              for name in OPTIONAL_NUMERIC_FEATURE_KEYS] +
                             [(LABEL_KEY,
                               tf.io.FixedLenFeature([], tf.string))])

RAW_DATA_METADATA = dataset_metadata.DatasetMetadata(
    schema_utils.schema_from_feature_spec(RAW_DATA_FEATURE_SPEC))

# Constants used for training.  Note that the number of instances will be
# computed by tf.Transform in future versions, in which case it can be read from
# the metadata.  Similarly BUCKET_SIZES will not be needed as this information
# will be stored in the metadata for each of the columns.  The bucket size
# includes all listed categories in the dataset description as well as one extra
# for "?" which represents unknown.
TRAIN_BATCH_SIZE = 128
TRAIN_NUM_EPOCHS = 200
NUM_TRAIN_INSTANCES = 32561
NUM_TEST_INSTANCES = 16281

# Names of temp files
TRANSFORMED_TRAIN_DATA_FILEBASE = 'train_transformed'
TRANSFORMED_TEST_DATA_FILEBASE = 'test_transformed'
Exemple #6
0
def analyze_in_place(preprocessing_fn, force_tf_compat_v1, feature_specs,
                     type_specs, transform_output_path):
    """Analyzes the `preprocessing_fn` in-place without looking at the data.

  This should only be used if the `preprocessing_fn` contains no TFT
  analyzers or TFT mappers that use analyzers.

  Writes out a transform function and transformed metadata to subdirs under
  `transform_output_path`.

  Args:
    preprocessing_fn: The tf.Transform preprocessing_fn.
    force_tf_compat_v1: If True, call Transform's API to use Tensorflow in
      tf.compat.v1 mode.
    feature_specs: a Dict from input feature key to its feature spec.
    type_specs: a Dict from input feature key to its type spec.
    transform_output_path: An absolute path to write the output to.

  Raises:
    RuntimeError if `preprocessing_fn` contains TFT analyzers.
  """
    use_tf_compat_v1 = tf2_utils.use_tf_compat_v1(force_tf_compat_v1)
    transform_fn_path = os.path.join(transform_output_path,
                                     TFTransformOutput.TRANSFORM_FN_DIR)
    if use_tf_compat_v1:
        graph, structured_inputs, structured_outputs = (
            trace_preprocessing_function(preprocessing_fn,
                                         feature_specs,
                                         use_tf_compat_v1=use_tf_compat_v1))
        _assert_no_analyzers_in_graph(graph)
        with tf.compat.v1.Session(graph=graph) as sess:
            sess.run(tf.compat.v1.global_variables_initializer())
            sess.run(tf.compat.v1.tables_initializer())
            saved_transform_io.write_saved_transform_from_session(
                sess, structured_inputs, structured_outputs, transform_fn_path)

            transformed_metadata = dataset_metadata.DatasetMetadata(
                schema=schema_inference.infer_feature_schema(
                    structured_outputs, graph, sess))
    else:
        concrete_transform_fn, concrete_metadata_fn = (
            trace_and_write_v2_saved_model(saved_model_dir=transform_fn_path,
                                           preprocessing_fn=preprocessing_fn,
                                           input_signature=type_specs,
                                           base_temp_dir=None,
                                           tensor_replacement_map=None,
                                           output_keys_to_name_map=None))
        _assert_no_analyzers_in_graph(concrete_transform_fn.graph)
        # This should be a no-op as if concrete_metadata_fn is None,
        # `_assert_no_analyzers_in_graph` should have raised an error.
        assert concrete_metadata_fn
        structured_outputs = tf.nest.pack_sequence_as(
            structure=concrete_transform_fn.structured_outputs,
            flat_sequence=concrete_transform_fn.outputs,
            expand_composites=True)
        transformed_metadata = dataset_metadata.DatasetMetadata(
            schema=schema_inference.infer_feature_schema_v2(
                structured_outputs,
                concrete_metadata_fn,
                evaluate_schema_overrides=True))
    transformed_metadata_dir = os.path.join(
        transform_output_path, TFTransformOutput.TRANSFORMED_METADATA_DIR)
    metadata_io.write_metadata(transformed_metadata, transformed_metadata_dir)
from tensorflow_transform.beam.tft_beam_io import transform_fn_io
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform.tf_metadata import futures
from tensorflow_transform.tf_metadata import metadata_io

import unittest
from tensorflow.python.framework import test_util
from tensorflow.python.lib.io import file_io

_TEST_METADATA = dataset_metadata.DatasetMetadata({
    'fixed_column':
    dataset_schema.ColumnSchema(tf.string, (1, 3, 2),
                                dataset_schema.FixedColumnRepresentation()),
    'fixed_column_with_default':
    dataset_schema.ColumnSchema(
        tf.float32, (1, 3, 2),
        dataset_schema.FixedColumnRepresentation(123.4)),
    'list_columm':
    dataset_schema.ColumnSchema(tf.float32, (None, ),
                                dataset_schema.ListColumnRepresentation())
})

_TEST_METADATA_WITH_FUTURES = dataset_metadata.DatasetMetadata({
    'fixed_column':
    dataset_schema.ColumnSchema(tf.string, (1, 3, 2),
                                dataset_schema.FixedColumnRepresentation()),
    'fixed_column_with_default':
    dataset_schema.ColumnSchema(
        tf.float32, (1, futures.Future('a'), 2),
        dataset_schema.FixedColumnRepresentation(123.4)),
    'list_columm':
Exemple #8
0
  def expand(self, dataset_and_transform_fn):
    """Transforms the dataset using the transform_fn.

    Args:
      dataset_and_transform_fn: A tuple of dataset and preprocessing
      function.

    Returns:
      A dataset transformed according to the transform_fn.
    """
    (input_values, input_metadata), (transform_fn, output_metadata) = (
        dataset_and_transform_fn)

    # If exclude_outputs is set, update the output metadata.
    if self._exclude_outputs is not None:
      if isinstance(output_metadata, beam_metadata_io.BeamDatasetMetadata):
        # Unwrap BeamDatasetMetadata into DatasetMetadata and pcollections dict.
        output_metadata, pcollections = output_metadata
        schema = output_metadata.schema
        # Update DatasetMetadata to remove excluded outputs
        output_metadata = dataset_metadata.DatasetMetadata(
            schema=dataset_schema.Schema({
                key: column_schema
                for key, column_schema in six.iteritems(schema.column_schemas)
                if key not in self._exclude_outputs
            }))
        # Update pcollections to keep only pcollections that resolve futures in
        # the updated metadata.
        unresolved_future_names = set(
            future.name for future in output_metadata.substitute_futures({}))
        pcollections = {
            name: pcollection
            for name, pcollection in six.iteritems(pcollections)
            if name in unresolved_future_names
        }
        # Wrap DatasetMetadata and pcollections as BeamDatasetMetadata
        output_metadata = beam_metadata_io.BeamDatasetMetadata(
            output_metadata, pcollections)
      else:
        schema = output_metadata.schema
        output_metadata = dataset_metadata.DatasetMetadata(
            schema=dataset_schema.Schema({
                key: column_schema
                for key, column_schema in six.iteritems(schema.column_schemas)
                if key not in self._exclude_outputs
            }))

    def convert_and_unbatch(batch_dict):
      return impl_helper.to_instance_dicts(output_metadata.schema, batch_dict)

    serialized_tf_config = (
        analyzer_impls._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get(  # pylint: disable=protected-access
            self.pipeline.runner))
    output_instances = (
        input_values
        | 'Batch' >> _BatchElements()
        | 'Transform' >> beam.ParDo(
            _RunMetaGraphDoFn(
                input_metadata.schema,
                serialized_tf_config,
                shared_graph_state_handle=shared.Shared(),
                exclude_outputs=self._exclude_outputs),
            saved_model_dir=beam.pvalue.AsSingleton(transform_fn))
        | 'ConvertAndUnbatch' >> beam.FlatMap(convert_and_unbatch))

    _clear_shared_state_after_barrier(self.pipeline, output_instances)

    return (output_instances, output_metadata)
Exemple #9
0
    def testPreprocessingFn(self):
        schema_file = os.path.join(self._testdata_path,
                                   'schema_gen/schema.pbtxt')
        schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())
        feature_spec = taxi_utils._get_raw_feature_spec(schema)
        working_dir = self.get_temp_dir()
        transform_graph_path = os.path.join(working_dir, 'transform_graph')
        transformed_examples_path = os.path.join(working_dir,
                                                 'transformed_examples')

        # Run very simplified version of executor logic.
        # TODO(kestert): Replace with tft_unit.assertAnalyzeAndTransformResults.
        # Generate legacy `DatasetMetadata` object.  Future version of Transform
        # will accept the `Schema` proto directly.
        legacy_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec(feature_spec))
        decoder = tft.coders.ExampleProtoCoder(legacy_metadata.schema)
        with beam.Pipeline() as p:
            with tft_beam.Context(temp_dir=os.path.join(working_dir, 'tmp')):
                examples = (
                    p
                    | 'ReadTrainData' >> beam.io.ReadFromTFRecord(
                        os.path.join(self._testdata_path,
                                     'csv_example_gen/train/*'),
                        coder=beam.coders.BytesCoder(),
                        # TODO(b/114938612): Eventually remove this override.
                        validate=False)
                    | 'DecodeTrainData' >> beam.Map(decoder.decode))
                (transformed_examples, transformed_metadata), transform_fn = (
                    (examples, legacy_metadata)
                    | 'AnalyzeAndTransform' >>
                    tft_beam.AnalyzeAndTransformDataset(
                        taxi_utils.preprocessing_fn))

                # WriteTransformFn writes transform_fn and metadata to subdirectories
                # tensorflow_transform.SAVED_MODEL_DIR and
                # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively.
                # pylint: disable=expression-not-assigned
                (transform_fn
                 | 'WriteTransformFn' >>
                 tft_beam.WriteTransformFn(transform_graph_path))

                encoder = tft.coders.ExampleProtoCoder(
                    transformed_metadata.schema)
                (transformed_examples
                 | 'EncodeTrainData' >> beam.Map(encoder.encode)
                 | 'WriteTrainData' >> beam.io.WriteToTFRecord(
                     os.path.join(transformed_examples_path,
                                  'train/transformed_examples.gz'),
                     coder=beam.coders.BytesCoder()))
                # pylint: enable=expression-not-assigned

        # Verify the output matches golden output.
        # NOTE: we don't verify that transformed examples match golden output.
        expected_transformed_schema = io_utils.parse_pbtxt_file(
            os.path.join(
                self._testdata_path,
                'transform/transform_graph/transformed_metadata/schema.pbtxt'),
            schema_pb2.Schema())
        transformed_schema = io_utils.parse_pbtxt_file(
            os.path.join(transform_graph_path,
                         'transformed_metadata/schema.pbtxt'),
            schema_pb2.Schema())
        # Clear annotations so we only have to test main schema.
        transformed_schema.ClearField('annotation')
        for feature in transformed_schema.feature:
            feature.ClearField('annotation')
        self.assertEqual(transformed_schema, expected_transformed_schema)
Exemple #10
0
train_ques = file.readlines()[1:]

xys = []
for i in range(len(train_ques)):
    train_ques_sp = train_ques[i].strip().replace(" ", "").replace("\n", "").replace("\r", "").upper().split(",")
    train_ques_sp_list = list(train_ques_sp[1])
    len_real = len(train_ques_sp_list)
    train_ques_sp_list_pad = train_ques_sp_list[:LEN_MAX] if len_real>LEN_MAX else train_ques_sp_list + ["*"]*(LEN_MAX-len_real)
    xy_json = {"x":train_ques_sp_list_pad, "y":train_ques_sp[0]}
    xys.append(xy_json)

# graph架构输入
STRING_FEATURE = {'x': tf.io.FixedLenFeature([LEN_MAX], tf.string),
                  'y': tf.io.FixedLenFeature([], tf.string)}

DATA_STRING_FEATURE_SPEC = dataset_metadata.DatasetMetadata(dataset_schema.from_feature_spec(STRING_FEATURE))


def parser(x, y):
    features = {"x": x, "y":y}
    return features


def train_input_fn(train, batch_size=64):
    x_train, y_train = train
    dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    dataset = dataset.shuffle(buffer_size=len(y_train))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(parser)
    dataset = dataset.repeat()
    iterator = dataset.make_one_shot_iterator()
Exemple #11
0
  def expand(self, dataset):
    """Analyze the dataset.

    Args:
      dataset: A dataset.

    Returns:
      A TransformFn containing the deferred transform function.
    """

    input_values, input_metadata = dataset
    input_schema = input_metadata.schema

    base_temp_dir = Context.create_base_temp_dir()

    graph = tf.Graph()
    with graph.as_default():
      # NOTE: it's important that create_phases is called directly after
      # run_preprocessing_fn, because we later mutate the graph's
      # TABLE_INITIALIZERS collection which would break the logic in
      # create_phases.
      inputs, outputs = impl_helper.run_preprocessing_fn(
          self._preprocessing_fn, input_schema)
      phases = impl_helper.create_phases()

      # Iterate through levels.  tensor_pcoll_mapping is a mapping from tensor
      # names to singleton PCollections containing a _TensorValue.  We compute
      # tensor_pcoll_mapping in phases, where at each phase we compute the
      # analyzers that are ready to run and update tensor_pcoll_mapping.
      tensor_pcoll_mapping = {}
      table_initializers = graph.get_collection_ref(
          tf.GraphKeys.TABLE_INITIALIZERS)
      original_table_initializers = list(table_initializers)
      del table_initializers[:]

      serialized_tf_config = (
          analyzer_impls._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get(  # pylint: disable=protected-access
              input_values.pipeline.runner))
      for level, phase in enumerate(phases):
        # Create a SavedModel that describes the mapping from the input data
        # to the inputs of the analyzers at this level.  The colum names of the
        # outputs are the tensor names of the analyzer inputs in the graph.
        # This graph has the anaylzer outputs computed so far replaced with
        # constants.
        analyzer_inputs = {}
        for analyzer in phase.analyzers:
          for input_tensor in analyzer.inputs:
            analyzer_inputs[input_tensor.name] = input_tensor
        table_initializers.extend(phase.table_initializers)
        unbound_saved_model_dir = _make_unique_temp_dir(base_temp_dir)
        _write_saved_transform(graph, inputs, analyzer_inputs,
                               unbound_saved_model_dir)
        saved_model_dir = (
            tensor_pcoll_mapping
            | 'CreateSavedModelForAnaylzerInputs[%d]' % level >>
            _ReplaceTensorsWithConstants(unbound_saved_model_dir, base_temp_dir,
                                         input_values.pipeline))

        # Run this saved model on the input dataset to obtain the inputs to the
        # analyzers.
        analyzer_input_values = (
            input_values
            | 'BatchAnalyzerInputs[%d]' % level >> _BatchElements()
            | 'ComputeAnalyzerInputs[%d]' % level >> beam.ParDo(
                _RunMetaGraphDoFn(
                    input_schema,
                    serialized_tf_config,
                    shared_graph_state_handle=shared.Shared()),
                saved_model_dir=beam.pvalue.AsSingleton(saved_model_dir)))

        # Compute the analyzers from their inputs.  `analyzer_outputs_dict` is a
        # map from tensor names to singleton PCollections of `_TensorValue`s.
        analyzer_outputs_dict = (
            analyzer_input_values
            | 'ComputeAnalyzerOutputs[%d]' % level >> _ComputeAnalyzerOutputs(
                phase.analyzers, base_temp_dir))

        # Update the mapping for all analyzers.
        tensor_pcoll_mapping.update(analyzer_outputs_dict)

      del table_initializers[:]
      table_initializers.extend(original_table_initializers)
      saved_model_dir = _make_unique_temp_dir(base_temp_dir)
      _write_saved_transform(graph, inputs, outputs, saved_model_dir)
      transform_fn = (
          tensor_pcoll_mapping
          | 'ReplaceTensorsWithConstants' >> _ReplaceTensorsWithConstants(
              saved_model_dir, base_temp_dir, input_values.pipeline))

      # Infer metadata.  The metadata may contain Futures that refer to the
      # values of tensors in the graph.  In that case, the tensors must be
      # "constant" in that they don't depend on input data.  The tensors can
      # depend on analyzer outputs though.  This allows us to set metadata that
      # depends on analyzer outputs.
      #
      # We first extract the names of the tensors that are referenced by the
      # Futures, and then compute them by calling _ComputeScalarConstants with
      # the tensor-PCollection mapping representing the analyzer outputs.
      metadata = dataset_metadata.DatasetMetadata(
          schema=impl_helper.infer_feature_schema(outputs))

      deferred_metadata_tensor_names = [
          future.name
          for column_schema in tft_api.get_column_schemas().values()
          for future in column_schema.substitute_futures({})
      ]
      name_pcoll_dict = (
          tensor_pcoll_mapping
          | 'ComputeTensorValues' >>
          _ComputeTensorValues(deferred_metadata_tensor_names, saved_model_dir,
                               input_values.pipeline))
      full_metadata = beam_metadata_io.BeamDatasetMetadata(
          metadata, name_pcoll_dict)

      _clear_shared_state_after_barrier(input_values.pipeline, transform_fn)

      return transform_fn, full_metadata
Exemple #12
0
  def test_non_frequency_vocabulary_merge(self):
    """This test compares vocabularies produced with and without cache."""

    mi_vocab_name = 'mutual_information_vocab'
    adjusted_mi_vocab_name = 'adjusted_mutual_information_vocab'
    weighted_frequency_vocab_name = 'weighted_frequency_vocab'

    def preprocessing_fn(inputs):
      _ = tft.vocabulary(
          inputs['s'],
          labels=inputs['label'],
          store_frequency=True,
          vocab_filename=mi_vocab_name,
          min_diff_from_avg=0.1,
          use_adjusted_mutual_info=False)

      _ = tft.vocabulary(
          inputs['s'],
          labels=inputs['label'],
          store_frequency=True,
          vocab_filename=adjusted_mi_vocab_name,
          min_diff_from_avg=1.0,
          use_adjusted_mutual_info=True)

      _ = tft.vocabulary(
          inputs['s'],
          weights=inputs['weight'],
          store_frequency=True,
          vocab_filename=weighted_frequency_vocab_name,
          use_adjusted_mutual_info=False)
      return inputs

    span_0_key = 'span-0'
    span_1_key = 'span-1'

    input_data = [
        dict(s='a', weight=1, label=1),
        dict(s='a', weight=0.5, label=1),
        dict(s='b', weight=0.75, label=1),
        dict(s='b', weight=1, label=0),
    ]
    input_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec({
            's': tf.FixedLenFeature([], tf.string),
            'label': tf.FixedLenFeature([], tf.int64),
            'weight': tf.FixedLenFeature([], tf.float32),
        }))
    input_data_dict = {
        span_0_key: input_data,
        span_1_key: input_data,
    }
    with beam_impl.Context(temp_dir=self.get_temp_dir()):

      flat_data = input_data_dict.values() | 'Flatten' >> beam.Flatten()

      transform_fn_with_cache, output_cache = (
          (flat_data, input_data_dict, {}, input_metadata) |
          (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))

      expected_accumulators = {
          '__v0__VocabularyAccumulate--vocabulary--': [
              b'["a", [2, 1.0, 1.0]]', b'["b", [2, 0.5, 1.0]]'
          ],
          '__v0__VocabularyAccumulate--vocabulary_1--': [
              b'["a", [2, 1.0, 1.0]]', b'["b", [2, 0.5, 1.0]]'
          ],
          '__v0__VocabularyAccumulate--vocabulary_2--': [
              b'["a", 1.5]', b'["b", 1.75]'
          ],
      }
      spans = [span_0_key, span_1_key]
      self.assertCountEqual(output_cache.keys(), spans)
      for span in spans:
        self.assertCountEqual(output_cache[span].keys(),
                              expected_accumulators.keys())
        for key, value in six.iteritems(expected_accumulators):
          self.assertCountEqual(output_cache[span][key], value)

      transform_fn_no_cache = ((input_data * 2, input_metadata) |
                               (beam_impl.AnalyzeDataset(preprocessing_fn)))

    transform_fn_with_cache_dir = os.path.join(self.base_test_dir,
                                               'transform_fn_with_cache')
    _ = transform_fn_with_cache | tft_beam.WriteTransformFn(
        transform_fn_with_cache_dir)

    transform_fn_no_cache_dir = os.path.join(self.base_test_dir,
                                             'transform_fn_no_cache')
    _ = transform_fn_no_cache | tft_beam.WriteTransformFn(
        transform_fn_no_cache_dir)

    tft_output_cache = tft.TFTransformOutput(transform_fn_with_cache_dir)
    tft_output_no_cache = tft.TFTransformOutput(transform_fn_no_cache_dir)

    for vocab_filename in (mi_vocab_name, adjusted_mi_vocab_name,
                           weighted_frequency_vocab_name):
      cache_path = tft_output_cache.vocabulary_file_by_name(vocab_filename)
      no_cache_path = tft_output_no_cache.vocabulary_file_by_name(
          vocab_filename)
      with tf.gfile.Open(cache_path, 'rb') as f1, tf.gfile.Open(
          no_cache_path, 'rb') as f2:
        self.assertEqual(
            f1.readlines(), f2.readlines(),
            'vocab with cache != vocab without cache for: {}'.format(
                vocab_filename))
                for i in range(28)
                for j in range(28)]
label_field = "label"
def preprocessing_fn(inputs):
    # TODO: pre-process the fields to 0 to 1 range inputs
    result = {label_field: inputs[label_field]}
    for field_name in pixel_fields:
        result[field_name + "_norm"] = tft.scale_to_0_1(inputs[field_name])
    return result

input_data_schema = dataset_schema.from_feature_spec(dict(
    [(name, tf.io.FixedLenFeature([], tf.int64))
     for name in pixel_fields] +
    [(label_field, tf.io.FixedLenFeature([], tf.int64))]))

input_data_metadata = dataset_metadata.DatasetMetadata(input_data_schema)

fetch_data()
train_data_file = mnist_path + "/train.csv"

with beam.Pipeline() as pipeline:
    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        columns = [label_field] + pixel_fields
        converter = tft.coders.CsvCoder(columns, input_data_schema)
        input_data = (
            pipeline
            | 'ReadInputData' >> beam.io.ReadFromText(train_data_file)
            | 'CleanInputData' >> beam.Map(converter.decode))
        input_dataset = (input_data, input_data_metadata)
        transformed_dataset, transform_fn = (
            input_dataset | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
Exemple #14
0
    def expand(self, dataset):
        """Analyze the dataset.

    Args:
      dataset: A dataset.

    Returns:
      A TransformFn containing the deferred transform function.

    Raises:
      ValueError: If preprocessing_fn has no outputs.
    """
        input_values, input_metadata = dataset
        input_schema = input_metadata.schema

        base_temp_dir = Context.create_base_temp_dir()

        graph = tf.Graph()
        with graph.as_default():

            with tf.name_scope('inputs'):
                inputs = input_schema.as_batched_placeholders()
            # In order to avoid a bug where import_graph_def fails when the input_map
            # and return_elements of an imported graph are the same (b/34288791), we
            # avoid using the placeholder of an input column as an output of a graph.
            # We do this by applying tf.identity to all inputs of the
            # preprocessing_fn.  Note this applies at the level of raw tensors.
            outputs = self._preprocessing_fn(impl_helper.copy_tensors(inputs))

            # At this point we check that the preprocessing_fn has at least one
            # output. This is because if we allowed the output of preprocessing_fn to
            # be empty, we wouldn't be able to determine how many instances to
            # "unbatch" the output into.
            if not outputs:
                raise ValueError(
                    'The preprocessing function returned an empty dict')

            if graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
                raise ValueError(
                    'The preprocessing function contained trainable variables '
                    '{}'.format(
                        graph.get_collection_ref(
                            tf.GraphKeys.TRAINABLE_VARIABLES)))

            # NOTE: it's important that create_phases is called directly after
            # preprocessing_fn, because we later mutate the graph's TABLE_INITIALIZERS
            # collection which would break the logic in create_phases.
            phases = impl_helper.create_phases()

            # Iterate through levels.  tensor_pcoll_mapping is a mapping from tensor
            # names to singleton PCollections containing a _TensorValue.  We compute
            # tensor_pcoll_mapping in phases, where at each phase we compute the
            # analyzers that are ready to run and update tensor_pcoll_mapping.
            tensor_pcoll_mapping = {}
            table_initializers = graph.get_collection_ref(
                tf.GraphKeys.TABLE_INITIALIZERS)
            original_table_initializers = list(table_initializers)
            del table_initializers[:]

            serialized_tf_config = (
                analyzer_impls._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get(  # pylint: disable=protected-access
                    input_values.pipeline.runner))
            for level, phase in enumerate(phases):
                # Create a SavedModel that describes the mapping from the input data
                # to the inputs of the analyzers at this level.  The colum names of the
                # outputs are the tensor names of the analyzer inputs in the graph.
                # This graph has the anaylzer outputs computed so far replaced with
                # constants.
                analyzer_inputs = {}
                for analyzer in phase.analyzers:
                    for input_tensor in analyzer.inputs:
                        analyzer_inputs[input_tensor.name] = input_tensor
                table_initializers.extend(phase.table_initializers)
                unbound_saved_model_dir = _make_unique_temp_dir(base_temp_dir)
                _write_saved_transform(graph, inputs, analyzer_inputs,
                                       unbound_saved_model_dir)
                saved_model_dir = (tensor_pcoll_mapping
                                   | 'CreateSavedModelForAnalyzerInputs[%d]' %
                                   level >> _ReplaceTensorsWithConstants(
                                       unbound_saved_model_dir, base_temp_dir,
                                       input_values.pipeline))

                # Run this saved model on the input dataset to obtain the inputs to the
                # analyzers.
                analyzer_input_values = (
                    input_values
                    | 'BatchAnalyzerInputs[%d]' % level >> _BatchElements()
                    | 'ComputeAnalyzerInputs[%d]' % level >> beam.ParDo(
                        _RunMetaGraphDoFn(
                            input_schema,
                            serialized_tf_config,
                            shared_graph_state_handle=shared.Shared()),
                        saved_model_dir=beam.pvalue.AsSingleton(
                            saved_model_dir)))

                # Compute the analyzers from their inputs.  `analyzer_outputs_dict` is a
                # map from tensor names to singleton PCollections of `_TensorValue`s.
                analyzer_outputs_dict = (
                    analyzer_input_values
                    | 'ComputeAnalyzerOutputs[%d]' % level >>
                    _ComputeAnalyzerOutputs(phase.analyzers, base_temp_dir))

                # Update the mapping for all analyzers.
                tensor_pcoll_mapping.update(analyzer_outputs_dict)

            del table_initializers[:]
            table_initializers.extend(original_table_initializers)
            saved_model_dir = _make_unique_temp_dir(base_temp_dir)
            _write_saved_transform(graph, inputs, outputs, saved_model_dir)
            transform_fn = (
                tensor_pcoll_mapping
                |
                'ReplaceTensorsWithConstants' >> _ReplaceTensorsWithConstants(
                    saved_model_dir, base_temp_dir, input_values.pipeline))

            # Infer metadata.  The metadata may contain Futures that refer to the
            # values of tensors in the graph.  In that case, the tensors must be
            # "constant" in that they don't depend on input data.  The tensors can
            # depend on analyzer outputs though.  This allows us to set metadata that
            # depends on analyzer outputs.
            #
            # We first extract the names of the tensors that are referenced by the
            # Futures, and then compute them by calling _ComputeScalarConstants with
            # the tensor-PCollection mapping representing the analyzer outputs.
            metadata = dataset_metadata.DatasetMetadata(
                schema=impl_helper.infer_feature_schema(outputs))

            deferred_metadata_tensor_names = {
                future.name
                for column_schema in metadata.schema.column_schemas.values()
                for future in column_schema.substitute_futures({})
            }
            name_pcoll_dict = (tensor_pcoll_mapping
                               | 'ComputeTensorValues' >> _ComputeTensorValues(
                                   deferred_metadata_tensor_names,
                                   saved_model_dir, input_values.pipeline))
            full_metadata = beam_metadata_io.BeamDatasetMetadata(
                metadata, name_pcoll_dict)

            _clear_shared_state_after_barrier(input_values.pipeline,
                                              transform_fn)

            return transform_fn, full_metadata
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Test metadata for tft_beam_io tests."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema

from tensorflow_metadata.proto.v0 import schema_pb2

_FEATURE_SPEC = {
    'fixed_column': tf.io.FixedLenFeature([3], tf.string),
    'list_columm': tf.io.VarLenFeature(tf.int64),
}

COMPLETE_METADATA = dataset_metadata.DatasetMetadata(
    dataset_schema.from_feature_spec(
        _FEATURE_SPEC,
        domains={'list_columm': schema_pb2.IntDomain(min=-1, max=5)}))

INCOMPLETE_METADATA = dataset_metadata.DatasetMetadata(
    dataset_schema.from_feature_spec(
        _FEATURE_SPEC,
        # Values will be overridden by those in COMPLETE_METADATA
        domains={'list_columm': schema_pb2.IntDomain(min=0, max=0)}))
Exemple #16
0
      num_ranking_candidate_movie_ids=0,
      negative_sample_ratio=args.negative_sample_ratio,
      negative_sample_label=args.negative_sample_label,
      movie_rating_history=args.movie_rating_history)

  movies_sideinput = beam.pvalue.AsDict(movies_data)
  eval_data |= 'BuildEvalFeatures' >> beam.ParDo(
      BuildExampleFn(args.random_seed),
      movies_data=movies_sideinput,
      rating_threshold=args.eval_score_threshold,
      is_ranking_problem=(args.eval_type == RANKING),
      is_train=False,
      num_ranking_candidate_movie_ids=args.num_ranking_candidate_movie_ids)

  # TFTransform based preprocessing.
  raw_metadata = dataset_metadata.DatasetMetadata(
      schema=movielens.make_examples_schema())
  _ = (raw_metadata
       | 'WriteRawMetadata' >> tft_beam_io.WriteMetadata(
           os.path.join(args.output_dir, 'raw_metadata'), pipeline))

  preprocessing_fn = movielens.make_preprocessing_fn()
  transform_fn = ((train_data, raw_metadata)
                  | 'Analyze' >> tft.AnalyzeDataset(preprocessing_fn))

  _ = (transform_fn
       | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(args.output_dir))

  @beam.ptransform_fn
  def TransformAndWrite(pcoll, path):  # pylint: disable=invalid-name
    pcoll |= 'Shuffle' >> _Shuffle()  # pylint: disable=no-value-for-parameter
    (dataset, metadata) = (((pcoll, raw_metadata), transform_fn)
Exemple #17
0
def transform_data(train_neg_filepattern, train_pos_filepattern,
                   test_neg_filepattern, test_pos_filepattern,
                   transformed_train_filebase, transformed_test_filebase,
                   transform_fn_dir):
  """Transform the data and write out as a TFRecord of Example protos.

  Read in the data from the positive and negative examples on disk, and
  transform it using a preprocessing pipeline that removes punctuation,
  tokenizes and maps tokens to int64 values indices.

  Args:
    train_neg_filepattern: Filepattern for training data negative examples
    train_pos_filepattern: Filepattern for training data positive examples
    test_neg_filepattern: Filepattern for test data negative examples
    test_pos_filepattern: Filepattern for test data positive examples
    transformed_train_filebase: Base filename for transformed training data
        shards
    transformed_test_filebase: Base filename for transformed test data shards
    transform_fn_dir: Directory where metadata for transform function should be
        written
  """

  with beam.Pipeline() as pipeline:
    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
      # pylint: disable=no-value-for-parameter
      train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData(
          (train_neg_filepattern, train_pos_filepattern))
      # pylint: disable=no-value-for-parameter
      test_data = pipeline | 'ReadTest' >> ReadAndShuffleData(
          (test_neg_filepattern, test_pos_filepattern))

      metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({
          REVIEW_COLUMN: dataset_schema.ColumnSchema(
              tf.string, [], dataset_schema.FixedColumnRepresentation()),
          LABEL_COLUMN: dataset_schema.ColumnSchema(
              tf.int64, [], dataset_schema.FixedColumnRepresentation()),
      }))

      def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        review = inputs[REVIEW_COLUMN]

        review_tokens = tf.string_split(review, DELIMITERS)
        review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE)
        # Add one for the oov bucket created by string_to_int.
        review_bow_indices, review_weight = tft.tfidf(review_indices,
                                                      VOCAB_SIZE + 1)
        return {
            REVIEW_COLUMN: review_bow_indices,
            REVIEW_WEIGHT: review_weight,
            LABEL_COLUMN: inputs[LABEL_COLUMN]
        }

      (transformed_train_data, transformed_metadata), transform_fn = (
          (train_data, metadata)
          | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(
              preprocessing_fn))

      transformed_test_data, _ = (
          ((test_data, metadata), transform_fn)
          | 'Transform' >> beam_impl.TransformDataset())

      _ = (
          transformed_train_data
          | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
              transformed_train_filebase,
              coder=example_proto_coder.ExampleProtoCoder(
                  transformed_metadata.schema)))

      _ = (
          transformed_test_data
          | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
              transformed_test_filebase,
              coder=example_proto_coder.ExampleProtoCoder(
                  transformed_metadata.schema)))

      _ = (transform_fn
           | 'WriteTransformFn' >>
           transform_fn_io.WriteTransformFn(transform_fn_dir))
Exemple #18
0
  def _RunBeamImpl(self, inputs: Mapping[Text, Any],
                   outputs: Mapping[Text, Any], preprocessing_fn: Any,
                   input_dataset_metadata: dataset_metadata.DatasetMetadata,
                   raw_examples_data_format: Text, transform_output_path: Text,
                   compute_statistics: bool,
                   materialize_output_paths: Sequence[Text]) -> _Status:
    """Perform data preprocessing with FlumeC++ runner.

    Args:
      inputs: A dictionary of labelled input values.
      outputs: A dictionary of labelled output values.
      preprocessing_fn: The tf.Transform preprocessing_fn.
      input_dataset_metadata: A DatasetMetadata object for the input data.
      raw_examples_data_format: A string describing the raw data format.
      transform_output_path: An absolute path to write the output to.
      compute_statistics: A bool indicating whether or not compute statistics.
      materialize_output_paths: Paths to materialized outputs.

    Raises:
      RuntimeError: If reset() is not being invoked between two run().
      ValueError: If the schema is empty.

    Returns:
      Status of the execution.
    """
    raw_examples_file_format = common.GetSoleValue(
        inputs, labels.EXAMPLES_FILE_FORMAT_LABEL, strict=False)
    analyze_and_transform_data_paths = common.GetValues(
        inputs, labels.ANALYZE_AND_TRANSFORM_DATA_PATHS_LABEL)
    transform_only_data_paths = common.GetValues(
        inputs, labels.TRANSFORM_ONLY_DATA_PATHS_LABEL)
    stats_use_tfdv = common.GetSoleValue(inputs,
                                         labels.TFT_STATISTICS_USE_TFDV_LABEL)
    per_set_stats_output_paths = common.GetValues(
        outputs, labels.PER_SET_STATS_OUTPUT_PATHS_LABEL)
    temp_path = common.GetSoleValue(outputs, labels.TEMP_OUTPUT_LABEL)

    input_cache_dir = common.GetSoleValue(
        inputs, labels.CACHE_INPUT_PATH_LABEL, strict=False)
    output_cache_dir = common.GetSoleValue(
        outputs, labels.CACHE_OUTPUT_PATH_LABEL, strict=False)

    tf.logging.info('Analyze and transform data patterns: %s',
                    list(enumerate(analyze_and_transform_data_paths)))
    tf.logging.info('Transform data patterns: %s',
                    list(enumerate(transform_only_data_paths)))
    tf.logging.info('Transform materialization output paths: %s',
                    list(enumerate(materialize_output_paths)))
    tf.logging.info('Transform output path: %s', transform_output_path)

    feature_spec = schema_utils.schema_as_feature_spec(
        _GetSchemaProto(input_dataset_metadata)).feature_spec
    try:
      analyze_input_columns = tft.get_analyze_input_columns(
          preprocessing_fn, feature_spec)
      transform_input_columns = (
          tft.get_transform_input_columns(preprocessing_fn, feature_spec))
    except AttributeError:
      # If using TFT 1.12, fall back to assuming all features are used.
      analyze_input_columns = feature_spec.keys()
      transform_input_columns = feature_spec.keys()
    # Use the same dataset (same columns) for AnalyzeDataset and computing
    # pre-transform stats so that the data will only be read once for these
    # two operations.
    if compute_statistics:
      analyze_input_columns = list(
          set(list(analyze_input_columns) + list(transform_input_columns)))
    if input_dataset_metadata.schema is _RAW_EXAMPLE_SCHEMA:
      analyze_input_dataset_metadata = input_dataset_metadata
      transform_input_dataset_metadata = input_dataset_metadata
    else:
      analyze_input_dataset_metadata = dataset_metadata.DatasetMetadata(
          dataset_schema.from_feature_spec(
              {feature: feature_spec[feature]
               for feature in analyze_input_columns}))
      transform_input_dataset_metadata = dataset_metadata.DatasetMetadata(
          dataset_schema.from_feature_spec(
              {feature: feature_spec[feature]
               for feature in transform_input_columns}))

    can_process_jointly = not bool(per_set_stats_output_paths or
                                   materialize_output_paths or output_cache_dir)
    analyze_data_list = self._MakeDatasetList(
        analyze_and_transform_data_paths, raw_examples_file_format,
        raw_examples_data_format, analyze_input_dataset_metadata,
        can_process_jointly)
    transform_data_list = self._MakeDatasetList(
        list(analyze_and_transform_data_paths) +
        list(transform_only_data_paths), raw_examples_file_format,
        raw_examples_data_format, transform_input_dataset_metadata,
        can_process_jointly)

    desired_batch_size = self._GetDesiredBatchSize(raw_examples_data_format)

    with self._CreatePipeline(outputs) as p:
      with tft_beam.Context(
          temp_dir=temp_path,
          desired_batch_size=desired_batch_size,
          passthrough_keys={_TRANSFORM_INTERNAL_FEATURE_FOR_KEY},
          use_deep_copy_optimization=True):
        # pylint: disable=expression-not-assigned
        # pylint: disable=no-value-for-parameter

        _ = (
            p | self._IncrementColumnUsageCounter(
                len(feature_spec.keys()), len(analyze_input_columns),
                len(transform_input_columns)))

        (new_analyze_data_dict, input_cache, flat_data_required) = (
            p | self._OptimizeRun(input_cache_dir, output_cache_dir,
                                  analyze_data_list, feature_spec,
                                  preprocessing_fn, self._GetCacheSource()))
        # Removing unneeded datasets if they won't be needed for statistics or
        # materialization.
        if not materialize_output_paths and not compute_statistics:
          analyze_data_list = [
              d for d in new_analyze_data_dict.values() if d is not None
          ]
          if len(analyze_data_list) < len(new_analyze_data_dict):
            tf.logging.info(
                'Not reading the following datasets due to cache: %s', [
                    dataset.file_pattern_suffix
                    for dataset in analyze_data_list
                    if dataset not in new_analyze_data_dict.values()
                ])

        analyze_decode_fn = (
            self._GetDecodeFunction(raw_examples_data_format,
                                    analyze_input_dataset_metadata.schema))

        for (idx, dataset) in enumerate(analyze_data_list):
          dataset.encoded = (
              p | 'ReadAnalysisDataset[{}]'.format(idx) >>
              self._ReadExamples(dataset))
          dataset.decoded = (
              dataset.encoded
              | 'DecodeAnalysisDataset[{}]'.format(idx) >>
              self._DecodeInputs(analyze_decode_fn))

        input_analysis_data = {}
        for key, dataset in six.iteritems(new_analyze_data_dict):
          if dataset is None:
            input_analysis_data[key] = None
          else:
            input_analysis_data[key] = dataset.decoded

        if flat_data_required:
          flat_input_analysis_data = (
              [dataset.decoded for dataset in analyze_data_list]
              | 'FlattenAnalysisDatasets' >> beam.Flatten(pipeline=p))
        else:
          flat_input_analysis_data = None
        if input_cache:
          tf.logging.info('Analyzing data with cache.')
        transform_fn, cache_output = (
            (flat_input_analysis_data, input_analysis_data, input_cache,
             input_dataset_metadata)
            | 'AnalyzeDataset' >> tft_beam.AnalyzeDatasetWithCache(
                preprocessing_fn, pipeline=p))

        # Write the raw/input metadata.
        (input_dataset_metadata
         | 'WriteMetadata' >> tft_beam.WriteMetadata(
             os.path.join(transform_output_path,
                          tft.TFTransformOutput.RAW_METADATA_DIR), p))

        # WriteTransformFn writes transform_fn and metadata to subdirectories
        # tensorflow_transform.SAVED_MODEL_DIR and
        # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively.
        (transform_fn |
         'WriteTransformFn' >> tft_beam.WriteTransformFn(transform_output_path))

        if output_cache_dir is not None and cache_output is not None:
          # TODO(b/37788560): Possibly make this part of the beam graph.
          tf.io.gfile.makedirs(output_cache_dir)
          tf.logging.info('Using existing cache in: %s', input_cache_dir)
          if input_cache_dir is not None:
            # Only copy cache that is relevant to this iteration. This is
            # assuming that this pipeline operates on rolling ranges, so those
            # cache entries may also be relevant for future iterations.
            for span_cache_dir in input_analysis_data:
              full_span_cache_dir = os.path.join(input_cache_dir,
                                                 span_cache_dir)
              if tf.io.gfile.isdir(full_span_cache_dir):
                self._CopyCache(full_span_cache_dir,
                                os.path.join(output_cache_dir, span_cache_dir))

          (cache_output
           | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(
               p, output_cache_dir, sink=self._GetCacheSink()))

        if compute_statistics or materialize_output_paths:
          # Do not compute pre-transform stats if the input format is raw proto,
          # as StatsGen would treat any input as tf.Example.
          if (compute_statistics and
              not self._IsDataFormatProto(raw_examples_data_format)):
            # Aggregated feature stats before transformation.
            pre_transform_feature_stats_path = os.path.join(
                transform_output_path,
                tft.TFTransformOutput.PRE_TRANSFORM_FEATURE_STATS_PATH)

            schema_proto = _GetSchemaProto(analyze_input_dataset_metadata)
            ([
                dataset.decoded if stats_use_tfdv else dataset.encoded
                for dataset in analyze_data_list
            ]
             | 'FlattenPreTransformAnalysisDatasets' >> beam.Flatten(pipeline=p)
             | 'GenerateAggregatePreTransformAnalysisStats' >>
             self._GenerateStats(
                 pre_transform_feature_stats_path,
                 schema_proto,
                 use_deep_copy_optimization=True,
                 use_tfdv=stats_use_tfdv))

          transform_decode_fn = (
              self._GetDecodeFunction(raw_examples_data_format,
                                      transform_input_dataset_metadata.schema))
          # transform_data_list is a superset of analyze_data_list, we pay the
          # cost to read the same dataset (analyze_data_list) again here to
          # prevent certain beam runner from doing large temp materialization.
          for (idx, dataset) in enumerate(transform_data_list):
            dataset.encoded = (
                p
                | 'ReadTransformDataset[{}]'.format(idx) >>
                self._ReadExamples(dataset))
            dataset.decoded = (
                dataset.encoded
                | 'DecodeTransformDataset[{}]'.format(idx) >>
                self._DecodeInputs(transform_decode_fn))
            (dataset.transformed,
             metadata) = (((dataset.decoded, transform_input_dataset_metadata),
                           transform_fn)
                          | 'TransformDataset[{}]'.format(idx) >>
                          tft_beam.TransformDataset())

            if materialize_output_paths or not stats_use_tfdv:
              dataset.transformed_and_encoded = (
                  dataset.transformed
                  | 'EncodeTransformedDataset[{}]'.format(idx) >> beam.ParDo(
                      self._EncodeAsExamples(), metadata))

          if compute_statistics:
            # Aggregated feature stats after transformation.
            _, metadata = transform_fn
            post_transform_feature_stats_path = os.path.join(
                transform_output_path,
                tft.TFTransformOutput.POST_TRANSFORM_FEATURE_STATS_PATH)

            # TODO(b/70392441): Retain tf.Metadata (e.g., IntDomain) in
            # schema. Currently input dataset schema only contains dtypes,
            # and other metadata is dropped due to roundtrip to tensors.
            transformed_schema_proto = _GetSchemaProto(metadata)

            ([(dataset.transformed
               if stats_use_tfdv else dataset.transformed_and_encoded)
              for dataset in transform_data_list]
             | 'FlattenPostTransformAnalysisDatasets' >> beam.Flatten()
             | 'GenerateAggregatePostTransformAnalysisStats' >>
             self._GenerateStats(
                 post_transform_feature_stats_path,
                 transformed_schema_proto,
                 use_tfdv=stats_use_tfdv))

            if per_set_stats_output_paths:
              assert len(transform_data_list) == len(per_set_stats_output_paths)
              # TODO(b/67632871): Remove duplicate stats gen compute that is
              # done both on a flattened view of the data, and on each span
              # below.
              bundles = zip(transform_data_list, per_set_stats_output_paths)
              for (idx, (dataset, output_path)) in enumerate(bundles):
                if stats_use_tfdv:
                  data = dataset.transformed
                else:
                  data = dataset.transformed_and_encoded
                (data
                 | 'GeneratePostTransformStats[{}]'.format(idx) >>
                 self._GenerateStats(
                     output_path,
                     transformed_schema_proto,
                     use_tfdv=stats_use_tfdv))

          if materialize_output_paths:
            assert len(transform_data_list) == len(materialize_output_paths)
            bundles = zip(transform_data_list, materialize_output_paths)
            for (idx, (dataset, output_path)) in enumerate(bundles):
              (dataset.transformed_and_encoded
               | 'Materialize[{}]'.format(idx) >> self._WriteExamples(
                   raw_examples_file_format, output_path))

    return _Status.OK()
Exemple #19
0
def transform_data(input_handle,
                   outfile_prefix,
                   working_dir,
                   schema_file,
                   transform_dir=None,
                   max_rows=None,
                   pipeline_args=None):
  """The main tf.transform method which analyzes and transforms data.

  Args:
    input_handle: BigQuery table name to process specified as DATASET.TABLE or
      path to csv file with input data.
    outfile_prefix: Filename prefix for emitted transformed examples
    working_dir: Directory in which transformed examples and transform function
      will be emitted.
    schema_file: An file path that contains a text-serialized TensorFlow
      metadata schema of the input data.
    transform_dir: Directory in which the transform output is located. If
      provided, this will load the transform_fn from disk instead of computing
      it over the data. Hint: this is useful for transforming eval data.
    max_rows: Number of rows to query from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """

  def preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
    outputs = {}
    for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
      # Preserve this feature as a dense float, setting nan's to the mean.
      outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
          _fill_in_missing(inputs[key]))

    for key in taxi.VOCAB_FEATURE_KEYS:
      # Build a vocabulary for this feature.
      outputs[
          taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary(
              _fill_in_missing(inputs[key]),
              top_k=taxi.VOCAB_SIZE,
              num_oov_buckets=taxi.OOV_SIZE)

    for key in taxi.BUCKET_FEATURE_KEYS:
      outputs[taxi.transformed_name(key)] = transform.bucketize(
          _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

    for key in taxi.CATEGORICAL_FEATURE_KEYS:
      outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

    # Was this passenger a big tipper?
    taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
    tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
    outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
        tf.is_nan(taxi_fare),
        tf.cast(tf.zeros_like(taxi_fare), tf.int64),
        # Test if the tip was > 20% of the fare.
        tf.cast(
            tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
            tf.int64))

    return outputs

  schema = taxi.read_schema(schema_file)
  raw_feature_spec = taxi.get_raw_feature_spec(schema)
  raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
  raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)

  with beam.Pipeline(argv=pipeline_args) as pipeline:
    with tft_beam.Context(temp_dir=working_dir):
      if input_handle.lower().endswith('csv'):
        csv_coder = taxi.make_csv_coder(schema)
        raw_data = (
            pipeline
            | 'ReadFromText' >> beam.io.ReadFromText(
                input_handle, skip_header_lines=1))
        decode_transform = beam.Map(csv_coder.decode)
      else:
        query = taxi.make_sql(input_handle, max_rows, for_eval=False)
        raw_data = (
            pipeline
            | 'ReadBigQuery' >> beam.io.Read(
                beam.io.BigQuerySource(query=query, use_standard_sql=True)))
        decode_transform = beam.Map(
            taxi.clean_raw_data_dict, raw_feature_spec=raw_feature_spec)

      if transform_dir is None:
        decoded_data = raw_data | 'DecodeForAnalyze' >> decode_transform
        transform_fn = (
            (decoded_data, raw_data_metadata) |
            ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn)))

        _ = (
            transform_fn
            | ('WriteTransformFn' >>
               tft_beam.WriteTransformFn(working_dir)))
      else:
        transform_fn = pipeline | tft_beam.ReadTransformFn(transform_dir)

      # Shuffling the data before materialization will improve Training
      # effectiveness downstream. Here we shuffle the raw_data (as opposed to
      # decoded data) since it has a compact representation.
      shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle()

      decoded_data = shuffled_data | 'DecodeForTransform' >> decode_transform
      (transformed_data, transformed_metadata) = (
          ((decoded_data, raw_data_metadata), transform_fn)
          | 'Transform' >> tft_beam.TransformDataset())

      coder = example_proto_coder.ExampleProtoCoder(transformed_metadata.schema)
      _ = (
          transformed_data
          | 'SerializeExamples' >> beam.Map(coder.encode)
          | 'WriteExamples' >> beam.io.WriteToTFRecord(
              os.path.join(working_dir, outfile_prefix), file_name_suffix='.gz')
      )
Exemple #20
0
    def expand(self, dataset):
        """Analyze the dataset.

    Args:
      dataset: A dataset.

    Returns:
      A TransformFn containing the deferred transform function.
    """

        input_values, input_metadata = dataset
        input_schema = input_metadata.schema
        input_batches = input_values | 'BatchInstances' >> beam.ParDo(
            _BatchDoFn())

        class _CreateTransformFn(beam.PTransform):
            """Create a TransformFnDef, binding statistics in a deferred manner.

      This function constructs a tensorflow graph eagerly and then (in a
      deferred manner) fills in analyzer outputs with their actual computed
      values. We construct the tensorflow graph up front because that implies
      serializing MetaGraphDef protos rather than pickling the user-defined TITO
      functions. The graph contains placeholders for `_AnalyzerOutput`s which
      are then replaced with their actual values (as constant tensors) in a
      deferred manner.

      Args:
        input_columns: A map from column names to `Column`s.
        output_columns: A map from column names to `Column`s.
        temp_dir: Temp dir to store `SavedModel`s.
      """
            def __init__(self, input_columns, output_columns, temp_dir):
                # Generally the pipeline is inferred from its inputs, however we need
                # to know the pipeline for beam.Create.
                self.pipeline = input_values.pipeline
                self._input_columns = input_columns
                self._output_columns = output_columns
                self._temp_dir = temp_dir

            def expand(self, analyzer_outputs_to_pcoll):
                """Converts a dict of statistics to a transform function.

        Args:
          analyzer_outputs_to_pcoll: A dictionary mapping `_AnalyzerOutput`s
              to the values of these statistics as a PCollection.

        Returns:
          A single-element PCollection containing the directory name with the
              SavedModel.
        """
                # Create a transform_fn with unbound values.

                unbound_transform_fn_dir = os.path.join(
                    self._temp_dir, 'unbound_transform_fn')
                input_columns_to_statistics = impl_helper.make_transform_fn_def(
                    input_schema, self._input_columns, self._output_columns,
                    unbound_transform_fn_dir)

                transform_fn = (self.pipeline | 'CreateTransformFn' >>
                                beam.Create([unbound_transform_fn_dir]))

                if not analyzer_outputs_to_pcoll:
                    return transform_fn

                # Convert the statistics dict into a DictPCollectionView so it can be
                # passed as a side input to the beam Map below.
                tagged_statistics = []
                for tag, statistic in input_columns_to_statistics.items():
                    pcoll = analyzer_outputs_to_pcoll[statistic]
                    tagged_statistics.append(
                        pcoll
                        | 'AddTag[%s]' % tag >> beam.Map(lambda x, tag=tag:
                                                         (tag, x)))

                statistics_side_input = beam.pvalue.AsDict(
                    tagged_statistics | 'MergeStatistics' >> beam.Flatten())

                # Run a mapper that inserts statistic values into the graph.
                return (transform_fn
                        | 'ReplaceTensorsWithConstantValues' >> beam.Map(
                            impl_helper.replace_tensors_with_constant_values,
                            bound_saved_model_dir=os.path.join(
                                self._temp_dir, 'transform_fn'),
                            input_value_mapping=statistics_side_input))

        inputs, outputs = impl_helper.run_preprocessing_fn(
            self._preprocessing_fn, input_schema)

        # Get a list of lists, containing analyzers (i.e. _AnalyzerOutput objects)
        # by level in the DAG of Columns/Statistics. Analyzers at level n are ready
        # to run once all analyzers at level n - 1 are complete.
        analyzers_by_level = self._analyzers_by_level(outputs)

        # Iterate through levels, keeping track of analyzer outputs (i.e.
        # statistics) via a mapping of `_AnalyzerOutput` -> single element
        # PCollection.
        analyzer_outputs_to_pcoll = {}
        for level, analyzer_outputs in enumerate(analyzers_by_level):
            # Create a TransformFnDef representing the graph needed to generate
            # all the inputs required by the analyzer_outputs at this level.  We
            # assign arbitrary names to the outputs of this TransformFnDef.
            analyzer_input_columns = {}
            for idx, analyzer_output in enumerate(analyzer_outputs):
                if len(analyzer_output.inputs) != 1:
                    raise NotImplementedError(
                        'Analyzers must have exactly one input')
                analyzer_input_key = 'analyzer_%d_input' % idx
                analyzer_input_columns[
                    analyzer_input_key] = analyzer_output.inputs[0]

            transform_fn = (
                analyzer_outputs_to_pcoll
                | 'CreateTransformFn_%d' % level >> _CreateTransformFn(
                    inputs, analyzer_input_columns,
                    os.path.join(self._output_dir, 'tmp', 'level_%s' % level)))
            analyzer_input_schema = impl_helper.infer_feature_schema(
                analyzer_input_columns)

            # Run the TransformFnDef in a mapper.
            analysis_inputs = (
                input_batches
                | 'ComputeAnalyzerInputs_%d' % level >> beam.ParDo(
                    _RunMetaGraphDoFn(input_schema, analyzer_input_schema),
                    saved_model_dir=beam.pvalue.AsSingleton(transform_fn)))

            # For each analyzer output, look up its input values (by tensor name)
            # and run the analyzer in these values.
            for idx, analyzer_output in enumerate(analyzer_outputs):
                analyzer_input_key = 'analyzer_%d_input' % idx
                analyzer_outputs_to_pcoll[analyzer_output] = (
                    analysis_inputs
                    | 'Extract_%d_%d' % (level, idx) >> beam.Map(
                        # pylint: disable=cell-var-from-loop
                        # This lint warning is prone to false positives, and it's not
                        # clear why the warning is required here.
                        lambda x, key=analyzer_input_key:
                        [inst[key] for inst in x])
                    | 'Analyze_%d_%d' %
                    (level, idx) >> self._Analyze(analyzer_output))

        output_metadata = dataset_metadata.DatasetMetadata(
            schema=impl_helper.infer_feature_schema(outputs))
        transform_fn = (analyzer_outputs_to_pcoll
                        | 'CreateTransformFn' >> _CreateTransformFn(
                            inputs, outputs, self._output_dir))

        return transform_fn, output_metadata
import apache_beam as beam

import tensorflow as tf
from tensorflow_transform.beam.tft_beam_io import beam_metadata_io
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform.tf_metadata import metadata_io

import unittest
from tensorflow.python.framework import test_util

_TEST_METADATA_COMPLETE = dataset_metadata.DatasetMetadata({
    'fixed_column':
    dataset_schema.ColumnSchema(tf.string, (3, ),
                                dataset_schema.FixedColumnRepresentation()),
    'list_columm':
    dataset_schema.ColumnSchema(
        dataset_schema.IntDomain(tf.int64, min_value=-1, max_value=5),
        (None, ), dataset_schema.ListColumnRepresentation())
})

_TEST_METADATA = dataset_metadata.DatasetMetadata({
    'fixed_column':
    dataset_schema.ColumnSchema(tf.string, (3, ),
                                dataset_schema.FixedColumnRepresentation()),
    # zeros will be overriddden
    'list_columm':
    dataset_schema.ColumnSchema(
        dataset_schema.IntDomain(tf.int64, min_value=0, max_value=0), (None, ),
        dataset_schema.ListColumnRepresentation())
})
Exemple #22
0
 def toMetadata(self, feature_spec):
     return dataset_metadata.DatasetMetadata(
         schema=sch.from_feature_spec(feature_spec))
Exemple #23
0
    def expand(self, dataset):
        """Analyze the dataset.

    Args:
      dataset: A dataset.

    Returns:
      A TransformFn containing the deferred transform function.

    Raises:
      ValueError: If preprocessing_fn has no outputs.
    """
        (flattened_pcoll, input_values_pcoll_dict, dataset_cache_dict,
         input_metadata) = dataset
        input_schema = input_metadata.schema

        input_values_pcoll_dict = input_values_pcoll_dict or dict()

        with tf.compat.v1.Graph().as_default() as graph:

            with tf.compat.v1.name_scope('inputs'):
                feature_spec = schema_utils.schema_as_feature_spec(
                    input_schema).feature_spec
                input_signature = impl_helper.feature_spec_as_batched_placeholders(
                    feature_spec)
                # In order to avoid a bug where import_graph_def fails when the
                # input_map and return_elements of an imported graph are the same
                # (b/34288791), we avoid using the placeholder of an input column as an
                # output of a graph. We do this by applying tf.identity to all inputs of
                # the preprocessing_fn.  Note this applies at the level of raw tensors.
                # TODO(b/34288791): Remove this workaround and use a shallow copy of
                # inputs instead.  A shallow copy is needed in case
                # self._preprocessing_fn mutates its input.
                copied_inputs = impl_helper.copy_tensors(input_signature)

            output_signature = self._preprocessing_fn(copied_inputs)

        # At this point we check that the preprocessing_fn has at least one
        # output. This is because if we allowed the output of preprocessing_fn to
        # be empty, we wouldn't be able to determine how many instances to
        # "unbatch" the output into.
        if not output_signature:
            raise ValueError(
                'The preprocessing function returned an empty dict')

        if graph.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES):
            raise ValueError(
                'The preprocessing function contained trainable variables '
                '{}'.format(
                    graph.get_collection_ref(
                        tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES)))

        pipeline = self.pipeline or (flattened_pcoll or next(
            v for v in input_values_pcoll_dict.values()
            if v is not None)).pipeline

        # Add a stage that inspects graph collections for API use counts and logs
        # them as a beam metric.
        _ = (pipeline | 'InstrumentAPI' >> _InstrumentAPI(graph))

        tf_config = _DEFAULT_TENSORFLOW_CONFIG_BY_BEAM_RUNNER_TYPE.get(
            type(pipeline.runner))
        extra_args = beam_common.ConstructBeamPipelineVisitor.ExtraArgs(
            base_temp_dir=Context.create_base_temp_dir(),
            tf_config=tf_config,
            pipeline=pipeline,
            flat_pcollection=flattened_pcoll,
            pcollection_dict=input_values_pcoll_dict,
            graph=graph,
            input_signature=input_signature,
            input_schema=input_schema,
            cache_pcoll_dict=dataset_cache_dict)

        transform_fn_future, cache_value_nodes = analysis_graph_builder.build(
            graph,
            input_signature,
            output_signature,
            input_values_pcoll_dict.keys(),
            cache_dict=dataset_cache_dict)

        traverser = nodes.Traverser(
            beam_common.ConstructBeamPipelineVisitor(extra_args))
        transform_fn_pcoll = traverser.visit_value_node(transform_fn_future)

        if cache_value_nodes is not None:
            output_cache_pcoll_dict = {}
            for (dataset_key,
                 cache_key), value_node in six.iteritems(cache_value_nodes):
                if dataset_key not in output_cache_pcoll_dict:
                    output_cache_pcoll_dict[dataset_key] = {}
                output_cache_pcoll_dict[dataset_key][cache_key] = (
                    traverser.visit_value_node(value_node))
        else:
            output_cache_pcoll_dict = None

        # Infer metadata.  We take the inferred metadata and apply overrides that
        # refer to values of tensors in the graph.  The override tensors must
        # be "constant" in that they don't depend on input data.  The tensors can
        # depend on analyzer outputs though.  This allows us to set metadata that
        # depends on analyzer outputs. _infer_metadata_from_saved_model will use the
        # analyzer outputs stored in `transform_fn` to compute the metadata in a
        # deferred manner, once the analyzer outputs are known.
        metadata = dataset_metadata.DatasetMetadata(
            schema=schema_inference.infer_feature_schema(
                output_signature, graph))

        deferred_metadata = (transform_fn_pcoll
                             | 'ComputeDeferredMetadata' >>
                             beam.Map(_infer_metadata_from_saved_model))

        full_metadata = beam_metadata_io.BeamDatasetMetadata(
            metadata, deferred_metadata)

        _clear_shared_state_after_barrier(pipeline, transform_fn_pcoll)

        return (transform_fn_pcoll, full_metadata), output_cache_pcoll_dict
Exemple #24
0
def transform_data(input_handle,
                   outfile_prefix,
                   working_dir,
                   schema_file,
                   transform_dir=None,
                   max_rows=None,
                   pipeline_args=None):
  """The main tf.transform method which analyzes and transforms data.

  Args:
    input_handle: BigQuery table name to process specified as DATASET.TABLE or
      path to csv file with input data.
    outfile_prefix: Filename prefix for emitted transformed examples
    working_dir: Directory in which transformed examples and transform function
      will be emitted.
    schema_file: An file path that contains a text-serialized TensorFlow
      metadata schema of the input data.
    transform_dir: Directory in which the transform output is located. If
      provided, this will load the transform_fn from disk instead of computing
      it over the data. Hint: this is useful for transforming eval data.
    max_rows: Number of rows to query from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """

  def transform_ngrams(input, ngram_range):
    """ helper function to transform ngrams and print output. """
    # this print statement causes output to concat itself!
    # input = tf.Print(input, [input], "raw input:", first_n=-1, summarize=100)

    transformed = transform.ngrams(
      tf.string_split(input, delimiter=" "),
      ngram_range=ngram_range,
      separator=' ')

    # SparseTensor basically cannot be printed because it's made up of 3
    # tensors. We can use this trick to print the values column, but without the index
    # it's not too meaningful.
    #
    # values = tf.Print(transformed.values, [transformed.values], "ngram output:")
    # transformed = tf.SparseTensor(
    #       indices=transformed.indices,
    #       values=values,
    #       dense_shape=transformed.dense_shape)
    return transformed

  def preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.
    https://cloud.google.com/solutions/machine-learning/data-preprocessing-for-ml-with-tf-transform-pt2

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
    outputs = {}
    for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
      print('processing key', key)
      print('input:', inputs[key])
      # Preserve this feature as a dense float, setting nan's to the mean.
      outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
          _fill_in_missing(inputs[key]))

    for key in taxi.VOCAB_FEATURE_KEYS:
      # Build a vocabulary for this feature.
      outputs[
          taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary(
              _fill_in_missing(inputs[key]),
              top_k=taxi.VOCAB_SIZE,
              num_oov_buckets=taxi.OOV_SIZE)

    # for key in taxi.FEATURE_NGRAM:
    #   # Extract nggrams and build a vocab.
    #   outputs[
    #       taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary(
    #           transform.ngrams(
    #             tf.string_split(_fill_in_missing(inputs[key])),
    #             ngram_range=taxi.NGRAM_RANGE,
    #             separator=' '),
    #           top_k=512,
    #           num_oov_buckets=taxi.OOV_SIZE)

    for key in taxi.FEATURE_NGRAM:
      # Extract nggrams and build a vocab.
      outputs[
          taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary(
            transform_ngrams(_fill_in_missing(inputs[key]), taxi.NGRAM_RANGE),
            top_k=taxi.VOCAB_SIZE,
            num_oov_buckets=taxi.OOV_SIZE)

    for key in taxi.BUCKET_FEATURE_KEYS:
      outputs[taxi.transformed_name(key)] = transform.bucketize(
          _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

    for key in taxi.CATEGORICAL_FEATURE_KEYS:
      outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

    # Was this passenger a big tipper?
    taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
    tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
    outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
        tf.is_nan(taxi_fare),
        tf.cast(tf.zeros_like(taxi_fare), tf.int64),
        # Test if the tip was > 20% of the fare.
        tf.cast(
            tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
            tf.int64))

    return outputs

  schema = taxi.read_schema(schema_file)
  raw_feature_spec = taxi.get_raw_feature_spec(schema)
  raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
  raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)

  with beam.Pipeline(argv=pipeline_args) as pipeline:
    with tft_beam.Context(temp_dir=working_dir):
      if input_handle.lower().endswith('csv'):
        csv_coder = taxi.make_csv_coder(schema, input_handle.lower())
        raw_data = (
            pipeline
            | 'ReadFromText' >> beam.io.ReadFromText(
                input_handle, skip_header_lines=1))
        decode_transform = beam.Map(csv_coder.decode)
      else:
        query = taxi.make_sql(input_handle, max_rows, for_eval=False)
        raw_data = (
            pipeline
            | 'ReadBigQuery' >> beam.io.Read(
                beam.io.BigQuerySource(query=query, use_standard_sql=True)))
        decode_transform = beam.Map(
            taxi.clean_raw_data_dict, raw_feature_spec=raw_feature_spec)

      if transform_dir is None:
        decoded_data = raw_data | 'DecodeForAnalyze' >> decode_transform
        transform_fn = (
            (decoded_data, raw_data_metadata) |
            ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn)))

        _ = (
            transform_fn
            | ('WriteTransformFn' >>
               tft_beam.WriteTransformFn(working_dir)))
      else:
        transform_fn = pipeline | tft_beam.ReadTransformFn(transform_dir)

      # Shuffling the data before materialization will improve Training
      # effectiveness downstream. Here we shuffle the raw_data (as opposed to
      # decoded data) since it has a compact representation.
      shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle()

      decoded_data = shuffled_data | 'DecodeForTransform' >> decode_transform
      (transformed_data, transformed_metadata) = (
          ((decoded_data, raw_data_metadata), transform_fn)
          | 'Transform' >> tft_beam.TransformDataset())

      coder = example_proto_coder.ExampleProtoCoder(transformed_metadata.schema)
      _ = (
          transformed_data
          | 'SerializeExamples' >> beam.Map(coder.encode)
          | 'WriteExamples' >> beam.io.WriteToTFRecord(
              os.path.join(working_dir, outfile_prefix), file_name_suffix='.gz')
      )
Exemple #25
0
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir,
               frequency_threshold):
    """Run pre-processing step as a pipeline.

  Args:
    pipeline: beam pipeline
    training_data: the name of the table to train on.
    eval_data: the name of the table to evaluate on.
    predict_data: the name of the table to predict on.
    output_dir: file path to where to write all the output files.
    frequency_threshold: frequency threshold to use for categorical values.
  """

    # 1) The schema can be either defined in-memory or read from a configuration
    #    file, in this case we are creating the schema in-memory.
    input_schema = reddit.make_input_schema()

    # 2) Read from BigQuery or from CSV.
    train_data = pipeline | 'ReadTrainingData' >> _ReadData(training_data)
    evaluate_data = pipeline | 'ReadEvalData' >> _ReadData(eval_data)

    input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema)

    _ = (input_metadata
         | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(
             os.path.join(output_dir, path_constants.RAW_METADATA_DIR),
             pipeline=pipeline))

    preprocessing_fn = reddit.make_preprocessing_fn(frequency_threshold)
    transform_fn = ((train_data, input_metadata)
                    | 'Analyze' >> tft.AnalyzeDataset(preprocessing_fn))

    # WriteTransformFn writes transform_fn and metadata to fixed subdirectories
    # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and
    # path_constants.TRANSFORMED_METADATA_DIR.
    _ = (transform_fn
         | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir))

    @beam.ptransform_fn
    def TransformAndWrite(pcoll, path):  # pylint: disable=invalid-name
        pcoll |= 'Shuffle' >> _Shuffle()  # pylint: disable=no-value-for-parameter
        (dataset, metadata) = (((pcoll, input_metadata), transform_fn)
                               | 'Transform' >> tft.TransformDataset())
        coder = coders.ExampleProtoCoder(metadata.schema)
        _ = (dataset
             | 'SerializeExamples' >> beam.Map(coder.encode)
             | 'WriteExamples' >> beam.io.WriteToTFRecord(
                 os.path.join(output_dir, path),
                 file_name_suffix='.tfrecord.gz'))

    _ = train_data | 'TransformAndWriteTraining' >> TransformAndWrite(  # pylint: disable=no-value-for-parameter
        path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX)

    _ = evaluate_data | 'TransformAndWriteEval' >> TransformAndWrite(  # pylint: disable=no-value-for-parameter
        path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX)

    # TODO(b/35300113) Remember to eventually also save the statistics.

    if predict_data:
        predict_mode = tf.contrib.learn.ModeKeys.INFER
        predict_schema = reddit.make_input_schema(mode=predict_mode)
        predict_coder = coders.ExampleProtoCoder(predict_schema)

        serialized_examples = (
            pipeline
            | 'ReadPredictData' >> _ReadData(predict_data, mode=predict_mode)
            # TODO(b/35194257) Obviate the need for this explicit
            # serialization.
            | 'EncodePredictData' >> beam.Map(predict_coder.encode))
        _ = (serialized_examples
             | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord(
                 os.path.join(
                     output_dir,
                     path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
                 file_name_suffix='.tfrecord.gz'))
        _ = (serialized_examples
             | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json)
             | 'WritePredictDataAsText' >> beam.io.WriteToText(
                 os.path.join(
                     output_dir,
                     path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
                 file_name_suffix='.txt'))
Exemple #26
0
        self._fn = fn

    def expand(self, pcoll):
        return pcoll | beam.ParDo(self._MapAndFilterErrorsDoFn(self._fn))


RAW_DATA_FEATURE_SPEC = dict([(name, tf.io.FixedLenFeature([], tf.string))
                              for name in CATEGORICAL_FEATURE_KEYS] +
                             [(name, tf.io.FixedLenFeature([], tf.float32))
                              for name in NUMERIC_FEATURE_KEYS] +
                             [(name, tf.io.VarLenFeature(tf.float32))
                              for name in OPTIONAL_NUMERIC_FEATURE_KEYS] +
                             [(LABEL_KEY,
                               tf.io.FixedLenFeature([], tf.string))])

RAW_DATA_METADATA = dataset_metadata.DatasetMetadata(
    dataset_schema.from_feature_spec(RAW_DATA_FEATURE_SPEC))

# Constants used for training.  Note that the number of instances will be
# computed by tf.Transform in future versions, in which case it can be read from
# the metadata.  Similarly BUCKET_SIZES will not be needed as this information
# will be stored in the metadata for each of the columns.  The bucket size
# includes all listed categories in the dataset description as well as one extra
# for "?" which represents unknown.
TRAIN_BATCH_SIZE = 128
TRAIN_NUM_EPOCHS = 200
NUM_TRAIN_INSTANCES = 32561
NUM_TEST_INSTANCES = 16281

# Names of temp files
TRANSFORMED_TRAIN_DATA_FILEBASE = 'train_transformed'
TRANSFORMED_TEST_DATA_FILEBASE = 'test_transformed'
Exemple #27
0
    def test_single_phase_mixed_analyzer_run_once(self):
        span_0_key = 'span-0'
        span_1_key = 'span-1'

        def preprocessing_fn(inputs):

            integerized_s = tft.compute_and_apply_vocabulary(inputs['s'])

            _ = tft.bucketize(inputs['x'], 2, name='bucketize')

            return {
                'integerized_s':
                integerized_s,
                'x_min':
                tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'x_mean':
                tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'y_min':
                tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
                'y_mean':
                tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
            }

        # Run AnalyzeAndTransform on some input data and compare with expected
        # output.
        input_data = [{'x': 12, 'y': 1, 's': 'c'}, {'x': 10, 'y': 1, 's': 'c'}]
        input_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec({
                'x':
                tf.io.FixedLenFeature([], tf.float32),
                'y':
                tf.io.FixedLenFeature([], tf.float32),
                's':
                tf.io.FixedLenFeature([], tf.string),
            }))
        input_data_dict = {
            span_0_key: [{
                'x': -2,
                'y': 1,
                's': 'b',
            }, {
                'x': 4,
                'y': -4,
                's': 'b',
            }],
            span_1_key:
            input_data,
        }

        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            with beam.Pipeline() as p:

                flat_data = p | 'CreateInputData' >> beam.Create(
                    list(itertools.chain(*input_data_dict.values())))

                # TODO(b/37788560): Get these names programmatically.
                cache_dict = {
                    span_0_key: {
                        '__v0__CacheableCombineAccumulate--x_1-mean_and_var--':
                        p | 'CreateA' >> beam.Create([b'[2.0, 1.0, 9.0]']),
                        '__v0__CacheableCombineAccumulate--x-x--':
                        p | 'CreateB' >> beam.Create([b'[2.0, 4.0]']),
                        '__v0__CacheableCombineAccumulate--y_1-mean_and_var--':
                        p | 'CreateC' >> beam.Create([b'[2.0, -1.5, 6.25]']),
                        '__v0__CacheableCombineAccumulate--y-y--':
                        p | 'CreateD' >> beam.Create([b'[4.0, 1.0]']),
                    },
                    span_1_key: {},
                }

                transform_fn, cache_output = (
                    (flat_data, input_data_dict, cache_dict, input_metadata)
                    | 'Analyze' >>
                    (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
                _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(
                    self._cache_dir)

                transformed_dataset = (
                    ((input_data_dict[span_1_key], input_metadata),
                     transform_fn)
                    | 'Transform' >> beam_impl.TransformDataset())

                dot_string = nodes.get_dot_graph(
                    [analysis_graph_builder._ANALYSIS_GRAPH]).to_string()
                self.WriteRenderedDotFile(dot_string)

                transformed_data, unused_transformed_metadata = transformed_dataset

                expected_transformed = [
                    {
                        'x_mean': 6.0,
                        'x_min': -2.0,
                        'y_mean': -0.25,
                        'y_min': -4.0,
                        'integerized_s': 0,
                    },
                    {
                        'x_mean': 6.0,
                        'x_min': -2.0,
                        'y_mean': -0.25,
                        'y_min': -4.0,
                        'integerized_s': 0,
                    },
                ]
                beam_test_util.assert_that(
                    transformed_data,
                    beam_test_util.equal_to(expected_transformed))

                transform_fn_dir = os.path.join(self.base_test_dir,
                                                'transform_fn')
                _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir)
Exemple #28
0
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir,
               frequency_threshold):
    """Run pre-processing step as a pipeline.

  Args:
    pipeline: beam pipeline
    training_data: file paths to input csv files.
    eval_data: file paths to input csv files.
    predict_data: file paths to input csv files.
    output_dir: file path to where to write all the output files.
    frequency_threshold: frequency threshold to use for categorical values.
  """
    # 1) The schema can be either defined in-memory or read from a configuration
    #    file, in this case we are creating the schema in-memory.
    input_schema = criteo.make_input_schema()

    # 2) Configure the coder to map the source file column names to a dictionary
    #    of key -> tensor_proto with the appropiate type derived from the
    #    input_schema.
    coder = criteo.make_tsv_coder(input_schema)

    # 3) Read from text using the coder.
    train_data = (pipeline
                  | 'ReadTrainingData' >> beam.io.ReadFromText(training_data)
                  | 'ParseTrainingCsv' >> beam.Map(coder.decode))

    evaluate_data = (pipeline
                     | 'ReadEvalData' >> beam.io.ReadFromText(eval_data)
                     | 'ParseEvalCsv' >> beam.Map(coder.decode))

    input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema)
    _ = (input_metadata
         | 'WriteInputMetadata' >> io.WriteMetadata(os.path.join(
             output_dir, path_constants.RAW_METADATA_DIR),
                                                    pipeline=pipeline))

    # TODO(b/33688220) should the transform functions take shuffle as an optional
    # argument?
    # TODO(b/33688275) Should the transform functions have more user friendly
    # names?
    work_dir = os.path.join(output_dir, path_constants.TEMP_DIR)
    preprocessing_fn = criteo.make_preprocessing_fn(frequency_threshold)
    (train_dataset, train_metadata), transform_fn = (
        (train_data, input_metadata)
        | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset(
            preprocessing_fn, work_dir))

    # WriteTransformFn writes transform_fn and metadata to fixed subdirectories
    # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and
    # path_constants.TRANSFORMED_METADATA_DIR.
    _ = (transform_fn | 'WriteTransformFn' >> io.WriteTransformFn(output_dir))

    # TODO(b/34231369) Remember to eventually also save the statistics.

    (evaluate_dataset,
     evaluate_metadata) = (((evaluate_data, input_metadata), transform_fn)
                           | 'TransformEval' >> tft.TransformDataset())

    train_coder = coders.ExampleProtoCoder(train_metadata.schema)
    _ = (train_dataset
         | 'SerializeTrainExamples' >> beam.Map(train_coder.encode)
         | 'WriteTraining' >>
         beam.io.WriteToTFRecord(os.path.join(
             output_dir, path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX),
                                 file_name_suffix='.tfrecord.gz'))

    evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema)
    _ = (evaluate_dataset
         | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode)
         | 'WriteEval' >>
         beam.io.WriteToTFRecord(os.path.join(
             output_dir, path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX),
                                 file_name_suffix='.tfrecord.gz'))

    if predict_data:
        predict_mode = tf.contrib.learn.ModeKeys.INFER
        predict_schema = criteo.make_input_schema(mode=predict_mode)
        tsv_coder = criteo.make_tsv_coder(predict_schema, mode=predict_mode)
        predict_coder = coders.ExampleProtoCoder(predict_schema)
        serialized_examples = (
            pipeline
            | 'ReadPredictData' >> beam.io.ReadFromText(predict_data)
            | 'ParsePredictCsv' >> beam.Map(tsv_coder.decode)
            # TODO(b/35194257) Obviate the need for this explicit serialization.
            | 'EncodePredictData' >> beam.Map(predict_coder.encode))
        _ = (serialized_examples
             | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord(
                 os.path.join(
                     output_dir,
                     path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
                 file_name_suffix='.tfrecord.gz'))
        _ = (serialized_examples
             | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json)
             | 'WritePredictDataAsText' >> beam.io.WriteToText(
                 os.path.join(
                     output_dir,
                     path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
                 file_name_suffix='.txt'))
Exemple #29
0
    def test_caching_vocab_for_integer_categorical(self):

        span_0_key = 'span-0'
        span_1_key = 'span-1'

        def preprocessing_fn(inputs):
            return {
                'x_vocab':
                tft.compute_and_apply_vocabulary(inputs['x'],
                                                 frequency_threshold=2)
            }

        input_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec({
                'x':
                tf.FixedLenFeature([], tf.int64),
            }))
        input_data_dict = {
            span_0_key: [{
                'x': -2,
            }, {
                'x': -4,
            }, {
                'x': -1,
            }, {
                'x': 4,
            }],
            span_1_key: [{
                'x': -2,
            }, {
                'x': -1,
            }, {
                'x': 6,
            }, {
                'x': 7,
            }],
        }
        expected_transformed_data = [{
            'x_vocab': 0,
        }, {
            'x_vocab': 1,
        }, {
            'x_vocab': -1,
        }, {
            'x_vocab': -1,
        }]
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            with beam.Pipeline() as p:

                flat_data = p | 'CreateInputData' >> beam.Create(
                    list(itertools.chain(*input_data_dict.values())))

                # TODO(b/37788560): Get these names programmatically.
                cache_dict = {
                    span_0_key: {
                        '__v0__VocabularyAccumulate--compute_and_apply_vocabulary-vocabulary--':
                        p | 'CreateB' >> beam.Create(
                            [b'[-2, 2]', b'[-4, 1]', b'[-1, 1]', b'[4, 1]']),
                    },
                    span_1_key: {},
                }

                transform_fn, cache_output = (
                    (flat_data, input_data_dict, cache_dict, input_metadata)
                    | 'Analyze' >>
                    (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
                _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(
                    self._cache_dir)

                transformed_dataset = (
                    ((input_data_dict[span_1_key], input_metadata),
                     transform_fn)
                    | 'Transform' >> beam_impl.TransformDataset())

                transformed_data, _ = transformed_dataset

                beam_test_util.assert_that(
                    transformed_data,
                    beam_test_util.equal_to(expected_transformed_data),
                    label='first')
Exemple #30
0
def preprocess(p, output_dir, check_path, data_size, bq_table, split_data_path,
               project_id):
    """Main processing pipeline reading, processing and storing processed data.

  Performs the following operations:
    - reads data from BigQuery
    - adds hash key value to each row
    - scales data
    - shuffles and splits data in train / validation / test sets
    - oversamples train data
    - stores data as TFRecord
    - splits and stores test data into labels and features files

  Args:
    p: PCollection, initial pipeline.
    output_dir: string, path to directory to store output.
    check_path: string, path to directory to store data checks.
    data_size: tuple of float, ratio of data going respectively to train,
      validation and test sets.
    bq_table: string, name of table to read data from.
    split_data_path: string, path to directory to store train, validation and
      test raw datasets.
    project_id: string, GCP project id.

  Raises:
    ValueError: No test dataset found in pipeline output.
  """

    train_size, validation_size, test_size = data_size

    data = (p |
            'ReadData' >> read_data(bq_table=bq_table, project_id=project_id))

    _ = data | 'StoreData' >> beam.io.WriteToText(
        posixpath.join(output_dir, check_path, 'processed_data.txt'))

    split_data = (
        data |
        'RandomlySplitData' >> randomly_split(train_size=train_size,
                                              validation_size=validation_size,
                                              test_size=test_size))

    for k in split_data:
        split_data[k] |= 'AddHash_{}'.format(k.name) >> beam.ParDo(
            AddHash(),
            label_column=constants.LABEL_COLUMN,
            key_column=constants.KEY_COLUMN,
            dtype=k)

    # Splits test data into features pipeline and labels pipeline.
    if DatasetType.TEST not in split_data:
        raise ValueError('No test dataset found in pipeline output.')
    test_data = (split_data.pop(DatasetType.TEST)
                 | 'SplitFeaturesLabels' >> split_features_labels(
                     constants.LABEL_COLUMN, constants.KEY_COLUMN))

    # Stores test data features and labels pipeline separately.
    for k in test_data:
        _ = (test_data[k]
             | 'ParseJsonToString_{}'.format(k) >> beam.Map(json.dumps)
             | 'StoreSplitData_{}'.format(k) >> beam.io.WriteToText(
                 posixpath.join(
                     output_dir, split_data_path,
                     'split_data_{}_{}.txt'.format(DatasetType.TEST.name, k))))

    meta_data = dataset_metadata.DatasetMetadata(make_input_schema())

    transform_fn = (
        (split_data[DatasetType.TRAIN], meta_data)
        | 'AnalyzeTrainDataset' >> beam_impl.AnalyzeDataset(preprocessing_fn))

    _ = (transform_fn
         | 'WriteTransformFn' >> tft.beam.tft_beam_io.WriteTransformFn(
             posixpath.join(output_dir, constants.PATH_INPUT_TRANSFORMATION)))
    _ = (meta_data
         | 'WriteInputMetadata' >> tft.beam.tft_beam_io.WriteMetadata(
             posixpath.join(output_dir, constants.PATH_INPUT_SCHEMA),
             pipeline=p))

    transformed_metadata, transformed_data = {}, {}
    for k in [DatasetType.TRAIN, DatasetType.VAL]:
        transformed_data[k], transformed_metadata[k] = (
            ((split_data[k], meta_data), transform_fn)
            | 'Transform{}'.format(k) >> beam_impl.TransformDataset())

    transformed_data[DatasetType.TRAIN] = (
        transformed_data[DatasetType.TRAIN]
        | 'OverSampleTraining' >> oversampling())

    for k in transformed_data:
        _ = (transformed_data[k]
             | 'ShuffleData{}'.format(k) >> shuffle_data()
             | 'StoreData{}'.format(k) >> store_transformed_data(
                 schema=transformed_metadata[k],
                 path=posixpath.join(output_dir,
                                     constants.PATH_TRANSFORMED_DATA_SPLIT[k]),
                 name=DatasetType(k).name))

    for k in transformed_data:
        _ = (transformed_data[k] | 'CheckSize{}'.format(k.name) >> check_size(
            name=DatasetType(k).name,
            path=posixpath.join(output_dir, check_path, k.name)))