Example #1
0
  def testCreatePhasesWithUnwrappedLoop(self):
    # Test a preprocessing function with control flow.
    #
    # The loop represents
    #
    # i = 0
    # while i < 10:
    #   i += 1
    #   x += 1
    #
    # We need to call an analyzer after the loop because only the transitive
    # parents of analyzers are inspected by create_phases
    def preprocessing_fn(inputs):
      def _subtract_ten(x):
        i = tf.constant(0)
        c = lambda i, x: tf.less(i, 10)
        b = lambda i, x: (tf.add(i, 1), tf.add(x, -1))
        return tf.while_loop(c, b, [i, x])[1]
      scaled_to_0_1 = mappers.scale_to_0_1(_subtract_ten(inputs['x']))
      return {'x_scaled': scaled_to_0_1}

    input_schema = sch.Schema({
        'x': sch.ColumnSchema(tf.int32, [], sch.FixedColumnRepresentation())
    })
    graph, _, _ = impl_helper.run_preprocessing_fn(
        preprocessing_fn, input_schema)
    with self.assertRaisesRegexp(ValueError, 'Cycle detected'):
      _ = impl_helper.create_phases(graph)
Example #2
0
  def testRunPreprocessingFn(self):
    schema = self.toSchema({
        'dense_1': tf.FixedLenFeature((), tf.float32),
        'dense_2': tf.FixedLenFeature((1, 2), tf.int64),
        'var_len': tf.VarLenFeature(tf.string),
        'sparse': tf.SparseFeature('ix', 'val', tf.float32, 100)
    })
    def preprocessing_fn(inputs):
      return {
          'dense_out': mappers.scale_to_0_1(inputs['dense_1']),
          'sparse_out': tf.sparse_reshape(inputs['sparse'], (1, 10)),
      }

    _, inputs, outputs = impl_helper.run_preprocessing_fn(
        preprocessing_fn, schema)

    # Verify that the input placeholders have the correct types.
    expected_dtype_and_shape = {
        'dense_1': (tf.float32, tf.TensorShape([None])),
        'dense_2': (tf.int64, tf.TensorShape([None, 1, 2])),
        'var_len': (tf.string, tf.TensorShape([None, None])),
        'sparse': (tf.float32, tf.TensorShape([None, None])),
        'dense_out': (tf.float32, tf.TensorShape([None])),
        'sparse_out': (tf.float32, tf.TensorShape([None, None])),
    }

    for key, tensor in itertools.chain(six.iteritems(inputs),
                                       six.iteritems(outputs)):
      dtype, shape = expected_dtype_and_shape[key]
      self.assertEqual(tensor.dtype, dtype)
      tensor.get_shape().assert_is_compatible_with(shape)
Example #3
0
  def testRunTransformFn(self):
    schema = self.toSchema({
        'dense_1': tf.FixedLenFeature((), tf.float32),
        'dense_2': tf.FixedLenFeature((1, 2), tf.int64),
        'var_len': tf.VarLenFeature(tf.string),
        'sparse': tf.SparseFeature('ix', 'val', tf.float32, 100)
    })
    def preprocessing_fn(inputs):
      return {
          'dense_out': mappers.scale_to_0_1(inputs['dense_1']),
          'sparse_out': api.map(lambda x: tf.sparse_reshape(x, (1, 10)),
                                inputs['sparse'])
      }

    inputs, outputs = impl_helper.run_preprocessing_fn(
        preprocessing_fn, schema)

    # Verify that the input placeholders have the correct types.
    expected_dtype_and_shape = {
        'dense_1': (tf.float32, tf.TensorShape([None])),
        'dense_2': (tf.int64, tf.TensorShape([None, 1, 2])),
        'var_len': (tf.string, tf.TensorShape(None)),
        'sparse': (tf.float32, tf.TensorShape(None)),
        'dense_out': (tf.float32, tf.TensorShape([None])),
        'sparse_out': (tf.float32, tf.TensorShape([None, None])),
    }

    for key, column in inputs.items() + outputs.items():
      dtype, shape = expected_dtype_and_shape[key]
      self.assertEqual(column.tensor.dtype, dtype)
      self.assertShapesEqual(column.tensor.get_shape(), shape)
Example #4
0
  def testCreatePhasesWithLoop(self):
    # Test a preprocessing function with control flow.
    #
    # The loop represents
    #
    # i = 0
    # while i < 10:
    #   i += 1
    #   x += 1
    #
    # To get an error in the case where apply_function is not called, we have
    # to call an analyzer first (see testCreatePhasesWithUnwrappedLoop).  So
    # we also do so here.
    def preprocessing_fn(inputs):
      def _subtract_ten(x):
        i = tf.constant(0)
        c = lambda i, x: tf.less(i, 10)
        b = lambda i, x: (tf.add(i, 1), tf.add(x, -1))
        return tf.while_loop(c, b, [i, x])[1]
      scaled_to_0_1 = mappers.scale_to_0_1(
          api.apply_function(_subtract_ten, inputs['x']))
      return {'x_scaled': scaled_to_0_1}

    input_schema = sch.Schema({
        'x': sch.ColumnSchema(tf.int32, [], sch.FixedColumnRepresentation())
    })
    graph, _, _ = impl_helper.run_preprocessing_fn(
        preprocessing_fn, input_schema)
    phases = impl_helper.create_phases(graph)
    self.assertEqual(len(phases), 1)
    self.assertEqual(len(phases[0].analyzers), 2)
Example #5
0
  def testRunTransformFnBadTransform(self):
    schema = self.toSchema({
        'x': tf.FixedLenFeature((3,), tf.float32),
    })
    def preprocessing_fn(inputs):
      return {
          'x_sum': api.map(tf.reduce_sum, inputs['x']),
      }

    # Verify that we raise if preprocessing_fn outputs a tensor with rank 0.
    with self.assertRaises(ValueError):
      _ = impl_helper.run_preprocessing_fn(preprocessing_fn, schema)
Example #6
0
    def testCreatePhasesWithDegenerateFunctionApplication(self):
        # Tests the case of a function whose inputs and outputs overlap.
        def preprocessing_fn(inputs):
            return {'index': api.apply_function(lambda x: x, inputs['a'])}

        input_schema = sch.Schema({
            'a':
            sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation())
        })
        _, _ = impl_helper.run_preprocessing_fn(preprocessing_fn, input_schema)
        phases = impl_helper.create_phases()
        self.assertEqual(len(phases), 0)
Example #7
0
    def testImportAndExportWithTensorValueMapping(self):
        # Export the function "z = x * min(y) + x + min(y)" with min(y) replaced by
        # 6.
        def preprocessing_fn(inputs):
            return {
                'z':
                api.map(lambda x, y: x * y + x + y, inputs['x'],
                        analyzers.min(inputs['y']))
            }

        input_schema = self.toSchema({
            'x': tf.FixedLenFeature((), tf.float32),
            'y': tf.FixedLenFeature((), tf.float32)
        })

        inputs, outputs = impl_helper.run_preprocessing_fn(
            preprocessing_fn, input_schema)
        saved_model_dir = os.path.join(self.get_temp_dir(), 'replace_original')
        input_columns_to_statistics = impl_helper.make_transform_fn_def(
            input_schema, inputs, outputs, saved_model_dir)
        self.assertEqual(len(input_columns_to_statistics.keys()), 1)
        y_min_input_name = input_columns_to_statistics.keys()[0]

        g = tf.Graph()
        with g.as_default():
            x = tf.placeholder(tf.float32, ())
            y = tf.placeholder(tf.float32, ())
            z = x * y + x + y
        new_saved_model_dir = os.path.join(self.get_temp_dir(), 'replace_new')
        impl_helper.replace_tensors_with_constant_values(
            saved_model_dir, new_saved_model_dir, {
                y_min_input_name:
                impl_helper.ConstantTensorValue(6, tf.float32, ())
            })

        # Import the function, applying it to constants for x and y.
        g = tf.Graph()
        with g.as_default():
            x = tf.constant(5, tf.float32, (1, ))
            y = tf.constant(1000, tf.float32, (1, ))  #  Value is never used.
            outputs = saved_transform_io.apply_saved_transform(
                new_saved_model_dir, {
                    'x': x,
                    'y': y
                })
            z = outputs['z']

            sess = tf.Session()
            with sess.as_default():
                # Check result is 5 * 6 + 5 + 6 = 41.
                self.assertEqual(41, z.eval())
Example #8
0
    def testImportAndExportDense(self):
        # Export the function "z = x * y + x + y"
        def preprocessing_fn(inputs):
            return {
                'z': api.map(lambda x, y: x * y + x + y, inputs['x'],
                             inputs['y'])
            }

        input_schema = self.toSchema({
            'x': tf.FixedLenFeature((), tf.float32),
            'y': tf.FixedLenFeature((), tf.float32)
        })

        inputs, outputs = impl_helper.run_preprocessing_fn(
            preprocessing_fn, input_schema)
        saved_model_dir = os.path.join(self.get_temp_dir(), 'dense')
        _ = impl_helper.make_transform_fn_def(input_schema, inputs, outputs,
                                              saved_model_dir)

        # Import the function, applying it to constants for x and y.
        g = tf.Graph()
        with g.as_default():
            x = tf.constant(5, tf.float32, (1, ))
            y = tf.constant(6, tf.float32, (1, ))
            outputs = saved_transform_io.apply_saved_transform(
                saved_model_dir, {
                    'x': x,
                    'y': y
                })
            z = outputs['z']

            sess = tf.Session()
            with sess.as_default():
                # Check result is 5 * 6 + 5 + 6 = 41.
                self.assertEqual(41, z.eval())

        # Import the graph, feeding it values for x and y.
        g = tf.Graph()
        with g.as_default():
            inputs, outputs = impl_helper.load_transform_fn_def(
                saved_model_dir)
            x = inputs['x']
            y = inputs['y']
            z = outputs['z']

            sess = tf.Session()
            with sess.as_default():
                # Check result is 5 * 6 + 5 + 6 = 41.
                self.assertEqual(41, sess.run(z, {x: [5], y: [6]}))
Example #9
0
  def testCreatePhasesWithUnwrappedTable(self):
    # Test a preprocessing function with a table that is not wrapped in
    # `apply_function`.
    def preprocessing_fn(inputs):
      table = lookup.index_table_from_tensor(['a', 'b'])
      integerized = table.lookup(inputs['x'])
      return {'integerized': integerized}

    input_schema = sch.Schema({
        'x': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation())
    })
    graph, _, _ = impl_helper.run_preprocessing_fn(
        preprocessing_fn, input_schema)
    with self.assertRaisesRegexp(ValueError, 'Found table initializers'):
      _ = impl_helper.create_phases(graph)
Example #10
0
  def testCreatePhasesWithMultipleLevelsOfAnalyzers(self):
    # Test a preprocessing function similar to scale_to_0_1 except that it
    # involves multiple interleavings of analyzers and transforms.
    def preprocessing_fn(inputs):
      scaled_to_0 = inputs['x'] - analyzers.min(inputs['x'])
      scaled_to_0_1 = scaled_to_0 / analyzers.max(scaled_to_0)
      return {'x_scaled': scaled_to_0_1}

    input_schema = sch.Schema({
        'x': sch.ColumnSchema(tf.float32, [], sch.FixedColumnRepresentation())
    })
    graph, _, _ = impl_helper.run_preprocessing_fn(
        preprocessing_fn, input_schema)
    phases = impl_helper.create_phases(graph)
    self.assertEqual(len(phases), 2)
    self.assertEqual(len(phases[0].analyzers), 1)
    self.assertEqual(len(phases[1].analyzers), 1)
Example #11
0
    def testImportAndExportSparse(self):
        # Export the function "z = x + y"
        def preprocessing_fn(inputs):
            return {'z': api.map(tf.sparse_add, inputs['x'], inputs['y'])}

        input_schema = self.toSchema({
            'x': tf.VarLenFeature(tf.float32),
            'y': tf.VarLenFeature(tf.float32)
        })

        inputs, outputs = impl_helper.run_preprocessing_fn(
            preprocessing_fn, input_schema)
        saved_model_dir = os.path.join(self.get_temp_dir(), 'sparse')
        _ = impl_helper.make_transform_fn_def(input_schema, inputs, outputs,
                                              saved_model_dir)

        # Import the function, applying it to constants for x and y.
        g = tf.Graph()
        with g.as_default():
            x = tf.SparseTensor(indices=[[0]],
                                values=tf.constant(5,
                                                   shape=(1, ),
                                                   dtype=tf.float32),
                                dense_shape=[1])
            y = tf.SparseTensor(indices=[[0]],
                                values=tf.constant(6,
                                                   shape=(1, ),
                                                   dtype=tf.float32),
                                dense_shape=[1])
            outputs = saved_transform_io.apply_saved_transform(
                saved_model_dir, {
                    'x': x,
                    'y': y
                })
            z = outputs['z']

            sess = tf.Session()
            with sess.as_default():
                # Check result is 5 + 6 = 11.
                result = z.eval()
                self.assertEqual(result.indices, [[0]])
                self.assertEqual(result.values, [11])
                self.assertEqual(result.dense_shape, [1])
Example #12
0
  def testCreatePhasesWithTable(self):
    # Test a preprocessing function with table that can only be run after the
    # first analyzer has run.  Note converting an integerized string into a
    # float doesn't make much sense, but is a legal tensorflow computation.
    def preprocessing_fn(inputs):
      integerized = mappers.string_to_int(inputs['x'])
      integerized = tf.to_float(integerized)
      scaled_to_0_1 = integerized / analyzers.max(integerized)
      return {'x_scaled': scaled_to_0_1}

    input_schema = sch.Schema({
        'x': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation())
    })
    graph, _, _ = impl_helper.run_preprocessing_fn(
        preprocessing_fn, input_schema)
    phases = impl_helper.create_phases(graph)
    self.assertEqual(len(phases), 2)
    self.assertEqual(len(phases[0].analyzers), 1)
    self.assertEqual(len(phases[1].analyzers), 1)
    self.assertEqual(len(phases[0].table_initializers), 0)
    self.assertEqual(len(phases[1].table_initializers), 1)
Example #13
0
def make_transform_graph(output_dir, schema, features):
  """Writes a tft transform fn, and metadata files.

  Args:
    output_dir: output folder
    schema: schema list
    features: features dict
  """

  tft_input_schema = make_tft_input_schema(schema, os.path.join(output_dir,
                                                                STATS_FILE))
  tft_input_metadata = dataset_metadata.DatasetMetadata(schema=tft_input_schema)
  preprocessing_fn = make_preprocessing_fn(output_dir, features)

  # copy from /tft/beam/impl
  inputs, outputs = impl_helper.run_preprocessing_fn(
      preprocessing_fn=preprocessing_fn,
      schema=tft_input_schema)
  output_metadata = dataset_metadata.DatasetMetadata(
      schema=impl_helper.infer_feature_schema(outputs))

  transform_fn_dir = os.path.join(output_dir, TRANSFORM_FN_DIR)

  # This writes the SavedModel
  impl_helper.make_transform_fn_def(
      schema=tft_input_schema,
      inputs=inputs,
      outputs=outputs,
      saved_model_dir=transform_fn_dir)

  metadata_io.write_metadata(
      metadata=output_metadata,
      path=os.path.join(output_dir, TRANSFORMED_METADATA_DIR))
  metadata_io.write_metadata(
      metadata=tft_input_metadata,
      path=os.path.join(output_dir, RAW_METADATA_DIR))
Example #14
0
  def expand(self, dataset):
    """Analyze the dataset.

    Args:
      dataset: A dataset.

    Returns:
      A TransformFn containing the deferred transform function.
    """

    input_values, input_metadata = dataset
    input_schema = input_metadata.schema

    base_temp_dir = Context.create_base_temp_dir()

    # NOTE: it's important that create_phases is called directly after
    # run_preprocessing_fn, because we later mutate the graph's
    # TABLE_INITIALIZERS collection which would break the logic in
    # create_phases.
    graph, inputs, outputs = impl_helper.run_preprocessing_fn(
        self._preprocessing_fn, input_schema)
    phases = impl_helper.create_phases(graph)

    # Iterate through levels.  tensor_pcoll_mapping is a mapping from tensor
    # names to singleton PCollections containing a _TensorValue.  We compute
    # tensor_pcoll_mapping in phases, where at each phase we compute the
    # analyzers that are ready to run and update tensor_pcoll_mapping.
    tensor_pcoll_mapping = {}
    table_initializers = graph.get_collection_ref(
        tf.GraphKeys.TABLE_INITIALIZERS)
    original_table_initializers = list(table_initializers)
    del table_initializers[:]

    serialized_tf_config = (
        analyzer_impls._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get(  # pylint: disable=protected-access
            input_values.pipeline.runner))
    for level, phase in enumerate(phases):
      # Create a SavedModel that describes the mapping from the input data
      # to the inputs of the analyzers at this level.  The colum names of the
      # outputs are the tensor names of the analyzer inputs in the graph.  This
      # graph has the anaylzer outputs computed so far replaced with constants.
      analyzer_inputs = {}
      for analyzer in phase.analyzers:
        for input_tensor in analyzer.inputs:
          analyzer_inputs[input_tensor.name] = input_tensor
      table_initializers.extend(phase.table_initializers)
      unbound_saved_model_dir = _make_unique_temp_dir(base_temp_dir)
      _write_saved_transform(
          graph, inputs, analyzer_inputs, unbound_saved_model_dir)
      saved_model_dir = (
          tensor_pcoll_mapping
          | 'CreateSavedModelForAnaylzerInputs[%d]' % level
          >> _ReplaceTensorsWithConstants(
              unbound_saved_model_dir, base_temp_dir, input_values.pipeline))

      # Run this saved model on the input dataset to obtain the inputs to the
      # analyzers.
      analyzer_input_values = (
          input_values
          | 'ComputeAnalyzerInputs[%d]' % level >> beam.ParDo(
              _RunMetaGraphDoFn(
                  input_schema,
                  serialized_tf_config,
                  shared_graph_state_handle=shared.Shared()),
              saved_model_dir=beam.pvalue.AsSingleton(saved_model_dir)))

      # Compute the analyzers from their inputs.  `analyzer_outputs_dict` is a
      # map from tensor names to singleton PCollections of `_TensorValue`s.
      analyzer_outputs_dict = (
          analyzer_input_values
          | 'ComputeAnalyzerOutputs[%d]' % level
          >> _ComputeAnalyzerOutputs(phase.analyzers, base_temp_dir))

      # Update the mapping for all analyzers.
      tensor_pcoll_mapping.update(analyzer_outputs_dict)

    del table_initializers[:]
    table_initializers.extend(original_table_initializers)
    saved_model_dir = _make_unique_temp_dir(base_temp_dir)
    _write_saved_transform(graph, inputs, outputs, saved_model_dir)
    transform_fn = (
        tensor_pcoll_mapping
        | 'ReplaceTensorsWithConstants'
        >> _ReplaceTensorsWithConstants(
            saved_model_dir, base_temp_dir, input_values.pipeline))

    # Infer metadata.  The metadata may contain Futures that refer to the values
    # of tensors in the graph.  In that case, the tensors must be "constant" in
    # that they don't depend on input data.  The tensors can depend on analyzer
    # outputs though.  This allows us to set metadata that depends on analyzer
    # outputs.
    #
    # We first extract the names of the tensors that are referenced by the
    # Futures, and then compute them by calling _ComputeScalarConstants with the
    # tensor-PCollection mapping representing the analyzer outputs.
    metadata = dataset_metadata.DatasetMetadata(
        schema=impl_helper.infer_feature_schema(graph, outputs))

    deferred_metadata_tensor_names = [
        future.name
        for column_schema in tft_api.get_column_schemas(graph).values()
        for future in column_schema.substitute_futures({})]
    name_pcoll_dict = (
        tensor_pcoll_mapping
        | 'ComputeTensorValues' >>
        _ComputeTensorValues(
            deferred_metadata_tensor_names, saved_model_dir,
            input_values.pipeline))
    full_metadata = beam_metadata_io.BeamDatasetMetadata(
        metadata, name_pcoll_dict)

    _clear_shared_state_after_barrier(input_values.pipeline, transform_fn)

    return transform_fn, full_metadata
Example #15
0
    def expand(self, dataset):
        """Analyze the dataset.

    Args:
      dataset: A dataset.

    Returns:
      A TransformFn containing the deferred transform function.
    """

        input_values, input_metadata = dataset
        input_schema = input_metadata.schema
        input_batches = input_values | 'BatchInstances' >> beam.ParDo(
            _BatchDoFn())

        class _CreateTransformFn(beam.PTransform):
            """Create a TransformFnDef, binding statistics in a deferred manner.

      This function constructs a tensorflow graph eagerly and then (in a
      deferred manner) fills in analyzer outputs with their actual computed
      values. We construct the tensorflow graph up front because that implies
      serializing MetaGraphDef protos rather than pickling the user-defined TITO
      functions. The graph contains placeholders for `_AnalyzerOutput`s which
      are then replaced with their actual values (as constant tensors) in a
      deferred manner.

      Args:
        input_columns: A map from column names to `Column`s.
        output_columns: A map from column names to `Column`s.
        temp_dir: Temp dir to store `SavedModel`s.
      """
            def __init__(self, input_columns, output_columns, temp_dir):
                # Generally the pipeline is inferred from its inputs, however we need
                # to know the pipeline for beam.Create.
                self.pipeline = input_values.pipeline
                self._input_columns = input_columns
                self._output_columns = output_columns
                self._temp_dir = temp_dir

            def expand(self, analyzer_outputs_to_pcoll):
                """Converts a dict of statistics to a transform function.

        Args:
          analyzer_outputs_to_pcoll: A dictionary mapping `_AnalyzerOutput`s
              to the values of these statistics as a PCollection.

        Returns:
          A single-element PCollection containing the directory name with the
              SavedModel.
        """
                # Create a transform_fn with unbound values.

                unbound_transform_fn_dir = os.path.join(
                    self._temp_dir, 'unbound_transform_fn')
                input_columns_to_statistics = impl_helper.make_transform_fn_def(
                    input_schema, self._input_columns, self._output_columns,
                    unbound_transform_fn_dir)

                transform_fn = (self.pipeline | 'CreateTransformFn' >>
                                beam.Create([unbound_transform_fn_dir]))

                if not analyzer_outputs_to_pcoll:
                    return transform_fn

                # Convert the statistics dict into a DictPCollectionView so it can be
                # passed as a side input to the beam Map below.
                tagged_statistics = []
                for tag, statistic in input_columns_to_statistics.items():
                    pcoll = analyzer_outputs_to_pcoll[statistic]
                    tagged_statistics.append(
                        pcoll
                        | 'AddTag[%s]' % tag >> beam.Map(lambda x, tag=tag:
                                                         (tag, x)))

                statistics_side_input = beam.pvalue.AsDict(
                    tagged_statistics | 'MergeStatistics' >> beam.Flatten())

                # Run a mapper that inserts statistic values into the graph.
                return (transform_fn
                        | 'ReplaceTensorsWithConstantValues' >> beam.Map(
                            impl_helper.replace_tensors_with_constant_values,
                            bound_saved_model_dir=os.path.join(
                                self._temp_dir, 'transform_fn'),
                            input_value_mapping=statistics_side_input))

        inputs, outputs = impl_helper.run_preprocessing_fn(
            self._preprocessing_fn, input_schema)

        # Get a list of lists, containing analyzers (i.e. _AnalyzerOutput objects)
        # by level in the DAG of Columns/Statistics. Analyzers at level n are ready
        # to run once all analyzers at level n - 1 are complete.
        analyzers_by_level = self._analyzers_by_level(outputs)

        # Iterate through levels, keeping track of analyzer outputs (i.e.
        # statistics) via a mapping of `_AnalyzerOutput` -> single element
        # PCollection.
        analyzer_outputs_to_pcoll = {}
        for level, analyzer_outputs in enumerate(analyzers_by_level):
            # Create a TransformFnDef representing the graph needed to generate
            # all the inputs required by the analyzer_outputs at this level.  We
            # assign arbitrary names to the outputs of this TransformFnDef.
            analyzer_input_columns = {}
            for idx, analyzer_output in enumerate(analyzer_outputs):
                if len(analyzer_output.inputs) != 1:
                    raise NotImplementedError(
                        'Analyzers must have exactly one input')
                analyzer_input_key = 'analyzer_%d_input' % idx
                analyzer_input_columns[
                    analyzer_input_key] = analyzer_output.inputs[0]

            transform_fn = (
                analyzer_outputs_to_pcoll
                | 'CreateTransformFn_%d' % level >> _CreateTransformFn(
                    inputs, analyzer_input_columns,
                    os.path.join(self._output_dir, 'tmp', 'level_%s' % level)))
            analyzer_input_schema = impl_helper.infer_feature_schema(
                analyzer_input_columns)

            # Run the TransformFnDef in a mapper.
            analysis_inputs = (
                input_batches
                | 'ComputeAnalyzerInputs_%d' % level >> beam.ParDo(
                    _RunMetaGraphDoFn(input_schema, analyzer_input_schema),
                    saved_model_dir=beam.pvalue.AsSingleton(transform_fn)))

            # For each analyzer output, look up its input values (by tensor name)
            # and run the analyzer in these values.
            for idx, analyzer_output in enumerate(analyzer_outputs):
                analyzer_input_key = 'analyzer_%d_input' % idx
                analyzer_outputs_to_pcoll[analyzer_output] = (
                    analysis_inputs
                    | 'Extract_%d_%d' % (level, idx) >> beam.Map(
                        # pylint: disable=cell-var-from-loop
                        # This lint warning is prone to false positives, and it's not
                        # clear why the warning is required here.
                        lambda x, key=analyzer_input_key:
                        [inst[key] for inst in x])
                    | 'Analyze_%d_%d' %
                    (level, idx) >> self._Analyze(analyzer_output))

        output_metadata = dataset_metadata.DatasetMetadata(
            schema=impl_helper.infer_feature_schema(outputs))
        transform_fn = (analyzer_outputs_to_pcoll
                        | 'CreateTransformFn' >> _CreateTransformFn(
                            inputs, outputs, self._output_dir))

        return transform_fn, output_metadata
Example #16
0
    def expand(self, dataset):
        """Analyze the dataset.

    Args:
      dataset: A dataset.

    Returns:
      A TransformFn containing the deferred transform function.
    """

        input_values, input_metadata = dataset
        input_schema = input_metadata.schema

        base_temp_dir = Context.create_base_temp_dir()

        class _ReplaceTensorsWithConstants(beam.PTransform):
            """Bind statistics in a deferred manner.

      This transform fills in analyzer outputs with their actual computed
      values.

      Args:
        saved_model_dir: The directory containing the SavedModel.
      """
            def __init__(self, saved_model_dir):
                # Generally the pipeline is inferred from its inputs, however we need
                # to know the pipeline for beam.Create.
                self.pipeline = input_values.pipeline
                self._saved_model_dir = saved_model_dir

            def expand(self, tensor_pcoll_mapping):
                """Converts a dict of statistics to a transform function.

        Args:
          tensor_pcoll_mapping: A dictionary mapping `Tensor`s to singleton
              `PCollection`s.

        Returns:
          A single-element PCollection containing the directory name with the
              SavedModel.
        """
                transform_fn = (self.pipeline | 'CreateTransformFn' >>
                                beam.Create([self._saved_model_dir]))

                if not tensor_pcoll_mapping:
                    return transform_fn

                # Convert tensor_value_mapping into a DictPCollectionView so it can be
                # passed as a side input to the beam Map below.
                tensor_value_pairs = []
                for name, pcoll in six.iteritems(tensor_pcoll_mapping):
                    tensor_value_pairs.append(
                        pcoll
                        | 'AddName[%s]' % name >> beam.Map(lambda x, name=name:
                                                           (name, x)))
                tensor_value_mapping = beam.pvalue.AsDict(
                    tensor_value_pairs
                    | 'MergeTensorValuePairs' >> beam.Flatten())

                # Run a mapper that inserts statistic values into the graph.  We wrap
                # replace_tensors_with_constant_values in a wrapper that also creates
                # a temp dir.  This makes the wrapper idempotent since any retry will
                # use a different temp dir.
                def replace_tensors_with_constant_values(
                        saved_model_dir, tensor_value_mapping,
                        serialized_tf_config):

                    tf_config = _maybe_deserialize_tf_config(
                        serialized_tf_config)
                    with tf.Session(config=tf_config) as session:
                        temp_dir = _make_unique_temp_dir(base_temp_dir)
                        input_tensors, output_tensors = (
                            saved_transform_io.partially_apply_saved_transform(
                                saved_model_dir, {}, tensor_value_mapping))
                        saved_transform_io.write_saved_transform_from_session(
                            session, input_tensors, output_tensors, temp_dir)
                    return temp_dir

                serialized_tf_config = _DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get(
                    self.pipeline.runner)
                return (transform_fn | 'ReplaceTensorsWithConstantValues' >>
                        beam.Map(replace_tensors_with_constant_values,
                                 tensor_value_mapping=tensor_value_mapping,
                                 serialized_tf_config=serialized_tf_config))

        class _ComputeTensorPcollMappingUpdate(beam.PTransform):
            """Create a mapping from `Tensor`s to PCollections.

      Creates a mapping from `Tensor`s to PCollections for the outputs of the
      new analyzers.  An existing mapping will be provided as the argument
      to the extend() method.

      Args:
        phase: The Phase to run
      """
            def __init__(self, saved_model_dir, analyzer_inputs_schema,
                         analyzers):
                self._saved_model_dir = saved_model_dir
                self._analyzer_inputs_schema = analyzer_inputs_schema
                self._analyzers = analyzers

            def expand(self, input_values_and_tensor_pcoll_mapping):
                input_values, tensor_pcoll_mapping = (
                    input_values_and_tensor_pcoll_mapping)

                # Create a transform_fn to produce inputs to new analyzers.
                transform_fn = (
                    tensor_pcoll_mapping
                    | 'ReplaceTensorsWithConstants' >>
                    _ReplaceTensorsWithConstants(self._saved_model_dir))

                # Run the transform_fn.
                serialized_tf_config = _DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get(
                    self.pipeline.runner)
                analyzer_input_values = (
                    input_values | 'ComputeAnalyzerInputs' >> beam.ParDo(
                        _RunMetaGraphDoFn(input_schema,
                                          self._analyzer_inputs_schema,
                                          serialized_tf_config),
                        saved_model_dir=beam.pvalue.AsSingleton(transform_fn)))

                # For each analyzer output, look up its input values (by tensor name)
                # and run the analyzer on these values.
                #
                tensor_pcoll_mapping_update = {}
                for idx, analyzer in enumerate(self._analyzers):
                    analyzer_impl = analyzer_impls._impl_for_analyzer(
                        analyzer.spec)
                    # pylint: enable=protected-access

                    assert len(analyzer.inputs) == 1
                    output_pcolls = (analyzer_input_values
                                     | 'Extract_%d' % idx >> beam.Map(
                                         lambda batch, key: batch[key],
                                         key=analyzer.inputs[0].name)
                                     | 'Analyze_%d' % idx >> analyzer_impl)
                    assert len(analyzer.outputs) == len(output_pcolls)
                    for tensor, pcoll in zip(analyzer.outputs, output_pcolls):
                        tensor_pcoll_mapping_update[tensor.name] = pcoll
                return tensor_pcoll_mapping_update

        # NOTE: it's important that create_phases is called directly after
        # run_preprocessing_fn, because we later mutate the graph's
        # TABLE_INITIALIZERS collection which would break the logic in
        # create_phases.
        graph, inputs, outputs = impl_helper.run_preprocessing_fn(
            self._preprocessing_fn, input_schema)
        phases = impl_helper.create_phases(graph)

        # Iterate through levels, generating PCollections for columns that are the
        # outputs of `Operations` that are not `MapOperation`s.
        tensor_pcoll_mapping = {}
        table_initializers = graph.get_collection_ref(
            tf.GraphKeys.TABLE_INITIALIZERS)
        original_table_initializers = list(table_initializers)
        del table_initializers[:]

        for level, phase in enumerate(phases):
            analyzer_inputs = {}
            for analyzer in phase.analyzers:
                for input_tensor in analyzer.inputs:
                    analyzer_inputs[input_tensor.name] = input_tensor
            analyzer_inputs_schema = impl_helper.infer_feature_schema(
                analyzer_inputs)
            table_initializers.extend(phase.table_initializers)
            saved_model_dir = _make_unique_temp_dir(base_temp_dir)
            _write_saved_transform(graph, inputs, analyzer_inputs,
                                   saved_model_dir)

            tensor_pcoll_mapping_update = (
                (input_values, tensor_pcoll_mapping)
                | 'ComputeTensorPcollMappingUpdate_%d' % level >>
                _ComputeTensorPcollMappingUpdate(
                    saved_model_dir, analyzer_inputs_schema, phase.analyzers))
            tensor_pcoll_mapping.update(tensor_pcoll_mapping_update)

        output_metadata = dataset_metadata.DatasetMetadata(
            schema=impl_helper.infer_feature_schema(outputs))
        del table_initializers[:]
        table_initializers.extend(original_table_initializers)
        saved_model_dir = _make_unique_temp_dir(base_temp_dir)
        _write_saved_transform(graph, inputs, outputs, saved_model_dir)
        transform_fn = (tensor_pcoll_mapping
                        | 'ReplaceTensorsWithConstants' >>
                        _ReplaceTensorsWithConstants(saved_model_dir))

        return transform_fn, output_metadata