def get_analyze_input_columns(preprocessing_fn,
                              specs,
                              force_tf_compat_v1=False):
    """Return columns that are required inputs of `AnalyzeDataset`.

  Args:
    preprocessing_fn: A tf.transform preprocessing_fn.
    specs: A dict of feature name to tf.TypeSpecs. If `force_tf_compat_v1` is
      True, this can also be feature specifications.
    force_tf_compat_v1: (Optional) If `True`, use Tensorflow in compat.v1 mode.
      Defaults to `False`.

  Returns:
    A list of columns that are required inputs of analyzers.
  """
    use_tf_compat_v1 = tf2_utils.use_tf_compat_v1(force_tf_compat_v1)
    if not use_tf_compat_v1:
        assert all([isinstance(s, tf.TypeSpec) for s in specs.values()]), specs
    graph, structured_inputs, _ = (impl_helper.trace_preprocessing_function(
        preprocessing_fn, specs, use_tf_compat_v1=use_tf_compat_v1))

    tensor_sinks = graph.get_collection(analyzer_nodes.TENSOR_REPLACEMENTS)
    visitor = _SourcedTensorsVisitor()
    for tensor_sink in tensor_sinks:
        nodes.Traverser(visitor).visit_value_node(tensor_sink.future)

    analyze_input_tensors = graph_tools.get_dependent_inputs(
        graph, structured_inputs, visitor.sourced_tensors)
    return list(analyze_input_tensors.keys())
def get_transform_input_columns(preprocessing_fn,
                                specs,
                                force_tf_compat_v1=False):
    """Return columns that are required inputs of `TransformDataset`.

  Args:
    preprocessing_fn: A tf.transform preprocessing_fn.
    specs: A dict of feature name to tf.TypeSpecs. If `force_tf_compat_v1` is
      True, this can also be feature specifications.
    force_tf_compat_v1: (Optional) If `True`, use Tensorflow in compat.v1 mode.
      Defaults to `False`.

  Returns:
    A list of columns that are required inputs of the transform `tf.Graph`
    defined by `preprocessing_fn`.
  """
    use_tf_compat_v1 = tf2_utils.use_tf_compat_v1(force_tf_compat_v1)
    if not use_tf_compat_v1:
        assert all([isinstance(s, tf.TypeSpec) for s in specs.values()]), specs
    graph, structured_inputs, structured_outputs = (
        impl_helper.trace_preprocessing_function(
            preprocessing_fn, specs, use_tf_compat_v1=use_tf_compat_v1))

    transform_input_tensors = graph_tools.get_dependent_inputs(
        graph, structured_inputs, structured_outputs)
    return list(transform_input_tensors.keys())
Exemple #3
0
    def __init__(self, saved_model_dir, input_schema, exclude_outputs,
                 tf_config):
      self.saved_model_dir = saved_model_dir
      with tf.Graph().as_default() as graph:
        self._session = tf.compat.v1.Session(graph=graph, config=tf_config)
        with self._session.as_default():
          inputs, outputs = (
              saved_transform_io.partially_apply_saved_transform_internal(
                  saved_model_dir, {}))
        self._session.run(tf.compat.v1.global_variables_initializer())
        self._session.run(tf.compat.v1.tables_initializer())
        graph.finalize()

        input_schema_keys = schema_utils.schema_as_feature_spec(
            input_schema).feature_spec.keys()
        if set(input_schema_keys).difference(inputs.keys()):
          raise ValueError('Input schema contained keys not in graph: %s' %
                           input_schema_keys)
        if set(exclude_outputs).difference(outputs.keys()):
          raise ValueError('Excluded outputs contained keys not in graph: %s' %
                           exclude_outputs)
        non_excluded_output_keys = sorted(
            set(outputs.keys()).difference(exclude_outputs))
        fetches = [outputs[key] for key in non_excluded_output_keys]
        tensor_inputs = graph_tools.get_dependent_inputs(graph, inputs, fetches)
        self.inputs_tensor_keys = sorted(tensor_inputs.keys())
        self.outputs_tensor_keys = non_excluded_output_keys

        tensor_inputs_list = [
            tensor_inputs[key] for key in self.inputs_tensor_keys
        ]
        self.callable_get_outputs = self._session.make_callable(
            fetches, feed_list=tensor_inputs_list)
Exemple #4
0
    def benchmarkRunMetagraphDoFnAtTFLevel(self):
        """Benchmark RunMetaGraphDoFn at the TF level.

    Benchmarks the parts of RunMetaGraphDoFn that involve feeding and
    fetching from the TFT SavedModel. Records the wall time taken.

    Note that this benchmark necessarily duplicates code directly from TFT
    since it's benchmarking the low-level internals of TFT, which are not
    exposed for use in this way.
    """
        common_variables = _get_common_variables(self._dataset)
        tf_config = tft_beam_impl._FIXED_PARALLELISM_TF_CONFIG  # pylint: disable=protected-access
        input_schema = common_variables.transform_input_dataset_metadata.schema

        # This block copied from _GraphState.__init__
        with tf.compat.v1.Graph().as_default() as graph:
            session = tf.compat.v1.Session(graph=graph, config=tf_config)
            with session.as_default():
                # TODO(b/148082271): Revert back to unpacking the result directly once
                # TFX depends on TFT 0.22.
                apply_saved_model_result = (
                    saved_transform_io.
                    partially_apply_saved_transform_internal(
                        self._dataset.tft_saved_model_path(), {}))
                inputs, outputs = apply_saved_model_result[:2]
                session.run(tf.compat.v1.global_variables_initializer())
                session.run(tf.compat.v1.tables_initializer())
                graph.finalize()
            # We ignore the schema, and assume there are no excluded outputs.
            outputs_tensor_keys = sorted(set(outputs.keys()))
            fetches = [outputs[key] for key in outputs_tensor_keys]
            tensor_inputs = graph_tools.get_dependent_inputs(
                graph, inputs, fetches)
            input_tensor_keys = sorted(tensor_inputs.keys())
            feed_list = [inputs[key] for key in input_tensor_keys]
            callable_get_outputs = session.make_callable(fetches,
                                                         feed_list=feed_list)

        batch_size, batched_records = _get_batched_records(self._dataset)

        # This block copied from _RunMetaGraphDoFn._handle_batch
        start = time.time()
        for batch in batched_records:
            feed_list = impl_helper.make_feed_list(input_tensor_keys,
                                                   input_schema, batch)
            outputs_list = callable_get_outputs(*feed_list)
            _ = {
                key: value
                for key, value in zip(outputs_tensor_keys, outputs_list)
            }
        end = time.time()
        delta = end - start

        self.report_benchmark(iters=1,
                              wall_time=delta,
                              extras={
                                  "batch_size": batch_size,
                                  "num_examples": self._dataset.num_examples()
                              })
Exemple #5
0
  def benchmarkRunMetagraphDoFnAtTFLevel(self):
    """Benchmark RunMetaGraphDoFn at the TF level for TFT's TF1 implementation.

    Benchmarks the parts of RunMetaGraphDoFn that involve feeding and
    fetching from the TFT SavedModel. Records the wall time taken.

    Note that this benchmark necessarily duplicates code directly from TFT
    since it's benchmarking the low-level internals of TFT, which are not
    exposed for use in this way.
    """
    common_variables = _get_common_variables(self._dataset)
    tf_config = tft_beam_impl._FIXED_PARALLELISM_TF_CONFIG  # pylint: disable=protected-access

    # This block copied from _GraphStateCompatV1.__init__
    with tf.compat.v1.Graph().as_default() as graph:
      session = tf.compat.v1.Session(graph=graph, config=tf_config)
      with session.as_default():
        inputs, outputs = (
            saved_transform_io.partially_apply_saved_transform_internal(
                self._dataset.tft_saved_model_path(force_tf_compat_v1=True),
                {}))
        session.run(tf.compat.v1.global_variables_initializer())
        session.run(tf.compat.v1.tables_initializer())
        graph.finalize()
      # We ignore the schema, and assume there are no excluded outputs.
      outputs_tensor_keys = sorted(set(outputs.keys()))
      fetches = [outputs[key] for key in outputs_tensor_keys]
      tensor_inputs = graph_tools.get_dependent_inputs(graph, inputs, fetches)
      input_tensor_keys = sorted(tensor_inputs.keys())
      feed_list = [inputs[key] for key in input_tensor_keys]
      callable_get_outputs = session.make_callable(fetches, feed_list=feed_list)

    batch_size, batched_records = _get_batched_records(self._dataset,
                                                       self._max_num_examples())

    input_tensor_adapter = tensor_adapter.TensorAdapter(
        common_variables.tfxio.TensorAdapterConfig())

    # This block copied from _RunMetaGraphDoFn._handle_batch
    start = time.time()
    for batch in batched_records:
      feed_by_name = input_tensor_adapter.ToBatchTensors(
          batch, produce_eager_tensors=False)
      feed_list = [feed_by_name[name] for name in input_tensor_keys]
      outputs_list = callable_get_outputs(*feed_list)
      _ = {key: value for key, value in zip(outputs_tensor_keys, outputs_list)}
    end = time.time()
    delta = end - start

    self.report_benchmark(
        iters=1,
        wall_time=delta,
        extras={
            "batch_size":
                batch_size,
            "num_examples":
                self._dataset.num_examples(limit=self._max_num_examples())
        })
 def testGetDependentInputs(self, create_graph_fn, feeds, fetches,
                            expected_dependent_inputs):
   tensors = create_graph_fn()
   got = graph_tools.get_dependent_inputs(tf.compat.v1.get_default_graph(),
                                          {x: tensors[x] for x in feeds},
                                          {y: tensors[y] for y in fetches})
   self.assertCountEqual(expected_dependent_inputs, got.keys())
   for input_name in expected_dependent_inputs:
     self.assertEqual(tensors[input_name], got[input_name])
def get_transform_input_columns(preprocessing_fn, feature_spec):
    """Return columns that are required inputs of `TransformDataset`.

  Args:
    preprocessing_fn: A tf.transform preprocessing_fn.
    feature_spec: A dict of feature name to feature specification.

  Returns:
    A list of columns that are required inputs of the transform `tf.Graph`
    defined by `preprocessing_fn`.
  """
    with tf.compat.v1.Graph().as_default() as graph:
        input_signature = impl_helper.feature_spec_as_batched_placeholders(
            feature_spec)
        output_signature = preprocessing_fn(input_signature.copy())
        transform_input_tensors = graph_tools.get_dependent_inputs(
            graph, input_signature, output_signature)
        return transform_input_tensors.keys()
def get_analyze_input_columns(
        preprocessing_fn: Callable[[Mapping[str, common_types.TensorType]],
                                   Mapping[str, common_types.TensorType]],
        specs: Mapping[str, Union[common_types.FeatureSpecType, tf.TypeSpec]],
        force_tf_compat_v1: bool = False) -> List[str]:
    """Return columns that are required inputs of `AnalyzeDataset`.

  Args:
    preprocessing_fn: A tf.transform preprocessing_fn.
    specs: A dict of feature name to tf.TypeSpecs. If `force_tf_compat_v1` is
      True, this can also be feature specifications.
    force_tf_compat_v1: (Optional) If `True`, use Tensorflow in compat.v1 mode.
      Defaults to `False`.

  Returns:
    A list of columns that are required inputs of analyzers.
  """
    use_tf_compat_v1 = tf2_utils.use_tf_compat_v1(force_tf_compat_v1)
    if not use_tf_compat_v1:
        assert all([isinstance(s, tf.TypeSpec) for s in specs.values()]), specs
    graph, structured_inputs, structured_outputs = (
        impl_helper.trace_preprocessing_function(
            preprocessing_fn, specs, use_tf_compat_v1=use_tf_compat_v1))

    tensor_sinks = graph.get_collection(analyzer_nodes.TENSOR_REPLACEMENTS)
    visitor = graph_tools.SourcedTensorsVisitor()
    for tensor_sink in tensor_sinks:
        nodes.Traverser(visitor).visit_value_node(tensor_sink.future)

    if use_tf_compat_v1:
        control_dependency_ops = []
    else:
        # If traced in TF2 as a tf.function, inputs that end up in control
        # dependencies are required for the function to execute. Return such inputs
        # as required inputs of analyzers as well.
        _, control_dependency_ops = (
            tf2_utils.strip_and_get_tensors_and_control_dependencies(
                tf.nest.flatten(structured_outputs, expand_composites=True)))

    output_tensors = list(
        itertools.chain(visitor.sourced_tensors, control_dependency_ops))
    analyze_input_tensors = graph_tools.get_dependent_inputs(
        graph, structured_inputs, output_tensors)
    return list(analyze_input_tensors.keys())
def get_analyze_input_columns(preprocessing_fn, feature_spec):
    """Return columns that are required inputs of `AnalyzeDataset`.

  Args:
    preprocessing_fn: A tf.transform preprocessing_fn.
    feature_spec: A dict of feature name to feature specification.

  Returns:
    A list of columns that are required inputs of analyzers.
  """
    with tf.compat.v1.Graph().as_default() as graph:
        input_signature = impl_helper.feature_spec_as_batched_placeholders(
            feature_spec)
        _ = preprocessing_fn(input_signature.copy())

        tensor_sinks = graph.get_collection(analyzer_nodes.TENSOR_REPLACEMENTS)
        visitor = _SourcedTensorsVisitor()
        for tensor_sink in tensor_sinks:
            nodes.Traverser(visitor).visit_value_node(tensor_sink.future)

        analyze_input_tensors = graph_tools.get_dependent_inputs(
            graph, input_signature, visitor.sourced_tensors)
        return analyze_input_tensors.keys()