Ejemplo n.º 1
0
 def test_batched_placeholders_from_specs_invalid_dtype(self):
     with self.assertRaisesRegexp(ValueError, 'had invalid dtype'):
         impl_helper.batched_placeholders_from_specs(
             {'f': tf.TensorSpec(dtype=tf.int32, shape=[None])})
     with self.assertRaisesRegexp(ValueError, 'had invalid dtype'):
         impl_helper.batched_placeholders_from_specs(
             {'f': tf.io.FixedLenFeature(dtype=tf.int32, shape=[None])})
Ejemplo n.º 2
0
 def test_batched_placeholders_from_specs_invalid_mixing(self):
     with self.assertRaisesRegexp(TypeError, 'Specs must be all'):
         impl_helper.batched_placeholders_from_specs({
             'f1':
             tf.TensorSpec(dtype=tf.int64, shape=[None]),
             'f2':
             tf.io.FixedLenFeature(dtype=tf.int64, shape=[None]),
         })
Ejemplo n.º 3
0
 def test_batched_placeholders_from_feature_spec(self):
     feature_spec = {
         'fixed_len_float': tf.io.FixedLenFeature([2, 3], tf.float32),
         'fixed_len_string': tf.io.FixedLenFeature([], tf.string),
         '_var_len_underscored': tf.io.VarLenFeature(tf.string),
         'var_len_int': tf.io.VarLenFeature(tf.int64)
     }
     with tf.compat.v1.Graph().as_default():
         features = impl_helper.batched_placeholders_from_specs(
             feature_spec)
     self.assertCountEqual(features.keys(), [
         'fixed_len_float', 'fixed_len_string', 'var_len_int',
         '_var_len_underscored'
     ])
     self.assertEqual(type(features['fixed_len_float']), tf.Tensor)
     self.assertEqual(features['fixed_len_float'].get_shape().as_list(),
                      [None, 2, 3])
     self.assertEqual(type(features['fixed_len_string']), tf.Tensor)
     self.assertEqual(features['fixed_len_string'].get_shape().as_list(),
                      [None])
     self.assertEqual(type(features['var_len_int']), tf.SparseTensor)
     self.assertEqual(features['var_len_int'].get_shape().as_list(),
                      [None, None])
     self.assertEqual(type(features['_var_len_underscored']),
                      tf.SparseTensor)
     self.assertEqual(
         features['_var_len_underscored'].get_shape().as_list(),
         [None, None])
Ejemplo n.º 4
0
    def test_batched_placeholders_from_typespecs(self):
        typespecs = {
            'dense_float':
            tf.TensorSpec(dtype=tf.float32, shape=[None, 2, 3]),
            'dense_string':
            tf.TensorSpec(shape=[None], dtype=tf.string),
            '_sparse_underscored':
            tf.SparseTensorSpec(dtype=tf.string, shape=[None, None]),
        }
        with tf.compat.v1.Graph().as_default():
            features = impl_helper.batched_placeholders_from_specs(typespecs)
        self.assertCountEqual(features.keys(), [
            'dense_float',
            'dense_string',
            '_sparse_underscored',
        ])
        self.assertEqual(type(features['dense_float']), tf.Tensor)
        self.assertEqual(features['dense_float'].get_shape().as_list(),
                         [None, 2, 3])
        self.assertEqual(features['dense_float'].dtype, tf.float32)

        self.assertEqual(type(features['dense_string']), tf.Tensor)
        self.assertEqual(features['dense_string'].get_shape().as_list(),
                         [None])
        self.assertEqual(features['dense_string'].dtype, tf.string)

        self.assertEqual(type(features['_sparse_underscored']),
                         tf.SparseTensor)
        self.assertEqual(features['_sparse_underscored'].get_shape().as_list(),
                         [None, None])
        self.assertEqual(features['_sparse_underscored'].dtype, tf.string)
Ejemplo n.º 5
0
    def test_batched_placeholders_from_typespecs(self):
        typespecs = {
            'dense_float':
            tf.TensorSpec(dtype=tf.float32, shape=[None, 2, 3]),
            'dense_string':
            tf.TensorSpec(shape=[None], dtype=tf.string),
            '_sparse_underscored':
            tf.SparseTensorSpec(dtype=tf.string, shape=[None, None, 17]),
            'ragged_string':
            tf.RaggedTensorSpec(dtype=tf.string,
                                ragged_rank=1,
                                shape=[None, None]),
            'ragged_multi_dimension':
            tf.RaggedTensorSpec(dtype=tf.int64,
                                ragged_rank=3,
                                shape=[None, None, None, None, 5]),
        }
        with tf.compat.v1.Graph().as_default():
            features = impl_helper.batched_placeholders_from_specs(typespecs)
        self.assertCountEqual(features.keys(), [
            'dense_float',
            'dense_string',
            '_sparse_underscored',
            'ragged_string',
            'ragged_multi_dimension',
        ])
        self.assertEqual(type(features['dense_float']), tf.Tensor)
        self.assertEqual(features['dense_float'].get_shape().as_list(),
                         [None, 2, 3])
        self.assertEqual(features['dense_float'].dtype, tf.float32)

        self.assertEqual(type(features['dense_string']), tf.Tensor)
        self.assertEqual(features['dense_string'].get_shape().as_list(),
                         [None])
        self.assertEqual(features['dense_string'].dtype, tf.string)

        self.assertEqual(type(features['_sparse_underscored']),
                         tf.SparseTensor)
        # TODO(zoyahav): Change last dimension size to 17 once SparseTensors propogate
        # static dense_shape from typespec correctly.
        self.assertEqual(features['_sparse_underscored'].get_shape().as_list(),
                         [None, None, None])
        self.assertEqual(features['_sparse_underscored'].dtype, tf.string)

        self.assertEqual(type(features['ragged_string']), tf.RaggedTensor)
        self.assertEqual(features['ragged_string'].shape.as_list(),
                         [None, None])
        self.assertEqual(features['ragged_string'].ragged_rank, 1)
        self.assertEqual(features['ragged_string'].dtype, tf.string)

        self.assertEqual(type(features['ragged_multi_dimension']),
                         tf.RaggedTensor)
        self.assertEqual(features['ragged_multi_dimension'].shape.as_list(),
                         [None, None, None, None, 5])
        self.assertEqual(features['ragged_multi_dimension'].ragged_rank, 3)
        self.assertEqual(features['ragged_multi_dimension'].dtype, tf.int64)
Ejemplo n.º 6
0
 def test_batched_placeholders_from_feature_spec(self):
     feature_spec = {
         'fixed_len_float':
         tf.io.FixedLenFeature([2, 3], tf.float32),
         'fixed_len_string':
         tf.io.FixedLenFeature([], tf.string),
         '_var_len_underscored':
         tf.io.VarLenFeature(tf.string),
         'var_len_int':
         tf.io.VarLenFeature(tf.int64),
         'sparse_1d':
         tf.io.SparseFeature('1d_idx', '1d_value', tf.int64, 7),
         'sparse_2d':
         tf.io.SparseFeature(['2d_idx0', '2d_idx1'], '2d_value', tf.int64,
                             [2, 17]),
     }
     with tf.compat.v1.Graph().as_default():
         features = impl_helper.batched_placeholders_from_specs(
             feature_spec)
     self.assertCountEqual(features.keys(), [
         'fixed_len_float',
         'fixed_len_string',
         'var_len_int',
         '_var_len_underscored',
         'sparse_1d',
         'sparse_2d',
     ])
     self.assertEqual(type(features['fixed_len_float']), tf.Tensor)
     self.assertEqual(features['fixed_len_float'].get_shape().as_list(),
                      [None, 2, 3])
     self.assertEqual(type(features['fixed_len_string']), tf.Tensor)
     self.assertEqual(features['fixed_len_string'].get_shape().as_list(),
                      [None])
     self.assertEqual(type(features['var_len_int']), tf.SparseTensor)
     self.assertEqual(features['var_len_int'].get_shape().as_list(),
                      [None, None])
     self.assertEqual(type(features['_var_len_underscored']),
                      tf.SparseTensor)
     self.assertEqual(
         features['_var_len_underscored'].get_shape().as_list(),
         [None, None])
     self.assertEqual(type(features['sparse_1d']), tf.SparseTensor)
     self.assertEqual(type(features['sparse_2d']), tf.SparseTensor)
     if version.parse(tf.__version__) >= version.parse('2'):
         self.assertEqual(features['sparse_1d'].get_shape().as_list(),
                          [None, 7])
         self.assertEqual(features['sparse_2d'].get_shape().as_list(),
                          [None, 2, 17])
     else:
         self.assertEqual(features['sparse_1d'].get_shape().as_list(),
                          [None, None])
         self.assertEqual(features['sparse_2d'].get_shape().as_list(),
                          [None, None, None])
    def test_build(self, feature_spec, preprocessing_fn,
                   expected_dot_graph_str):
        with tf.compat.v1.Graph().as_default() as graph:
            with tf.compat.v1.name_scope('inputs'):
                input_signature = impl_helper.batched_placeholders_from_specs(
                    feature_spec)
            output_signature = preprocessing_fn(input_signature)
            transform_fn_future, unused_cache = analysis_graph_builder.build(
                graph, input_signature, output_signature)

        dot_string = nodes.get_dot_graph([transform_fn_future]).to_string()
        self.WriteRenderedDotFile(dot_string)
        self.assertMultiLineEqual(
            msg='Result dot graph is:\n{}'.format(dot_string),
            first=dot_string,
            second=expected_dot_graph_str)
def _build_analysis_graph_for_inspection(preprocessing_fn, specs, dataset_keys,
                                         input_cache):
    """Builds the analysis graph for inspection."""
    with tf.compat.v1.Graph().as_default() as graph:
        with tf.compat.v1.name_scope('inputs'):
            input_signature = impl_helper.batched_placeholders_from_specs(
                specs)
            # TODO(b/34288791): This needs to be exactly the same as in impl.py
            copied_inputs = impl_helper.copy_tensors(input_signature)

        output_signature = preprocessing_fn(copied_inputs)
    transform_fn_future, cache_dict = build(graph,
                                            input_signature,
                                            output_signature,
                                            dataset_keys=dataset_keys,
                                            cache_dict=input_cache)
    return transform_fn_future, cache_dict
Ejemplo n.º 9
0
def get_transform_input_columns(preprocessing_fn, specs):
  """Return columns that are required inputs of `TransformDataset`.

  Args:
    preprocessing_fn: A tf.transform preprocessing_fn.
    specs: A dict of feature name to feature specification or tf.TypeSpecs.

  Returns:
    A list of columns that are required inputs of the transform `tf.Graph`
    defined by `preprocessing_fn`.
  """
  with tf.compat.v1.Graph().as_default() as graph:
    input_signature = impl_helper.batched_placeholders_from_specs(
        specs)
    output_signature = preprocessing_fn(input_signature.copy())
    transform_input_tensors = graph_tools.get_dependent_inputs(
        graph, input_signature, output_signature)
    return list(transform_input_tensors.keys())
Ejemplo n.º 10
0
def get_analyze_input_columns(preprocessing_fn, specs):
  """Return columns that are required inputs of `AnalyzeDataset`.

  Args:
    preprocessing_fn: A tf.transform preprocessing_fn.
    specs: A dict of feature name to feature specification or tf.TypeSpecs.

  Returns:
    A list of columns that are required inputs of analyzers.
  """

  with tf.compat.v1.Graph().as_default() as graph:
    input_signature = impl_helper.batched_placeholders_from_specs(
        specs)
    _ = preprocessing_fn(input_signature.copy())

    tensor_sinks = graph.get_collection(analyzer_nodes.TENSOR_REPLACEMENTS)
    visitor = _SourcedTensorsVisitor()
    for tensor_sink in tensor_sinks:
      nodes.Traverser(visitor).visit_value_node(tensor_sink.future)

    analyze_input_tensors = graph_tools.get_dependent_inputs(
        graph, input_signature, visitor.sourced_tensors)
    return list(analyze_input_tensors.keys())
Ejemplo n.º 11
0
  def expand(self, dataset):
    """Analyze the dataset.

    Args:
      dataset: A dataset.

    Returns:
      A TransformFn containing the deferred transform function.

    Raises:
      ValueError: If preprocessing_fn has no outputs.
    """
    (flattened_pcoll, input_values_pcoll_dict, dataset_cache_dict,
     input_metadata) = dataset
    if self._use_tfxio:
      input_schema = None
      input_tensor_adapter_config = input_metadata
    else:
      input_schema = input_metadata.schema
      input_tensor_adapter_config = None

    input_values_pcoll_dict = input_values_pcoll_dict or dict()

    with tf.compat.v1.Graph().as_default() as graph:

      with tf.compat.v1.name_scope('inputs'):
        if self._use_tfxio:
          specs = TensorAdapter(input_tensor_adapter_config).OriginalTypeSpecs()
        else:
          specs = schema_utils.schema_as_feature_spec(input_schema).feature_spec
        input_signature = impl_helper.batched_placeholders_from_specs(specs)
        # In order to avoid a bug where import_graph_def fails when the
        # input_map and return_elements of an imported graph are the same
        # (b/34288791), we avoid using the placeholder of an input column as an
        # output of a graph. We do this by applying tf.identity to all inputs of
        # the preprocessing_fn.  Note this applies at the level of raw tensors.
        # TODO(b/34288791): Remove this workaround and use a shallow copy of
        # inputs instead.  A shallow copy is needed in case
        # self._preprocessing_fn mutates its input.
        copied_inputs = impl_helper.copy_tensors(input_signature)

      output_signature = self._preprocessing_fn(copied_inputs)

    # At this point we check that the preprocessing_fn has at least one
    # output. This is because if we allowed the output of preprocessing_fn to
    # be empty, we wouldn't be able to determine how many instances to
    # "unbatch" the output into.
    if not output_signature:
      raise ValueError('The preprocessing function returned an empty dict')

    if graph.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES):
      raise ValueError(
          'The preprocessing function contained trainable variables '
          '{}'.format(
              graph.get_collection_ref(
                  tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES)))

    pipeline = self.pipeline or (flattened_pcoll or next(
        v for v in input_values_pcoll_dict.values() if v is not None)).pipeline

    # Add a stage that inspects graph collections for API use counts and logs
    # them as a beam metric.
    _ = (pipeline | 'InstrumentAPI' >> _InstrumentAPI(graph))

    tf_config = _DEFAULT_TENSORFLOW_CONFIG_BY_BEAM_RUNNER_TYPE.get(
        type(pipeline.runner))
    extra_args = beam_common.ConstructBeamPipelineVisitor.ExtraArgs(
        base_temp_dir=Context.create_base_temp_dir(),
        tf_config=tf_config,
        pipeline=pipeline,
        flat_pcollection=flattened_pcoll,
        pcollection_dict=input_values_pcoll_dict,
        graph=graph,
        input_signature=input_signature,
        input_schema=input_schema,
        input_tensor_adapter_config=input_tensor_adapter_config,
        use_tfxio=self._use_tfxio,
        cache_pcoll_dict=dataset_cache_dict)

    transform_fn_future, cache_value_nodes = analysis_graph_builder.build(
        graph,
        input_signature,
        output_signature,
        input_values_pcoll_dict.keys(),
        cache_dict=dataset_cache_dict)
    traverser = nodes.Traverser(
        beam_common.ConstructBeamPipelineVisitor(extra_args))
    transform_fn_pcoll = traverser.visit_value_node(transform_fn_future)

    if cache_value_nodes is not None:
      output_cache_pcoll_dict = {}
      for (dataset_key,
           cache_key), value_node in six.iteritems(cache_value_nodes):
        if dataset_key not in output_cache_pcoll_dict:
          output_cache_pcoll_dict[dataset_key] = {}
        output_cache_pcoll_dict[dataset_key][cache_key] = (
            traverser.visit_value_node(value_node))
    else:
      output_cache_pcoll_dict = None

    # Infer metadata.  We take the inferred metadata and apply overrides that
    # refer to values of tensors in the graph.  The override tensors must
    # be "constant" in that they don't depend on input data.  The tensors can
    # depend on analyzer outputs though.  This allows us to set metadata that
    # depends on analyzer outputs. _infer_metadata_from_saved_model will use the
    # analyzer outputs stored in `transform_fn` to compute the metadata in a
    # deferred manner, once the analyzer outputs are known.
    metadata = dataset_metadata.DatasetMetadata(
        schema=schema_inference.infer_feature_schema(output_signature, graph))

    deferred_metadata = (
        transform_fn_pcoll
        |
        'ComputeDeferredMetadata' >> beam.Map(_infer_metadata_from_saved_model))

    full_metadata = beam_metadata_io.BeamDatasetMetadata(
        metadata, deferred_metadata)

    _clear_shared_state_after_barrier(pipeline, transform_fn_pcoll)

    return (transform_fn_pcoll, full_metadata), output_cache_pcoll_dict