Exemple #1
0
 def testCrossNamedParameters(self):
     test_cases_1 = [
         {
             'testcase_name': 'a_1_b_1',
             'a': 1,
             'b': 1
         },
         {
             'testcase_name': 'a_3_b_3',
             'a': 3,
             'b': 3
         },
     ]
     test_cases_2 = [
         {
             'testcase_name': 'c_2',
             'c': 2
         },
         {
             'testcase_name': 'c_4',
             'c': 4
         },
     ]
     expected_cross = [
         {
             'testcase_name': 'a_1_b_1_c_2',
             'a': 1,
             'b': 1,
             'c': 2
         },
         {
             'testcase_name': 'a_1_b_1_c_4',
             'a': 1,
             'b': 1,
             'c': 4
         },
         {
             'testcase_name': 'a_3_b_3_c_2',
             'a': 3,
             'b': 3,
             'c': 2
         },
         {
             'testcase_name': 'a_3_b_3_c_4',
             'a': 3,
             'b': 3,
             'c': 4
         },
     ]
     self.assertEqual(
         test_case.cross_named_parameters(test_cases_1, test_cases_2),
         expected_cross)
class InspectPreprocessingFnTest(test_case.TransformTestCase):
    @test_case.named_parameters(*test_case.cross_named_parameters([
        dict(testcase_name='identity',
             preprocessing_fn=_identity_preprocessing_fn,
             expected_analyze_input_columns=[],
             expected_transform_input_columns=['x', 'y', 's']),
        dict(testcase_name='side_affect',
             preprocessing_fn=_side_affect_preprocessing_fn,
             expected_analyze_input_columns=['s'],
             expected_transform_input_columns=[]),
        dict(testcase_name='non_identity_ops',
             preprocessing_fn=_non_identity_ops_preprocessing_fn,
             expected_analyze_input_columns=[],
             expected_transform_input_columns=['x', 'y', 's']),
        dict(testcase_name='feature_renaming',
             preprocessing_fn=_renaming_preprocessing_fn,
             expected_analyze_input_columns=[],
             expected_transform_input_columns=['x', 'y', 's']),
        dict(testcase_name='one_phase',
             preprocessing_fn=_one_phase_preprocessing_fn,
             expected_analyze_input_columns=['x', 's'],
             expected_transform_input_columns=['y']),
        dict(testcase_name='two_phases',
             preprocessing_fn=_two_phases_preprocessing_fn,
             expected_analyze_input_columns=['x', 'y', 's'],
             expected_transform_input_columns=['x', 's'])
    ], [
        dict(testcase_name='tf_compat_v1', force_tf_compat_v1=True),
        dict(testcase_name='tf2', force_tf_compat_v1=False)
    ]))
    def test_column_inference(self, preprocessing_fn,
                              expected_analyze_input_columns,
                              expected_transform_input_columns,
                              force_tf_compat_v1):
        if not force_tf_compat_v1:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')
            specs = _TYPE_SPEC
        else:
            specs = _FEATURE_SPEC

        analyze_input_columns = (
            inspect_preprocessing_fn.get_analyze_input_columns(
                preprocessing_fn, specs, force_tf_compat_v1))
        transform_input_columns = (
            inspect_preprocessing_fn.get_transform_input_columns(
                preprocessing_fn, specs, force_tf_compat_v1))
        self.assertCountEqual(analyze_input_columns,
                              expected_analyze_input_columns)
        self.assertCountEqual(transform_input_columns,
                              expected_transform_input_columns)
Exemple #3
0
class ImplHelperTest(test_case.TransformTestCase):
    def test_batched_placeholders_from_feature_spec(self):
        feature_spec = {
            'fixed_len_float': tf.io.FixedLenFeature([2, 3], tf.float32),
            'fixed_len_string': tf.io.FixedLenFeature([], tf.string),
            '_var_len_underscored': tf.io.VarLenFeature(tf.string),
            'var_len_int': tf.io.VarLenFeature(tf.int64)
        }
        with tf.compat.v1.Graph().as_default():
            features = impl_helper.batched_placeholders_from_specs(
                feature_spec)
        self.assertCountEqual(features.keys(), [
            'fixed_len_float', 'fixed_len_string', 'var_len_int',
            '_var_len_underscored'
        ])
        self.assertEqual(type(features['fixed_len_float']), tf.Tensor)
        self.assertEqual(features['fixed_len_float'].get_shape().as_list(),
                         [None, 2, 3])
        self.assertEqual(type(features['fixed_len_string']), tf.Tensor)
        self.assertEqual(features['fixed_len_string'].get_shape().as_list(),
                         [None])
        self.assertEqual(type(features['var_len_int']), tf.SparseTensor)
        self.assertEqual(features['var_len_int'].get_shape().as_list(),
                         [None, None])
        self.assertEqual(type(features['_var_len_underscored']),
                         tf.SparseTensor)
        self.assertEqual(
            features['_var_len_underscored'].get_shape().as_list(),
            [None, None])

    def test_batched_placeholders_from_typespecs(self):
        typespecs = {
            'dense_float':
            tf.TensorSpec(dtype=tf.float32, shape=[None, 2, 3]),
            'dense_string':
            tf.TensorSpec(shape=[None], dtype=tf.string),
            '_sparse_underscored':
            tf.SparseTensorSpec(dtype=tf.string, shape=[None, None]),
            'ragged_string':
            tf.RaggedTensorSpec(dtype=tf.string,
                                ragged_rank=1,
                                shape=[None, None]),
            'ragged_multi_dimension':
            tf.RaggedTensorSpec(dtype=tf.int64,
                                ragged_rank=3,
                                shape=[None, None, None, None, 5]),
        }
        with tf.compat.v1.Graph().as_default():
            features = impl_helper.batched_placeholders_from_specs(typespecs)
        self.assertCountEqual(features.keys(), [
            'dense_float',
            'dense_string',
            '_sparse_underscored',
            'ragged_string',
            'ragged_multi_dimension',
        ])
        self.assertEqual(type(features['dense_float']), tf.Tensor)
        self.assertEqual(features['dense_float'].get_shape().as_list(),
                         [None, 2, 3])
        self.assertEqual(features['dense_float'].dtype, tf.float32)

        self.assertEqual(type(features['dense_string']), tf.Tensor)
        self.assertEqual(features['dense_string'].get_shape().as_list(),
                         [None])
        self.assertEqual(features['dense_string'].dtype, tf.string)

        self.assertEqual(type(features['_sparse_underscored']),
                         tf.SparseTensor)
        self.assertEqual(features['_sparse_underscored'].get_shape().as_list(),
                         [None, None])
        self.assertEqual(features['_sparse_underscored'].dtype, tf.string)

        self.assertEqual(type(features['ragged_string']), tf.RaggedTensor)
        self.assertEqual(features['ragged_string'].shape.as_list(),
                         [None, None])
        self.assertEqual(features['ragged_string'].ragged_rank, 1)
        self.assertEqual(features['ragged_string'].dtype, tf.string)

        self.assertEqual(type(features['ragged_multi_dimension']),
                         tf.RaggedTensor)
        self.assertEqual(features['ragged_multi_dimension'].shape.as_list(),
                         [None, None, None, None, 5])
        self.assertEqual(features['ragged_multi_dimension'].ragged_rank, 3)
        self.assertEqual(features['ragged_multi_dimension'].dtype, tf.int64)

    def test_batched_placeholders_from_specs_invalid_dtype(self):
        with self.assertRaisesRegexp(ValueError, 'had invalid dtype'):
            impl_helper.batched_placeholders_from_specs(
                {'f': tf.TensorSpec(dtype=tf.int32, shape=[None])})
        with self.assertRaisesRegexp(ValueError, 'had invalid dtype'):
            impl_helper.batched_placeholders_from_specs(
                {'f': tf.io.FixedLenFeature(dtype=tf.int32, shape=[None])})

    def test_batched_placeholders_from_specs_invalid_mixing(self):
        with self.assertRaisesRegexp(TypeError, 'Specs must be all'):
            impl_helper.batched_placeholders_from_specs({
                'f1':
                tf.TensorSpec(dtype=tf.int64, shape=[None]),
                'f2':
                tf.io.FixedLenFeature(dtype=tf.int64, shape=[None]),
            })

    @test_case.named_parameters(*test_case.cross_named_parameters(
        _ROUNDTRIP_CASES, [
            dict(testcase_name='eager_tensors', feed_eager_tensors=True),
            dict(testcase_name='session_run_values', feed_eager_tensors=False)
        ]))
    def test_to_instance_dicts(self, feature_spec, instances, feed_dict,
                               feed_eager_tensors):
        if feed_eager_tensors:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')
        schema = schema_utils.schema_from_feature_spec(feature_spec)
        feed_dict_local = copy.copy(feed_dict)
        if feed_eager_tensors:
            for key, value in six.iteritems(feed_dict_local):
                if isinstance(value, tf.compat.v1.SparseTensorValue):
                    feed_dict_local[key] = tf.sparse.SparseTensor.from_value(
                        value)
                else:
                    feed_dict_local[key] = tf.constant(value)
        np.testing.assert_equal(
            instances, impl_helper.to_instance_dicts(schema, feed_dict_local))

    @test_case.named_parameters(*_TO_INSTANCE_DICT_ERROR_CASES)
    def test_to_instance_dicts_error(self,
                                     feature_spec,
                                     feed_dict,
                                     error_msg,
                                     error_type=ValueError):
        schema = schema_utils.schema_from_feature_spec(feature_spec)
        with self.assertRaisesRegexp(error_type, error_msg):
            impl_helper.to_instance_dicts(schema, feed_dict)

    @test_case.named_parameters(
        dict(testcase_name='tf_compat_v1', force_tf_compat_v1=True),
        dict(testcase_name='native_tf2', force_tf_compat_v1=False))
    def test_analyze_in_place(self, force_tf_compat_v1):
        if not force_tf_compat_v1:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')

        def preprocessing_fn(inputs):
            return {'x_add_1': inputs['x'] + 1}

        feature_spec = {'x': tf.io.FixedLenFeature([], tf.int64)}
        type_spec = {
            'x': tf.TensorSpec(dtype=tf.int64, shape=[
                None,
            ])
        }
        output_path = os.path.join(self.get_temp_dir(), self._testMethodName)
        impl_helper.analyze_in_place(preprocessing_fn, force_tf_compat_v1,
                                     feature_spec, type_spec, output_path)

        tft_output = TFTransformOutput(output_path)
        expected_value = np.array([2], dtype=np.int64)
        if force_tf_compat_v1:
            with tf.Graph().as_default() as graph:
                with tf.compat.v1.Session(graph=graph).as_default():
                    transformed_features = tft_output.transform_raw_features(
                        {'x': tf.constant([1], dtype=tf.int64)})
                    transformed_value = transformed_features['x_add_1'].eval()
        else:
            transformed_features = tft_output.transform_raw_features(
                {'x': tf.constant([1], dtype=tf.int64)})
            transformed_value = transformed_features['x_add_1'].numpy()
        self.assertEqual(transformed_value, expected_value)

        transformed_feature_spec = tft_output.transformed_feature_spec()
        expected_feature_spec = feature_spec = {
            'x_add_1': tf.io.FixedLenFeature([], tf.int64)
        }
        self.assertEqual(transformed_feature_spec, expected_feature_spec)

    @test_case.named_parameters(
        dict(testcase_name='tf_compat_v1', force_tf_compat_v1=True),
        dict(testcase_name='native_tf2', force_tf_compat_v1=False))
    def test_analyze_in_place_with_analyzers_raises_error(
            self, force_tf_compat_v1):
        if not force_tf_compat_v1:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')

        def preprocessing_fn(inputs):
            return {'x_add_1': analyzers.mean(inputs['x'])}

        feature_spec = {'x': tf.io.FixedLenFeature([], tf.int64)}
        type_spec = {
            'x': tf.TensorSpec(dtype=tf.int64, shape=[
                None,
            ])
        }
        output_path = os.path.join(self.get_temp_dir(), self._testMethodName)
        with self.assertRaisesRegexp(RuntimeError,
                                     'analyzers found when tracing'):
            impl_helper.analyze_in_place(preprocessing_fn, force_tf_compat_v1,
                                         feature_spec, type_spec, output_path)
class ImplHelperTest(test_case.TransformTestCase):
    def test_batched_placeholders_from_feature_spec(self):
        feature_spec = {
            'fixed_len_float':
            tf.io.FixedLenFeature([2, 3], tf.float32),
            'fixed_len_string':
            tf.io.FixedLenFeature([], tf.string),
            '_var_len_underscored':
            tf.io.VarLenFeature(tf.string),
            'var_len_int':
            tf.io.VarLenFeature(tf.int64),
            'sparse_1d':
            tf.io.SparseFeature('1d_idx', '1d_value', tf.int64, 7),
            'sparse_2d':
            tf.io.SparseFeature(['2d_idx0', '2d_idx1'], '2d_value', tf.int64,
                                [2, 17]),
        }
        with tf.compat.v1.Graph().as_default():
            features = impl_helper.batched_placeholders_from_specs(
                feature_spec)
        self.assertCountEqual(features.keys(), [
            'fixed_len_float',
            'fixed_len_string',
            'var_len_int',
            '_var_len_underscored',
            'sparse_1d',
            'sparse_2d',
        ])
        self.assertEqual(type(features['fixed_len_float']), tf.Tensor)
        self.assertEqual(features['fixed_len_float'].get_shape().as_list(),
                         [None, 2, 3])
        self.assertEqual(type(features['fixed_len_string']), tf.Tensor)
        self.assertEqual(features['fixed_len_string'].get_shape().as_list(),
                         [None])
        self.assertEqual(type(features['var_len_int']), tf.SparseTensor)
        self.assertEqual(features['var_len_int'].get_shape().as_list(),
                         [None, None])
        self.assertEqual(type(features['_var_len_underscored']),
                         tf.SparseTensor)
        self.assertEqual(
            features['_var_len_underscored'].get_shape().as_list(),
            [None, None])
        self.assertEqual(type(features['sparse_1d']), tf.SparseTensor)
        self.assertEqual(type(features['sparse_2d']), tf.SparseTensor)
        if version.parse(tf.__version__) >= version.parse('2'):
            self.assertEqual(features['sparse_1d'].get_shape().as_list(),
                             [None, 7])
            self.assertEqual(features['sparse_2d'].get_shape().as_list(),
                             [None, 2, 17])
        else:
            self.assertEqual(features['sparse_1d'].get_shape().as_list(),
                             [None, None])
            self.assertEqual(features['sparse_2d'].get_shape().as_list(),
                             [None, None, None])

    def test_batched_placeholders_from_typespecs(self):
        typespecs = {
            'dense_float':
            tf.TensorSpec(dtype=tf.float32, shape=[None, 2, 3]),
            'dense_string':
            tf.TensorSpec(shape=[None], dtype=tf.string),
            '_sparse_underscored':
            tf.SparseTensorSpec(dtype=tf.string, shape=[None, None, 17]),
            'ragged_string':
            tf.RaggedTensorSpec(dtype=tf.string,
                                ragged_rank=1,
                                shape=[None, None]),
            'ragged_multi_dimension':
            tf.RaggedTensorSpec(dtype=tf.int64,
                                ragged_rank=3,
                                shape=[None, None, None, None, 5]),
        }
        with tf.compat.v1.Graph().as_default():
            features = impl_helper.batched_placeholders_from_specs(typespecs)
        self.assertCountEqual(features.keys(), [
            'dense_float',
            'dense_string',
            '_sparse_underscored',
            'ragged_string',
            'ragged_multi_dimension',
        ])
        self.assertEqual(type(features['dense_float']), tf.Tensor)
        self.assertEqual(features['dense_float'].get_shape().as_list(),
                         [None, 2, 3])
        self.assertEqual(features['dense_float'].dtype, tf.float32)

        self.assertEqual(type(features['dense_string']), tf.Tensor)
        self.assertEqual(features['dense_string'].get_shape().as_list(),
                         [None])
        self.assertEqual(features['dense_string'].dtype, tf.string)

        self.assertEqual(type(features['_sparse_underscored']),
                         tf.SparseTensor)
        # TODO(zoyahav): Change last dimension size to 17 once SparseTensors propogate
        # static dense_shape from typespec correctly.
        self.assertEqual(features['_sparse_underscored'].get_shape().as_list(),
                         [None, None, None])
        self.assertEqual(features['_sparse_underscored'].dtype, tf.string)

        self.assertEqual(type(features['ragged_string']), tf.RaggedTensor)
        self.assertEqual(features['ragged_string'].shape.as_list(),
                         [None, None])
        self.assertEqual(features['ragged_string'].ragged_rank, 1)
        self.assertEqual(features['ragged_string'].dtype, tf.string)

        self.assertEqual(type(features['ragged_multi_dimension']),
                         tf.RaggedTensor)
        self.assertEqual(features['ragged_multi_dimension'].shape.as_list(),
                         [None, None, None, None, 5])
        self.assertEqual(features['ragged_multi_dimension'].ragged_rank, 3)
        self.assertEqual(features['ragged_multi_dimension'].dtype, tf.int64)

    def test_batched_placeholders_from_specs_invalid_dtype(self):
        with self.assertRaisesRegexp(ValueError, 'had invalid dtype'):
            impl_helper.batched_placeholders_from_specs(
                {'f': tf.TensorSpec(dtype=tf.int32, shape=[None])})
        with self.assertRaisesRegexp(ValueError, 'had invalid dtype'):
            impl_helper.batched_placeholders_from_specs(
                {'f': tf.io.FixedLenFeature(dtype=tf.int32, shape=[None])})

    def test_batched_placeholders_from_specs_invalid_mixing(self):
        with self.assertRaisesRegexp(TypeError, 'Specs must be all'):
            impl_helper.batched_placeholders_from_specs({
                'f1':
                tf.TensorSpec(dtype=tf.int64, shape=[None]),
                'f2':
                tf.io.FixedLenFeature(dtype=tf.int64, shape=[None]),
            })

    @test_case.named_parameters(*test_case.cross_named_parameters(
        _ROUNDTRIP_CASES, [
            dict(testcase_name='eager_tensors', feed_eager_tensors=True),
            dict(testcase_name='session_run_values', feed_eager_tensors=False)
        ]))
    def test_to_instance_dicts(self, feature_spec, instances, record_batch,
                               feed_dict, feed_eager_tensors):
        del record_batch
        if feed_eager_tensors:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')
        schema = schema_utils.schema_from_feature_spec(feature_spec)
        feed_dict_local = (_eager_tensor_from_values(feed_dict)
                           if feed_eager_tensors else copy.copy(feed_dict))
        result = impl_helper.to_instance_dicts(schema, feed_dict_local)
        np.testing.assert_equal(instances, result)

    @test_case.named_parameters(*_TO_INSTANCE_DICT_ERROR_CASES)
    def test_to_instance_dicts_error(self,
                                     feature_spec,
                                     feed_dict,
                                     error_msg,
                                     error_type=ValueError):
        schema = schema_utils.schema_from_feature_spec(feature_spec)
        with self.assertRaisesRegexp(error_type, error_msg):
            impl_helper.to_instance_dicts(schema, feed_dict)

    @test_case.named_parameters(*test_case.cross_named_parameters(
        _ROUNDTRIP_CASES, [
            dict(testcase_name='eager_tensors', feed_eager_tensors=True),
            dict(testcase_name='session_run_values', feed_eager_tensors=False)
        ]))
    def test_convert_to_arrow(self, feature_spec, instances, record_batch,
                              feed_dict, feed_eager_tensors):
        del instances
        if feed_eager_tensors:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')
        schema = schema_utils.schema_from_feature_spec(feature_spec)
        converter = impl_helper.make_tensor_to_arrow_converter(schema)
        feed_dict_local = (_eager_tensor_from_values(feed_dict)
                           if feed_eager_tensors else copy.copy(feed_dict))
        arrow_columns, arrow_schema = impl_helper.convert_to_arrow(
            schema, converter, feed_dict_local)
        actual = pa.RecordBatch.from_arrays(arrow_columns, schema=arrow_schema)
        expected = pa.RecordBatch.from_arrays(list(record_batch.values()),
                                              names=list(record_batch.keys()))
        np.testing.assert_equal(actual.to_pydict(), expected.to_pydict())

    @test_case.named_parameters(*_CONVERT_TO_ARROW_ERROR_CASES)
    def test_convert_to_arrow_error(self,
                                    feature_spec,
                                    feed_dict,
                                    error_msg,
                                    error_type=ValueError):
        schema = schema_utils.schema_from_feature_spec(feature_spec)
        converter = impl_helper.make_tensor_to_arrow_converter(schema)
        with self.assertRaisesRegexp(error_type, error_msg):
            impl_helper.convert_to_arrow(schema, converter, feed_dict)

    @test_case.named_parameters(
        dict(testcase_name='tf_compat_v1', force_tf_compat_v1=True),
        dict(testcase_name='native_tf2', force_tf_compat_v1=False))
    def test_analyze_in_place(self, force_tf_compat_v1):
        if not force_tf_compat_v1:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')

        def preprocessing_fn(inputs):
            return {'x_add_1': inputs['x'] + 1}

        feature_spec = {'x': tf.io.FixedLenFeature([], tf.int64)}
        type_spec = {
            'x': tf.TensorSpec(dtype=tf.int64, shape=[
                None,
            ])
        }
        output_path = os.path.join(self.get_temp_dir(), self._testMethodName)
        impl_helper.analyze_in_place(preprocessing_fn, force_tf_compat_v1,
                                     feature_spec, type_spec, output_path)

        tft_output = TFTransformOutput(output_path)
        expected_value = np.array([2], dtype=np.int64)
        if force_tf_compat_v1:
            with tf.Graph().as_default() as graph:
                with tf.compat.v1.Session(graph=graph).as_default():
                    transformed_features = tft_output.transform_raw_features(
                        {'x': tf.constant([1], dtype=tf.int64)})
                    transformed_value = transformed_features['x_add_1'].eval()
        else:
            transformed_features = tft_output.transform_raw_features(
                {'x': tf.constant([1], dtype=tf.int64)})
            transformed_value = transformed_features['x_add_1'].numpy()
        self.assertEqual(transformed_value, expected_value)

        transformed_feature_spec = tft_output.transformed_feature_spec()
        expected_feature_spec = feature_spec = {
            'x_add_1': tf.io.FixedLenFeature([], tf.int64)
        }
        self.assertEqual(transformed_feature_spec, expected_feature_spec)

    @test_case.named_parameters(
        dict(testcase_name='tf_compat_v1', force_tf_compat_v1=True),
        dict(testcase_name='native_tf2', force_tf_compat_v1=False))
    def test_analyze_in_place_with_analyzers_raises_error(
            self, force_tf_compat_v1):
        if not force_tf_compat_v1:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')

        def preprocessing_fn(inputs):
            return {'x_add_1': analyzers.mean(inputs['x'])}

        feature_spec = {'x': tf.io.FixedLenFeature([], tf.int64)}
        type_spec = {
            'x': tf.TensorSpec(dtype=tf.int64, shape=[
                None,
            ])
        }
        output_path = os.path.join(self.get_temp_dir(), self._testMethodName)
        with self.assertRaisesRegexp(RuntimeError,
                                     'analyzers found when tracing'):
            impl_helper.analyze_in_place(preprocessing_fn, force_tf_compat_v1,
                                         feature_spec, type_spec, output_path)

    @test_case.named_parameters(
        dict(testcase_name='_3d',
             sparse_value=tf.compat.v1.SparseTensorValue(
                 indices=np.array([[0, 0, 1], [0, 1, 2], [1, 1, 1]]),
                 values=np.array([0, 1, 2]),
                 dense_shape=np.array([2, 2, 3])),
             expected_indices=[[np.array([0, 1]),
                                np.array([1, 2])],
                               [np.array([1]), np.array([1])]],
             expected_values=[np.array([0, 1]),
                              np.array([2])]),
        dict(testcase_name='_4d',
             sparse_value=tf.compat.v1.SparseTensorValue(
                 indices=np.array([[0, 0, 0, 1], [0, 1, 0, 2], [1, 1, 1, 1]]),
                 values=np.array([0, 1, 2]),
                 dense_shape=np.array([2, 2, 2, 3])),
             expected_indices=[[
                 np.array([0, 1]),
                 np.array([0, 0]),
                 np.array([1, 2])
             ], [np.array([1]), np.array([1]),
                 np.array([1])]],
             expected_values=[np.array([0, 1]),
                              np.array([2])]),
    )
    def test_decompose_sparse_batch(self, sparse_value, expected_indices,
                                    expected_values):
        indices, values = impl_helper._decompose_sparse_batch(sparse_value)
        self.assertLen(indices, len(expected_indices))
        self.assertLen(values, len(expected_values))
        for idx, (a, b) in enumerate(zip(expected_indices, indices)):
            self.assertAllEqual(
                a, b, 'Indices are different at index {}'.format(idx))
        for idx, (a, b) in enumerate(zip(expected_values, values)):
            self.assertAllEqual(a, b,
                                'Values are different at index {}'.format(idx))

    def test_get_num_values_per_instance_in_sparse_batch(self):
        batch_indices = np.array([[idx % 4, 0, 1, 2] for idx in range(100)])
        num_values = impl_helper._get_num_values_per_instance_in_sparse_batch(
            batch_indices, 27)
        expected_num_values = [25, 25, 25, 25] + [0] * 23
        self.assertEqual(expected_num_values, num_values)

    @test_case.named_parameters(
        dict(
            testcase_name='_3d',
            ragged_tensor=tf.compat.v1.ragged.RaggedTensorValue(
                values=tf.compat.v1.ragged.RaggedTensorValue(
                    values=tf.compat.v1.ragged.RaggedTensorValue(
                        values=np.array([10., 20., 30.]),
                        row_splits=np.array([0, 0, 1, 3])),  # row_lengths2
                    row_splits=np.array([0, 1, 1, 3])),  # row_lengths1
                row_splits=np.array([0, 2, 3])),  # batch dimension
            # pytype: disable=attribute-error
            spec=tf.io.RaggedFeature(  # pylint: disable=g-long-ternary
                tf.float32,
                value_key='ragged_3d_val',
                partitions=[
                    tf.io.RaggedFeature.RowLengths('ragged_3d_row_lengths1'),
                    tf.io.RaggedFeature.RowLengths('ragged_3d_row_lengths2'),
                ]) if common_types.is_ragged_feature_available() else None,
            # pytype: enable=attribute-error
            expected_components={
                'ragged_3d_val':
                [np.array([], dtype=np.float32),
                 np.array([10., 20., 30.])],
                'ragged_3d_row_lengths1': [np.array([1, 0]),
                                           np.array([2])],
                'ragged_3d_row_lengths2': [np.array([0]),
                                           np.array([1, 2])],
            },
        ),
        dict(
            testcase_name='_4d',
            ragged_tensor=tf.compat.v1.ragged.RaggedTensorValue(
                values=tf.compat.v1.ragged.RaggedTensorValue(
                    values=tf.compat.v1.ragged.RaggedTensorValue(
                        values=tf.compat.v1.ragged.RaggedTensorValue(
                            values=np.array([b'a', b'b', b'c', b'd']),
                            row_splits=np.array([0, 1, 1, 3,
                                                 4])),  # row_lengths3
                        row_splits=np.array([0, 2, 2, 4])),  # row_lengths2
                    row_splits=np.array([0, 1, 1, 3])),  # row_lengths1
                row_splits=np.array([0, 2, 2, 3])),  # batch dimension
            # pytype: disable=attribute-error
            spec=tf.io.RaggedFeature(  # pylint: disable=g-long-ternary
                tf.float32,
                value_key='ragged_4d_val',
                partitions=[
                    tf.io.RaggedFeature.RowLengths('ragged_4d_row_lengths1'),
                    tf.io.RaggedFeature.RowLengths('ragged_4d_row_lengths2'),
                    tf.io.RaggedFeature.RowLengths('ragged_4d_row_lengths3'),
                ]) if common_types.is_ragged_feature_available() else None,
            # pytype: enable=attribute-error
            expected_components={
                'ragged_4d_val': [
                    np.array([b'a']),
                    np.array([], dtype=object),
                    np.array([b'b', b'c', b'd'])
                ],
                'ragged_4d_row_lengths1':
                [np.array([1, 0]),
                 np.array([]), np.array([2])],
                'ragged_4d_row_lengths2':
                [np.array([2]), np.array([]),
                 np.array([0, 2])],
                'ragged_4d_row_lengths3':
                [np.array([1, 0]),
                 np.array([]),
                 np.array([2, 1])],
            },
        ))
    def test_handle_ragged_batch(self, ragged_tensor, spec,
                                 expected_components):
        test_case.skip_if_not_tf2('RaggedFeature is not available in TF 1.x')
        result = impl_helper._handle_ragged_batch(ragged_tensor,
                                                  spec,
                                                  name='ragged')
        np.testing.assert_equal(result, expected_components)
Exemple #5
0
class ImplHelperTest(test_case.TransformTestCase):
    def test_batched_placeholders_from_feature_spec(self):
        feature_spec = {
            'fixed_len_float':
            tf.io.FixedLenFeature([2, 3], tf.float32),
            'fixed_len_string':
            tf.io.FixedLenFeature([], tf.string),
            '_var_len_underscored':
            tf.io.VarLenFeature(tf.string),
            'var_len_int':
            tf.io.VarLenFeature(tf.int64),
            'sparse_1d':
            tf.io.SparseFeature('1d_idx', '1d_value', tf.int64, 7),
            'sparse_2d':
            tf.io.SparseFeature(['2d_idx0', '2d_idx1'], '2d_value', tf.int64,
                                [2, 17]),
        }
        with tf.compat.v1.Graph().as_default():
            features = impl_helper.batched_placeholders_from_specs(
                feature_spec)
        self.assertCountEqual(features.keys(), [
            'fixed_len_float',
            'fixed_len_string',
            'var_len_int',
            '_var_len_underscored',
            'sparse_1d',
            'sparse_2d',
        ])
        self.assertEqual(type(features['fixed_len_float']), tf.Tensor)
        self.assertEqual(features['fixed_len_float'].get_shape().as_list(),
                         [None, 2, 3])
        self.assertEqual(type(features['fixed_len_string']), tf.Tensor)
        self.assertEqual(features['fixed_len_string'].get_shape().as_list(),
                         [None])
        self.assertEqual(type(features['var_len_int']), tf.SparseTensor)
        self.assertEqual(features['var_len_int'].get_shape().as_list(),
                         [None, None])
        self.assertEqual(type(features['_var_len_underscored']),
                         tf.SparseTensor)
        self.assertEqual(
            features['_var_len_underscored'].get_shape().as_list(),
            [None, None])
        self.assertEqual(type(features['sparse_1d']), tf.SparseTensor)
        self.assertEqual(type(features['sparse_2d']), tf.SparseTensor)
        if tf.__version__ >= '2':
            self.assertEqual(features['sparse_1d'].get_shape().as_list(),
                             [None, 7])
            self.assertEqual(features['sparse_2d'].get_shape().as_list(),
                             [None, 2, 17])
        else:
            self.assertEqual(features['sparse_1d'].get_shape().as_list(),
                             [None, None])
            self.assertEqual(features['sparse_2d'].get_shape().as_list(),
                             [None, None, None])

    def test_batched_placeholders_from_typespecs(self):
        typespecs = {
            'dense_float':
            tf.TensorSpec(dtype=tf.float32, shape=[None, 2, 3]),
            'dense_string':
            tf.TensorSpec(shape=[None], dtype=tf.string),
            '_sparse_underscored':
            tf.SparseTensorSpec(dtype=tf.string, shape=[None, None, 17]),
            'ragged_string':
            tf.RaggedTensorSpec(dtype=tf.string,
                                ragged_rank=1,
                                shape=[None, None]),
            'ragged_multi_dimension':
            tf.RaggedTensorSpec(dtype=tf.int64,
                                ragged_rank=3,
                                shape=[None, None, None, None, 5]),
        }
        with tf.compat.v1.Graph().as_default():
            features = impl_helper.batched_placeholders_from_specs(typespecs)
        self.assertCountEqual(features.keys(), [
            'dense_float',
            'dense_string',
            '_sparse_underscored',
            'ragged_string',
            'ragged_multi_dimension',
        ])
        self.assertEqual(type(features['dense_float']), tf.Tensor)
        self.assertEqual(features['dense_float'].get_shape().as_list(),
                         [None, 2, 3])
        self.assertEqual(features['dense_float'].dtype, tf.float32)

        self.assertEqual(type(features['dense_string']), tf.Tensor)
        self.assertEqual(features['dense_string'].get_shape().as_list(),
                         [None])
        self.assertEqual(features['dense_string'].dtype, tf.string)

        self.assertEqual(type(features['_sparse_underscored']),
                         tf.SparseTensor)
        # TODO(zoyahav): Change last dimension size to 17 once SparseTensors propogate
        # static dense_shape from typespec correctly.
        self.assertEqual(features['_sparse_underscored'].get_shape().as_list(),
                         [None, None, None])
        self.assertEqual(features['_sparse_underscored'].dtype, tf.string)

        self.assertEqual(type(features['ragged_string']), tf.RaggedTensor)
        self.assertEqual(features['ragged_string'].shape.as_list(),
                         [None, None])
        self.assertEqual(features['ragged_string'].ragged_rank, 1)
        self.assertEqual(features['ragged_string'].dtype, tf.string)

        self.assertEqual(type(features['ragged_multi_dimension']),
                         tf.RaggedTensor)
        self.assertEqual(features['ragged_multi_dimension'].shape.as_list(),
                         [None, None, None, None, 5])
        self.assertEqual(features['ragged_multi_dimension'].ragged_rank, 3)
        self.assertEqual(features['ragged_multi_dimension'].dtype, tf.int64)

    def test_batched_placeholders_from_specs_invalid_dtype(self):
        with self.assertRaisesRegexp(ValueError, 'had invalid dtype'):
            impl_helper.batched_placeholders_from_specs(
                {'f': tf.TensorSpec(dtype=tf.int32, shape=[None])})
        with self.assertRaisesRegexp(ValueError, 'had invalid dtype'):
            impl_helper.batched_placeholders_from_specs(
                {'f': tf.io.FixedLenFeature(dtype=tf.int32, shape=[None])})

    def test_batched_placeholders_from_specs_invalid_mixing(self):
        with self.assertRaisesRegexp(TypeError, 'Specs must be all'):
            impl_helper.batched_placeholders_from_specs({
                'f1':
                tf.TensorSpec(dtype=tf.int64, shape=[None]),
                'f2':
                tf.io.FixedLenFeature(dtype=tf.int64, shape=[None]),
            })

    @test_case.named_parameters(*test_case.cross_named_parameters(
        _ROUNDTRIP_CASES, [
            dict(testcase_name='eager_tensors', feed_eager_tensors=True),
            dict(testcase_name='session_run_values', feed_eager_tensors=False)
        ]))
    def test_to_instance_dicts(self, feature_spec, instances, feed_dict,
                               feed_eager_tensors):
        if feed_eager_tensors:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')
        schema = schema_utils.schema_from_feature_spec(feature_spec)
        feed_dict_local = copy.copy(feed_dict)
        if feed_eager_tensors:
            for key, value in six.iteritems(feed_dict_local):
                if isinstance(value, tf.compat.v1.SparseTensorValue):
                    feed_dict_local[key] = tf.sparse.SparseTensor.from_value(
                        value)
                else:
                    feed_dict_local[key] = tf.constant(value)
        result = impl_helper.to_instance_dicts(schema, feed_dict_local)
        np.testing.assert_equal(instances, result)

    @test_case.named_parameters(*_TO_INSTANCE_DICT_ERROR_CASES)
    def test_to_instance_dicts_error(self,
                                     feature_spec,
                                     feed_dict,
                                     error_msg,
                                     error_type=ValueError):
        schema = schema_utils.schema_from_feature_spec(feature_spec)
        with self.assertRaisesRegexp(error_type, error_msg):
            impl_helper.to_instance_dicts(schema, feed_dict)

    @test_case.named_parameters(*test_case.cross_named_parameters(
        _ROUNDTRIP_CASES, [
            dict(testcase_name='eager_tensors', feed_eager_tensors=True),
            dict(testcase_name='session_run_values', feed_eager_tensors=False)
        ]))
    def test_convert_to_arrow(self, feature_spec, instances, feed_dict,
                              feed_eager_tensors):
        if feed_eager_tensors:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')
        schema = schema_utils.schema_from_feature_spec(feature_spec)
        converter = impl_helper.make_tensor_to_arrow_converter(schema)
        feed_dict_local = copy.copy(feed_dict)
        if feed_eager_tensors:
            for key, value in six.iteritems(feed_dict_local):
                if isinstance(value, tf.compat.v1.SparseTensorValue):
                    feed_dict_local[key] = tf.sparse.SparseTensor.from_value(
                        value)
                else:
                    feed_dict_local[key] = tf.constant(value)
        arrow_columns, arrow_schema = impl_helper.convert_to_arrow(
            schema, converter, feed_dict_local)
        record_batch = pa.RecordBatch.from_arrays(arrow_columns, arrow_schema)

        # Merge and flatten expected instance dicts.
        expected = collections.defaultdict(list)
        for instance_dict in instances:
            for key, value in instance_dict.items():
                expected[key].append(np.ravel(value))
        actual = record_batch.to_pydict()
        self.assertEqual(len(actual), len(expected))
        for key, expected_value in expected.items():
            # Floating-point error breaks exact equality for some floating values.
            # However, the approximate equality testing fails on strings.
            if np.issubdtype(expected_value[0].dtype, np.number):
                self.assertAllClose(actual[key], expected_value)
            else:
                np.testing.assert_equal(actual[key], expected_value)

    @test_case.named_parameters(*_CONVERT_TO_ARROW_ERROR_CASES)
    def test_convert_to_arrow_error(self,
                                    feature_spec,
                                    feed_dict,
                                    error_msg,
                                    error_type=ValueError):
        schema = schema_utils.schema_from_feature_spec(feature_spec)
        converter = impl_helper.make_tensor_to_arrow_converter(schema)
        with self.assertRaisesRegexp(error_type, error_msg):
            impl_helper.convert_to_arrow(schema, converter, feed_dict)

    @test_case.named_parameters(
        dict(testcase_name='tf_compat_v1', force_tf_compat_v1=True),
        dict(testcase_name='native_tf2', force_tf_compat_v1=False))
    def test_analyze_in_place(self, force_tf_compat_v1):
        if not force_tf_compat_v1:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')

        def preprocessing_fn(inputs):
            return {'x_add_1': inputs['x'] + 1}

        feature_spec = {'x': tf.io.FixedLenFeature([], tf.int64)}
        type_spec = {
            'x': tf.TensorSpec(dtype=tf.int64, shape=[
                None,
            ])
        }
        output_path = os.path.join(self.get_temp_dir(), self._testMethodName)
        impl_helper.analyze_in_place(preprocessing_fn, force_tf_compat_v1,
                                     feature_spec, type_spec, output_path)

        tft_output = TFTransformOutput(output_path)
        expected_value = np.array([2], dtype=np.int64)
        if force_tf_compat_v1:
            with tf.Graph().as_default() as graph:
                with tf.compat.v1.Session(graph=graph).as_default():
                    transformed_features = tft_output.transform_raw_features(
                        {'x': tf.constant([1], dtype=tf.int64)})
                    transformed_value = transformed_features['x_add_1'].eval()
        else:
            transformed_features = tft_output.transform_raw_features(
                {'x': tf.constant([1], dtype=tf.int64)})
            transformed_value = transformed_features['x_add_1'].numpy()
        self.assertEqual(transformed_value, expected_value)

        transformed_feature_spec = tft_output.transformed_feature_spec()
        expected_feature_spec = feature_spec = {
            'x_add_1': tf.io.FixedLenFeature([], tf.int64)
        }
        self.assertEqual(transformed_feature_spec, expected_feature_spec)

    @test_case.named_parameters(
        dict(testcase_name='tf_compat_v1', force_tf_compat_v1=True),
        dict(testcase_name='native_tf2', force_tf_compat_v1=False))
    def test_analyze_in_place_with_analyzers_raises_error(
            self, force_tf_compat_v1):
        if not force_tf_compat_v1:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')

        def preprocessing_fn(inputs):
            return {'x_add_1': analyzers.mean(inputs['x'])}

        feature_spec = {'x': tf.io.FixedLenFeature([], tf.int64)}
        type_spec = {
            'x': tf.TensorSpec(dtype=tf.int64, shape=[
                None,
            ])
        }
        output_path = os.path.join(self.get_temp_dir(), self._testMethodName)
        with self.assertRaisesRegexp(RuntimeError,
                                     'analyzers found when tracing'):
            impl_helper.analyze_in_place(preprocessing_fn, force_tf_compat_v1,
                                         feature_spec, type_spec, output_path)

    @test_case.named_parameters(
        dict(testcase_name='_3d',
             sparse_value=tf.compat.v1.SparseTensorValue(
                 indices=np.array([[0, 0, 1], [0, 1, 2], [1, 1, 1]]),
                 values=np.array([0, 1, 2]),
                 dense_shape=np.array([2, 2, 3])),
             expected_indices=[[np.array([0, 1]),
                                np.array([1, 2])],
                               [np.array([1]), np.array([1])]],
             expected_values=[np.array([0, 1]),
                              np.array([2])]),
        dict(testcase_name='_4d',
             sparse_value=tf.compat.v1.SparseTensorValue(
                 indices=np.array([[0, 0, 0, 1], [0, 1, 0, 2], [1, 1, 1, 1]]),
                 values=np.array([0, 1, 2]),
                 dense_shape=np.array([2, 2, 2, 3])),
             expected_indices=[[
                 np.array([0, 1]),
                 np.array([0, 0]),
                 np.array([1, 2])
             ], [np.array([1]), np.array([1]),
                 np.array([1])]],
             expected_values=[np.array([0, 1]),
                              np.array([2])]),
    )
    def test_decompose_sparse_batch(self, sparse_value, expected_indices,
                                    expected_values):
        indices, values = impl_helper._decompose_sparse_batch(sparse_value)
        self.assertLen(indices, len(expected_indices))
        self.assertLen(values, len(expected_values))
        for idx, (a, b) in enumerate(zip(expected_indices, indices)):
            self.assertAllEqual(
                a, b, 'Indices are different at index {}'.format(idx))
        for idx, (a, b) in enumerate(zip(expected_values, values)):
            self.assertAllEqual(a, b,
                                'Values are different at index {}'.format(idx))

    def test_get_num_values_per_instance_in_sparse_batch(self):
        batch_indices = np.array([[idx % 4, 0, 1, 2] for idx in range(100)])
        num_values = impl_helper._get_num_values_per_instance_in_sparse_batch(
            batch_indices, 27)
        expected_num_values = [25, 25, 25, 25] + [0] * 23
        self.assertEqual(expected_num_values, num_values)
class AnalysisGraphBuilderTest(test_case.TransformTestCase):
    @test_case.named_parameters(*test_case.cross_named_parameters(
        _ANALYZE_TEST_CASES, [
            dict(testcase_name='tf_compat_v1', use_tf_compat_v1=True),
            dict(testcase_name='tf2', use_tf_compat_v1=False)
        ]))
    def test_build(self, feature_spec, preprocessing_fn,
                   expected_dot_graph_str, expected_dot_graph_str_tf2,
                   use_tf_compat_v1):
        if not use_tf_compat_v1:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')
        specs = (feature_spec if use_tf_compat_v1 else
                 impl_helper.get_type_specs_from_feature_specs(feature_spec))
        graph, structured_inputs, structured_outputs = (
            impl_helper.trace_preprocessing_function(
                preprocessing_fn,
                specs,
                use_tf_compat_v1=use_tf_compat_v1,
                base_temp_dir=os.path.join(self.get_temp_dir(),
                                           self._testMethodName)))
        transform_fn_future, unused_cache = analysis_graph_builder.build(
            graph, structured_inputs, structured_outputs)

        dot_string = nodes.get_dot_graph([transform_fn_future]).to_string()
        self.WriteRenderedDotFile(dot_string)
        self.assertMultiLineEqual(
            msg='Result dot graph is:\n{}'.format(dot_string),
            first=dot_string,
            second=(expected_dot_graph_str
                    if use_tf_compat_v1 else expected_dot_graph_str_tf2))

    @test_case.named_parameters(*test_case.cross_named_parameters(
        [
            dict(
                testcase_name='one_dataset_cached_single_phase',
                preprocessing_fn=_preprocessing_fn_with_one_analyzer,
                full_dataset_keys=['a', 'b'],
                cached_dataset_keys=['a'],
                expected_dataset_keys=['b'],
            ),
            dict(
                testcase_name='all_datasets_cached_single_phase',
                preprocessing_fn=_preprocessing_fn_with_one_analyzer,
                full_dataset_keys=['a', 'b'],
                cached_dataset_keys=['a', 'b'],
                expected_dataset_keys=[],
            ),
            dict(
                testcase_name='mixed_single_phase',
                preprocessing_fn=lambda d: dict(  # pylint: disable=g-long-lambda
                    list(
                        _preprocessing_fn_with_chained_ptransforms(d).items())
                    + list(_preprocessing_fn_with_one_analyzer(d).items())),
                full_dataset_keys=['a', 'b'],
                cached_dataset_keys=['a', 'b'],
                expected_dataset_keys=['a', 'b'],
            ),
            dict(
                testcase_name='multi_phase',
                preprocessing_fn=_preprocessing_fn_with_two_phases,
                full_dataset_keys=['a', 'b'],
                cached_dataset_keys=['a', 'b'],
                expected_dataset_keys=['a', 'b'],
            )
        ],
        [
            dict(testcase_name='tf_compat_v1', use_tf_compat_v1=True),
            dict(testcase_name='tf2', use_tf_compat_v1=False)
        ]))
    def test_get_analysis_dataset_keys(self, preprocessing_fn,
                                       full_dataset_keys, cached_dataset_keys,
                                       expected_dataset_keys,
                                       use_tf_compat_v1):
        if not use_tf_compat_v1:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')
        full_dataset_keys = [
            analysis_graph_builder.analyzer_cache.DatasetKey(k)
            for k in full_dataset_keys
        ]
        # We force all dataset keys with entries in the cache dict will have a cache
        # hit.
        mocked_cache_entry_key = b'M'
        input_cache = {
            key: {
                mocked_cache_entry_key: 'C'
            }
            for key in cached_dataset_keys
        }
        feature_spec = {'x': tf.io.FixedLenFeature([], tf.float32)}
        specs = (feature_spec if use_tf_compat_v1 else
                 impl_helper.get_type_specs_from_feature_specs(feature_spec))
        with mock.patch(
                'tensorflow_transform.beam.analysis_graph_builder.'
                'analyzer_cache.make_cache_entry_key',
                return_value=mocked_cache_entry_key):
            dataset_keys = (analysis_graph_builder.get_analysis_dataset_keys(
                preprocessing_fn,
                specs,
                full_dataset_keys,
                input_cache,
                force_tf_compat_v1=use_tf_compat_v1))

        dot_string = nodes.get_dot_graph(
            [analysis_graph_builder._ANALYSIS_GRAPH]).to_string()
        self.WriteRenderedDotFile(dot_string)
        self.assertCountEqual(expected_dataset_keys, dataset_keys)

    @test_case.named_parameters(
        dict(testcase_name='tf_compat_v1', use_tf_compat_v1=True),
        dict(testcase_name='tf2', use_tf_compat_v1=False))
    def test_get_analysis_cache_entry_keys(self, use_tf_compat_v1):
        if not use_tf_compat_v1:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')
        full_dataset_keys = ['a', 'b']

        def preprocessing_fn(inputs):
            return {'x': tft.scale_to_0_1(inputs['x'])}

        mocked_cache_entry_key = 'A'

        def mocked_make_cache_entry_key(_):
            return mocked_cache_entry_key

        feature_spec = {'x': tf.io.FixedLenFeature([], tf.float32)}
        specs = (feature_spec if use_tf_compat_v1 else
                 impl_helper.get_type_specs_from_feature_specs(feature_spec))
        with mock.patch(
                'tensorflow_transform.beam.analysis_graph_builder.'
                'analyzer_cache.make_cache_entry_key',
                side_effect=mocked_make_cache_entry_key):
            cache_entry_keys = (
                analysis_graph_builder.get_analysis_cache_entry_keys(
                    preprocessing_fn,
                    specs,
                    full_dataset_keys,
                    force_tf_compat_v1=use_tf_compat_v1))

        dot_string = nodes.get_dot_graph(
            [analysis_graph_builder._ANALYSIS_GRAPH]).to_string()
        self.WriteRenderedDotFile(dot_string)
        self.assertCountEqual(cache_entry_keys, [mocked_cache_entry_key])
class SchemaInferenceTest(test_case.TransformTestCase):
    def _get_schema(self,
                    preprocessing_fn,
                    use_compat_v1,
                    inputs=None,
                    input_signature=None,
                    create_session=False):
        if inputs is None:
            inputs = {}
        if input_signature is None:
            input_signature = {}
        if use_compat_v1:
            with tf.compat.v1.Graph().as_default() as graph:
                # Convert eager tensors to graph tensors.
                inputs_copy = {
                    k: tf.constant(v, input_signature[k].dtype)
                    for k, v in inputs.items()
                }
                tensors = preprocessing_fn(inputs_copy)
                if create_session:
                    # Create a session to actually evaluate the annotations and extract
                    # the output schema with annotations applied.
                    with tf.compat.v1.Session(graph=graph) as session:
                        schema = schema_inference.infer_feature_schema(
                            tensors, graph, session)
                else:
                    schema = schema_inference.infer_feature_schema(
                        tensors, graph)
        else:
            tf_func = tf.function(preprocessing_fn,
                                  input_signature=[input_signature
                                                   ]).get_concrete_function()
            tensors = tf.nest.pack_sequence_as(
                structure=tf_func.structured_outputs,
                flat_sequence=tf_func.outputs,
                expand_composites=True)
            metadata_fn = schema_inference.get_traced_metadata_fn(
                tensor_replacement_map={},
                preprocessing_fn=preprocessing_fn,
                input_signature=input_signature,
                base_temp_dir=os.path.join(self.get_temp_dir(),
                                           self._testMethodName),
                evaluate_schema_overrides=create_session)
            schema = schema_inference.infer_feature_schema_v2(
                tensors,
                metadata_fn.get_concrete_function(),
                evaluate_schema_overrides=create_session)
        return schema

    # pylint: disable=g-long-lambda
    @test_case.named_parameters(*test_case.cross_named_parameters([
        dict(testcase_name='fixed_len_int',
             make_tensors_fn=_make_tensors,
             feature_spec={'x': tf.io.FixedLenFeature([], tf.int64)}),
        dict(testcase_name='fixed_len_string',
             make_tensors_fn=_make_tensors,
             feature_spec={'x': tf.io.FixedLenFeature([], tf.string)}),
        dict(testcase_name='fixed_len_float',
             make_tensors_fn=_make_tensors,
             feature_spec={'x': tf.io.FixedLenFeature([], tf.float32)}),
        dict(testcase_name='override',
             make_tensors_fn=_make_tensors_with_override,
             feature_spec={'x': tf.io.FixedLenFeature([], tf.int64)},
             domains={'x': schema_pb2.IntDomain(is_categorical=True)}),
        dict(testcase_name='override_with_session',
             make_tensors_fn=_make_tensors_with_override,
             feature_spec={'x': tf.io.FixedLenFeature([], tf.int64)},
             domains={
                 'x': schema_pb2.IntDomain(min=5, max=6, is_categorical=True)
             },
             create_session=True)
    ], [
        dict(testcase_name='compat_v1', use_compat_v1=True),
        dict(testcase_name='v2', use_compat_v1=False)
    ]))
    # pylint: enable=g-long-lambda
    def test_infer_feature_schema(self,
                                  make_tensors_fn,
                                  feature_spec,
                                  use_compat_v1,
                                  domains=None,
                                  create_session=False):
        if not use_compat_v1:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')
        x_val = '0' if feature_spec['x'].dtype == tf.string else 0
        inputs = {'x': [x_val]}
        input_signature = {
            'x': tf.TensorSpec([None], dtype=feature_spec['x'].dtype)
        }
        schema = self._get_schema(make_tensors_fn,
                                  use_compat_v1,
                                  inputs=inputs,
                                  input_signature=input_signature,
                                  create_session=create_session)
        expected_schema = schema_utils.schema_from_feature_spec(
            feature_spec, domains)
        self.assertEqual(schema, expected_schema)

    @test_case.named_parameters(
        dict(testcase_name='compat_v1', use_compat_v1=True),
        dict(testcase_name='v2', use_compat_v1=False))
    def test_infer_feature_schema_bad_rank(self, use_compat_v1):
        if not use_compat_v1:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')
        inputs = {'x': 0}
        input_signature = {'x': tf.TensorSpec([], dtype=tf.float32)}
        with self.assertRaises(ValueError):
            self._get_schema(_make_tensors,
                             use_compat_v1,
                             inputs=inputs,
                             input_signature=input_signature)

    @unittest.skipIf(not common.IS_ANNOTATIONS_PB_AVAILABLE,
                     'Schema annotations are not available')
    @test_case.named_parameters(
        dict(testcase_name='compat_v1', use_compat_v1=True),
        dict(testcase_name='v2', use_compat_v1=False))
    def test_vocab_annotation(self, use_compat_v1):
        if not use_compat_v1:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')

        def preprocessing_fn(_):
            analyzers._maybe_annotate_vocab_metadata(
                'file1', tf.constant(100, dtype=tf.int64))
            analyzers._maybe_annotate_vocab_metadata(
                'file2', tf.constant(200, dtype=tf.int64))
            return {
                'foo': tf.convert_to_tensor([0, 1, 2, 3], dtype=tf.int64),
            }

        schema = self._get_schema(preprocessing_fn,
                                  use_compat_v1,
                                  create_session=True)
        self.assertLen(schema.annotation.extra_metadata, 2)
        sizes = {}
        for annotation in schema.annotation.extra_metadata:
            message = annotations_pb2.VocabularyMetadata()
            annotation.Unpack(message)
            sizes[message.file_name] = message.unfiltered_vocabulary_size
        self.assertDictEqual(sizes, {'file1': 100, 'file2': 200})

    @unittest.skipIf(not common.IS_ANNOTATIONS_PB_AVAILABLE,
                     'Schema annotations are not available')
    @test_case.named_parameters(
        dict(testcase_name='compat_v1', use_compat_v1=True),
        dict(testcase_name='v2', use_compat_v1=False))
    def test_bucketization_annotation(self, use_compat_v1):
        if not use_compat_v1:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')

        def preprocessing_fn(_):
            inputs = {
                'foo': tf.convert_to_tensor([0, 1, 2, 3]),
                'bar': tf.convert_to_tensor([0, 2, 0, 2]),
            }
            boundaries_foo = tf.expand_dims(tf.convert_to_tensor([.5, 1.5]),
                                            axis=0)
            boundaries_bar = tf.expand_dims(tf.convert_to_tensor([.1, .2]),
                                            axis=0)
            outputs = {}
            # tft.apply_buckets will annotate the feature in the output schema to
            # indicate the bucket boundaries that were applied.
            outputs['Bucketized_foo'] = mappers.apply_buckets(
                inputs['foo'], boundaries_foo)
            outputs['Bucketized_bar'] = mappers.apply_buckets(
                inputs['bar'], boundaries_bar)
            return outputs

        schema = self._get_schema(preprocessing_fn,
                                  use_compat_v1,
                                  create_session=True)
        self.assertLen(schema.feature, 2)
        for feature in schema.feature:
            self.assertLen(feature.annotation.extra_metadata, 1)
            for annotation in feature.annotation.extra_metadata:

                # Extract the annotated message and validate its contents
                message = annotations_pb2.BucketBoundaries()
                annotation.Unpack(message)
                if feature.name == 'Bucketized_foo':
                    self.assertAllClose(list(message.boundaries), [.5, 1.5])
                elif feature.name == 'Bucketized_bar':
                    self.assertAllClose(list(message.boundaries), [.1, .2])
                else:
                    raise RuntimeError('Unexpected features in schema')

    @unittest.skipIf(not common.IS_ANNOTATIONS_PB_AVAILABLE,
                     'Schema annotations are not available')
    @test_case.named_parameters(
        dict(testcase_name='compat_v1', use_compat_v1=True),
        dict(testcase_name='v2', use_compat_v1=False))
    def test_global_annotation(self, use_compat_v1):
        # pylint: enable=g-import-not-at-top
        if not use_compat_v1:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')

        def preprocessing_fn(_):
            # Annotate an arbitrary proto at the schema level (not sure what global
            # schema boundaries would mean, but hey I'm just a test).
            boundaries = tf.constant([[1.0]])
            message_type = annotations_pb2.BucketBoundaries.DESCRIPTOR.full_name
            sizes = tf.expand_dims([tf.size(boundaries)], axis=0)
            message_proto = tf.raw_ops.EncodeProto(
                sizes=sizes,
                values=[tf.cast(boundaries, tf.float32)],
                field_names=['boundaries'],
                message_type=message_type)[0]
            type_url = os.path.join('type.googleapis.com', message_type)
            schema_inference.annotate(type_url, message_proto)
            return {
                'foo': tf.convert_to_tensor([0, 1, 2, 3], dtype=tf.int64),
                'bar': tf.convert_to_tensor([0, 2, 0, 2], dtype=tf.int64),
            }

        schema = self._get_schema(preprocessing_fn,
                                  use_compat_v1,
                                  create_session=True)
        self.assertLen(schema.annotation.extra_metadata, 1)
        for annotation in schema.annotation.extra_metadata:
            # Extract the annotated message and validate its contents
            message = annotations_pb2.BucketBoundaries()
            annotation.Unpack(message)
            self.assertAllClose(list(message.boundaries), [1])

    @test_case.named_parameters(
        dict(testcase_name='compat_v1', use_compat_v1=True),
        dict(testcase_name='v2', use_compat_v1=False))
    def test_infer_feature_schema_with_ragged_tensor(self, use_compat_v1):
        if not use_compat_v1:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')

        def preprocessing_fn(_):
            return {
                'foo':
                tf.RaggedTensor.from_row_splits(values=tf.constant(
                    [3, 1, 4, 1, 5, 9, 2, 6], tf.int64),
                                                row_splits=[0, 4, 4, 7, 8, 8]),
            }

        schema = self._get_schema(preprocessing_fn,
                                  use_compat_v1,
                                  create_session=True)
        expected_schema_ascii = """feature {
name: "foo"
type: INT
annotation {
tag: "ragged_tensor"
}
}
"""
        expected_schema = text_format.Parse(expected_schema_ascii,
                                            schema_pb2.Schema())
        schema_utils_legacy.set_generate_legacy_feature_spec(
            expected_schema, False)
        self.assertProtoEquals(expected_schema, schema)
        with self.assertRaisesRegexp(ValueError,
                                     'Feature "foo" had tag "ragged_tensor"'):
            schema_utils.schema_as_feature_spec(schema)
Exemple #8
0
class ImplHelperTest(test_case.TransformTestCase):
    def test_batched_placeholders_from_feature_spec(self):
        feature_spec = {
            'fixed_len_float': tf.io.FixedLenFeature([2, 3], tf.float32),
            'fixed_len_string': tf.io.FixedLenFeature([], tf.string),
            '_var_len_underscored': tf.io.VarLenFeature(tf.string),
            'var_len_int': tf.io.VarLenFeature(tf.int64)
        }
        with tf.compat.v1.Graph().as_default():
            features = impl_helper.batched_placeholders_from_specs(
                feature_spec)
        self.assertCountEqual(features.keys(), [
            'fixed_len_float', 'fixed_len_string', 'var_len_int',
            '_var_len_underscored'
        ])
        self.assertEqual(type(features['fixed_len_float']), tf.Tensor)
        self.assertEqual(features['fixed_len_float'].get_shape().as_list(),
                         [None, 2, 3])
        self.assertEqual(type(features['fixed_len_string']), tf.Tensor)
        self.assertEqual(features['fixed_len_string'].get_shape().as_list(),
                         [None])
        self.assertEqual(type(features['var_len_int']), tf.SparseTensor)
        self.assertEqual(features['var_len_int'].get_shape().as_list(),
                         [None, None])
        self.assertEqual(type(features['_var_len_underscored']),
                         tf.SparseTensor)
        self.assertEqual(
            features['_var_len_underscored'].get_shape().as_list(),
            [None, None])

    def test_batched_placeholders_from_typespecs(self):
        typespecs = {
            'dense_float':
            tf.TensorSpec(dtype=tf.float32, shape=[None, 2, 3]),
            'dense_string':
            tf.TensorSpec(shape=[None], dtype=tf.string),
            '_sparse_underscored':
            tf.SparseTensorSpec(dtype=tf.string, shape=[None, None]),
            'ragged_string':
            tf.RaggedTensorSpec(dtype=tf.string,
                                ragged_rank=1,
                                shape=[None, None]),
            'ragged_multi_dimension':
            tf.RaggedTensorSpec(dtype=tf.int64,
                                ragged_rank=3,
                                shape=[None, None, None, None, 5]),
        }
        with tf.compat.v1.Graph().as_default():
            features = impl_helper.batched_placeholders_from_specs(typespecs)
        self.assertCountEqual(features.keys(), [
            'dense_float',
            'dense_string',
            '_sparse_underscored',
            'ragged_string',
            'ragged_multi_dimension',
        ])
        self.assertEqual(type(features['dense_float']), tf.Tensor)
        self.assertEqual(features['dense_float'].get_shape().as_list(),
                         [None, 2, 3])
        self.assertEqual(features['dense_float'].dtype, tf.float32)

        self.assertEqual(type(features['dense_string']), tf.Tensor)
        self.assertEqual(features['dense_string'].get_shape().as_list(),
                         [None])
        self.assertEqual(features['dense_string'].dtype, tf.string)

        self.assertEqual(type(features['_sparse_underscored']),
                         tf.SparseTensor)
        self.assertEqual(features['_sparse_underscored'].get_shape().as_list(),
                         [None, None])
        self.assertEqual(features['_sparse_underscored'].dtype, tf.string)

        self.assertEqual(type(features['ragged_string']), tf.RaggedTensor)
        self.assertEqual(features['ragged_string'].shape.as_list(),
                         [None, None])
        self.assertEqual(features['ragged_string'].ragged_rank, 1)
        self.assertEqual(features['ragged_string'].dtype, tf.string)

        self.assertEqual(type(features['ragged_multi_dimension']),
                         tf.RaggedTensor)
        self.assertEqual(features['ragged_multi_dimension'].shape.as_list(),
                         [None, None, None, None, 5])
        self.assertEqual(features['ragged_multi_dimension'].ragged_rank, 3)
        self.assertEqual(features['ragged_multi_dimension'].dtype, tf.int64)

    def test_batched_placeholders_from_specs_invalid_dtype(self):
        with self.assertRaisesRegexp(ValueError, 'had invalid dtype'):
            impl_helper.batched_placeholders_from_specs(
                {'f': tf.TensorSpec(dtype=tf.int32, shape=[None])})
        with self.assertRaisesRegexp(ValueError, 'had invalid dtype'):
            impl_helper.batched_placeholders_from_specs(
                {'f': tf.io.FixedLenFeature(dtype=tf.int32, shape=[None])})

    def test_batched_placeholders_from_specs_invalid_mixing(self):
        with self.assertRaisesRegexp(TypeError, 'Specs must be all'):
            impl_helper.batched_placeholders_from_specs({
                'f1':
                tf.TensorSpec(dtype=tf.int64, shape=[None]),
                'f2':
                tf.io.FixedLenFeature(dtype=tf.int64, shape=[None]),
            })

    @test_case.named_parameters(*test_case.cross_named_parameters(
        (_ROUNDTRIP_CASES + _MAKE_FEED_DICT_CASES), [
            dict(testcase_name='eager_tensors', produce_eager_tensors=True),
            dict(testcase_name='feed_values', produce_eager_tensors=False)
        ]))
    def test_make_feed_list(self, feature_spec, instances, feed_dict,
                            produce_eager_tensors):
        if produce_eager_tensors:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')
        schema = schema_utils.schema_from_feature_spec(feature_spec)
        feature_names = list(feature_spec.keys())
        expected_feed_list = [feed_dict[key] for key in feature_names]
        evaluated_feed_list = impl_helper.make_feed_list(
            feature_names,
            schema,
            instances,
            produce_eager_tensors=produce_eager_tensors)
        np.testing.assert_equal(
            evaluated_feed_list if not produce_eager_tensors else
            _get_value_from_eager_tensors(evaluated_feed_list),
            expected_feed_list)

    @test_case.named_parameters(*_MAKE_FEED_LIST_ERROR_CASES)
    def test_make_feed_list_error(self,
                                  feature_spec,
                                  instances,
                                  error_msg,
                                  error_type=ValueError):
        with tf.compat.v1.Graph().as_default():
            tensors = tf.io.parse_example(serialized=tf.compat.v1.placeholder(
                tf.string, [None]),
                                          features=feature_spec)
            schema = schema_utils.schema_from_feature_spec(feature_spec)
            with self.assertRaisesRegexp(error_type, error_msg):
                impl_helper.make_feed_list(tensors, schema, instances)

    @test_case.named_parameters(*test_case.cross_named_parameters(
        _ROUNDTRIP_CASES, [
            dict(testcase_name='eager_tensors', feed_eager_tensors=True),
            dict(testcase_name='session_run_values', feed_eager_tensors=False)
        ]))
    def test_to_instance_dicts(self, feature_spec, instances, feed_dict,
                               feed_eager_tensors):
        if feed_eager_tensors:
            test_case.skip_if_not_tf2('Tensorflow 2.x required')
        schema = schema_utils.schema_from_feature_spec(feature_spec)
        feed_dict_local = copy.copy(feed_dict)
        if feed_eager_tensors:
            for key, value in six.iteritems(feed_dict_local):
                if isinstance(value, tf.compat.v1.SparseTensorValue):
                    feed_dict_local[key] = tf.sparse.SparseTensor.from_value(
                        value)
                else:
                    feed_dict_local[key] = tf.constant(value)
        np.testing.assert_equal(
            instances, impl_helper.to_instance_dicts(schema, feed_dict_local))

    @test_case.named_parameters(*_TO_INSTANCE_DICT_ERROR_CASES)
    def test_to_instance_dicts_error(self,
                                     feature_spec,
                                     feed_dict,
                                     error_msg,
                                     error_type=ValueError):
        schema = schema_utils.schema_from_feature_spec(feature_spec)
        with self.assertRaisesRegexp(error_type, error_msg):
            impl_helper.to_instance_dicts(schema, feed_dict)

    def test_copy_tensors_produces_different_tensors(self):
        with tf.compat.v1.Graph().as_default():
            tensors = {
                'dense':
                tf.compat.v1.placeholder(tf.int64, (None, ),
                                         name='my_dense_input'),
                'sparse':
                tf.compat.v1.sparse_placeholder(tf.int64,
                                                name='my_sparse_input'),
                'ragged':
                tf.compat.v1.ragged.placeholder(tf.int64,
                                                ragged_rank=2,
                                                name='my_ragged_input')
            }
            copied_tensors = impl_helper.copy_tensors(tensors)

            self.assertNotEqual(tensors['dense'], copied_tensors['dense'])
            self.assertNotEqual(tensors['sparse'].indices,
                                copied_tensors['sparse'].indices)
            self.assertNotEqual(tensors['sparse'].values,
                                copied_tensors['sparse'].values)
            self.assertNotEqual(tensors['sparse'].dense_shape,
                                copied_tensors['sparse'].dense_shape)
            self.assertNotEqual(tensors['ragged'].values,
                                copied_tensors['ragged'].values)
            self.assertNotEqual(tensors['ragged'].row_splits,
                                copied_tensors['ragged'].row_splits)

    def test_copy_tensors_produces_equivalent_tensors(self):
        with tf.compat.v1.Graph().as_default():
            tensors = {
                'dense':
                tf.compat.v1.placeholder(tf.int64, (None, ),
                                         name='my_dense_input'),
                'sparse':
                tf.compat.v1.sparse_placeholder(tf.int64,
                                                name='my_sparse_input'),
                'ragged':
                tf.compat.v1.ragged.placeholder(tf.int64,
                                                ragged_rank=1,
                                                name='my_ragged_input')
            }
            copied_tensors = impl_helper.copy_tensors(tensors)

            with tf.compat.v1.Session() as session:
                dense_value = [1, 2]
                sparse_value = tf.compat.v1.SparseTensorValue(
                    indices=[[0, 0], [0, 2], [1, 1]],
                    values=[3, 4, 5],
                    dense_shape=[2, 3])
                ragged_value = tf.compat.v1.ragged.RaggedTensorValue(
                    values=np.array([3, 4, 5], dtype=np.int64),
                    row_splits=np.array([0, 2, 3], dtype=np.int64))
                sample_tensors = session.run(copied_tensors,
                                             feed_dict={
                                                 tensors['dense']: dense_value,
                                                 tensors['sparse']:
                                                 sparse_value,
                                                 tensors['ragged']:
                                                 ragged_value
                                             })
                self.assertAllEqual(sample_tensors['dense'], dense_value)
                self.assertAllEqual(sample_tensors['sparse'].indices,
                                    sparse_value.indices)
                self.assertAllEqual(sample_tensors['sparse'].values,
                                    sparse_value.values)
                self.assertAllEqual(sample_tensors['sparse'].dense_shape,
                                    sparse_value.dense_shape)
                self.assertAllEqual(sample_tensors['ragged'].values,
                                    ragged_value.values)
                self.assertAllEqual(sample_tensors['ragged'].row_splits,
                                    ragged_value.row_splits)