def test_schema_equality(self): schema1 = sch.Schema(column_schemas={ 'fixed_bool_with_default': sch.ColumnSchema( tf.bool, [1], sch.FixedColumnRepresentation(False)), 'var_float': sch.ColumnSchema( tf.float32, None, sch.ListColumnRepresentation()) }) schema2 = sch.Schema(column_schemas={ 'fixed_bool_with_default': sch.ColumnSchema( tf.bool, [1], sch.FixedColumnRepresentation(False)), 'var_float': sch.ColumnSchema( tf.float32, None, sch.ListColumnRepresentation()) }) schema3 = sch.Schema(column_schemas={ 'fixed_bool_with_default': sch.ColumnSchema( tf.bool, [1], sch.FixedColumnRepresentation(False)), 'var_float': sch.ColumnSchema( tf.float64, None, sch.ListColumnRepresentation()) }) schema4 = sch.Schema(column_schemas={ 'fixed_bool_with_default': sch.ColumnSchema( tf.bool, [1], sch.FixedColumnRepresentation(False)) }) self.assertEqual(schema1, schema2) self.assertNotEqual(schema1, schema3) self.assertNotEqual(schema1, schema4)
def test_column_representation_equality(self): fixed1 = sch.FixedColumnRepresentation(1.1) fixed2 = sch.FixedColumnRepresentation(1.1) fixed3 = sch.FixedColumnRepresentation() list1 = sch.ListColumnRepresentation() list2 = sch.ListColumnRepresentation() sparse1 = sch.SparseColumnRepresentation('val', [ sch.SparseIndexField('idx1', False), sch.SparseIndexField('idx2', True) ]) sparse2 = sch.SparseColumnRepresentation('val', [ sch.SparseIndexField('idx1', False), sch.SparseIndexField('idx2', True) ]) sparse3 = sch.SparseColumnRepresentation('val', [ sch.SparseIndexField('idx1', False), sch.SparseIndexField('idx2', False) ]) self.assertEqual(fixed1, fixed2) self.assertNotEqual(fixed1, fixed3) self.assertNotEqual(fixed1, list1) self.assertNotEqual(fixed1, sparse1) self.assertEqual(list1, list2) self.assertNotEqual(list1, sparse1) self.assertEqual(sparse1, sparse2) self.assertNotEqual(sparse1, sparse3)
def _from_feature_dict(feature_dict): """Translate a JSON feature dict into a `ColumnSchema`.""" domain = _from_domain_dict(feature_dict['domain']) axes = [] if 'fixedShape' in feature_dict: for axis in feature_dict['fixedShape']['axis']: # int() is needed because protobuf JSON encodes int64 as string axes.append(sch.Axis(int(axis.get('size')))) elif 'valueCount' in feature_dict: # Value_count always means a 1-D feature of unknown size. # We don't support value_count.min and value_count.max yet. axes.append(sch.Axis(None)) tf_options = feature_dict['parsingOptions']['tfOptions'] if tf_options.get('fixedLenFeature') is not None: default_value = None try: # int() is needed because protobuf JSON encodes int64 as string default_value = int(tf_options['fixedLenFeature']['intDefaultValue']) except KeyError: try: default_value = tf_options['fixedLenFeature']['stringDefaultValue'] except KeyError: try: default_value = tf_options['fixedLenFeature']['floatDefaultValue'] except KeyError: pass representation = sch.FixedColumnRepresentation(default_value) elif tf_options.get('varLenFeature') is not None: representation = sch.ListColumnRepresentation() else: raise ValueError('Could not interpret tfOptions: {}'.format(tf_options)) return sch.ColumnSchema(domain, axes, representation)
def test_schema_equality(self): schema1 = sch.Schema(column_schemas={ 'fixed_bool_with_default': sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool), sch.LogicalShape([sch.Axis(1)])), sch.FixedColumnRepresentation(False)), 'var_float': sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32), sch.LogicalShape([sch.Axis(None)])), sch.ListColumnRepresentation()) }) schema2 = sch.Schema(column_schemas={ 'fixed_bool_with_default': sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool), sch.LogicalShape([sch.Axis(1)])), sch.FixedColumnRepresentation(False)), 'var_float': sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32), sch.LogicalShape([sch.Axis(None)])), sch.ListColumnRepresentation()) }) schema3 = sch.Schema(column_schemas={ 'fixed_bool_with_default': sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool), sch.LogicalShape([sch.Axis(1)])), sch.FixedColumnRepresentation(False)), 'var_float': sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float64), sch.LogicalShape([sch.Axis(None)])), sch.ListColumnRepresentation()) }) schema4 = sch.Schema(column_schemas={ 'fixed_bool_with_default': sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool), sch.LogicalShape([sch.Axis(1)])), sch.FixedColumnRepresentation(False)) }) self.assertEqual(schema1, schema2) self.assertNotEqual(schema1, schema3) self.assertNotEqual(schema1, schema4)
def get_manually_created_schema(): """Provide a test schema built from scratch using the Schema classes.""" return sch.Schema({ # FixedLenFeatures 'fixed_categorical_int_with_range': sch.ColumnSchema(sch.IntDomain(tf.int64, -5, 10, True), [], sch.FixedColumnRepresentation()), 'fixed_int': sch.ColumnSchema(tf.int64, [5], sch.FixedColumnRepresentation()), 'fixed_float': sch.ColumnSchema(tf.float32, [5], sch.FixedColumnRepresentation()), 'fixed_string': sch.ColumnSchema(tf.string, [5], sch.FixedColumnRepresentation()), # VarLenFeatures 'var_int': sch.ColumnSchema(tf.int64, None, sch.ListColumnRepresentation()), 'var_float': sch.ColumnSchema(tf.float32, None, sch.ListColumnRepresentation()), 'var_string': sch.ColumnSchema(tf.string, None, sch.ListColumnRepresentation()) })
def test_infer_column_schema_from_tensor(self): dense = tf.constant([[1., 2.], [3., 4.]], dtype=tf.float32, shape=[2, 2]) column_schema = sch.infer_column_schema_from_tensor(dense) expected_column_schema = sch.ColumnSchema( tf.float32, [2], sch.FixedColumnRepresentation()) self.assertEqual(expected_column_schema, column_schema) varlen = tf.sparse_placeholder(tf.string) column_schema = sch.infer_column_schema_from_tensor(varlen) expected_column_schema = sch.ColumnSchema( tf.string, [None], sch.ListColumnRepresentation()) self.assertEqual(expected_column_schema, column_schema)
def _make_transformed_schema(shape): schema = sch.Schema() schema.column_schemas['transformed_a'] = ( sch.ColumnSchema(tf.int64, shape, sch.FixedColumnRepresentation())) schema.column_schemas['transformed_b'] = ( sch.ColumnSchema(tf.int64, shape, sch.ListColumnRepresentation())) schema.column_schemas['transformed_label'] = ( sch.ColumnSchema(tf.int64, shape, sch.FixedColumnRepresentation())) return schema
def test_schema_equality(self): schema1 = sch.Schema( column_schemas={ 'fixed_int': sch.ColumnSchema(tf.int64, [2], sch.FixedColumnRepresentation()), 'var_float': sch.ColumnSchema(tf.float32, None, sch.ListColumnRepresentation()) }) schema2 = sch.Schema( column_schemas={ 'fixed_int': sch.ColumnSchema(tf.int64, [2], sch.FixedColumnRepresentation()), 'var_float': sch.ColumnSchema(tf.float32, None, sch.ListColumnRepresentation()) }) schema3 = sch.Schema( column_schemas={ 'fixed_int': sch.ColumnSchema(tf.int64, [2], sch.FixedColumnRepresentation()), 'var_float': sch.ColumnSchema(tf.string, None, sch.ListColumnRepresentation()) }) schema4 = sch.Schema( column_schemas={ 'fixed_int': sch.ColumnSchema(tf.int64, [2], sch.FixedColumnRepresentation()) }) self.assertEqual(schema1, schema2) self.assertNotEqual(schema1, schema3) self.assertNotEqual(schema1, schema4)
def test_infer_column_schema_from_tensor(self): dense = tf.constant([[1., 2.], [3., 4.]], dtype=tf.float32, shape=[2, 2]) column_schema = sch.infer_column_schema_from_tensor(dense) expected_column_schema = sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32), sch.LogicalShape([sch.Axis(2)])), sch.FixedColumnRepresentation()) self.assertEqual(expected_column_schema, column_schema) varlen = tf.sparse_placeholder(tf.string) column_schema = sch.infer_column_schema_from_tensor(varlen) expected_column_schema = sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.string), sch.LogicalShape([sch.Axis(None)])), sch.ListColumnRepresentation()) self.assertEqual(expected_column_schema, column_schema)
def _create_output_metadata(features_config, min_value, max_value): """Constructs a custom DatasetMetadata. Args: features_config: Features configuration mock. min_value: Minimum value for IntDomain. max_value: Maximum value for IntDomain. Returns: A `tft.tf_metadata.dataset_metadata.DatasetMetadata` object. """ schema = { features_config.TARGET_FEATURE: dataset_schema.ColumnSchema( tf.float32, [], dataset_schema.FixedColumnRepresentation()), features_config.ID_FEATURE: dataset_schema.ColumnSchema(tf.int64, [None], dataset_schema.ListColumnRepresentation()) } schema.update({ utils.make_transformed_key(feature): dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation()) for feature in features_config.NUMERIC_FEATURES }) categorical_col_schema = dataset_schema.ColumnSchema( dataset_schema.IntDomain(tf.int64, min_value, max_value, is_categorical=True), [], dataset_schema.FixedColumnRepresentation()) schema.update({ utils.make_transformed_key(feature): categorical_col_schema for feature in features_config.CATEGORICAL_FEATURES }) return dataset_metadata.DatasetMetadata(schema)
import unittest from tensorflow.python.framework import test_util from tensorflow.python.lib.io import file_io _TEST_METADATA = dataset_metadata.DatasetMetadata({ 'fixed_column': dataset_schema.ColumnSchema(tf.string, (1, 3, 2), dataset_schema.FixedColumnRepresentation()), 'fixed_column_with_default': dataset_schema.ColumnSchema( tf.float32, (1, 3, 2), dataset_schema.FixedColumnRepresentation(123.4)), 'list_columm': dataset_schema.ColumnSchema(tf.float32, (None, ), dataset_schema.ListColumnRepresentation()) }) _TEST_METADATA_WITH_FUTURES = dataset_metadata.DatasetMetadata({ 'fixed_column': dataset_schema.ColumnSchema(tf.string, (1, 3, 2), dataset_schema.FixedColumnRepresentation()), 'fixed_column_with_default': dataset_schema.ColumnSchema( tf.float32, (1, futures.Future('a'), 2), dataset_schema.FixedColumnRepresentation(123.4)), 'list_columm': dataset_schema.ColumnSchema(tf.float32, (None, ), dataset_schema.ListColumnRepresentation()) })
def get_test_schema(): return sch.from_feature_spec(test_feature_spec) _COLUMN_SCHEMAS = { # FixedLenFeatures 'fixed_categorical_int_with_range': sch.ColumnSchema(sch.IntDomain(tf.int64, -5, 10, True), [], sch.FixedColumnRepresentation()), 'fixed_int': sch.ColumnSchema(tf.int64, [5], sch.FixedColumnRepresentation()), 'fixed_float': sch.ColumnSchema(tf.float32, [5], sch.FixedColumnRepresentation()), 'fixed_string': sch.ColumnSchema(tf.string, [5], sch.FixedColumnRepresentation()), # VarLenFeatures 'var_int': sch.ColumnSchema(tf.int64, None, sch.ListColumnRepresentation()), 'var_float': sch.ColumnSchema(tf.float32, None, sch.ListColumnRepresentation()), 'var_string': sch.ColumnSchema(tf.string, None, sch.ListColumnRepresentation()) } def get_manually_created_schema(): """Provide a test schema built from scratch using the Schema classes.""" return sch.Schema(_COLUMN_SCHEMAS)
def get_manually_created_schema(): """Provide a test schema built from scratch using the Schema classes.""" schema = sch.Schema() # FixedLenFeatures schema.column_schemas['fixed_bool_with_default'] = (sch.ColumnSchema( tf.bool, [1], sch.FixedColumnRepresentation(default_value=False))) schema.column_schemas['fixed_bool_without_default'] = (sch.ColumnSchema( tf.bool, [5], sch.FixedColumnRepresentation())) schema.column_schemas['fixed_int_with_default'] = (sch.ColumnSchema( tf.int64, [1], sch.FixedColumnRepresentation(default_value=0))) schema.column_schemas['fixed_categorical_int_with_range'] = ( sch.ColumnSchema(sch.IntDomain(tf.int64, -5, 10, True), [1], sch.FixedColumnRepresentation(0))) schema.column_schemas['fixed_categorical_int_with_vocab'] = ( sch.ColumnSchema( sch.IntDomain(tf.int64, vocabulary_file='test_filename'), [1], sch.FixedColumnRepresentation(0))) schema.column_schemas['fixed_int_without_default'] = (sch.ColumnSchema( tf.int64, [5], sch.FixedColumnRepresentation())) schema.column_schemas['fixed_float_with_default'] = (sch.ColumnSchema( tf.float32, [1], sch.FixedColumnRepresentation(default_value=0.0))) schema.column_schemas['fixed_float_without_default'] = (sch.ColumnSchema( tf.float32, [5], sch.FixedColumnRepresentation())) schema.column_schemas['fixed_string_with_default'] = (sch.ColumnSchema( tf.string, [1], sch.FixedColumnRepresentation(default_value='default'))) schema.column_schemas['fixed_string_without_default'] = (sch.ColumnSchema( tf.string, [5], sch.FixedColumnRepresentation())) schema.column_schemas['3d_fixed_int_without_default'] = (sch.ColumnSchema( tf.int64, [5, 6, 7], sch.FixedColumnRepresentation())) # VarLenFeatures schema.column_schemas['var_bool'] = (sch.ColumnSchema( tf.bool, None, sch.ListColumnRepresentation())) schema.column_schemas['var_int'] = (sch.ColumnSchema( tf.int64, None, sch.ListColumnRepresentation())) schema.column_schemas['var_float'] = (sch.ColumnSchema( tf.float32, None, sch.ListColumnRepresentation())) schema.column_schemas['var_string'] = (sch.ColumnSchema( tf.string, None, sch.ListColumnRepresentation())) # SparseFeatures schema.column_schemas['sparse_bool'] = (sch.ColumnSchema( tf.bool, [15], sch.SparseColumnRepresentation( 'sparse_bool_value', [sch.SparseIndexField('sparse_bool_index', True)]))) schema.column_schemas['sparse_int'] = (sch.ColumnSchema( tf.int64, [150], sch.SparseColumnRepresentation( 'sparse_int_value', [sch.SparseIndexField('sparse_int_index', False)]))) schema.column_schemas['sparse_float'] = (sch.ColumnSchema( tf.float32, [1500], sch.SparseColumnRepresentation( 'sparse_float_value', [sch.SparseIndexField('sparse_float_index', False)]))) schema.column_schemas['sparse_string'] = (sch.ColumnSchema( tf.string, [15000], sch.SparseColumnRepresentation( 'sparse_string_value', [sch.SparseIndexField('sparse_string_index', True)]))) return schema
def get_manually_created_schema(): """Provide a test schema built from scratch using the Schema classes.""" schema = sch.Schema() # This verbose stuff may be replaced with convienience methods in the future. # FixedLenFeatures schema.column_schemas['fixed_bool_with_default'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool), sch.LogicalShape([sch.Axis(1)])), sch.FixedColumnRepresentation(False))) schema.column_schemas['fixed_bool_without_default'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool), sch.LogicalShape([sch.Axis(5)])), sch.FixedColumnRepresentation())) schema.column_schemas['fixed_int_with_default'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.int64), sch.LogicalShape([sch.Axis(1)])), sch.FixedColumnRepresentation(0))) schema.column_schemas['fixed_int_without_default'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.int64), sch.LogicalShape([sch.Axis(5)])), sch.FixedColumnRepresentation())) schema.column_schemas['fixed_float_with_default'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32), sch.LogicalShape([sch.Axis(1)])), sch.FixedColumnRepresentation(0.0))) schema.column_schemas['fixed_float_without_default'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32), sch.LogicalShape([sch.Axis(5)])), sch.FixedColumnRepresentation())) schema.column_schemas['fixed_string_with_default'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.string), sch.LogicalShape([sch.Axis(1)])), sch.FixedColumnRepresentation('default'))) schema.column_schemas['fixed_string_without_default'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.string), sch.LogicalShape([sch.Axis(5)])), sch.FixedColumnRepresentation())) schema.column_schemas['3d_fixed_int_without_default'] = (sch.ColumnSchema( sch.LogicalColumnSchema( sch.dtype_to_domain(tf.int64), sch.LogicalShape([sch.Axis(5), sch.Axis(6), sch.Axis(7)])), sch.FixedColumnRepresentation())) # VarLenFeatures schema.column_schemas['var_bool'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool), sch.LogicalShape([sch.Axis(None)])), sch.ListColumnRepresentation())) schema.column_schemas['var_int'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.int64), sch.LogicalShape([sch.Axis(None)])), sch.ListColumnRepresentation())) schema.column_schemas['var_float'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32), sch.LogicalShape([sch.Axis(None)])), sch.ListColumnRepresentation())) schema.column_schemas['var_string'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.string), sch.LogicalShape([sch.Axis(None)])), sch.ListColumnRepresentation())) # SparseFeatures schema.column_schemas['sparse_bool'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool), sch.LogicalShape([sch.Axis(15)])), sch.SparseColumnRepresentation( 'sparse_bool_value', [sch.SparseIndexField('sparse_bool_index', True)]))) schema.column_schemas['sparse_int'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.int64), sch.LogicalShape([sch.Axis(150)])), sch.SparseColumnRepresentation( 'sparse_int_value', [sch.SparseIndexField('sparse_int_index', False)]))) schema.column_schemas['sparse_float'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32), sch.LogicalShape([sch.Axis(1500)])), sch.SparseColumnRepresentation( 'sparse_float_value', [sch.SparseIndexField('sparse_float_index', False)]))) schema.column_schemas['sparse_string'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.string), sch.LogicalShape([sch.Axis(15000)])), sch.SparseColumnRepresentation( 'sparse_string_value', [sch.SparseIndexField('sparse_string_index', True)]))) return schema
import tensorflow_transform as tft from tensorflow_transform.beam.tft_beam_io import beam_metadata_io from tensorflow_transform.beam.tft_beam_io import transform_fn_io from tensorflow_transform.tf_metadata import dataset_metadata from tensorflow_transform.tf_metadata import dataset_schema from tensorflow_transform.tf_metadata import metadata_io import unittest from tensorflow.python.framework import test_util from tensorflow.python.lib.io import file_io _TEST_METADATA_COMPLETE = dataset_metadata.DatasetMetadata({ 'fixed_column': dataset_schema.ColumnSchema( tf.string, (3,), dataset_schema.FixedColumnRepresentation()), 'list_columm': dataset_schema.ColumnSchema( tf.float32, (None,), dataset_schema.ListColumnRepresentation()) }) _TEST_METADATA = dataset_metadata.DatasetMetadata({ 'fixed_column': dataset_schema.ColumnSchema( tf.string, (3,), dataset_schema.FixedColumnRepresentation()), # zeros will be overriddden 'list_columm': dataset_schema.ColumnSchema( dataset_schema.IntDomain(tf.int64, min_value=0, max_value=0), (None,), dataset_schema.ListColumnRepresentation()) }) class BeamMetadataIoTest(test_util.TensorFlowTestCase): def testReadTransformFn(self):