def testInferFeatureSchema(self): columns = { 'a': api._InputColumn(tf.placeholder(tf.float32, (None, )), None), 'b': api._InputColumn(tf.placeholder(tf.string, (1, 2, 3)), None), 'c': api._InputColumn(tf.placeholder(tf.int64, None), None) } schema = impl_helper.infer_feature_schema(columns) expected_schema = sch.Schema( column_schemas={ 'a': sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32), sch.LogicalShape([])), sch.FixedColumnRepresentation()), 'b': sch.ColumnSchema( sch.LogicalColumnSchema( sch.dtype_to_domain(tf.string), sch.LogicalShape([sch.Axis(2), sch.Axis(3)])), sch.FixedColumnRepresentation()), 'c': sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.int64), sch.LogicalShape(None)), sch.FixedColumnRepresentation()) }) self.assertEqual(schema, expected_schema)
def test_logical_shape_equality(self): s1 = sch.LogicalShape([sch.Axis(1), sch.Axis(2)]) s2 = sch.LogicalShape([sch.Axis(1), sch.Axis(2)]) s3 = sch.LogicalShape([sch.Axis(0)]) s4 = sch.LogicalShape(None) self.assertEqual(s1, s2) self.assertNotEqual(s1, s3) self.assertNotEqual(s3, s4)
def preprocessing_fn(inputs): sparse_sum = tft.map(lambda x: tf.sparse_reduce_sum(x, axis=1), inputs['sparse']) sparse_copy = tft.map( lambda y: tf.SparseTensor(y.indices, y.values, y.dense_shape), inputs['sparse']) varlen_copy = tft.map( lambda y: tf.SparseTensor(y.indices, y.values, y.dense_shape), inputs['varlen']) sparse_copy.schema = sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32), sch.LogicalShape([sch.Axis(10)])), sch.SparseColumnRepresentation( 'val_copy', [sch.SparseIndexField('idx_copy', False)])) return { 'fixed': sparse_sum, # Schema should be inferred. 'sparse': inputs['sparse'], # Schema manually attached above. 'varlen': inputs['varlen'], # Schema should be inferred. 'sparse_copy': sparse_copy, # Schema should propagate from input. 'varlen_copy': varlen_copy # Schema should propagate from input. }
def test_infer_column_schema_from_tensor(self): dense = tf.constant([[1., 2.], [3., 4.]], dtype=tf.float32, shape=[2, 2]) column_schema = sch.infer_column_schema_from_tensor(dense) expected_column_schema = sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32), sch.LogicalShape([sch.Axis(2)])), sch.FixedColumnRepresentation()) self.assertEqual(expected_column_schema, column_schema) varlen = tf.sparse_placeholder(tf.string) column_schema = sch.infer_column_schema_from_tensor(varlen) expected_column_schema = sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.string), sch.LogicalShape([sch.Axis(None)])), sch.ListColumnRepresentation()) self.assertEqual(expected_column_schema, column_schema)
def _from_sparse_feature_dict(feature_dict): """Translate a JSON sparse feature dict into a ColumnSchema.""" # assume there is only one value column value_feature = feature_dict['valueFeature'][0] domain = _to_domain(value_feature['domain']) index_feature_dicts = feature_dict['indexFeature'] # int() is needed because protobuf JSON encodes int64 as string axes = [ sch.Axis(int(index_feature_dict['size'])) for index_feature_dict in index_feature_dicts ] shape = sch.LogicalShape(axes) logical_column = sch.LogicalColumnSchema(domain, shape) value_field_name = value_feature['name'] index_fields = [ sch.SparseIndexField(index_feature_dict['name'], index_feature_dict['isSorted']) for index_feature_dict in index_feature_dicts ] representation = sch.SparseColumnRepresentation(value_field_name, index_fields) return sch.ColumnSchema(logical_column, representation)
def test_logical_column_schema_equality(self): c1 = sch.LogicalColumnSchema( sch.dtype_to_domain(tf.int64), sch.LogicalShape([sch.Axis(5), sch.Axis(6), sch.Axis(7)])) c2 = sch.LogicalColumnSchema( sch.dtype_to_domain(tf.int64), sch.LogicalShape([sch.Axis(5), sch.Axis(6), sch.Axis(7)])) c3 = sch.LogicalColumnSchema( sch.dtype_to_domain(tf.int32), sch.LogicalShape([sch.Axis(5), sch.Axis(6), sch.Axis(7)])) c4 = sch.LogicalColumnSchema( sch.dtype_to_domain(tf.int64), sch.LogicalShape(None)) self.assertEqual(c1, c2) self.assertNotEqual(c1, c3) self.assertNotEqual(c3, c4)
def _make_transformed_schema(): schema = sch.Schema() schema.column_schemas['transformed_a'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.int64), sch.LogicalShape([sch.Axis(1)])), sch.FixedColumnRepresentation())) schema.column_schemas['transformed_b'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.int64), sch.LogicalShape([sch.Axis(1)])), sch.FixedColumnRepresentation())) schema.column_schemas['transformed_label'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.int64), sch.LogicalShape([sch.Axis(1)])), sch.FixedColumnRepresentation())) return schema
def test_column_schema_equality(self): c1 = sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool), sch.LogicalShape([sch.Axis(1)])), sch.FixedColumnRepresentation(False)) c2 = sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool), sch.LogicalShape([sch.Axis(1)])), sch.FixedColumnRepresentation(False)) c3 = sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool), sch.LogicalShape([sch.Axis(1)])), sch.FixedColumnRepresentation()) c4 = sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool), sch.LogicalShape([sch.Axis(2)])), sch.FixedColumnRepresentation()) self.assertEqual(c1, c2) self.assertNotEqual(c1, c3) self.assertNotEqual(c3, c4)
def _from_feature_dict(feature_dict): """Translate a JSON feature dict into a `ColumnSchema`.""" domain = _to_domain(feature_dict['domain']) axes = [] if 'fixedShape' in feature_dict: for axis in feature_dict['fixedShape']['axis']: # int() is needed because protobuf JSON encodes int64 as string axes.append(sch.Axis(int(axis.get('size')))) elif 'valueCount' in feature_dict: # Value_count always means a 1-D feature of unknown size. # We don't support value_count.min and value_count.max yet. axes.append(sch.Axis(None)) shape = sch.LogicalShape(axes) logical_column = sch.LogicalColumnSchema(domain, shape) tf_options = feature_dict['parsingOptions']['tfOptions'] if tf_options.get('fixedLenFeature') is not None: default_value = None try: # int() is needed because protobuf JSON encodes int64 as string default_value = int( tf_options['fixedLenFeature']['intDefaultValue']) except KeyError: try: default_value = tf_options['fixedLenFeature'][ 'stringDefaultValue'] except KeyError: try: default_value = tf_options['fixedLenFeature'][ 'floatDefaultValue'] except KeyError: pass representation = sch.FixedColumnRepresentation(default_value) elif tf_options.get('varLenFeature') is not None: representation = sch.ListColumnRepresentation() else: raise ValueError( 'Could not interpret tfOptions: {}'.format(tf_options)) return sch.ColumnSchema(logical_column, representation)
def test_schema_equality(self): schema1 = sch.Schema(column_schemas={ 'fixed_bool_with_default': sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool), sch.LogicalShape([sch.Axis(1)])), sch.FixedColumnRepresentation(False)), 'var_float': sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32), sch.LogicalShape([sch.Axis(None)])), sch.ListColumnRepresentation()) }) schema2 = sch.Schema(column_schemas={ 'fixed_bool_with_default': sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool), sch.LogicalShape([sch.Axis(1)])), sch.FixedColumnRepresentation(False)), 'var_float': sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32), sch.LogicalShape([sch.Axis(None)])), sch.ListColumnRepresentation()) }) schema3 = sch.Schema(column_schemas={ 'fixed_bool_with_default': sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool), sch.LogicalShape([sch.Axis(1)])), sch.FixedColumnRepresentation(False)), 'var_float': sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float64), sch.LogicalShape([sch.Axis(None)])), sch.ListColumnRepresentation()) }) schema4 = sch.Schema(column_schemas={ 'fixed_bool_with_default': sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool), sch.LogicalShape([sch.Axis(1)])), sch.FixedColumnRepresentation(False)) }) self.assertEqual(schema1, schema2) self.assertNotEqual(schema1, schema3) self.assertNotEqual(schema1, schema4)
'x': 2, 'y': 2, 's': 'world' }, { 'x': 3, 'y': 3, 's': 'hello' }] raw_data_metadata = dataset_metadata.DatasetMetadata( dataset_schema.Schema({ 's': dataset_schema.ColumnSchema( dataset_schema.LogicalColumnSchema( dataset_schema.Domain(tf.string), dataset_schema.LogicalShape([])), dataset_schema.FixedColumnRepresentation()), 'y': dataset_schema.ColumnSchema( dataset_schema.LogicalColumnSchema( dataset_schema.Domain(tf.float32), dataset_schema.LogicalShape([])), dataset_schema.FixedColumnRepresentation()), 'x': dataset_schema.ColumnSchema( dataset_schema.LogicalColumnSchema( dataset_schema.Domain(tf.float32), dataset_schema.LogicalShape([])), dataset_schema.FixedColumnRepresentation()) }))
def get_manually_created_schema(): """Provide a test schema built from scratch using the Schema classes.""" schema = sch.Schema() # This verbose stuff may be replaced with convienience methods in the future. # FixedLenFeatures schema.column_schemas['fixed_bool_with_default'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool), sch.LogicalShape([sch.Axis(1)])), sch.FixedColumnRepresentation(False))) schema.column_schemas['fixed_bool_without_default'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool), sch.LogicalShape([sch.Axis(5)])), sch.FixedColumnRepresentation())) schema.column_schemas['fixed_int_with_default'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.int64), sch.LogicalShape([sch.Axis(1)])), sch.FixedColumnRepresentation(0))) schema.column_schemas['fixed_int_without_default'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.int64), sch.LogicalShape([sch.Axis(5)])), sch.FixedColumnRepresentation())) schema.column_schemas['fixed_float_with_default'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32), sch.LogicalShape([sch.Axis(1)])), sch.FixedColumnRepresentation(0.0))) schema.column_schemas['fixed_float_without_default'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32), sch.LogicalShape([sch.Axis(5)])), sch.FixedColumnRepresentation())) schema.column_schemas['fixed_string_with_default'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.string), sch.LogicalShape([sch.Axis(1)])), sch.FixedColumnRepresentation('default'))) schema.column_schemas['fixed_string_without_default'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.string), sch.LogicalShape([sch.Axis(5)])), sch.FixedColumnRepresentation())) schema.column_schemas['3d_fixed_int_without_default'] = (sch.ColumnSchema( sch.LogicalColumnSchema( sch.dtype_to_domain(tf.int64), sch.LogicalShape([sch.Axis(5), sch.Axis(6), sch.Axis(7)])), sch.FixedColumnRepresentation())) # VarLenFeatures schema.column_schemas['var_bool'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool), sch.LogicalShape([sch.Axis(None)])), sch.ListColumnRepresentation())) schema.column_schemas['var_int'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.int64), sch.LogicalShape([sch.Axis(None)])), sch.ListColumnRepresentation())) schema.column_schemas['var_float'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32), sch.LogicalShape([sch.Axis(None)])), sch.ListColumnRepresentation())) schema.column_schemas['var_string'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.string), sch.LogicalShape([sch.Axis(None)])), sch.ListColumnRepresentation())) # SparseFeatures schema.column_schemas['sparse_bool'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.bool), sch.LogicalShape([sch.Axis(15)])), sch.SparseColumnRepresentation( 'sparse_bool_value', [sch.SparseIndexField('sparse_bool_index', True)]))) schema.column_schemas['sparse_int'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.int64), sch.LogicalShape([sch.Axis(150)])), sch.SparseColumnRepresentation( 'sparse_int_value', [sch.SparseIndexField('sparse_int_index', False)]))) schema.column_schemas['sparse_float'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32), sch.LogicalShape([sch.Axis(1500)])), sch.SparseColumnRepresentation( 'sparse_float_value', [sch.SparseIndexField('sparse_float_index', False)]))) schema.column_schemas['sparse_string'] = (sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.string), sch.LogicalShape([sch.Axis(15000)])), sch.SparseColumnRepresentation( 'sparse_string_value', [sch.SparseIndexField('sparse_string_index', True)]))) return schema
def transform_data(train_data_file, eval_data_file, transformed_train_data_base, transformed_eval_data_base, transformed_metadata_dir): """Transform the cleaned data and write out as a TFRecord of Example protos. Read in the cleaned data using the CSV reader, and transform it using a preprocessing pipeline that scales numeric data and coverts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data eval_data_file: File containing evaluation data transformed_train_data_base: Base filename for transformed training data shards transformed_eval_data_base: Base filename for cleaned evaluation data shards transformed_metadata_dir: Directory where metadata for transformed data should be written. """ raw_data_schema = { key: dataset_schema.ColumnSchema( dataset_schema.LogicalColumnSchema( dataset_schema.Domain(tf.string), dataset_schema.LogicalShape([])), dataset_schema.FixedColumnRepresentation()) for key in CATEGORICAL_COLUMNS } raw_data_schema.update({ key: dataset_schema.ColumnSchema( dataset_schema.LogicalColumnSchema( dataset_schema.Domain(tf.float32), dataset_schema.LogicalShape([])), dataset_schema.FixedColumnRepresentation()) for key in NUMERIC_COLUMNS }) raw_data_schema[LABEL_COLUMN] = dataset_schema.ColumnSchema( dataset_schema.LogicalColumnSchema(dataset_schema.Domain(tf.string), dataset_schema.LogicalShape([])), dataset_schema.FixedColumnRepresentation()) raw_data_schema = dataset_schema.Schema(raw_data_schema) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_data_schema) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1]. for key in NUMERIC_COLUMNS: outputs[key] = tft.scale_to_0_1(inputs[key]) # For all categorical columns except the label column, we use # tft.string_to_int which computes the set of unique values and uses this # to convert the strings to indices. for key in CATEGORICAL_COLUMNS: outputs[key] = tft.string_to_int(inputs[key]) # Update outputs of both kinds to convert from shape (batch,), i.e. a batch # of scalars, to shape (batch, 1), i.e. a batch of vectors of length 1. # This is needed so the output can be easily wrapped in `FeatureColumn`s. for key in NUMERIC_COLUMNS + CATEGORICAL_COLUMNS: outputs[key] = tft.map(lambda x: tf.expand_dims(x, -1), outputs[key]) # For the label column we provide the mapping from string to index. def convert_label(label): table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K']) return table.lookup(label) outputs[LABEL_COLUMN] = tft.map(convert_label, inputs[LABEL_COLUMN]) return outputs # The "with" block will create a pipeline, and run that pipeline at the exit # of the block. with beam.Pipeline() as p: # Create a coder to read the census data with the schema. To do this we # need to list all columns in order since the schema doesn't specify the # order of columns in the csv. ordered_columns = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label' ] converter = csv_coder.CsvCoder(ordered_columns, raw_data_schema) # Read in raw data and convert using CSV converter. Note that we apply some # Beam transformations here, which will not be encoded in the TF graph since # we don't do the from within tf.Transform's methods (AnalyzeDataset, # TransformDataset etc.). These transformations are just to get data into # a format that the CSV converter can read, in particular removing empty # lines and removing spaces after commas. raw_data = (p | 'ReadTrainData' >> textio.ReadFromText(train_data_file) | 'FilterTrainData' >> beam.Filter(lambda line: line) | 'FixCommasTrainData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'DecodeTrainData' >> beam.Map(converter.decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. raw_dataset = (raw_data, raw_data_metadata) transformed_dataset, transform_fn = ( raw_dataset | beam_impl.AnalyzeAndTransformDataset( preprocessing_fn, output_dir=os.path.join(tempfile.mkdtemp()))) transformed_data, transformed_metadata = transformed_dataset _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( transformed_train_data_base, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) # Now apply transform function to eval data. In this case we also remove # the header line from the CSV file and the trailing period at the end of # each line. raw_eval_data = ( p | 'ReadEvalData' >> textio.ReadFromText(eval_data_file) | 'FilterEvalData' >> beam.Filter(lambda line: line and line != '|1x3 Cross validator') | 'FixCommasEvalData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'RemoveTrailingPeriodsEvalData' >> beam.Map(lambda line: line[:-1]) | 'DecodeEvalData' >> beam.Map(converter.decode)) raw_eval_dataset = (raw_eval_data, raw_data_metadata) transformed_eval_dataset = ((raw_eval_dataset, transform_fn) | beam_impl.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_eval_data, _ = transformed_eval_dataset _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord( transformed_eval_data_base, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) _ = (transformed_metadata | 'WriteMetadata' >> beam_metadata_io.WriteMetadata( transformed_metadata_dir, pipeline=p))