def encode(): """ Creates a Beam pipeline that generates data, transforms it and encodes it in ELWC """ output_path = "./output" options = PipelineOptions() options.view_as(StandardOptions).runner = "DirectRunner" with beam.Pipeline(options=options) as pipeline: with tft_beam.Context(temp_dir="./tmp"): raw_data = generate_data(100) input_data = (pipeline | beam.Create(raw_data)) transformed_data, transform_fn = ( (input_data, raw_metadata) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) elwc_coder = ELWCProtoCoder(context_specs, examples_specs) data, metadata = transformed_data _ = (data | beam.Map(elwc_coder.encode) | beam.io.WriteToTFRecord( file_path_prefix="{}/data".format(output_path), file_name_suffix=".tfrecords")) _ = (transform_fn | tft_beam.WriteTransformFn(output_path))
def testPreprocessingFn(self): schema_file = os.path.join(self._testdata_path, 'schema_gen/schema.pbtxt') schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) feature_spec = taxi_utils._get_raw_feature_spec(schema) working_dir = self.get_temp_dir() transform_graph_path = os.path.join(working_dir, 'transform_graph') transformed_examples_path = os.path.join( working_dir, 'transformed_examples') # Run very simplified version of executor logic. # TODO(kestert): Replace with tft_unit.assertAnalyzeAndTransformResults. # Generate legacy `DatasetMetadata` object. Future version of Transform # will accept the `Schema` proto directly. legacy_metadata = dataset_metadata.DatasetMetadata( schema_utils.schema_from_feature_spec(feature_spec)) tfxio = tf_example_record.TFExampleRecord( file_pattern=os.path.join(self._testdata_path, 'csv_example_gen/Split-train/*'), telemetry_descriptors=['Tests'], schema=legacy_metadata.schema) with beam.Pipeline() as p: with tft_beam.Context(temp_dir=os.path.join(working_dir, 'tmp')): examples = p | 'ReadTrainData' >> tfxio.BeamSource() (transformed_examples, transformed_metadata), transform_fn = ( (examples, tfxio.TensorAdapterConfig()) | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset( taxi_utils.preprocessing_fn)) # WriteTransformFn writes transform_fn and metadata to subdirectories # tensorflow_transform.SAVED_MODEL_DIR and # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively. # pylint: disable=expression-not-assigned (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(transform_graph_path)) encoder = tft.coders.ExampleProtoCoder(transformed_metadata.schema) (transformed_examples | 'EncodeTrainData' >> beam.Map(encoder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(transformed_examples_path, 'Split-train/transformed_examples.gz'), coder=beam.coders.BytesCoder())) # pylint: enable=expression-not-assigned # Verify the output matches golden output. # NOTE: we don't verify that transformed examples match golden output. expected_transformed_schema = io_utils.parse_pbtxt_file( os.path.join( self._testdata_path, 'transform/transform_graph/transformed_metadata/schema.pbtxt'), schema_pb2.Schema()) transformed_schema = io_utils.parse_pbtxt_file( os.path.join(transform_graph_path, 'transformed_metadata/schema.pbtxt'), schema_pb2.Schema()) # Clear annotations so we only have to test main schema. transformed_schema.ClearField('annotation') for feature in transformed_schema.feature: feature.ClearField('annotation') self.assertEqual(transformed_schema, expected_transformed_schema)
def test_train(self): """Tests case where training data is passed.""" with self.pipeline as p: with tft_beam.Context(temp_dir=os.path.join(self.test_dir, 'tmp')): df = self.pre_tft_df[self.pre_tft_df.split == 'TRAIN'] dataset = self._get_dataset(p, df) preprocessing_fn = functools.partial( beam_pipeline._preprocessing_fn, schema_map=self.schema.pre_tft_schema_map) transform_fn = (beam_pipeline._transform_and_write_tfr( dataset, self.tfr_writer, preprocessing_fn=preprocessing_fn, metadata=self.pre_tft_metadata, label='Train')) _ = transform_fn | tft_beam.WriteTransformFn(self.test_dir) self.assertTrue( os.path.isdir(os.path.join(self.test_dir, 'transform_fn'))) self.assertTrue( os.path.isdir(os.path.join(self.test_dir, 'transformed_metadata'))) self.assertTrue(glob.glob(os.path.join(self.test_dir, 'train*.gz'))) self.assertFalse( glob.glob(os.path.join(self.test_dir, 'validation*.gz'))) self.assertFalse(glob.glob(os.path.join(self.test_dir, 'test*.gz')))
def _main(argv=None): logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('--raw_examples_path', required=True) parser.add_argument('--raw_examples_schema_path', required=True) parser.add_argument('--preprocessing_module_path', required=True) parser.add_argument('--transform_fn_dir', required=True) known_args, pipeline_args = parser.parse_known_args(argv) raw_examples_schema = load_schema(known_args.raw_examples_schema_path) raw_examples_coder = tft.coders.ExampleProtoCoder(raw_examples_schema) raw_examples_metadata = dataset_metadata.DatasetMetadata( raw_examples_schema) tft_preprocessing = load_module_from_file_path( 'tft_preprocessing', known_args.preprocessing_module_path) preprocessing_fn = tft_preprocessing.preprocessing_fn pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as pipeline: with tft_beam.Context(temp_dir=get_beam_temp_dir(pipeline_options)): raw_examples = pipeline | 'ReadRawExamples' >> beam.io.ReadFromTFRecord( known_args.raw_examples_path, coder=raw_examples_coder) raw_examples_dataset = (raw_examples, raw_examples_metadata) transform_fn = raw_examples_dataset | tft_beam.AnalyzeDataset( preprocessing_fn) transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn( known_args.transform_fn_dir)
def transform_tft(train_data, test_data, working_dir): options = PipelineOptions() options.view_as(StandardOptions).runner = 'DirectRunner' with beam.Pipeline(options=options) as pipeline: with tft_beam.Context(temp_dir=tempfile.mkdtemp()): data_shape = train_data[0][0].shape raw_data = ( pipeline | 'ReadTrainData' >> beam.Create(train_data) | 'CreateTrainData' >> beam.Map(lambda data: format(data))) raw_data_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ IMAGE_KEY: tf.FixedLenFeature(list(data_shape), tf.float32), LABEL_KEY: tf.FixedLenFeature([], tf.int64) })) raw_dataset = (raw_data, raw_data_metadata) transformed_dataset, transform_fn = ( raw_dataset | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) _ = ( transformed_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE), file_name_suffix='.tfrecords')) raw_test_data = ( pipeline | 'ReadTestData' >> beam.Create(test_data) | 'CreateTestData' >> beam.Map(lambda data: format(data))) raw_test_dataset = (raw_test_data, raw_data_metadata) transformed_test_dataset = ((raw_test_dataset, transform_fn) | tft_beam.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_test_data, _ = transformed_test_dataset _ = (transformed_test_data | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode) | 'WriteTestData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE), file_name_suffix='.tfrecords')) _ = (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
def transformed_data(working_dir): """数据处理与生成transform_fn""" def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" xi, yi = inputs["x"], inputs["y"] x_integerized = tft.compute_and_apply_vocabulary(xi, default_value=0, name="vocab") # , top_k=VOCAB_SIZE) y_integerized = tft.compute_and_apply_vocabulary(yi, default_value=0, name="label") # ,top_k=LABEL_SIZE return {"x": x_integerized, "y": y_integerized} # path_transform with tft_beam.Context(temp_dir=path_transform): transformed_dataset, transform_fn = ((xys, DATA_STRING_FEATURE_SPEC) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_train_data, transformed_metadata = transformed_dataset _ = (transform_fn | tft_beam.WriteTransformFn(working_dir)) return transformed_train_data
def transform_train_and_eval(pipeline, train_data, eval_data, data_source, transform_dir, output_dir, schema): """Analyzes and transforms data. Args: pipeline: Beam Pipeline instance. train_data: Training CSV data. eval_data: Evaluation CSV data. data_source: Input data source - path to CSV file or BigQuery table. Expects either `csv` or `bigquery`. transform_dir: Directory to write transformed output. If this directory exists beam loads `transform_fn` instead of computing it again. output_dir: Directory to write transformed output. schema: A text-serialized TensorFlow metadata schema for the input data. """ train_raw_data = ( pipeline | 'ReadTrainData' >> ReadData(train_data, data_source, schema, tf.estimator.ModeKeys.TRAIN)) eval_raw_data = ( pipeline | 'ReadEvalData' >> ReadData(eval_data, data_source, schema, tf.estimator.ModeKeys.EVAL)) schema = utils.make_dataset_schema(schema, mode=tf.estimator.ModeKeys.TRAIN) input_metadata = dataset_metadata.DatasetMetadata(schema) logger.info('Creating new transform model.') transform_fn = ((train_raw_data, input_metadata) | ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn))) (transform_fn | ('WriteTransformFn' >> tft_beam.WriteTransformFn(transform_dir))) (train_raw_data | 'TransformAndWriteTraining' >> transform_and_write( input_metadata, output_dir, transform_fn, _TRAIN_PREFIX)) (eval_raw_data | 'TransformAndWriteEval' >> transform_and_write(input_metadata, output_dir, transform_fn, _EVAL_PREFIX))
def test_single_phase_mixed_analyzer_run_once(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): integerized_s = tft.compute_and_apply_vocabulary(inputs['s']) _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'integerized_s': integerized_s, 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), } # Run AnalyzeAndTransform on some input data and compare with expected # output. input_data = [{'x': 12, 'y': 1, 's': 'd'}, {'x': 10, 'y': 1, 's': 'c'}] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.io.FixedLenFeature([], tf.float32), 'y': tf.io.FixedLenFeature([], tf.float32), 's': tf.io.FixedLenFeature([], tf.string), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'b', }, { 'x': 4, 'y': -4, 's': 'b', }], span_1_key: input_data, } with _TestPipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) cache_dict = { span_0_key: { b'__v0__CacheableCombineAccumulate[x_1/mean_and_var]-.\xc4t>ZBv\xea\xa5SU\xf4\x065\xc6\x1c\x81W\xf9\x1b': p | 'CreateA' >> beam.Create([b'[2.0, 1.0, 9.0, 0.0]']), b'__v0__CacheableCombineAccumulate[x/x]-\x95\xc5w\x88\x85\x8b5V\xc9\x00\xe0\x0f\x03\x1a\xdaL\x9d\xd5\xb3\xe3': p | 'CreateB' >> beam.Create([b'[2.0, 4.0]']), b'__v0__CacheableCombineAccumulate[y_1/mean_and_var]-E^\xb7VZ\xeew4rm\xab\xa3\xa4k|J\x80ck\x16': p | 'CreateC' >> beam.Create([b'[2.0, -1.5, 6.25, 0.0]']), b'__v0__CacheableCombineAccumulate[y/y]-\xdf\x1ey\x03\x1c\x96\xd5' b' e\x9bJ\xa1\xd2\xfc\x9c\x03\x0fM \xdb': p | 'CreateD' >> beam.Create([b'[4.0, 1.0]']), }, span_1_key: {}, } transform_fn, cache_output = ( (flat_data, input_data_dict, cache_dict, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS( self._cache_dir) transformed_dataset = (( (input_data_dict[span_1_key], input_metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) dot_string = nodes.get_dot_graph([analysis_graph_builder._ANALYSIS_GRAPH ]).to_string() self.WriteRenderedDotFile(dot_string) # The output cache should not have entries for the cache that is present # in the input cache. self.assertEqual( len(cache_output[span_0_key]), len(cache_output[span_1_key]) - 4) transformed_data, unused_transformed_metadata = transformed_dataset expected_transformed = [ { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, 'integerized_s': 1, }, { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, 'integerized_s': 2, }, ] beam_test_util.assert_that(transformed_data, beam_test_util.equal_to(expected_transformed)) transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn') _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir) # 4 from analyzing 2 spans, and 2 from transform. self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 6) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 4) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 8) self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2)
def test_preprocessing_fn(self): schema_file = os.path.join(self._testdata_path, 'schema_gen/schema.pbtxt') schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) feature_spec = taxi_utils._get_raw_feature_spec(schema) working_dir = self.get_temp_dir() transform_output_path = os.path.join(working_dir, 'transform_output') transformed_examples_path = os.path.join(working_dir, 'transformed_examples') # Run very simplified version of executor logic. # TODO(kestert): Replace with tft_unit.assertAnalyzeAndTransformResults. # Generate legacy `DatasetMetadata` object. Future version of Transform # will accept the `Schema` proto directly. legacy_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec(feature_spec)) decoder = tft.coders.ExampleProtoCoder(legacy_metadata.schema) with beam.Pipeline() as p: with tft_beam.Context(temp_dir=os.path.join(working_dir, 'tmp')): examples = ( p | 'ReadTrainData' >> beam.io.ReadFromTFRecord( os.path.join(self._testdata_path, 'csv_example_gen/train/*'), coder=beam.coders.BytesCoder(), # TODO(b/114938612): Eventually remove this override. validate=False) | 'DecodeTrainData' >> beam.Map(decoder.decode)) (transformed_examples, transformed_metadata), transform_fn = ( (examples, legacy_metadata) | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset( taxi_utils.preprocessing_fn)) # WriteTransformFn writes transform_fn and metadata to subdirectories # tensorflow_transform.SAVED_MODEL_DIR and # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively. # pylint: disable=expression-not-assigned (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(transform_output_path)) encoder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) (transformed_examples | 'EncodeTrainData' >> beam.Map(encoder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(transformed_examples_path, 'train/transformed_examples.gz'), coder=beam.coders.BytesCoder())) # pylint: enable=expression-not-assigned # Verify the output matches golden output. # NOTE: we don't verify that transformed examples match golden output. expected_transformed_schema = io_utils.parse_pbtxt_file( os.path.join( self._testdata_path, 'transform/transform_output/transformed_metadata/schema.pbtxt' ), schema_pb2.Schema()) transformed_schema = io_utils.parse_pbtxt_file( os.path.join(transform_output_path, 'transformed_metadata/schema.pbtxt'), schema_pb2.Schema()) # Clear annotations so we only have to test main schema. for feature in transformed_schema.feature: feature.ClearField('annotation') self.assertEqual(transformed_schema, expected_transformed_schema)
def transform_data(input_handle, outfile_prefix, working_dir, schema_file, transform_dir=None, max_rows=None, pipeline_args=None): """The main tf.transform method which analyzes and transforms data. Args: input_handle: BigQuery table name to process specified as DATASET.TABLE or path to csv file with input data. outfile_prefix: Filename prefix for emitted transformed examples working_dir: Directory in which transformed examples and transform function will be emitted. schema_file: An file path that contains a text-serialized TensorFlow metadata schema of the input data. transform_dir: Directory in which the transform output is located. If provided, this will load the transform_fn from disk instead of computing it over the data. Hint: this is useful for transforming eval data. max_rows: Number of rows to query from BigQuery pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. """ def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[taxi.transformed_name(key)] = transform.scale_to_z_score( _fill_in_missing(inputs[key])) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[taxi.transformed_name( key)] = transform.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = transform.bucketize( _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY]) tips = _fill_in_missing(inputs[taxi.LABEL_KEY]) outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs schema = taxi.read_schema(schema_file) raw_feature_spec = taxi.get_raw_feature_spec(schema) raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema) with beam.Pipeline(argv=pipeline_args) as pipeline: with tft_beam.Context(temp_dir=working_dir): if input_handle.lower().endswith('csv'): csv_coder = taxi.make_csv_coder(schema) raw_data = (pipeline | 'ReadFromText' >> beam.io.ReadFromText( input_handle, skip_header_lines=1)) decode_transform = beam.Map(csv_coder.decode) else: query = taxi.make_sql(input_handle, max_rows, for_eval=False) raw_data = (pipeline | 'ReadBigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True))) decode_transform = beam.Map(taxi.clean_raw_data_dict, raw_feature_spec=raw_feature_spec) if transform_dir is None: decoded_data = raw_data | 'DecodeForAnalyze' >> decode_transform transform_fn = ( (decoded_data, raw_data_metadata) | ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn))) _ = (transform_fn | ('WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))) else: transform_fn = pipeline | tft_beam.ReadTransformFn( transform_dir) # Shuffling the data before materialization will improve Training # effectiveness downstream. Here we shuffle the raw_data (as opposed to # decoded data) since it has a compact representation. shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle( ) decoded_data = shuffled_data | 'DecodeForTransform' >> decode_transform (transformed_data, transformed_metadata) = ( ((decoded_data, raw_data_metadata), transform_fn) | 'Transform' >> tft_beam.TransformDataset()) coder = example_proto_coder.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_data | 'SerializeExamples' >> beam.Map(coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(working_dir, outfile_prefix), file_name_suffix='.gz'))
def test_single_phase_run_twice(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): _ = tft.vocabulary(inputs['s']) _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), } input_data = [{'x': 12, 'y': 1, 's': 'b'}, {'x': 10, 'y': 1, 's': 'c'}] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.io.FixedLenFeature([], tf.float32), 'y': tf.io.FixedLenFeature([], tf.float32), 's': tf.io.FixedLenFeature([], tf.string), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'a', }, { 'x': 4, 'y': -4, 's': 'a', }], span_1_key: input_data, } with beam_impl.Context(temp_dir=self.get_temp_dir()): with beam.Pipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # wrap each value in input_data_dict as a pcoll. input_data_pcoll_dict = {} for a, b in six.iteritems(input_data_dict): input_data_pcoll_dict[a] = p | a >> beam.Create(b) transform_fn_1, cache_output = ( (flat_data, input_data_pcoll_dict, {}, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) _ = (cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(self._cache_dir)) transformed_dataset = ( ((input_data_pcoll_dict[span_1_key], input_metadata), transform_fn_1) | 'Transform' >> beam_impl.TransformDataset()) del input_data_pcoll_dict transformed_data, unused_transformed_metadata = transformed_dataset expected_transformed_data = [ { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, }, { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, }, ] beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='first') transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn_1') _ = transform_fn_1 | tft_beam.WriteTransformFn( transform_fn_dir) for key in input_data_dict: self.assertIn(key, cache_output) self.assertEqual(6, len(cache_output[key])) with beam_impl.Context(temp_dir=self.get_temp_dir()): with beam.Pipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # wrap each value in input_data_dict as a pcoll. input_data_pcoll_dict = {} for a, b in six.iteritems(input_data_dict): input_data_pcoll_dict[a] = p | a >> beam.Create(b) input_cache = p | analyzer_cache.ReadAnalysisCacheFromFS( self._cache_dir, list(input_data_dict.keys())) transform_fn_2, second_output_cache = ( (flat_data, input_data_pcoll_dict, input_cache, input_metadata) | 'AnalyzeAgain' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) dot_string = nodes.get_dot_graph( [analysis_graph_builder._ANALYSIS_GRAPH]).to_string() self.WriteRenderedDotFile(dot_string) transformed_dataset = ( ((input_data_dict[span_1_key], input_metadata), transform_fn_2) | 'TransformAgain' >> beam_impl.TransformDataset()) transformed_data, unused_transformed_metadata = transformed_dataset beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='second') self.assertFalse(second_output_cache)
def transform_data(working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, and transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: working_dir: Directory to read shuffled data from and write transformed data and metadata to. """ with beam.Pipeline() as pipeline: with tft_beam.Context( temp_dir=os.path.join(working_dir, TRANSFORM_TEMP_DIR)): tfxio_train_data = tfxio.TFExampleRecord(file_pattern=os.path.join( working_dir, SHUFFLED_TRAIN_DATA_FILEBASE + '*'), schema=SCHEMA) train_data = (pipeline | 'TFXIORead[Train]' >> tfxio_train_data.BeamSource()) tfxio_test_data = tfxio.TFExampleRecord(file_pattern=os.path.join( working_dir, SHUFFLED_TEST_DATA_FILEBASE + '*'), schema=SCHEMA) test_data = (pipeline | 'TFXIORead[Test]' >> tfxio_test_data.BeamSource()) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_KEY] # Here tf.compat.v1.string_split behaves differently from # tf.strings.split. review_tokens = tf.compat.v1.string_split(review, DELIMITERS) review_indices = tft.compute_and_apply_vocabulary( review_tokens, top_k=VOCAB_SIZE) # Add one for the oov bucket created by compute_and_apply_vocabulary. review_bow_indices, review_weight = tft.tfidf( review_indices, VOCAB_SIZE + 1) return { REVIEW_KEY: review_bow_indices, REVIEW_WEIGHT_KEY: review_weight, LABEL_KEY: inputs[LABEL_KEY] } # Transformed metadata is not necessary for encoding. # The TFXIO output format is chosen for improved performance. (transformed_train_data, _), transform_fn = ( (train_data, tfxio_train_data.TensorAdapterConfig()) | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset( preprocessing_fn, output_record_batches=True)) transformed_test_data, _ = ( ((test_data, tfxio_test_data.TensorAdapterConfig()), transform_fn) | 'Transform' >> tft_beam.TransformDataset(output_record_batches=True)) # Extract transformed RecordBatches, encode and write them to the given # directory. coder = tfxio.RecordBatchToExamplesEncoder() _ = (transformed_train_data | 'EncodeTrainData' >> beam.FlatMapTuple(lambda batch, _: coder.encode(batch)) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE))) _ = ( transformed_test_data | 'EncodeTestData' >> beam.FlatMapTuple(lambda batch, _: coder.encode(batch)) | 'WriteTestData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE))) # Will write a SavedModel and metadata to two subdirectories of # working_dir, given by tft.TRANSFORM_FN_DIR and # tft.TRANSFORMED_METADATA_DIR respectively. _ = (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
def transform_data(train_data_file, test_data_file, working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data using the CSV reader, and transform it using a preprocessing pipeline that scales numeric data and converts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data test_data_file: File containing test data working_dir: Directory to write transformed data and metadata to """ # The "with" block will create a pipeline, and run that pipeline at the exit # of the block. with apache_beam.Pipeline() as pipeline: with tft_beam.Context(temp_dir=tempfile.mkdtemp()): # Create a coder to read the census data with the schema. To do this we # need to list all columns in order since the schema doesn't specify the # order of columns in the csv. ordered_columns = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label' ] converter = tft.coders.CsvCoder(ordered_columns, RAW_DATA_METADATA.schema) # Read in raw data and convert using CSV converter. Note that we apply # some Beam transformations here, which will not be encoded in the TF # graph since we don't do them from within tf.Transform's methods # (AnalyzeDataset, TransformDataset etc.). These transformations are just # to get data into a format that the CSV converter can read, in particular # removing spaces after commas. # # We use MapAndFilterErrors instead of Map to filter out decode errors in # convert.decode which should only occur for the trailing blank line. raw_data = ( pipeline | 'ReadTrainData' >> apache_beam.io.ReadFromText(train_data_file) | 'FixCommasTrainData' >> apache_beam.Map(lambda line: line.replace(', ', ',')) | 'DecodeTrainData' >> MapAndFilterErrors(converter.decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. raw_dataset = (raw_data, RAW_DATA_METADATA) transformed_dataset, transform_fn = ( raw_dataset | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset # A coder between TF Examples and tf.Transform datasets. # Used to encode a tf.transform encoded dict as tf.Example. transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_data | 'EncodeTrainData' >> apache_beam.Map( transformed_data_coder.encode) | 'WriteTrainData' >> apache_beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE))) # Now apply transform function to test data. In this case we remove the # trailing period at the end of each line, and also ignore the header line # that is present in the test data file. raw_test_data = ( pipeline | 'ReadTestData' >> apache_beam.io.ReadFromText( test_data_file, skip_header_lines=1) | 'FixCommasTestData' >> apache_beam.Map(lambda line: line.replace(', ', ',')) | 'RemoveTrailingPeriodsTestData' >> apache_beam.Map(lambda line: line[:-1]) | 'DecodeTestData' >> MapAndFilterErrors(converter.decode)) raw_test_dataset = (raw_test_data, RAW_DATA_METADATA) transformed_test_dataset = ((raw_test_dataset, transform_fn) | tft_beam.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_test_data, _ = transformed_test_dataset _ = ( transformed_test_data | 'EncodeTestData' >> apache_beam.Map( transformed_data_coder.encode) | 'WriteTestData' >> apache_beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE))) # Will write a SavedModel and metadata to working_dir, which can then # be read by the tft.TFTransformOutput class. _ = (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
def build_pipeline(df: pd.DataFrame, job_label: str, runner: str, project: str, region: str, output_dir: str, compression: str, num_shards: int, dataflow_options: dict, integer_label: bool) -> beam.Pipeline: """Runs TFRecorder Beam Pipeline. Args: df: Pandas DataFrame job_label: User description for the beam job. runner: Beam Runner: (e.g. DataflowRunner, DirectRunner). project: GCP project ID (if DataflowRunner) region: GCP compute region (if DataflowRunner) output_dir: GCS or Local Path for output. compression: gzip or None. num_shards: Number of shards. dataflow_options: Dataflow Runner Options (optional) integer_label: Flags if label is already an integer. Returns: beam.Pipeline Note: These inputs must be validated upstream (by client.create_tfrecord()) """ job_name = _get_job_name(job_label) job_dir = _get_job_dir(output_dir, job_name) options = _get_pipeline_options(runner, job_name, job_dir, project, region, dataflow_options) #with beam.Pipeline(runner, options=options) as p: p = beam.Pipeline(options=options) with tft_beam.Context(temp_dir=os.path.join(job_dir, 'tft_tmp')): converter = tft.coders.CsvCoder(constants.IMAGE_CSV_COLUMNS, constants.IMAGE_CSV_METADATA.schema) extract_images_fn = beam_image.ExtractImagesDoFn( constants.IMAGE_URI_KEY) flatten_rows = ToCSVRows() # Each element in the image_csv_data PCollection will be a dict # including the image_csv_columns and the image features created from # extract_images_fn. image_csv_data = ( p | 'ReadFromDataFrame' >> beam.Create(df.values.tolist()) | 'ToCSVRows' >> beam.ParDo(flatten_rows) | 'DecodeCSV' >> beam.Map(converter.decode) | 'ReadImage' >> beam.ParDo(extract_images_fn)) # Split dataset into train and validation. train_data, val_data, test_data, discard_data = ( image_csv_data | 'SplitDataset' >> beam.Partition( _partition_fn, len(constants.SPLIT_VALUES))) train_dataset = (train_data, constants.RAW_METADATA) val_dataset = (val_data, constants.RAW_METADATA) test_dataset = (test_data, constants.RAW_METADATA) # TensorFlow Transform applied to all datasets. preprocessing_fn = functools.partial(_preprocessing_fn, integer_label=integer_label) transformed_train_dataset, transform_fn = ( train_dataset | 'AnalyzeAndTransformTrain' >> tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_train_data, transformed_metadata = transformed_train_dataset transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) transformed_val_data, _ = ( (val_dataset, transform_fn) | 'TransformVal' >> tft_beam.TransformDataset()) transformed_test_data, _ = ( (test_dataset, transform_fn) | 'TransformTest' >> tft_beam.TransformDataset()) # Sinks for TFRecords and metadata. tfr_writer = functools.partial(_get_write_to_tfrecord, output_dir=job_dir, compress=compression, num_shards=num_shards) _ = (transformed_train_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> tfr_writer(prefix='train')) _ = (transformed_val_data | 'EncodeValData' >> beam.Map(transformed_data_coder.encode) | 'WriteValData' >> tfr_writer(prefix='val')) _ = (transformed_test_data | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode) | 'WriteTestData' >> tfr_writer(prefix='test')) _ = (discard_data | 'DiscardDataWriter' >> beam.io.WriteToText( os.path.join(job_dir, 'discarded-data'))) # Output transform function and metadata _ = (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(job_dir)) # Output metadata schema _ = (transformed_metadata | 'WriteMetadata' >> tft_beam.WriteMetadata(job_dir, pipeline=p)) return p
def transform_data(train_data_file, test_data_file, working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data using the parquet io, and transform it using a preprocessing pipeline that scales numeric data and converts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data test_data_file: File containing test data feature_config: named tuple with feature types working_dir: Directory to write transformed data and metadata to """ numerical_feats = [ "startCountTotal", "purchaseCountTotal", "globalStartCountTotal", "globalPurchaseCountTotal" ] categorical_feats = ["country", "sourceGameId", "platform"] def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} for key in numerical_feats: outputs[key] = tf.cast(tft.bucketize(inputs[key], 20), tf.float32) / 20.0 - 0.5 outputs["campaignCost_mod"] = inputs["campaignCost"] / 100.0 inputs["game_zone"] = tf.string_join( [inputs["sourceGameId"], inputs["zone"]], separator="_") inputs["game_campaignId"] = tf.string_join( [inputs["sourceGameId"], inputs["campaignId"]], separator="_") for key in categorical_feats + ["game_zone", "game_campaignId"]: vocab = tft.vocabulary(inputs[key], vocab_filename=key, frequency_threshold=100) outputs[key] = tft.apply_vocabulary(inputs[key], vocab, default_value=0) outputs["label"] = inputs["label"] outputs["key"] = inputs["key"] return outputs # Input schema definition RAW_DATA_METADATA = gather_raw_metadata( numerical_feats + ["campaignCost"], categorical_feats + ["zone", "campaignId", "key"]) # pipeline args to read from gcs, currently unused because reading local file pipeline_args = [ '--runner=DirectRunner', '--project=unity-ads-ds-prd', # '--staging_location=gs://unity-ads-ds-prd-users/villew/promo/staging', # '--temp_location=gs://unity-ads-ds-prd-users/villew/promo/temp', '--job_name=transform-promo-data-to-tf-records' ] pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True # create a beam pipeline with beam.Pipeline(options=pipeline_options) as pipeline: with tft_beam.Context(temp_dir=tempfile.mkdtemp()): raw_data = ( pipeline | 'ReadTrainData' >> beam.io.ReadFromParquet(train_data_file)) # Combine data and schema into a dataset tuple. raw_dataset = (raw_data, RAW_DATA_METADATA) transformed_dataset, transform_fn = ( raw_dataset | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) # write to tf record _ = (transformed_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, "train_tfrecord"))) # Now apply transform function to test data. raw_test_data = ( pipeline | 'ReadTestData' >> beam.io.ReadFromParquet(test_data_file)) raw_test_dataset = (raw_test_data, RAW_DATA_METADATA) transformed_test_dataset = ((raw_test_dataset, transform_fn) | tft_beam.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_test_data, _ = transformed_test_dataset _ = (transformed_test_data | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode) | 'WriteTestData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, "test_tfrecord"))) # Will write a SavedModel and metadata to working_dir, which can then # be read by the tft.TFTransformOutput class. _ = (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
def transform_data(input_handle, outfile_prefix, working_dir, schema_file, transform_dir=None, max_rows=None, pipeline_args=None): """The main tf.transform method which analyzes and transforms data. Args: input_handle: BigQuery table name to process specified as DATASET.TABLE or path to csv file with input data. outfile_prefix: Filename prefix for emitted transformed examples working_dir: Directory in which transformed examples and transform function will be emitted. schema_file: An file path that contains a text-serialized TensorFlow metadata schema of the input data. transform_dir: Directory in which the transform output is located. If provided, this will load the transform_fn from disk instead of computing it over the data. Hint: this is useful for transforming eval data. max_rows: Number of rows to query from BigQuery pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. """ def transform_ngrams(input, ngram_range): """ helper function to transform ngrams and print output. """ # this print statement causes output to concat itself! # input = tf.Print(input, [input], "raw input:", first_n=-1, summarize=100) transformed = transform.ngrams( tf.string_split(input, delimiter=" "), ngram_range=ngram_range, separator=' ') # SparseTensor basically cannot be printed because it's made up of 3 # tensors. We can use this trick to print the values column, but without the index # it's not too meaningful. # # values = tf.Print(transformed.values, [transformed.values], "ngram output:") # transformed = tf.SparseTensor( # indices=transformed.indices, # values=values, # dense_shape=transformed.dense_shape) return transformed def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. https://cloud.google.com/solutions/machine-learning/data-preprocessing-for-ml-with-tf-transform-pt2 Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: print('processing key', key) print('input:', inputs[key]) # Preserve this feature as a dense float, setting nan's to the mean. outputs[taxi.transformed_name(key)] = transform.scale_to_z_score( _fill_in_missing(inputs[key])) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[ taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) # for key in taxi.FEATURE_NGRAM: # # Extract nggrams and build a vocab. # outputs[ # taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary( # transform.ngrams( # tf.string_split(_fill_in_missing(inputs[key])), # ngram_range=taxi.NGRAM_RANGE, # separator=' '), # top_k=512, # num_oov_buckets=taxi.OOV_SIZE) for key in taxi.FEATURE_NGRAM: # Extract nggrams and build a vocab. outputs[ taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary( transform_ngrams(_fill_in_missing(inputs[key]), taxi.NGRAM_RANGE), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = transform.bucketize( _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY]) tips = _fill_in_missing(inputs[taxi.LABEL_KEY]) outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast( tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs schema = taxi.read_schema(schema_file) raw_feature_spec = taxi.get_raw_feature_spec(schema) raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema) with beam.Pipeline(argv=pipeline_args) as pipeline: with tft_beam.Context(temp_dir=working_dir): if input_handle.lower().endswith('csv'): csv_coder = taxi.make_csv_coder(schema, input_handle.lower()) raw_data = ( pipeline | 'ReadFromText' >> beam.io.ReadFromText( input_handle, skip_header_lines=1)) decode_transform = beam.Map(csv_coder.decode) else: query = taxi.make_sql(input_handle, max_rows, for_eval=False) raw_data = ( pipeline | 'ReadBigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True))) decode_transform = beam.Map( taxi.clean_raw_data_dict, raw_feature_spec=raw_feature_spec) if transform_dir is None: decoded_data = raw_data | 'DecodeForAnalyze' >> decode_transform transform_fn = ( (decoded_data, raw_data_metadata) | ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn))) _ = ( transform_fn | ('WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))) else: transform_fn = pipeline | tft_beam.ReadTransformFn(transform_dir) # Shuffling the data before materialization will improve Training # effectiveness downstream. Here we shuffle the raw_data (as opposed to # decoded data) since it has a compact representation. shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle() decoded_data = shuffled_data | 'DecodeForTransform' >> decode_transform (transformed_data, transformed_metadata) = ( ((decoded_data, raw_data_metadata), transform_fn) | 'Transform' >> tft_beam.TransformDataset()) coder = example_proto_coder.ExampleProtoCoder(transformed_metadata.schema) _ = ( transformed_data | 'SerializeExamples' >> beam.Map(coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(working_dir, outfile_prefix), file_name_suffix='.gz') )
def compute_gci_vocab(input_handle, working_dir, schema_file, transform_dir=None, max_rows=None, pipeline_args=None): """The main tf.transform method which analyzes and transforms data. Args: input_handle: BigQuery table name to process specified as DATASET.TABLE or path to csv file with input data. outfile_prefix: Filename prefix for emitted transformed examples working_dir: Directory in which transformed examples and transform function will be emitted. schema_file: An file path that contains a text-serialized TensorFlow metadata schema of the input data. transform_dir: Directory in which the transform output is located. If provided, this will load the transform_fn from disk instead of computing it over the data. Hint: this is useful for transforming eval data. max_rows: Number of rows to query from BigQuery pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. """ def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} DENSE_FLOAT_FEATURE_KEYS = [] VOCAB_FEATURE_KEYS = [] _CSV_COLUMNS_NAMES, _CSV_COLUMN_DEFAULTS, _CSV_COLUMN_types, _UNUSED = setcolumn_list_original( ) for i in range(len(_CSV_COLUMNS_NAMES)): if _CSV_COLUMN_types[i] is tf.string: VOCAB_FEATURE_KEYS.append(_CSV_COLUMNS_NAMES[i]) outputs['gci'] = tf.expand_dims(_fill_in_missing(inputs['gci']), 1) for key in VOCAB_FEATURE_KEYS: if key in _UNUSED: continue if 'gci' in key: appendlist = tf.expand_dims(_fill_in_missing(inputs[key]), 1) outputs['gci'] = tf.concat([appendlist, outputs['gci']], 0) transform.vocabulary(outputs['gci'], vocab_filename='gci') transform.vocabulary(inputs['LAT_LON_10'], vocab_filename='label') return outputs schema = read_schema(schema_file) raw_feature_spec = get_raw_feature_spec(schema) raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema) with beam.Pipeline(argv=pipeline_args) as pipeline: with tft_beam.Context(temp_dir=working_dir): csv_coder = make_csv_coder(schema) raw_data = (pipeline | 'ReadFromText' >> beam.io.ReadFromText( input_handle, skip_header_lines=1)) decode_transform = beam.Map(csv_coder.decode) decoded_data = raw_data | 'DecodeForAnalyze' >> decode_transform transform_fn = ( (decoded_data, raw_data_metadata) | ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn))) _ = ( transform_fn | ('WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir)))
def test_single_phase_run_twice(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): _ = tft.vocabulary(inputs['s'], vocab_filename='vocab1') _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 's_integerized': tft.compute_and_apply_vocabulary( inputs['s'], labels=inputs['label'], use_adjusted_mutual_info=True), } input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.io.FixedLenFeature([], tf.float32), 'y': tf.io.FixedLenFeature([], tf.float32), 's': tf.io.FixedLenFeature([], tf.string), 'label': tf.io.FixedLenFeature([], tf.int64), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'a', 'label': 0, }, { 'x': 4, 'y': -4, 's': 'a', 'label': 1, }, { 'x': 5, 'y': 11, 's': 'a', 'label': 1, }, { 'x': 1, 'y': -4, 's': u'ȟᎥ𝒋ǩľḿꞑȯ𝘱𝑞𝗋𝘴'.encode('utf-8'), 'label': 1, }], span_1_key: [{ 'x': 12, 'y': 1, 's': u'ȟᎥ𝒋ǩľḿꞑȯ𝘱𝑞𝗋𝘴'.encode('utf-8'), 'label': 0 }, { 'x': 10, 'y': 1, 's': 'c', 'label': 1 }], } expected_vocabulary_contents = np.array( [b'a', u'ȟᎥ𝒋ǩľḿꞑȯ𝘱𝑞𝗋𝘴'.encode('utf-8'), b'c'], dtype=object) with _TestPipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # wrap each value in input_data_dict as a pcoll. input_data_pcoll_dict = {} for a, b in six.iteritems(input_data_dict): input_data_pcoll_dict[a] = p | a >> beam.Create(b) transform_fn_1, cache_output = ( (flat_data, input_data_pcoll_dict, {}, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) _ = ( cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS( self._cache_dir)) transformed_dataset = (( (input_data_pcoll_dict[span_1_key], input_metadata), transform_fn_1) | 'Transform' >> beam_impl.TransformDataset()) del input_data_pcoll_dict transformed_data, unused_transformed_metadata = transformed_dataset expected_transformed_data = [ { 'x_mean': 5.0, 'x_min': -2.0, 'y_mean': 1.0, 'y_min': -4.0, 's_integerized': 0, }, { 'x_mean': 5.0, 'x_min': -2.0, 'y_mean': 1.0, 'y_min': -4.0, 's_integerized': 2, }, ] beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='first') transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn_1') _ = transform_fn_1 | tft_beam.WriteTransformFn(transform_fn_dir) for key in input_data_dict: self.assertIn(key, cache_output) self.assertEqual(7, len(cache_output[key])) tf_transform_output = tft.TFTransformOutput(transform_fn_dir) vocab1_path = tf_transform_output.vocabulary_file_by_name('vocab1') self.AssertVocabularyContents(vocab1_path, expected_vocabulary_contents) # 4 from analyzing 2 spans, and 2 from transform. self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 8) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 0) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 14) self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2) with _TestPipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # wrap each value in input_data_dict as a pcoll. input_data_pcoll_dict = {} for a, b in six.iteritems(input_data_dict): input_data_pcoll_dict[a] = p | a >> beam.Create(b) input_cache = p | analyzer_cache.ReadAnalysisCacheFromFS( self._cache_dir, list(input_data_dict.keys())) transform_fn_2, second_output_cache = ( (flat_data, input_data_pcoll_dict, input_cache, input_metadata) | 'AnalyzeAgain' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) dot_string = nodes.get_dot_graph([analysis_graph_builder._ANALYSIS_GRAPH ]).to_string() self.WriteRenderedDotFile(dot_string) transformed_dataset = (( (input_data_dict[span_1_key], input_metadata), transform_fn_2) | 'TransformAgain' >> beam_impl.TransformDataset()) transformed_data, unused_transformed_metadata = transformed_dataset beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='second') transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn_2') _ = transform_fn_2 | tft_beam.WriteTransformFn(transform_fn_dir) tf_transform_output = tft.TFTransformOutput(transform_fn_dir) vocab1_path = tf_transform_output.vocabulary_file_by_name('vocab1') self.AssertVocabularyContents(vocab1_path, expected_vocabulary_contents) self.assertFalse(second_output_cache) # Only 2 from transform. self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 2) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 14) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 0) # The root CreateSavedModel is optimized away because the data doesn't get # processed at all (only cache). self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 1)
def test_non_frequency_vocabulary_merge(self): """This test compares vocabularies produced with and without cache.""" mi_vocab_name = 'mutual_information_vocab' adjusted_mi_vocab_name = 'adjusted_mutual_information_vocab' weighted_frequency_vocab_name = 'weighted_frequency_vocab' def preprocessing_fn(inputs): _ = tft.vocabulary( inputs['s'], labels=inputs['label'], store_frequency=True, vocab_filename=mi_vocab_name, min_diff_from_avg=0.1, use_adjusted_mutual_info=False) _ = tft.vocabulary( inputs['s'], labels=inputs['label'], store_frequency=True, vocab_filename=adjusted_mi_vocab_name, min_diff_from_avg=1.0, use_adjusted_mutual_info=True) _ = tft.vocabulary( inputs['s'], weights=inputs['weight'], store_frequency=True, vocab_filename=weighted_frequency_vocab_name, use_adjusted_mutual_info=False) return inputs span_0_key = 'span-0' span_1_key = 'span-1' input_data = [ dict(s='a', weight=1, label=1), dict(s='a', weight=0.5, label=1), dict(s='b', weight=0.75, label=1), dict(s='b', weight=1, label=0), ] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 's': tf.io.FixedLenFeature([], tf.string), 'label': tf.io.FixedLenFeature([], tf.int64), 'weight': tf.io.FixedLenFeature([], tf.float32), })) input_data_dict = { span_0_key: input_data, span_1_key: input_data, } with _TestPipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # wrap each value in input_data_dict as a pcoll. input_data_pcoll_dict = {} for a, b in six.iteritems(input_data_dict): input_data_pcoll_dict[a] = p | a >> beam.Create(b) transform_fn_with_cache, output_cache = ( (flat_data, input_data_pcoll_dict, {}, input_metadata) | (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) transform_fn_with_cache_dir = os.path.join(self.base_test_dir, 'transform_fn_with_cache') _ = transform_fn_with_cache | tft_beam.WriteTransformFn( transform_fn_with_cache_dir) expected_accumulators = { b'__v0__VocabularyAccumulate[vocabulary]-\xd3\xe0p\x82\xb1\xa0z\xa3S\xd7N8@\x8f\xa2\xd7\xa1\x9e\xac;': [ b'["a", [2, [0.0, 1.0], [0.0, 0.0], 1.0]]', b'["b", [2, [0.5, 0.5], [0.0, 0.0], 1.0]]' ], b'__v0__VocabularyAccumulate[vocabulary_1]-A\xc7_0\xee\xff\x88@E<\xde\xcb\x8d\xff5\xebyZZ\x8d': [ b'["a", [2, [0.0, 1.0], [0.0, 0.0], 1.0]]', b'["b", [2, [0.5, 0.5], [0.0, 0.0], 1.0]]' ], b"__v0__VocabularyAccumulate[vocabulary_2]-\x97\x1c>\x851\x94'\xdc\xdf\xfd\xcc\x86\xb7\xb8\xe1\xe8*\x89B\t": [b'["a", 1.5]', b'["b", 1.75]'], } spans = [span_0_key, span_1_key] self.assertCountEqual(output_cache.keys(), spans) for span in spans: self.assertCountEqual(output_cache[span].keys(), expected_accumulators.keys()) for idx, (key, value) in enumerate(six.iteritems(expected_accumulators)): beam_test_util.assert_that( output_cache[span][key], beam_test_util.equal_to(value), label='AssertCache[{}][{}]'.format(span, idx)) # 4 from analysis on each of the input spans. self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 8) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 0) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 6) self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2) with _TestPipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create(input_data * 2) transform_fn_no_cache = ((flat_data, input_metadata) | (beam_impl.AnalyzeDataset(preprocessing_fn))) transform_fn_no_cache_dir = os.path.join(self.base_test_dir, 'transform_fn_no_cache') _ = transform_fn_no_cache | tft_beam.WriteTransformFn( transform_fn_no_cache_dir) # 4 from analysis on each of the input spans. self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 8) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 0) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 0) self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2) tft_output_cache = tft.TFTransformOutput(transform_fn_with_cache_dir) tft_output_no_cache = tft.TFTransformOutput(transform_fn_no_cache_dir) for vocab_filename in (mi_vocab_name, adjusted_mi_vocab_name, weighted_frequency_vocab_name): cache_path = tft_output_cache.vocabulary_file_by_name(vocab_filename) no_cache_path = tft_output_no_cache.vocabulary_file_by_name( vocab_filename) with tf.io.gfile.GFile(cache_path, 'rb') as f1, tf.io.gfile.GFile( no_cache_path, 'rb') as f2: self.assertEqual( f1.readlines(), f2.readlines(), 'vocab with cache != vocab without cache for: {}'.format( vocab_filename))
'add': tf.FixedLenFeature([], tf.boolean), 'line_length': tf.FixedLenFeature([], tf.int32]))) input_data_metadata = dataset_metadata.DatasetMetadata(input_data_schema) with beam.Pipeline() as pipeline: with tft_beam.Context(temp_dir=tempfile.mkdtemp()): columns = text_fields + ["commented", "add", "line_length"] converter = tft.coders.CsvCoder(columns, input_data_schema) input_data = ( pipeline | 'ReadInputData' >> beam.io.ReadFromText(train_data_file) | 'CleanInputData' >> MapAndFilterErrors(converter.decode)) input_dataset = (input_data, input_data_metadata) transformed_dataset, transform_fn = ( input_dataset | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transfored_metadata = transformed_dataset transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) # Write the resulting data out _ = ( transformed_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE))) # We'll use the transform function later too _ = ( transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
def _RunBeamImpl(self, inputs: Mapping[Text, Any], outputs: Mapping[Text, Any], preprocessing_fn: Any, input_dataset_metadata: dataset_metadata.DatasetMetadata, raw_examples_data_format: Text, transform_output_path: Text, compute_statistics: bool, materialize_output_paths: Sequence[Text]) -> _Status: """Perform data preprocessing with FlumeC++ runner. Args: inputs: A dictionary of labelled input values. outputs: A dictionary of labelled output values. preprocessing_fn: The tf.Transform preprocessing_fn. input_dataset_metadata: A DatasetMetadata object for the input data. raw_examples_data_format: A string describing the raw data format. transform_output_path: An absolute path to write the output to. compute_statistics: A bool indicating whether or not compute statistics. materialize_output_paths: Paths to materialized outputs. Raises: RuntimeError: If reset() is not being invoked between two run(). ValueError: If the schema is empty. Returns: Status of the execution. """ raw_examples_file_format = common.GetSoleValue( inputs, labels.EXAMPLES_FILE_FORMAT_LABEL, strict=False) analyze_and_transform_data_paths = common.GetValues( inputs, labels.ANALYZE_AND_TRANSFORM_DATA_PATHS_LABEL) transform_only_data_paths = common.GetValues( inputs, labels.TRANSFORM_ONLY_DATA_PATHS_LABEL) stats_use_tfdv = common.GetSoleValue(inputs, labels.TFT_STATISTICS_USE_TFDV_LABEL) per_set_stats_output_paths = common.GetValues( outputs, labels.PER_SET_STATS_OUTPUT_PATHS_LABEL) temp_path = common.GetSoleValue(outputs, labels.TEMP_OUTPUT_LABEL) input_cache_dir = common.GetSoleValue( inputs, labels.CACHE_INPUT_PATH_LABEL, strict=False) output_cache_dir = common.GetSoleValue( outputs, labels.CACHE_OUTPUT_PATH_LABEL, strict=False) tf.logging.info('Analyze and transform data patterns: %s', list(enumerate(analyze_and_transform_data_paths))) tf.logging.info('Transform data patterns: %s', list(enumerate(transform_only_data_paths))) tf.logging.info('Transform materialization output paths: %s', list(enumerate(materialize_output_paths))) tf.logging.info('Transform output path: %s', transform_output_path) feature_spec = schema_utils.schema_as_feature_spec( _GetSchemaProto(input_dataset_metadata)).feature_spec try: analyze_input_columns = tft.get_analyze_input_columns( preprocessing_fn, feature_spec) transform_input_columns = ( tft.get_transform_input_columns(preprocessing_fn, feature_spec)) except AttributeError: # If using TFT 1.12, fall back to assuming all features are used. analyze_input_columns = feature_spec.keys() transform_input_columns = feature_spec.keys() # Use the same dataset (same columns) for AnalyzeDataset and computing # pre-transform stats so that the data will only be read once for these # two operations. if compute_statistics: analyze_input_columns = list( set(list(analyze_input_columns) + list(transform_input_columns))) if input_dataset_metadata.schema is _RAW_EXAMPLE_SCHEMA: analyze_input_dataset_metadata = input_dataset_metadata transform_input_dataset_metadata = input_dataset_metadata else: analyze_input_dataset_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec( {feature: feature_spec[feature] for feature in analyze_input_columns})) transform_input_dataset_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec( {feature: feature_spec[feature] for feature in transform_input_columns})) can_process_jointly = not bool(per_set_stats_output_paths or materialize_output_paths or output_cache_dir) analyze_data_list = self._MakeDatasetList( analyze_and_transform_data_paths, raw_examples_file_format, raw_examples_data_format, analyze_input_dataset_metadata, can_process_jointly) transform_data_list = self._MakeDatasetList( list(analyze_and_transform_data_paths) + list(transform_only_data_paths), raw_examples_file_format, raw_examples_data_format, transform_input_dataset_metadata, can_process_jointly) desired_batch_size = self._GetDesiredBatchSize(raw_examples_data_format) with self._CreatePipeline(outputs) as p: with tft_beam.Context( temp_dir=temp_path, desired_batch_size=desired_batch_size, passthrough_keys={_TRANSFORM_INTERNAL_FEATURE_FOR_KEY}, use_deep_copy_optimization=True): # pylint: disable=expression-not-assigned # pylint: disable=no-value-for-parameter _ = ( p | self._IncrementColumnUsageCounter( len(feature_spec.keys()), len(analyze_input_columns), len(transform_input_columns))) (new_analyze_data_dict, input_cache, flat_data_required) = ( p | self._OptimizeRun(input_cache_dir, output_cache_dir, analyze_data_list, feature_spec, preprocessing_fn, self._GetCacheSource())) # Removing unneeded datasets if they won't be needed for # materialization. This means that these datasets won't be included in # the statistics computation or profiling either. if not materialize_output_paths: analyze_data_list = [ d for d in new_analyze_data_dict.values() if d is not None ] analyze_decode_fn = ( self._GetDecodeFunction(raw_examples_data_format, analyze_input_dataset_metadata.schema)) for (idx, dataset) in enumerate(analyze_data_list): dataset.encoded = ( p | 'ReadAnalysisDataset[{}]'.format(idx) >> self._ReadExamples(dataset)) dataset.decoded = ( dataset.encoded | 'DecodeAnalysisDataset[{}]'.format(idx) >> self._DecodeInputs(analyze_decode_fn)) input_analysis_data = {} for key, dataset in six.iteritems(new_analyze_data_dict): if dataset is None: input_analysis_data[key] = None else: input_analysis_data[key] = dataset.decoded if flat_data_required: flat_input_analysis_data = ( [dataset.decoded for dataset in analyze_data_list] | 'FlattenAnalysisDatasets' >> beam.Flatten(pipeline=p)) else: flat_input_analysis_data = None if input_cache: tf.logging.info('Analyzing data with cache.') transform_fn, cache_output = ( (flat_input_analysis_data, input_analysis_data, input_cache, input_dataset_metadata) | 'AnalyzeDataset' >> tft_beam.AnalyzeDatasetWithCache( preprocessing_fn, pipeline=p)) # Write the raw/input metadata. (input_dataset_metadata | 'WriteMetadata' >> tft_beam.WriteMetadata( os.path.join(transform_output_path, tft.TFTransformOutput.RAW_METADATA_DIR), p)) # WriteTransformFn writes transform_fn and metadata to subdirectories # tensorflow_transform.SAVED_MODEL_DIR and # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively. (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(transform_output_path)) if output_cache_dir is not None and cache_output is not None: # TODO(b/37788560): Possibly make this part of the beam graph. tf.io.gfile.makedirs(output_cache_dir) tf.logging.info('Using existing cache in: %s', input_cache_dir) if input_cache_dir is not None: # Only copy cache that is relevant to this iteration. This is # assuming that this pipeline operates on rolling ranges, so those # cache entries may also be relevant for future iterations. for span_cache_dir in input_analysis_data: full_span_cache_dir = os.path.join(input_cache_dir, span_cache_dir) if tf.io.gfile.isdir(full_span_cache_dir): self._CopyCache(full_span_cache_dir, os.path.join(output_cache_dir, span_cache_dir)) (cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS( p, output_cache_dir, sink=self._GetCacheSink())) if compute_statistics or materialize_output_paths: # Do not compute pre-transform stats if the input format is raw proto, # as StatsGen would treat any input as tf.Example. if (compute_statistics and not self._IsDataFormatProto(raw_examples_data_format)): # Aggregated feature stats before transformation. pre_transform_feature_stats_path = os.path.join( transform_output_path, tft.TFTransformOutput.PRE_TRANSFORM_FEATURE_STATS_PATH) schema_proto = _GetSchemaProto(analyze_input_dataset_metadata) ([ dataset.decoded if stats_use_tfdv else dataset.encoded for dataset in analyze_data_list ] | 'FlattenPreTransformAnalysisDatasets' >> beam.Flatten(pipeline=p) | 'GenerateAggregatePreTransformAnalysisStats' >> self._GenerateStats( pre_transform_feature_stats_path, schema_proto, use_deep_copy_optimization=True, use_tfdv=stats_use_tfdv)) transform_decode_fn = ( self._GetDecodeFunction(raw_examples_data_format, transform_input_dataset_metadata.schema)) # transform_data_list is a superset of analyze_data_list, we pay the # cost to read the same dataset (analyze_data_list) again here to # prevent certain beam runner from doing large temp materialization. for (idx, dataset) in enumerate(transform_data_list): dataset.encoded = ( p | 'ReadTransformDataset[{}]'.format(idx) >> self._ReadExamples(dataset)) dataset.decoded = ( dataset.encoded | 'DecodeTransformDataset[{}]'.format(idx) >> self._DecodeInputs(transform_decode_fn)) (dataset.transformed, metadata) = (((dataset.decoded, transform_input_dataset_metadata), transform_fn) | 'TransformDataset[{}]'.format(idx) >> tft_beam.TransformDataset()) if materialize_output_paths or not stats_use_tfdv: dataset.transformed_and_encoded = ( dataset.transformed | 'EncodeTransformedDataset[{}]'.format(idx) >> beam.ParDo( self._EncodeAsExamples(), metadata)) if compute_statistics: # Aggregated feature stats after transformation. _, metadata = transform_fn post_transform_feature_stats_path = os.path.join( transform_output_path, tft.TFTransformOutput.POST_TRANSFORM_FEATURE_STATS_PATH) # TODO(b/70392441): Retain tf.Metadata (e.g., IntDomain) in # schema. Currently input dataset schema only contains dtypes, # and other metadata is dropped due to roundtrip to tensors. transformed_schema_proto = _GetSchemaProto(metadata) ([(dataset.transformed if stats_use_tfdv else dataset.transformed_and_encoded) for dataset in transform_data_list] | 'FlattenPostTransformAnalysisDatasets' >> beam.Flatten() | 'GenerateAggregatePostTransformAnalysisStats' >> self._GenerateStats( post_transform_feature_stats_path, transformed_schema_proto, use_tfdv=stats_use_tfdv)) if per_set_stats_output_paths: assert len(transform_data_list) == len(per_set_stats_output_paths) # TODO(b/67632871): Remove duplicate stats gen compute that is # done both on a flattened view of the data, and on each span # below. bundles = zip(transform_data_list, per_set_stats_output_paths) for (idx, (dataset, output_path)) in enumerate(bundles): if stats_use_tfdv: data = dataset.transformed else: data = dataset.transformed_and_encoded (data | 'GeneratePostTransformStats[{}]'.format(idx) >> self._GenerateStats( output_path, transformed_schema_proto, use_tfdv=stats_use_tfdv)) if materialize_output_paths: assert len(transform_data_list) == len(materialize_output_paths) bundles = zip(transform_data_list, materialize_output_paths) for (idx, (dataset, output_path)) in enumerate(bundles): (dataset.transformed_and_encoded | 'Materialize[{}]'.format(idx) >> self._WriteExamples( raw_examples_file_format, output_path)) return _Status.OK()
def transform_data(train_data_file, test_data_file, working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data using the CSV reader, and transform it using a preprocessing pipeline that scales numeric data and converts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data test_data_file: File containing test data working_dir: Directory to write transformed data and metadata to """ def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" # Since we are modifying some features and leaving others unchanged, we # start by setting `outputs` to a copy of `inputs. outputs = inputs.copy() # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(inputs[key]) for key in OPTIONAL_NUMERIC_FEATURE_KEYS: # This is a SparseTensor because it is optional. Here we fill in a default # value when it is missing. sparse = tf.sparse.SparseTensor(inputs[key].indices, inputs[key].values, [inputs[key].dense_shape[0], 1]) dense = tf.sparse.to_dense(sp_input=sparse, default_value=0.) # Reshaping from a batch of vectors of size 1 to a batch to scalars. dense = tf.squeeze(dense, axis=1) outputs[key] = tft.scale_to_0_1(dense) # For all categorical columns except the label column, we generate a # vocabulary but do not modify the feature. This vocabulary is instead # used in the trainer, by means of a feature column, to convert the feature # from a string to an integer id. for key in CATEGORICAL_FEATURE_KEYS: outputs[key] = tft.compute_and_apply_vocabulary(tf.strings.strip( inputs[key]), num_oov_buckets=1, vocab_filename=key) # For the label column we provide the mapping from string to index. table_keys = ['>50K', '<=50K'] initializer = tf.lookup.KeyValueTensorInitializer( keys=table_keys, values=tf.cast(tf.range(len(table_keys)), tf.int64), key_dtype=tf.string, value_dtype=tf.int64) table = tf.lookup.StaticHashTable(initializer, default_value=-1) # Romove trailing periods for test data when the data is read with tf.data. label_str = tf.strings.regex_replace(inputs[LABEL_KEY], r'\.', '') label_str = tf.strings.strip(label_str) data_labels = table.lookup(label_str) transformed_label = tf.one_hot(indices=data_labels, depth=len(table_keys), on_value=1.0, off_value=0.0) outputs[LABEL_KEY] = tf.reshape(transformed_label, [-1, len(table_keys)]) return outputs # The "with" block will create a pipeline, and run that pipeline at the exit # of the block. with beam.Pipeline() as pipeline: with tft_beam.Context(temp_dir=tempfile.mkdtemp()): # Create a TFXIO to read the census data with the schema. To do this we # need to list all columns in order since the schema doesn't specify the # order of columns in the csv. # We first read CSV files and use BeamRecordCsvTFXIO whose .BeamSource() # accepts a PCollection[bytes] because we need to patch the records first # (see "FixCommasTrainData" below). Otherwise, tfxio.CsvTFXIO can be used # to both read the CSV files and parse them to TFT inputs: # csv_tfxio = tfxio.CsvTFXIO(...) # raw_data = (pipeline | 'ToRecordBatches' >> csv_tfxio.BeamSource()) csv_tfxio = tfxio.BeamRecordCsvTFXIO( physical_format='text', column_names=ORDERED_CSV_COLUMNS, schema=SCHEMA) # Read in raw data and convert using CSV TFXIO. Note that we apply # some Beam transformations here, which will not be encoded in the TF # graph since we don't do the from within tf.Transform's methods # (AnalyzeDataset, TransformDataset etc.). These transformations are just # to get data into a format that the CSV TFXIO can read, in particular # removing spaces after commas. raw_data = (pipeline | 'ReadTrainData' >> beam.io.ReadFromText( train_data_file, coder=beam.coders.BytesCoder()) | 'FixCommasTrainData' >> beam.Map(lambda line: line.replace(b', ', b',')) | 'DecodeTrainData' >> csv_tfxio.BeamSource()) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. raw_dataset = (raw_data, csv_tfxio.TensorAdapterConfig()) transformed_dataset, transform_fn = ( raw_dataset | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE))) # Now apply transform function to test data. In this case we remove the # trailing period at the end of each line, and also ignore the header line # that is present in the test data file. raw_test_data = (pipeline | 'ReadTestData' >> beam.io.ReadFromText( test_data_file, skip_header_lines=1, coder=beam.coders.BytesCoder()) | 'FixCommasTestData' >> beam.Map(lambda line: line.replace(b', ', b',')) | 'RemoveTrailingPeriodsTestData' >> beam.Map(lambda line: line[:-1]) | 'DecodeTestData' >> csv_tfxio.BeamSource()) raw_test_dataset = (raw_test_data, csv_tfxio.TensorAdapterConfig()) transformed_test_dataset = ((raw_test_dataset, transform_fn) | tft_beam.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_test_data, _ = transformed_test_dataset _ = ( transformed_test_data | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode) | 'WriteTestData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE))) # Will write a SavedModel and metadata to working_dir, which can then # be read by the tft.TFTransformOutput class. _ = (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
def transform_data(train_data_file, test_data_file, working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data using the CSV reader, and transform it using a preprocessing pipeline that scales numeric data and converts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data test_data_file: File containing test data working_dir: Directory to write transformed data and metadata to """ def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" # Since we are modifying some features and leaving others unchanged, we # start by setting `outputs` to a copy of `inputs. outputs = inputs.copy() # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(outputs[key]) for key in OPTIONAL_NUMERIC_FEATURE_KEYS: # This is a SparseTensor because it is optional. Here we fill in a default # value when it is missing. dense = tf.compat.v1.sparse_to_dense( outputs[key].indices, [outputs[key].dense_shape[0], 1], outputs[key].values, default_value=0.) # Reshaping from a batch of vectors of size 1 to a batch to scalars. dense = tf.squeeze(dense, axis=1) outputs[key] = tft.scale_to_0_1(dense) # For all categorical columns except the label column, we generate a # vocabulary but do not modify the feature. This vocabulary is instead # used in the trainer, by means of a feature column, to convert the feature # from a string to an integer id. for key in CATEGORICAL_FEATURE_KEYS: tft.vocabulary(inputs[key], vocab_filename=key) # For the label column we provide the mapping from string to index. table_keys = ['>50K', '<=50K'] initializer = tf.lookup.KeyValueTensorInitializer( keys=table_keys, values=tf.cast(tf.range(len(table_keys)), tf.int64), key_dtype=tf.string, value_dtype=tf.int64) table = tf.lookup.StaticHashTable(initializer, default_value=-1) outputs[LABEL_KEY] = table.lookup(outputs[LABEL_KEY]) return outputs # The "with" block will create a pipeline, and run that pipeline at the exit # of the block. with beam.Pipeline() as pipeline: with tft_beam.Context(temp_dir=tempfile.mkdtemp()): # Create a coder to read the census data with the schema. To do this we # need to list all columns in order since the schema doesn't specify the # order of columns in the csv. ordered_columns = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label' ] converter = tft.coders.CsvCoder(ordered_columns, RAW_DATA_METADATA.schema) # Read in raw data and convert using CSV converter. Note that we apply # some Beam transformations here, which will not be encoded in the TF # graph since we don't do the from within tf.Transform's methods # (AnalyzeDataset, TransformDataset etc.). These transformations are just # to get data into a format that the CSV converter can read, in particular # removing spaces after commas. # # We use MapAndFilterErrors instead of Map to filter out decode errors in # convert.decode which should only occur for the trailing blank line. raw_data = ( pipeline | 'ReadTrainData' >> beam.io.ReadFromText(train_data_file) | 'FixCommasTrainData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'DecodeTrainData' >> MapAndFilterErrors(converter.decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. raw_dataset = (raw_data, RAW_DATA_METADATA) transformed_dataset, transform_fn = ( raw_dataset | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE))) # Now apply transform function to test data. In this case we remove the # trailing period at the end of each line, and also ignore the header line # that is present in the test data file. raw_test_data = ( pipeline | 'ReadTestData' >> beam.io.ReadFromText(test_data_file, skip_header_lines=1) | 'FixCommasTestData' >> beam.Map(lambda line: line.replace(', ', ',')) | 'RemoveTrailingPeriodsTestData' >> beam.Map(lambda line: line[:-1]) | 'DecodeTestData' >> MapAndFilterErrors(converter.decode)) raw_test_dataset = (raw_test_data, RAW_DATA_METADATA) transformed_test_dataset = ((raw_test_dataset, transform_fn) | tft_beam.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_test_data, _ = transformed_test_dataset _ = ( transformed_test_data | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode) | 'WriteTestData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE))) # Will write a SavedModel and metadata to working_dir, which can then # be read by the tft.TFTransformOutput class. _ = (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
def transform_data(input_features, preprocessing_fn, pipeline_args, train_data_file, cv_data_file, test_data_file, working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data using the parquet io, and transform it using a preprocessing pipeline that scales numeric data and converts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data test_data_file: File containing test data feature_config: named tuple with feature types working_dir: Directory to write transformed data and metadata to """ # Input schema definition RAW_DATA_METADATA = _get_raw_metadata(input_features) # pipeline args to read from gcs, currently unused because reading local file pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True # create a beam pipeline with beam.Pipeline(options=pipeline_options) as pipeline: # Needs to be GCS location if the process is running on Dataflow, otherwise it can't share model files temp_dir = pipeline_options.get_all_options().get( 'temp_location') or tempfile.mkdtemp() with tft_beam.Context(temp_dir=temp_dir): raw_data = ( pipeline | 'ReadTrainData' >> beam.io.ReadFromParquet(train_data_file)) # Combine data and schema into a dataset tuple. raw_dataset = (raw_data, RAW_DATA_METADATA) transformed_dataset, transform_fn = ( raw_dataset | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) # write to tf record _ = (transformed_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, "train_tfrecord"))) def encode_data(data_path, prefix, output_filename): # Apply transform function to test data. raw_data = ( pipeline | 'ReadData' + prefix >> beam.io.ReadFromParquet(data_path)) raw_dataset = (raw_data, RAW_DATA_METADATA) transformed_dataset = ( (raw_dataset, transform_fn) | 'Transform' + prefix >> tft_beam.TransformDataset()) # Don't need transformed data schema, it's the same as before. transformed_data, _ = transformed_dataset _ = (transformed_data | 'EncodeData' + prefix >> beam.Map( transformed_data_coder.encode) | 'WriteData' + prefix >> beam.io.WriteToTFRecord( os.path.join(working_dir, output_filename))) encode_data(cv_data_file, "-cv", "cv_tfrecord") encode_data(test_data_file, "-test", "test_tfrecord") # Will write a SavedModel and metadata to working_dir, which can then # be read by the tft.TFTransformOutput class. _ = (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
def test_single_phase_mixed_analyzer_run_once(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): integerized_s = tft.compute_and_apply_vocabulary(inputs['s']) _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'integerized_s': integerized_s, 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), } # Run AnalyzeAndTransform on some input data and compare with expected # output. input_data = [{'x': 12, 'y': 1, 's': 'c'}, {'x': 10, 'y': 1, 's': 'c'}] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.io.FixedLenFeature([], tf.float32), 'y': tf.io.FixedLenFeature([], tf.float32), 's': tf.io.FixedLenFeature([], tf.string), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'b', }, { 'x': 4, 'y': -4, 's': 'b', }], span_1_key: input_data, } with beam_impl.Context(temp_dir=self.get_temp_dir()): with beam.Pipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # TODO(b/37788560): Get these names programmatically. cache_dict = { span_0_key: { '__v0__CacheableCombineAccumulate--x_1-mean_and_var--': p | 'CreateA' >> beam.Create([b'[2.0, 1.0, 9.0, 0.0]']), '__v0__CacheableCombineAccumulate--x-x--': p | 'CreateB' >> beam.Create([b'[2.0, 4.0]']), '__v0__CacheableCombineAccumulate--y_1-mean_and_var--': p | 'CreateC' >> beam.Create([b'[2.0, -1.5, 6.25, 0.0]']), '__v0__CacheableCombineAccumulate--y-y--': p | 'CreateD' >> beam.Create([b'[4.0, 1.0]']), }, span_1_key: {}, } transform_fn, cache_output = ( (flat_data, input_data_dict, cache_dict, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS( self._cache_dir) transformed_dataset = ( ((input_data_dict[span_1_key], input_metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) dot_string = nodes.get_dot_graph( [analysis_graph_builder._ANALYSIS_GRAPH]).to_string() self.WriteRenderedDotFile(dot_string) transformed_data, unused_transformed_metadata = transformed_dataset expected_transformed = [ { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, 'integerized_s': 0, }, { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, 'integerized_s': 0, }, ] beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed)) transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn') _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir)
def test_single_phase_mixed_analyzer_run_once(self): cache_location = self._make_cache_location() span_0_key = 'span-0' span_1_key = 'span-1' _write_cache('__v0__CacheableCombineAccumulate--x_1-mean_and_var--', span_0_key, [2.0, 1.0, 9.0], cache_location.input_cache_dir) _write_cache('__v0__CacheableCombineAccumulate--x-x--', span_0_key, [2.0, 4.0], cache_location.input_cache_dir) _write_cache('__v0__CacheableCombineAccumulate--y_1-mean_and_var--', span_0_key, [2.0, -1.5, 6.25], cache_location.input_cache_dir) _write_cache('__v0__CacheableCombineAccumulate--y-y--', span_0_key, [4.0, 1.0], cache_location.input_cache_dir) def preprocessing_fn(inputs): integerized_s = tft.compute_and_apply_vocabulary(inputs['s']) _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'integerized_s': integerized_s, 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), } # Run AnalyzeAndTransform on some input data and compare with expected # output. input_data = [{'x': 12, 'y': 1, 's': 'c'}, {'x': 10, 'y': 1, 's': 'c'}] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.FixedLenFeature([], tf.float32), 'y': tf.FixedLenFeature([], tf.float32), 's': tf.FixedLenFeature([], tf.string), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'b', }, { 'x': 4, 'y': -4, 's': 'b', }], span_1_key: input_data, } with beam_impl.Context(temp_dir=self.get_temp_dir()): flat_data = input_data_dict.values() | 'Flatten' >> beam.Flatten() transform_fn = ((flat_data, input_data_dict, input_metadata) | (beam_impl.AnalyzeDatasetWithCache( preprocessing_fn, cache_location))) transformed_dataset = (( (input_data_dict[span_1_key], input_metadata), transform_fn) | beam_impl.TransformDataset()) transformed_data, unused_transformed_metadata = transformed_dataset exepected_transformed_data = [ { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, 'integerized_s': 0, }, { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, 'integerized_s': 0, }, ] self.assertDataCloseOrEqual(transformed_data, exepected_transformed_data) transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn') _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir)
def test_non_frequency_vocabulary_merge(self): """This test compares vocabularies produced with and without cache.""" mi_vocab_name = 'mutual_information_vocab' adjusted_mi_vocab_name = 'adjusted_mutual_information_vocab' weighted_frequency_vocab_name = 'weighted_frequency_vocab' def preprocessing_fn(inputs): _ = tft.vocabulary(inputs['s'], labels=inputs['label'], store_frequency=True, vocab_filename=mi_vocab_name, min_diff_from_avg=0.1, use_adjusted_mutual_info=False) _ = tft.vocabulary(inputs['s'], labels=inputs['label'], store_frequency=True, vocab_filename=adjusted_mi_vocab_name, min_diff_from_avg=1.0, use_adjusted_mutual_info=True) _ = tft.vocabulary(inputs['s'], weights=inputs['weight'], store_frequency=True, vocab_filename=weighted_frequency_vocab_name, use_adjusted_mutual_info=False) return inputs span_0_key = 'span-0' span_1_key = 'span-1' input_data = [ dict(s='a', weight=1, label=1), dict(s='a', weight=0.5, label=1), dict(s='b', weight=0.75, label=1), dict(s='b', weight=1, label=0), ] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 's': tf.io.FixedLenFeature([], tf.string), 'label': tf.io.FixedLenFeature([], tf.int64), 'weight': tf.io.FixedLenFeature([], tf.float32), })) input_data_dict = { span_0_key: input_data, span_1_key: input_data, } with beam_impl.Context(temp_dir=self.get_temp_dir()): flat_data = input_data_dict.values() | 'Flatten' >> beam.Flatten() transform_fn_with_cache, output_cache = ( (flat_data, input_data_dict, {}, input_metadata) | (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) expected_accumulators = { '__v0__VocabularyAccumulate--vocabulary--': [b'["a", [2, 1.0, 0.0, 1.0]]', b'["b", [2, 0.5, 0.0, 1.0]]'], '__v0__VocabularyAccumulate--vocabulary_1--': [b'["a", [2, 1.0, 0.0, 1.0]]', b'["b", [2, 0.5, 0.0, 1.0]]'], '__v0__VocabularyAccumulate--vocabulary_2--': [b'["a", 1.5]', b'["b", 1.75]'], } spans = [span_0_key, span_1_key] self.assertCountEqual(output_cache.keys(), spans) for span in spans: self.assertCountEqual(output_cache[span].keys(), expected_accumulators.keys()) for key, value in six.iteritems(expected_accumulators): self.assertCountEqual(output_cache[span][key], value) transform_fn_no_cache = ( (input_data * 2, input_metadata) | (beam_impl.AnalyzeDataset(preprocessing_fn))) transform_fn_with_cache_dir = os.path.join(self.base_test_dir, 'transform_fn_with_cache') _ = transform_fn_with_cache | tft_beam.WriteTransformFn( transform_fn_with_cache_dir) transform_fn_no_cache_dir = os.path.join(self.base_test_dir, 'transform_fn_no_cache') _ = transform_fn_no_cache | tft_beam.WriteTransformFn( transform_fn_no_cache_dir) tft_output_cache = tft.TFTransformOutput(transform_fn_with_cache_dir) tft_output_no_cache = tft.TFTransformOutput(transform_fn_no_cache_dir) for vocab_filename in (mi_vocab_name, adjusted_mi_vocab_name, weighted_frequency_vocab_name): cache_path = tft_output_cache.vocabulary_file_by_name( vocab_filename) no_cache_path = tft_output_no_cache.vocabulary_file_by_name( vocab_filename) with tf.io.gfile.GFile(cache_path, 'rb') as f1, tf.io.gfile.GFile( no_cache_path, 'rb') as f2: self.assertEqual( f1.readlines(), f2.readlines(), 'vocab with cache != vocab without cache for: {}'.format( vocab_filename))
def test_single_phase_run_twice(self): cache_location = self._make_cache_location('input_cache_1', 'output_cache_1') span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): _ = tft.vocabulary(inputs['s']) _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), } input_data = [{'x': 12, 'y': 1, 's': 'b'}, {'x': 10, 'y': 1, 's': 'c'}] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.FixedLenFeature([], tf.float32), 'y': tf.FixedLenFeature([], tf.float32), 's': tf.FixedLenFeature([], tf.string), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'a', }, { 'x': 4, 'y': -4, 's': 'a', }], span_1_key: input_data, } with beam_impl.Context(temp_dir=self.get_temp_dir()): flat_data = input_data_dict.values() | 'Flatten' >> beam.Flatten() transform_fn = ((flat_data, input_data_dict, input_metadata) | (beam_impl.AnalyzeDatasetWithCache( preprocessing_fn, cache_location))) transformed_dataset = (( (input_data_dict[span_1_key], input_metadata), transform_fn) | beam_impl.TransformDataset()) transformed_data, unused_transformed_metadata = transformed_dataset exepected_transformed_data = [ { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, }, { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, }, ] self.assertDataCloseOrEqual(transformed_data, exepected_transformed_data) transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn') _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir) for key in input_data_dict: key_cache_dir = os.path.join(cache_location.output_cache_dir, key) self.assertTrue(tf.gfile.IsDirectory(key_cache_dir)) self.assertEqual(len(tf.gfile.ListDirectory(key_cache_dir)), 6) cache_location = self._make_cache_location('output_cache_1', 'output_cache_2') with beam_impl.Context(temp_dir=self.get_temp_dir()): flat_data = input_data_dict.values() | 'Flatten' >> beam.Flatten() transform_fn = ((flat_data, input_data_dict, input_metadata) | (beam_impl.AnalyzeDatasetWithCache( preprocessing_fn, cache_location))) transformed_dataset = (( (input_data_dict[span_1_key], input_metadata), transform_fn) | beam_impl.TransformDataset()) transformed_data, unused_transformed_metadata = transformed_dataset self.assertDataCloseOrEqual(transformed_data, exepected_transformed_data) self.assertFalse(tf.gfile.IsDirectory(cache_location.output_cache_dir))
def transform_data(working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, and transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: working_dir: Directory to read shuffled data from and write transformed data and metadata to. """ with beam.Pipeline() as pipeline: with tft_beam.Context( temp_dir=os.path.join(working_dir, TRANSFORM_TEMP_DIR)): coder = tft.coders.ExampleProtoCoder(RAW_DATA_METADATA.schema) train_data = (pipeline | 'ReadTrain' >> beam.io.ReadFromTFRecord( os.path.join(working_dir, SHUFFLED_TRAIN_DATA_FILEBASE + '*')) | 'DecodeTrain' >> beam.Map(coder.decode)) test_data = (pipeline | 'ReadTest' >> beam.io.ReadFromTFRecord( os.path.join(working_dir, SHUFFLED_TEST_DATA_FILEBASE + '*')) | 'DecodeTest' >> beam.Map(coder.decode)) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_KEY] # Here tf.compat.v1.string_split behaves differently from # tf.strings.split. review_tokens = tf.compat.v1.string_split(review, DELIMITERS) review_indices = tft.compute_and_apply_vocabulary( review_tokens, top_k=VOCAB_SIZE) # Add one for the oov bucket created by compute_and_apply_vocabulary. review_bow_indices, review_weight = tft.tfidf( review_indices, VOCAB_SIZE + 1) return { REVIEW_KEY: review_bow_indices, REVIEW_WEIGHT_KEY: review_weight, LABEL_KEY: inputs[LABEL_KEY] } (transformed_train_data, transformed_metadata), transform_fn = ( (train_data, RAW_DATA_METADATA) | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) transformed_test_data, _ = ( ((test_data, RAW_DATA_METADATA), transform_fn) | 'Transform' >> tft_beam.TransformDataset()) _ = (transformed_train_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE))) _ = ( transformed_test_data | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode) | 'WriteTestData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE))) # Will write a SavedModel and metadata to two subdirectories of # working_dir, given by tft.TRANSFORM_FN_DIR and # tft.TRANSFORMED_METADATA_DIR respectively. _ = (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
def _RunBeamImpl(self, inputs, outputs, preprocessing_fn, input_dataset_metadata, raw_examples_data_format, transform_output_path, compute_statistics, materialize_output_paths): """Perform data preprocessing with FlumeC++ runner. Args: inputs: A dictionary of labelled input values. outputs: A dictionary of labelled output values. preprocessing_fn: The tf.Transform preprocessing_fn. input_dataset_metadata: A DatasetMetadata object for the input data. raw_examples_data_format: A string describing the raw data format. transform_output_path: An absolute path to write the output to. compute_statistics: A bool indicating whether or not compute statistics. materialize_output_paths: Paths to materialized outputs. Raises: RuntimeError: If reset() is not being invoked between two run(). ValueError: If the schema is empty. Returns: Status of the execution. """ raw_examples_file_format = common.GetSoleValue( inputs, labels.EXAMPLES_FILE_FORMAT_LABEL, strict=False) analyze_and_transform_data_paths = common.GetValues( inputs, labels.ANALYZE_AND_TRANSFORM_DATA_PATHS_LABEL) transform_only_data_paths = common.GetValues( inputs, labels.TRANSFORM_ONLY_DATA_PATHS_LABEL) stats_use_tfdv = common.GetSoleValue( inputs, labels.TFT_STATISTICS_USE_TFDV_LABEL) per_set_stats_output_paths = common.GetValues( outputs, labels.PER_SET_STATS_OUTPUT_PATHS_LABEL) temp_path = common.GetSoleValue(outputs, labels.TEMP_OUTPUT_LABEL) tf.logging.info('Analyze and transform data patterns: %s', list(enumerate(analyze_and_transform_data_paths))) tf.logging.info('Transform data patterns: %s', list(enumerate(transform_only_data_paths))) tf.logging.info('Transform materialization output paths: %s', list(enumerate(materialize_output_paths))) tf.logging.info('Transform output path: %s', transform_output_path) feature_spec = input_dataset_metadata.schema.as_feature_spec() try: analyze_input_columns = tft.get_analyze_input_columns( preprocessing_fn, feature_spec) transform_input_columns = (tft.get_transform_input_columns( preprocessing_fn, feature_spec)) except AttributeError: # If using TFT 1.12, fall back to assuming all features are used. analyze_input_columns = feature_spec.keys() transform_input_columns = feature_spec.keys() # Use the same dataset (same columns) for AnalyzeDataset and computing # pre-transform stats so that the data will only be read once for these # two operations. if compute_statistics: analyze_input_columns = list( set( list(analyze_input_columns) + list(transform_input_columns))) analyze_input_dataset_metadata = copy.deepcopy(input_dataset_metadata) transform_input_dataset_metadata = copy.deepcopy( input_dataset_metadata) if input_dataset_metadata.schema is not _RAW_EXAMPLE_SCHEMA: analyze_input_dataset_metadata.schema = dataset_schema.from_feature_spec( { feature: feature_spec[feature] for feature in analyze_input_columns }) transform_input_dataset_metadata.schema = ( dataset_schema.from_feature_spec({ feature: feature_spec[feature] for feature in transform_input_columns })) can_process_jointly = not bool(per_set_stats_output_paths or materialize_output_paths) analyze_data_list = self._MakeDatasetList( analyze_and_transform_data_paths, raw_examples_file_format, raw_examples_data_format, analyze_input_dataset_metadata, can_process_jointly) transform_data_list = self._MakeDatasetList( list(analyze_and_transform_data_paths) + list(transform_only_data_paths), raw_examples_file_format, raw_examples_data_format, transform_input_dataset_metadata, can_process_jointly) desired_batch_size = self._GetDesiredBatchSize( raw_examples_data_format) with self._CreatePipeline(outputs) as p: with tft_beam.Context( temp_dir=temp_path, desired_batch_size=desired_batch_size, passthrough_keys={_TRANSFORM_INTERNAL_FEATURE_FOR_KEY}, use_deep_copy_optimization=True): # pylint: disable=expression-not-assigned # pylint: disable=no-value-for-parameter analyze_decode_fn = (self._GetDecodeFunction( raw_examples_data_format, analyze_input_dataset_metadata.schema)) for (idx, dataset) in enumerate(analyze_data_list): dataset.encoded = (p | 'ReadAnalysisDataset[{}]'.format(idx) >> self._ReadExamples(dataset)) dataset.decoded = ( dataset.encoded | 'DecodeAnalysisDataset[{}]'.format(idx) >> self._DecodeInputs(analyze_decode_fn)) input_analysis_data = ( [dataset.decoded for dataset in analyze_data_list] | 'FlattenAnalysisDatasets' >> beam.Flatten()) transform_fn = ((input_analysis_data, input_dataset_metadata) | 'AnalyzeDataset' >> tft_beam.AnalyzeDataset(preprocessing_fn)) # Write the raw/input metadata. (input_dataset_metadata | 'WriteMetadata' >> tft_beam.WriteMetadata( os.path.join(transform_output_path, tft.TFTransformOutput.RAW_METADATA_DIR), p)) # WriteTransformFn writes transform_fn and metadata to subdirectories # tensorflow_transform.SAVED_MODEL_DIR and # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively. (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(transform_output_path)) if compute_statistics or materialize_output_paths: # Do not compute pre-transform stats if the input format is raw proto, # as StatsGen would treat any input as tf.Example. if (compute_statistics and not self._IsDataFormatProto( raw_examples_data_format)): # Aggregated feature stats before transformation. pre_transform_feature_stats_path = os.path.join( transform_output_path, tft.TFTransformOutput. PRE_TRANSFORM_FEATURE_STATS_PATH) # TODO(b/70392441): Retain tf.Metadata (e.g., IntDomain) in # schema. Currently input dataset schema only contains dtypes, # and other metadata is dropped due to roundtrip to tensors. schema_proto = schema_utils.schema_from_feature_spec( analyze_input_dataset_metadata.schema. as_feature_spec()) ([ dataset.decoded if stats_use_tfdv else dataset.encoded for dataset in analyze_data_list ] | 'FlattenPreTransformAnalysisDatasets' >> beam.Flatten() | 'GenerateAggregatePreTransformAnalysisStats' >> self._GenerateStats(pre_transform_feature_stats_path, schema_proto, use_deep_copy_optimization=True, use_tfdv=stats_use_tfdv)) transform_decode_fn = (self._GetDecodeFunction( raw_examples_data_format, transform_input_dataset_metadata.schema)) # transform_data_list is a superset of analyze_data_list, we pay the # cost to read the same dataset (analyze_data_list) again here to # prevent certain beam runner from doing large temp materialization. for (idx, dataset) in enumerate(transform_data_list): dataset.encoded = ( p | 'ReadTransformDataset[{}]'.format(idx) >> self._ReadExamples(dataset)) dataset.decoded = ( dataset.encoded | 'DecodeTransformDataset[{}]'.format(idx) >> self._DecodeInputs(transform_decode_fn)) (dataset.transformed, metadata) = ( ((dataset.decoded, transform_input_dataset_metadata), transform_fn) | 'TransformDataset[{}]'.format(idx) >> tft_beam.TransformDataset()) if materialize_output_paths or not stats_use_tfdv: dataset.transformed_and_encoded = ( dataset.transformed | 'EncodeTransformedDataset[{}]'.format(idx) >> beam.ParDo(self._EncodeAsExamples(), metadata)) if compute_statistics: # Aggregated feature stats after transformation. _, metadata = transform_fn post_transform_feature_stats_path = os.path.join( transform_output_path, tft.TFTransformOutput. POST_TRANSFORM_FEATURE_STATS_PATH) # TODO(b/70392441): Retain tf.Metadata (e.g., IntDomain) in # schema. Currently input dataset schema only contains dtypes, # and other metadata is dropped due to roundtrip to tensors. transformed_schema_proto = schema_utils.schema_from_feature_spec( metadata.schema.as_feature_spec()) ([(dataset.transformed if stats_use_tfdv else dataset.transformed_and_encoded) for dataset in transform_data_list] | 'FlattenPostTransformAnalysisDatasets' >> beam.Flatten() | 'GenerateAggregatePostTransformAnalysisStats' >> self._GenerateStats(post_transform_feature_stats_path, transformed_schema_proto, use_tfdv=stats_use_tfdv)) if per_set_stats_output_paths: assert len(transform_data_list) == len( per_set_stats_output_paths) # TODO(b/67632871): Remove duplicate stats gen compute that is # done both on a flattened view of the data, and on each span # below. bundles = zip(transform_data_list, per_set_stats_output_paths) for (idx, (dataset, output_path)) in enumerate(bundles): if stats_use_tfdv: data = dataset.transformed else: data = dataset.transformed_and_encoded (data | 'GeneratePostTransformStats[{}]'.format(idx) >> self._GenerateStats( output_path, transformed_schema_proto, use_tfdv=stats_use_tfdv)) if materialize_output_paths: assert len(transform_data_list) == len( materialize_output_paths) bundles = zip(transform_data_list, materialize_output_paths) for (idx, (dataset, output_path)) in enumerate(bundles): (dataset.transformed_and_encoded | 'Materialize[{}]'.format(idx) >> self._WriteExamples(raw_examples_file_format, output_path)) return _Status.OK()