Esempio n. 1
0
    def testWriteTransformFn(self):
        path = os.path.join(self.get_temp_dir(), 'output')

        with beam.Pipeline() as pipeline:
            # Create an empty directory for the source saved model dir.
            saved_model_dir = os.path.join(self.get_temp_dir(), 'source')
            file_io.recursive_create_dir(saved_model_dir)
            saved_model_dir_pcoll = (
                pipeline
                | 'CreateSavedModelDir' >> beam.Create([saved_model_dir]))
            metadata = beam_metadata_io.BeamDatasetMetadata(
                _TEST_METADATA_WITH_FUTURES, {
                    'a': pipeline | 'CreateA' >> beam.Create([3]),
                })

            _ = ((saved_model_dir_pcoll, metadata)
                 | transform_fn_io.WriteTransformFn(path))

        transformed_metadata_dir = os.path.join(
            path, transform_fn_io.TRANSFORMED_METADATA_DIR)
        metadata = metadata_io.read_metadata(transformed_metadata_dir)
        self.assertEqual(metadata, _TEST_METADATA)

        transform_fn_dir = os.path.join(path, transform_fn_io.TRANSFORM_FN_DIR)
        self.assertTrue(file_io.file_exists(transform_fn_dir))
        self.assertTrue(file_io.is_directory(transform_fn_dir))
Esempio n. 2
0
def main(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--cloud', type=str, help='y' )
    args = parser.parse_args(argv) # Parse the arguments 
    if args.cloud=="y":
        pipeline_options = get_cloud_pipeline_options()
    else:
        pipeline_options = beam.pipeline.PipelineOptions(flags=[],**{'project': "iotpubsub-1536350750202"})
    with beam_impl.Context(temp_dir="gs://relation_extraction/beam"):
        p = beam.Pipeline(options=pipeline_options)
        train_data, test_data = (p | "Read from bigquery" >> ReadBigQuery())

        (test_data | "test it" >> beam.Map(printy))
        train_data = (train_data, train_metadata)
        train_dataset, transform_fn = (train_data
                                            | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)
                                            )
        test_data = (test_data, train_metadata)
        test_data, _ = ((test_data, transform_fn) | 'Transform test data' >> beam_impl.TransformDataset())
        train_data, transformed_metadata = train_dataset
        transformed_data_coder = tft.coders.ExampleProtoCoder(transformed_metadata.schema)
        _ = (train_data
                | 'Encode train data to save it' >> beam.Map(transformed_data_coder.encode)
                | 'Write the train data to tfrecords' >> tfrecordio.WriteToTFRecord(os.path.join("gs://relation_extraction/beam/Train","TRAIN"))
                )
        _ = (test_data
                | 'Encode test data to save it' >> beam.Map(transformed_data_coder.encode)
                | 'Write the test data to tfrecords' >> tfrecordio.WriteToTFRecord(os.path.join("gs://relation_extraction/beam/Test","TEST"))
                )
        _ = (transform_fn | "WriteTransformFn" >> transform_fn_io.WriteTransformFn("gs://relation_extraction/beam/"))

        p.run().wait_until_finish()
Esempio n. 3
0
  def testWriteTransformFn(self):
    transform_output_dir = os.path.join(self.get_temp_dir(), 'output')

    with beam.Pipeline() as pipeline:
      # Create an empty directory for the source saved model dir.
      saved_model_dir = os.path.join(self.get_temp_dir(), 'source')
      file_io.recursive_create_dir(saved_model_dir)
      saved_model_dir_pcoll = (
          pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir]))
      # Combine test metadata with a dict of PCollections resolving futures.
      deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create(
          [test_metadata.COMPLETE_METADATA])
      metadata = beam_metadata_io.BeamDatasetMetadata(
          test_metadata.INCOMPLETE_METADATA, deferred_metadata)

      _ = ((saved_model_dir_pcoll, metadata)
           | transform_fn_io.WriteTransformFn(transform_output_dir))

    # Test reading with TFTransformOutput
    tf_transform_output = tft.TFTransformOutput(transform_output_dir)
    metadata = tf_transform_output.transformed_metadata
    self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)

    transform_fn_dir = tf_transform_output.transform_savedmodel_dir
    self.assertTrue(file_io.file_exists(transform_fn_dir))
    self.assertTrue(file_io.is_directory(transform_fn_dir))
    def testWriteTransformFn(self):
        path = os.path.join(self.get_temp_dir(), 'output')

        with beam.Pipeline() as pipeline:
            # Create an empty directory for the source saved model dir.
            saved_model_dir = os.path.join(self.get_temp_dir(), 'source')
            file_io.recursive_create_dir(saved_model_dir)
            saved_model_dir_pcoll = (
                pipeline
                | 'CreateSavedModelDir' >> beam.Create([saved_model_dir]))
            metadata = _TEST_METADATA
            deferred_metadata = (
                pipeline
                | 'CreateEmptyProperties' >> beam.Create([_FUTURES_DICT]))

            _ = ((saved_model_dir_pcoll, (metadata, deferred_metadata))
                 | transform_fn_io.WriteTransformFn(path))

        transformed_metadata_dir = os.path.join(path, 'transformed_metadata')
        metadata = metadata_io.read_metadata(transformed_metadata_dir)
        self.assertEqual(metadata, _TEST_METADATA)

        transform_fn_dir = os.path.join(path, 'transform_fn')
        self.assertTrue(file_io.file_exists(transform_fn_dir))
        self.assertTrue(file_io.is_directory(transform_fn_dir))
Esempio n. 5
0
    def testWriteTransformFn(self):
        transform_output_dir = os.path.join(self.get_temp_dir(), 'output')

        with beam.Pipeline() as pipeline:
            # Create an empty directory for the source saved model dir.
            saved_model_dir = os.path.join(self.get_temp_dir(), 'source')
            file_io.recursive_create_dir(saved_model_dir)
            saved_model_dir_pcoll = (
                pipeline
                | 'CreateSavedModelDir' >> beam.Create([saved_model_dir]))
            metadata = beam_metadata_io.BeamDatasetMetadata(
                _TEST_METADATA_WITH_FUTURES, {
                    'a': pipeline | 'CreateA' >> beam.Create([3]),
                })

            _ = ((saved_model_dir_pcoll, metadata)
                 | transform_fn_io.WriteTransformFn(transform_output_dir))

        # Test reading with TFTransformOutput
        tf_transform_output = tft.TFTransformOutput(transform_output_dir)
        metadata = tf_transform_output.transformed_metadata
        self.assertEqual(metadata, _TEST_METADATA)

        transform_fn_dir = tf_transform_output.transform_savedmodel_dir
        self.assertTrue(file_io.file_exists(transform_fn_dir))
        self.assertTrue(file_io.is_directory(transform_fn_dir))
def run(pipeline_options, known_args):
  pipeline = beam.Pipeline(options=pipeline_options)

  with impl.Context(known_args.transform_temp_dir):
    articles = (
        pipeline
        | 'Get Paths' >> beam.Create(get_paths(known_args.file_pattern))
        | 'Get Articles' >> beam.Map(get_articles)
        | 'Get Article' >> beam.FlatMap(lambda x: x)
    )

    dataset = (articles, get_metadata())

    transform_fn = (
        dataset
        | 'Analyse dataset' >> impl.AnalyzeDataset(preprocess_fn)
    )

    transformed_data_with_meta = (
        (dataset, transform_fn)
        | 'Transform dataset' >> impl.TransformDataset()
    )

    transformed_data, transformed_metadata = transformed_data_with_meta

    transform_fn | 'Export Transform Fn' >> transform_fn_io.WriteTransformFn(
        known_args.transform_export_dir)

    (
        transformed_data
        | 'Convert to Insertable data' >> beam.Map(to_bq_row)
        | 'Write to BigQuery table' >> beam.io.WriteToBigQuery(
            project=known_args.bq_project,
            dataset=known_args.bq_dataset,
            table=known_args.bq_table,
            schema=get_bigquery_schema(),
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
    )

    if known_args.enable_tfrecord:
      transformed_data | 'Write TFRecords' >> beam.io.tfrecordio.WriteToTFRecord(
          file_path_prefix='{0}/{1}'.format(known_args.tfrecord_export_dir, 'reuter'),
          file_name_suffix='.tfrecords',
          coder=tft_coders.example_proto_coder.ExampleProtoCoder(transformed_metadata.schema))

    if known_args.enable_debug:
      transformed_data | 'Debug Output' >> beam.io.textio.WriteToText(
          file_path_prefix=known_args.debug_output_prefix, file_name_suffix='.txt')


  job = pipeline.run()

  if pipeline_options.get_all_options()['runner'] == 'DirectRunner':
    job.wait_until_finish()
Esempio n. 7
0
def main(argv=None):
    """Run preprocessing as a Dataflow pipeline.
    Args:
        argv (list): list of arguments
    """
    args = parse_arguments(sys.argv if argv is None else argv)

    if args.cloud:
        pipeline_options = get_cloud_pipeline_options()
    else:
        pipeline_options = None

    p = beam.Pipeline(options=pipeline_options)
    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
        # read data and join by key
        raw_data_input = (p
                          | 'ReadInputData' >> beam.io.ReadFromText(
                              TRAIN_INPUT_DATA, skip_header_lines=1)
                          | 'ParseInputCSV' >> beam.Map(converter_input.decode)
                          | 'ExtractBatchKeyIn' >> beam.Map(extract_batchkey))

        raw_data_output = (
            p
            | 'ReadOutputData' >> beam.io.ReadFromText(TRAIN_OUTPUT_DATA,
                                                       skip_header_lines=1)
            | 'ParseOutputCSV' >> beam.Map(converter_output.decode)
            | 'ExtractBatchKeyOut' >> beam.Map(extract_batchkey))

        raw_data = ((raw_data_input, raw_data_output)
                    | 'JoinData' >> beam.CoGroupByKey()
                    | 'RemoveKeys' >> beam.FlatMap(remove_keys))

        # analyse and transform dataset
        raw_dataset = (raw_data, input_metadata)
        transformed_dataset, transform_fn = (
            raw_dataset
            | 'AnalyzeAndTransform' >>
            beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
        transformed_data, transformed_metadata = transformed_dataset

        # save data and serialize TransformFn
        transformed_data_coder = tft.coders.ExampleProtoCoder(
            transformed_metadata.schema)
        _ = (transformed_data
             | 'EncodeData' >> beam.Map(transformed_data_coder.encode)
             | 'WriteData' >> tfrecordio.WriteToTFRecord(
                 os.path.join(TFRECORD_DIR, 'records')))
        _ = (transform_fn
             |
             "WriteTransformFn" >> transform_fn_io.WriteTransformFn(MODEL_DIR))

        p.run().wait_until_finish()
Esempio n. 8
0
    def build_graph(self):
        # Move percentage of train data to .`PPGRAPH_EXT` files, used for graph building.
        # num_lines = 0
        # for i in range(DATASET_NUM_SHARDS):
        #     _fname = '{}-{:05}-of-{:05}'.format(self.train_fname_out, i, self.config.DATASET_NUM_SHARDS)
        #     num_lines += sum(1 for _ in open(_fname))
        #     _fname_marked = '{}-{:05}-of-{:05}.{}'.format(self.train_fname_out, i, self.config.DATASET_NUM_SHARDS,
        #                                                   PPGRAPH_EXT)
        #     shutil.move(_fname, _fname_marked)
        #     if num_lines >= self.config.PPGRAPH_MAX_SAMPLES:
        #         break

        # Set up the preprocessing pipeline for analyzing the dataset. The analyze call is not combined with the
        # transform call because we will parallelize the transform call later. We had the issue that this process
        # runs on a single core and tends to cause OOM issues.
        pipeline = beam.Pipeline(runner=DirectRunner())

        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            # todo: maybe, I should only use train data (or percentage of train data) to build the graph
            raw_train_data = (
                pipeline
                | 'ReadTrainDataFile' >> textio.ReadFromText(
                    'data/features' + '*' + 'shard' + '*', skip_header_lines=0)
                | 'DecodeTrainDataCSV' >> MapAndFilterErrors(
                    tft_coders.CsvCoder(
                        self.data_formatter.get_ordered_columns(),
                        self.data_formatter.get_raw_data_metadata().schema).
                    decode))

            # Combine data and schema into a dataset tuple.  Note that we already used
            # the schema to read the CSV data, but we also need it to interpret
            # raw_data.
            # That is when to use vocabulary, scale_to_0_1 or sparse_to_dense ...
            transform_fn = (
                (raw_train_data, self.data_formatter.get_raw_data_metadata())
                | beam_impl.AnalyzeDataset(
                    PreprocessingFunction().transform_to_tfrecord))

            # Write SavedModel and metadata to two subdirectories of working_dir, given by
            # `transform_fn_io.TRANSFORM_FN_DIR` and `transform_fn_io.TRANSFORMED_METADATA_DIR` respectively.
            _ = (transform_fn
                 | 'WriteTransformGraph' >>
                 transform_fn_io.WriteTransformFn(TARGET_DIR))  # working dir

        # Run the Beam preprocessing pipeline.
        st = time.time()
        result = pipeline.run()
        result.wait_until_finish()
        self.logger.info(
            'Transformation graph built and written in {:.2f} sec'.format(
                time.time() - st))
Esempio n. 9
0
def run_tft_pipeline(args):
    """
    This is where all the data we have available in our database is processed and 
    transformed into Tensorflow tfrecords for later training and testing.

    The code runs in distributed manner automatically in the engine choosen by
    the `runner` argument in input.
    """
    pipeline_options = build_pipeline_options(args)
    temp_tft_folder = (tempfile.mkdtemp(
        dir='/tmp/') if not args.tft_temp else args.tft_temp)
    tft_transform_folder = (tempfile.mkdtemp(
        dir='/tmp/') if not args.tft_transform else args.tft_transform)

    with beam.Pipeline(options=pipeline_options) as pipeline:
        with beam_impl.Context(temp_dir=temp_tft_folder):

            train_data = read_input_data(args, pipeline, 'train')

            write_total_distinct_keys_to_file(train_data, args.nitems_filename,
                                              'sku')

            train_dataset = (train_data, metadata.RAW_DATA_METADATA)
            (train_data, transformed_train_metadata), transform_fn = (
                train_dataset
                | beam_impl.AnalyzeAndTransformDataset(preprocess_fn))

            _ = (transform_fn
                 | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(tft_transform_folder))

            train_data = aggregate_transformed_data(train_data, 'train')

            write_tfrecords(train_data, metadata.OUTPUT_TRAIN_SCHEMA,
                            args.output_train_filename, 'output train')

            test_data = read_input_data(args, pipeline, 'test')

            test_dataset = (test_data, metadata.RAW_DATA_METADATA)

            (test_data,
             _) = ((test_dataset, transform_fn) | beam_impl.TransformDataset())

            test_data = aggregate_transformed_data(test_data, 'test')

            test_data = aggregate_final_test_data(train_data, test_data)

            write_tfrecords(test_data, metadata.OUTPUT_TEST_SCHEMA,
                            args.output_test_filename, 'output test')
Esempio n. 10
0
    def testWriteTransformFnIsRetryable(self):
        tft.test_case.skip_if_external_environment(
            'Retries are currently not available on this environment.')
        original_copy_tree_to_unique_temp_dir = (
            transform_fn_io._copy_tree_to_unique_temp_dir)

        def mock_copy_tree_to_unique_temp_dir(source, base_temp_dir_path):
            """Mocks transform_fn_io._copy_tree to fail the first time it is called by this test, thus forcing a retry which should succeed."""
            global _COPY_TREE_TO_UNIQUE_TEMP_DIR_CALLED
            if not _COPY_TREE_TO_UNIQUE_TEMP_DIR_CALLED:
                _COPY_TREE_TO_UNIQUE_TEMP_DIR_CALLED = True
                original_copy_tree_to_unique_temp_dir(source,
                                                      base_temp_dir_path)
                raise ArithmeticError('Some error')
            return original_copy_tree_to_unique_temp_dir(
                source, base_temp_dir_path)

        with self._makeTestPipeline() as pipeline:
            transform_output_dir = os.path.join(self.get_temp_dir(), 'output')
            # Create an empty directory for the source saved model dir.
            saved_model_dir = os.path.join(self.get_temp_dir(), 'source')
            file_io.recursive_create_dir(saved_model_dir)
            saved_model_path = os.path.join(saved_model_dir, 'saved_model')
            with file_io.FileIO(saved_model_path, mode='w') as f:
                f.write('some content')
            saved_model_dir_pcoll = (
                pipeline
                | 'CreateSavedModelDir' >> beam.Create([saved_model_dir]))
            # Combine test metadata with a dict of PCollections resolving futures.
            deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create(
                [test_metadata.COMPLETE_METADATA])
            metadata = beam_metadata_io.BeamDatasetMetadata(
                test_metadata.INCOMPLETE_METADATA, deferred_metadata)
            with mock.patch.object(transform_fn_io,
                                   '_copy_tree_to_unique_temp_dir',
                                   mock_copy_tree_to_unique_temp_dir):
                _ = ((saved_model_dir_pcoll, metadata)
                     | transform_fn_io.WriteTransformFn(transform_output_dir))

        # Test reading with TFTransformOutput
        tf_transform_output = tft.TFTransformOutput(transform_output_dir)
        metadata = tf_transform_output.transformed_metadata
        self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)

        transform_fn_dir = tf_transform_output.transform_savedmodel_dir
        self.assertTrue(file_io.file_exists(transform_fn_dir))
        self.assertTrue(file_io.is_directory(transform_fn_dir))
        # Check temp directory created by failed run was cleaned up.
        self.assertEqual(2, len(file_io.list_directory(transform_output_dir)))
Esempio n. 11
0
  def testWriteTransformFnIsIdempotent(self):
    transform_output_dir = os.path.join(self.get_temp_dir(), 'output')

    def mock_write_metadata_expand(unused_self, unused_metadata):
      raise ArithmeticError('Some error')

    with beam.Pipeline() as pipeline:
      # Create an empty directory for the source saved model dir.
      saved_model_dir = os.path.join(self.get_temp_dir(), 'source')
      saved_model_dir_pcoll = (
          pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir]))

      with mock.patch.object(transform_fn_io.beam_metadata_io.WriteMetadata,
                             'expand', mock_write_metadata_expand):
        with self.assertRaisesRegexp(ArithmeticError, 'Some error'):
          _ = ((saved_model_dir_pcoll, object())
               | transform_fn_io.WriteTransformFn(transform_output_dir))

    self.assertFalse(file_io.file_exists(transform_output_dir))
Esempio n. 12
0
    def testTransformFnExportAndImportRoundtrip(self):
        tranform_fn_dir = os.path.join(self.get_temp_dir(),
                                       'export_transform_fn')
        metadata_dir = os.path.join(self.get_temp_dir(), 'export_metadata')

        with beam.Pipeline() as p:

            def preprocessing_fn(inputs):
                return {'x_scaled': tft.scale_to_0_1(inputs['x'])}

            metadata = self.toMetadata(
                {'x': tf.FixedLenFeature((), tf.float32, 0)})
            columns = p | 'CreateTrainingData' >> beam.Create([{
                'x': v
            } for v in [4, 1, 5, 2]])
            with beam_impl.Context(temp_dir=self.get_temp_dir()):
                _, transform_fn = (
                    (columns, metadata)
                    | 'Analyze and Transform' >>
                    beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

            _ = transform_fn | transform_fn_io.WriteTransformFn(
                tranform_fn_dir)
            _ = metadata | beam_metadata_io.WriteMetadata(metadata_dir,
                                                          pipeline=p)

        with beam.Pipeline() as p:
            transform_fn = p | transform_fn_io.ReadTransformFn(tranform_fn_dir)
            metadata = p | beam_metadata_io.ReadMetadata(metadata_dir)
            # Run transform_columns on some eval dataset.
            eval_data = p | 'CreateEvalData' >> beam.Create([{
                'x': v
            } for v in [6, 3]])
            transformed_eval_data, _ = (
                ((eval_data, metadata), transform_fn)
                | 'Transform' >> beam_impl.TransformDataset())
            expected_transformed_eval_data = [{
                'x_scaled': v
            } for v in [1.25, 0.5]]
            beam_test_util.assert_that(
                transformed_eval_data,
                beam_test_util.equal_to(expected_transformed_eval_data))
def data_transformation(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/marshal"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    # -----------------------DATA LOADING START--------------------------------
    _kale_directory_file_names = [
        os.path.splitext(f)[0]
        for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
    ]

    if "column_names" not in _kale_directory_file_names:
        raise ValueError("column_names" + " does not exists in directory")

    _kale_load_file_name = [
        f
        for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f)) and
        os.path.splitext(f)[0] == "column_names"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " +
                         "column_names" + ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    column_names = _kale_resource_load(os.path.join(
        _kale_data_directory, _kale_load_file_name))

    if "schema" not in _kale_directory_file_names:
        raise ValueError("schema" + " does not exists in directory")

    _kale_load_file_name = [
        f
        for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f)) and
        os.path.splitext(f)[0] == "schema"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " +
                         "schema" + ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    schema = _kale_resource_load(os.path.join(
        _kale_data_directory, _kale_load_file_name))
    # -----------------------DATA LOADING END----------------------------------

    import os
    import shutil
    import logging
    import apache_beam as beam
    import tensorflow as tf
    import tensorflow_transform as tft
    import tensorflow_model_analysis as tfma
    import tensorflow_data_validation as tfdv

    from apache_beam.io import textio
    from apache_beam.io import tfrecordio

    from tensorflow_transform.beam import impl as beam_impl
    from tensorflow_transform.beam.tft_beam_io import transform_fn_io
    from tensorflow_transform.coders.csv_coder import CsvCoder
    from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder
    from tensorflow_transform.tf_metadata import dataset_metadata
    from tensorflow_transform.tf_metadata import metadata_io
    DATA_DIR = 'data/'
    TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv')
    EVALUATION_DATA = os.path.join(
        DATA_DIR, 'taxi-cab-classification/eval.csv')

    # Categorical features are assumed to each have a maximum value in the dataset.
    MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12]
    CATEGORICAL_FEATURE_KEYS = ['trip_start_hour',
                                'trip_start_day', 'trip_start_month']

    DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds']

    # Number of buckets used by tf.transform for encoding each feature.
    FEATURE_BUCKET_COUNT = 10

    BUCKET_FEATURE_KEYS = [
        'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']

    # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform
    VOCAB_SIZE = 1000

    # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed.
    OOV_SIZE = 10

    VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company',
                          'pickup_community_area', 'dropoff_community_area']

    # allow nan values in these features.
    OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract',
                         'company', 'trip_seconds', 'dropoff_community_area']

    LABEL_KEY = 'tips'
    FARE_KEY = 'fare'
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    # tf.get_logger().setLevel(logging.ERROR)

    def to_dense(tensor):
        """Takes as input a SparseTensor and return a Tensor with correct default value
        Args:
          tensor: tf.SparseTensor
        Returns:
          tf.Tensor with default value
        """
        if not isinstance(tensor, tf.sparse.SparseTensor):
            return tensor
        if tensor.dtype == tf.string:
            default_value = ''
        elif tensor.dtype == tf.float32:
            default_value = 0.0
        elif tensor.dtype == tf.int32:
            default_value = 0
        else:
            raise ValueError(f"Tensor type not recognized: {tensor.dtype}")

        return tf.squeeze(tf.sparse_to_dense(tensor.indices,
                                             [tensor.dense_shape[0], 1],
                                             tensor.values, default_value=default_value), axis=1)
        # TODO: Update to below version
        # return tf.squeeze(tf.sparse.to_dense(tensor, default_value=default_value), axis=1)

    def preprocess_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.
        Args:
          inputs: map from feature keys to raw not-yet-transformed features.
        Returns:
          Map from string feature key to transformed feature operations.
        """
        outputs = {}
        for key in DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[key] = tft.scale_to_z_score(to_dense(inputs[key]))

        for key in VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            if inputs[key].dtype == tf.string:
                vocab_tensor = to_dense(inputs[key])
            else:
                vocab_tensor = tf.as_string(to_dense(inputs[key]))
            outputs[key] = tft.compute_and_apply_vocabulary(
                vocab_tensor, vocab_filename='vocab_' + key,
                top_k=VOCAB_SIZE, num_oov_buckets=OOV_SIZE)

        for key in BUCKET_FEATURE_KEYS:
            outputs[key] = tft.bucketize(
                to_dense(inputs[key]), FEATURE_BUCKET_COUNT)

        for key in CATEGORICAL_FEATURE_KEYS:
            outputs[key] = tf.cast(to_dense(inputs[key]), tf.int64)

        taxi_fare = to_dense(inputs[FARE_KEY])
        taxi_tip = to_dense(inputs[LABEL_KEY])
        # Test if the tip was > 20% of the fare.
        tip_threshold = tf.multiply(taxi_fare, tf.constant(0.2))
        outputs[LABEL_KEY] = tf.logical_and(
            tf.logical_not(tf.math.is_nan(taxi_fare)),
            tf.greater(taxi_tip, tip_threshold))

        for key in outputs:
            if outputs[key].dtype == tf.bool:
                outputs[key] = tft.compute_and_apply_vocabulary(tf.as_string(outputs[key]),
                                                                vocab_filename='vocab_' + key)

        return outputs
    trns_output = os.path.join(DATA_DIR, "transformed")
    if os.path.exists(trns_output):
        shutil.rmtree(trns_output)

    tft_input_metadata = dataset_metadata.DatasetMetadata(schema)

    runner = 'DirectRunner'
    with beam.Pipeline(runner, options=None) as p:
        with beam_impl.Context(temp_dir=os.path.join(trns_output, 'tmp')):
            converter = CsvCoder(column_names, tft_input_metadata.schema)

            # READ TRAIN DATA
            train_data = (
                p
                | 'ReadTrainData' >> textio.ReadFromText(TRAIN_DATA, skip_header_lines=1)
                | 'DecodeTrainData' >> beam.Map(converter.decode))

            # TRANSFORM TRAIN DATA (and get transform_fn function)
            transformed_dataset, transform_fn = (
                (train_data, tft_input_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocess_fn))
            transformed_data, transformed_metadata = transformed_dataset

            # SAVE TRANSFORMED TRAIN DATA
            _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                os.path.join(trns_output, 'train'),
                coder=ExampleProtoCoder(transformed_metadata.schema))

            # READ EVAL DATA
            eval_data = (
                p
                | 'ReadEvalData' >> textio.ReadFromText(EVALUATION_DATA, skip_header_lines=1)
                | 'DecodeEvalData' >> beam.Map(converter.decode))

            # TRANSFORM EVAL DATA (using previously created transform_fn function)
            eval_dataset = (eval_data, tft_input_metadata)
            transformed_eval_data, transformed_metadata = (
                (eval_dataset, transform_fn) | beam_impl.TransformDataset())

            # SAVE EVAL DATA
            _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord(
                os.path.join(trns_output, 'eval'),
                coder=ExampleProtoCoder(transformed_metadata.schema))

            # SAVE transform_fn FUNCTION FOR LATER USE
            # TODO: check out what is the transform function (transform_fn) that came from previous step
            _ = (transform_fn | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(trns_output))

            # SAVE TRANSFORMED METADATA
            metadata_io.write_metadata(
                metadata=tft_input_metadata,
                path=os.path.join(trns_output, 'metadata'))

    # -----------------------DATA SAVING START---------------------------------
    if "trns_output" in locals():
        _kale_resource_save(trns_output, os.path.join(
            _kale_data_directory, "trns_output"))
    else:
        print("_kale_resource_save: `trns_output` not found.")
Esempio n. 14
0
def transform_data(working_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data from the positive and negative examples on disk, and
  transform it using a preprocessing pipeline that removes punctuation,
  tokenizes and maps tokens to int64 values indices.

  Args:
    working_dir: Directory to read shuffled data from and write transformed data
        and metadata to.
  """

    with beam.Pipeline() as pipeline:
        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            coder = tft.coders.ExampleProtoCoder(RAW_DATA_METADATA.schema)
            train_data = (pipeline
                          | 'ReadTrain' >> tfrecordio.ReadFromTFRecord(
                              os.path.join(working_dir,
                                           SHUFFLED_TRAIN_DATA_FILEBASE + '*'))
                          | 'DecodeTrain' >> beam.Map(coder.decode))

            test_data = (pipeline
                         | 'ReadTest' >> tfrecordio.ReadFromTFRecord(
                             os.path.join(working_dir,
                                          SHUFFLED_TEST_DATA_FILEBASE + '*'))
                         | 'DecodeTest' >> beam.Map(coder.decode))

            def preprocessing_fn(inputs):
                """Preprocess input columns into transformed columns."""
                review = inputs[REVIEW_KEY]

                review_tokens = tf.string_split(review, DELIMITERS)
                review_indices = tft.string_to_int(review_tokens,
                                                   top_k=VOCAB_SIZE)
                # Add one for the oov bucket created by string_to_int.
                review_bow_indices, review_weight = tft.tfidf(
                    review_indices, VOCAB_SIZE + 1)
                return {
                    REVIEW_KEY: review_bow_indices,
                    REVIEW_WEIGHT_KEY: review_weight,
                    LABEL_KEY: inputs[LABEL_KEY]
                }

            (transformed_train_data, transformed_metadata), transform_fn = (
                (train_data, RAW_DATA_METADATA)
                | 'AnalyzeAndTransform' >>
                beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data_coder = tft.coders.ExampleProtoCoder(
                transformed_metadata.schema)

            transformed_test_data, _ = (
                ((test_data, RAW_DATA_METADATA), transform_fn)
                | 'Transform' >> beam_impl.TransformDataset())

            _ = (transformed_train_data
                 | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
                 | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                     os.path.join(working_dir,
                                  TRANSFORMED_TRAIN_DATA_FILEBASE)))

            _ = (
                transformed_test_data
                | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode)
                | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
                    os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)))

            # Will write a SavedModel and metadata to two subdirectories of
            # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and
            # transform_fn_io.TRANSFORMED_METADATA_DIR respectively.
            _ = (transform_fn
                 | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(working_dir))
Esempio n. 15
0
def tftransform(
    pipeline_args,  # type: List[str]
    temp_location,  # type: str
    schema_file,  # type: str
    output_dir,  # type: str
    preprocessing_fn,  # type: Any
    training_data=None,  # type: Union[None, str]
    evaluation_data=None,  # type: Union[None, str]
    transform_fn_dir=None,  # type: Union[None, str]
    compression_type=None  # type: str
):  # type: (...) -> PipelineState
    """
    Generic tf.transform pipeline that takes tf.{example, record} training and evaluation
    datasets and outputs transformed data together with transform function Saved Model.

    :param pipeline_args: un-parsed Dataflow arguments
    :param temp_location: temporary location for dataflow job working dir
    :param schema_file: path to the raw feature schema text file
    :param output_dir: output dir for transformed data and function
    :param preprocessing_fn: tf.transform preprocessing function
    :param training_data: path to the training data
    :param evaluation_data: path to the evaluation data
    :param transform_fn_dir: dir to previously saved transformation function to apply
    :param compression_type: compression type for writing of tf.records
    :return final state of the Beam pipeline
    """
    assert_not_empty_string(temp_location)
    assert_not_empty_string(schema_file)
    assert_not_empty_string(output_dir)
    assert_not_none(preprocessing_fn)

    if compression_type is None:
        compression_type = CompressionTypes.AUTO

    raw_feature_spec = schema_txt_file_to_feature_spec(schema_file)
    raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)
    raw_data_coder = ExampleProtoCoder(raw_data_metadata.schema)

    transformed_train_output_dir = os.path.join(output_dir, "training")
    transformed_eval_output_dir = os.path.join(output_dir, "evaluation")

    if not any(i.startswith("--job_name") for i in pipeline_args):
        pipeline_args.append("--job_name=tf-transform-{}-{}".format(
            getpass.getuser(), int(time.time())))

    pipeline = beam.Pipeline(argv=pipeline_args)
    with beam_impl.Context(temp_dir=temp_location):
        if training_data is not None:
            # if training data is provided, transform_fn_dir will be ignored
            if transform_fn_dir is not None:
                warnings.warn(
                    "Transform_fn_dir is ignored because training_data is provided"
                )

            transform_fn_output = os.path.join(output_dir, "transform_fn",
                                               "saved_model.pb")
            if FileSystems.exists(transform_fn_output):
                raise ValueError("Transform fn already exists at %s!" %
                                 transform_fn_output)

            # compute the transform_fn and apply to the training data
            raw_train_data = (pipeline
                              | "ReadTrainData" >> tfrecordio.ReadFromTFRecord(
                                  training_data, coder=raw_data_coder))

            ((transformed_train_data, transformed_train_metadata),
             transform_fn) = (
                 (raw_train_data, raw_data_metadata)
                 | ("AnalyzeAndTransformTrainData" >>
                    beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
             )  # noqa: E501

            _ = (  # noqa: F841
                transform_fn
                | "WriteTransformFn" >>
                transform_fn_io.WriteTransformFn(output_dir))

            transformed_train_coder = ExampleProtoCoder(
                transformed_train_metadata.schema)
            _ = (  # noqa: F841
                transformed_train_data
                | "WriteTransformedTrainData" >> tfrecordio.WriteToTFRecord(
                    os.path.join(transformed_train_output_dir,
                                 "part"),  # noqa: E501
                    coder=transformed_train_coder,  # noqa: E501
                    compression_type=compression_type,  # noqa: E501
                    file_name_suffix=".tfrecords"))  # noqa: E501
        else:
            if transform_fn_dir is None:
                raise ValueError(
                    "Either training_data or transformed_fn needs to be provided"
                )
            # load the transform_fn
            transform_fn = pipeline | transform_fn_io.ReadTransformFn(
                transform_fn_dir)

        if evaluation_data is not None:
            # if evaluation_data exists, apply the transform_fn to the evaluation data
            raw_eval_data = (pipeline
                             | "ReadEvalData" >> tfrecordio.ReadFromTFRecord(
                                 evaluation_data, coder=raw_data_coder))

            (transformed_eval_data, transformed_eval_metadata) = (
                ((raw_eval_data, raw_data_metadata), transform_fn)
                | "TransformEvalData" >> beam_impl.TransformDataset())

            transformed_eval_coder = ExampleProtoCoder(
                transformed_eval_metadata.schema)
            _ = (  # noqa: F841
                transformed_eval_data
                | "WriteTransformedEvalData" >> tfrecordio.WriteToTFRecord(
                    os.path.join(transformed_eval_output_dir,
                                 "part"),  # noqa: E501
                    coder=transformed_eval_coder,  # noqa: E501
                    compression_type=compression_type,  # noqa: E501
                    file_name_suffix=".tfrecords"))  # noqa: E501
    result = pipeline.run().wait_until_finish()

    return result
Esempio n. 16
0
def transform_data(train_neg_filepattern, train_pos_filepattern,
                   test_neg_filepattern, test_pos_filepattern,
                   transformed_train_filebase, transformed_test_filebase,
                   transform_fn_dir):
  """Transform the data and write out as a TFRecord of Example protos.

  Read in the data from the positive and negative examples on disk, and
  transform it using a preprocessing pipeline that removes punctuation,
  tokenizes and maps tokens to int64 values indices.

  Args:
    train_neg_filepattern: Filepattern for training data negative examples
    train_pos_filepattern: Filepattern for training data positive examples
    test_neg_filepattern: Filepattern for test data negative examples
    test_pos_filepattern: Filepattern for test data positive examples
    transformed_train_filebase: Base filename for transformed training data
        shards
    transformed_test_filebase: Base filename for transformed test data shards
    transform_fn_dir: Directory where metadata for transform function should be
        written
  """

  with beam.Pipeline() as pipeline:
    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
      # pylint: disable=no-value-for-parameter
      train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData(
          (train_neg_filepattern, train_pos_filepattern))
      # pylint: disable=no-value-for-parameter
      test_data = pipeline | 'ReadTest' >> ReadAndShuffleData(
          (test_neg_filepattern, test_pos_filepattern))

      metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({
          REVIEW_COLUMN: dataset_schema.ColumnSchema(
              tf.string, [], dataset_schema.FixedColumnRepresentation()),
          LABEL_COLUMN: dataset_schema.ColumnSchema(
              tf.int64, [], dataset_schema.FixedColumnRepresentation()),
      }))

      def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        review = inputs[REVIEW_COLUMN]

        review_tokens = tf.string_split(review, DELIMITERS)
        review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE)
        # Add one for the oov bucket created by string_to_int.
        review_bow_indices, review_weight = tft.tfidf(review_indices,
                                                      VOCAB_SIZE + 1)
        return {
            REVIEW_COLUMN: review_bow_indices,
            REVIEW_WEIGHT: review_weight,
            LABEL_COLUMN: inputs[LABEL_COLUMN]
        }

      (transformed_train_data, transformed_metadata), transform_fn = (
          (train_data, metadata)
          | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(
              preprocessing_fn))

      transformed_test_data, _ = (
          ((test_data, metadata), transform_fn)
          | 'Transform' >> beam_impl.TransformDataset())

      _ = (
          transformed_train_data
          | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
              transformed_train_filebase,
              coder=example_proto_coder.ExampleProtoCoder(
                  transformed_metadata.schema)))

      _ = (
          transformed_test_data
          | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
              transformed_test_filebase,
              coder=example_proto_coder.ExampleProtoCoder(
                  transformed_metadata.schema)))

      _ = (transform_fn
           | 'WriteTransformFn' >>
           transform_fn_io.WriteTransformFn(transform_fn_dir))
Esempio n. 17
0
def transform_data(input_handle,
                   outfile_prefix,
                   working_dir,
                   schema_file,
                   transform_dir=None,
                   max_rows=None,
                   pipeline_args=None):
    """The main tf.transform method which analyzes and transforms data.

  Args:
    input_handle: BigQuery table name to process specified as DATASET.TABLE or
      path to csv file with input data.
    outfile_prefix: Filename prefix for emitted transformed examples
    working_dir: Directory in which transformed examples and transform function
      will be emitted.
    schema_file: An file path that contains a text-serialized TensorFlow
      metadata schema of the input data.
    transform_dir: Directory in which the transform output is located. If
      provided, this will load the transform_fn from disk instead of computing
      it over the data. Hint: this is useful for transforming eval data.
    max_rows: Number of rows to query from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """
    def preprocessing_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
        outputs = {}
        for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
                _fill_in_missing(inputs[key]))

        for key in taxi.VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            outputs[taxi.transformed_name(
                key)] = transform.compute_and_apply_vocabulary(
                    _fill_in_missing(inputs[key]),
                    top_k=taxi.VOCAB_SIZE,
                    num_oov_buckets=taxi.OOV_SIZE)

        for key in taxi.BUCKET_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = transform.bucketize(
                _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

        for key in taxi.CATEGORICAL_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

        # Was this passenger a big tipper?
        taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
        tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
        outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
            tf.is_nan(taxi_fare),
            tf.cast(tf.zeros_like(taxi_fare), tf.int64),
            # Test if the tip was > 20% of the fare.
            tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
                    tf.int64))

        return outputs

    schema = taxi.read_schema(schema_file)
    raw_feature_spec = taxi.get_raw_feature_spec(schema)
    raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)

    with beam.Pipeline(argv=pipeline_args) as pipeline:
        with beam_impl.Context(temp_dir=working_dir):
            if input_handle.lower().endswith('csv'):
                csv_coder = taxi.make_csv_coder(schema)
                raw_data = (pipeline
                            | 'ReadFromText' >> beam.io.ReadFromText(
                                input_handle, skip_header_lines=1)
                            | 'ParseCSV' >> beam.Map(csv_coder.decode))
            else:
                query = taxi.make_sql(input_handle, max_rows, for_eval=False)
                raw_data = (
                    pipeline
                    | 'ReadBigQuery' >> beam.io.Read(
                        beam.io.BigQuerySource(query=query,
                                               use_standard_sql=True))
                    |
                    'CleanData' >> beam.Map(taxi.clean_raw_data_dict,
                                            raw_feature_spec=raw_feature_spec))

            if transform_dir is None:
                transform_fn = (
                    (raw_data, raw_data_metadata)
                    |
                    ('Analyze' >> beam_impl.AnalyzeDataset(preprocessing_fn)))

                _ = (transform_fn
                     | ('WriteTransformFn' >>
                        transform_fn_io.WriteTransformFn(working_dir)))
            else:
                transform_fn = pipeline | transform_fn_io.ReadTransformFn(
                    transform_dir)

            # Shuffling the data before materialization will improve Training
            # effectiveness downstream.
            shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle(
            )

            (transformed_data, transformed_metadata) = (
                ((shuffled_data, raw_data_metadata), transform_fn)
                | 'Transform' >> beam_impl.TransformDataset())

            coder = example_proto_coder.ExampleProtoCoder(
                transformed_metadata.schema)
            _ = (transformed_data
                 | 'SerializeExamples' >> beam.Map(coder.encode)
                 | 'WriteExamples' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir, outfile_prefix),
                     file_name_suffix='.gz'))
Esempio n. 18
0
  def assertAnalyzeAndTransformResults(self,
                                       input_data,
                                       input_metadata,
                                       preprocessing_fn,
                                       expected_data=None,
                                       expected_metadata=None,
                                       only_check_core_metadata=False,
                                       expected_vocab_file_contents=None,
                                       expected_asset_file_contents=None,
                                       test_data=None,
                                       desired_batch_size=None):
    """Assert that input data and metadata is transformed as expected.

    This methods asserts transformed data and transformed metadata match
    with expected_data and expected_metadata.

    Args:
      input_data: A sequence of dicts whose values are
          either strings, lists of strings, numeric types or a pair of those.
      input_metadata: DatasetMetadata describing input_data.
      preprocessing_fn: A function taking a dict of tensors and returning
          a dict of tensors.
      expected_data: (optional) A dataset with the same type constraints as
          input_data, but representing the output after transformation.
          If supplied, transformed data is asserted to be equal.
      expected_metadata: (optional) DatasetMetadata describing the transformed
          data. If supplied, transformed metadata is asserted to be equal.
      only_check_core_metadata: A boolean to indicate if all elements in
          the transformed metadata is asserted to be equal to expected metadata.
          If True, only transformed feature names, dtypes and representations
          are asserted.
      expected_vocab_file_contents: (optional) A dictionary from vocab filenames
          to their expected content as a list of text lines.  Values should be
          the expected result of calling f.readlines() on the given asset files.
      expected_asset_file_contents: deprecated.  Use
          expected_vocab_file_contents.
      test_data: (optional) If this is provided then instead of calling
          AnalyzeAndTransformDataset with input_data, this function will call
          AnalyzeDataset with input_data and TransformDataset with test_data.
          Note that this is the case even if input_data and test_data are equal.
          test_data should also conform to input_metadata.
      desired_batch_size: (optional) A batch size to batch elements by. If not
          provided, a batch size will be computed automatically.
    Raises:
      AssertionError: if the expected data does not match the results of
          transforming input_data according to preprocessing_fn, or
          (if provided) if the expected metadata does not match.
      ValueError: if expected_vocab_file_contents and
          expected_asset_file_contents are both set.
    """
    if (expected_vocab_file_contents is not None and
        expected_asset_file_contents is not None):
      raise ValueError('only one of expected_asset_file_contents and '
                       'expected_asset_file_contents should be set')
    elif expected_asset_file_contents is not None:
      tf.logging.warn('expected_asset_file_contents is deprecated, use '
                      'expected_vocab_file_contents')

    expected_vocab_file_contents = (
        expected_vocab_file_contents or expected_asset_file_contents or {})
    del expected_asset_file_contents

    # Note: we don't separately test AnalyzeDataset and TransformDataset as
    # AnalyzeAndTransformDataset currently simply composes these two
    # transforms.  If in future versions of the code, the implementation
    # differs, we should also run AnalyzeDataset and TransformDatset composed.
    temp_dir = self.get_temp_dir()
    with beam_impl.Context(
        temp_dir=temp_dir, desired_batch_size=desired_batch_size):
      if test_data is None:
        (transformed_data, transformed_metadata), transform_fn = (
            (input_data, input_metadata)
            | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
      else:
        transform_fn = ((input_data, input_metadata)
                        | beam_impl.AnalyzeDataset(preprocessing_fn))
        transformed_data, transformed_metadata = (
            ((test_data, input_metadata), transform_fn)
            | beam_impl.TransformDataset())

      # Write transform_fn so we can test its assets
      if expected_vocab_file_contents:
        _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir)

    if expected_data is not None:
      self.assertDataCloseOrEqual(expected_data, transformed_data)

    if expected_metadata:
      # Now that the pipeline has run, transformed_metadata.deferred_metadata
      # should be a list containing a single DatasetMetadata with the full
      # metadata.
      assert len(transformed_metadata.deferred_metadata) == 1
      transformed_metadata = transformed_metadata.deferred_metadata[0]

      if only_check_core_metadata:
        # preprocessing_fn may add metadata to column schema only relevant to
        # internal implementation such as vocabulary_file. As such, only check
        # feature names, dtypes and representations are as expected.
        self.assertSameElements(
            transformed_metadata.schema.column_schemas.keys(),
            expected_metadata.schema.column_schemas.keys())
        for k, v in transformed_metadata.schema.column_schemas.iteritems():
          expected_schema = expected_metadata.schema.column_schemas[k]
          self.assertEqual(expected_schema.representation, v.representation,
                           "representation doesn't match for feature '%s'" % k)
          self.assertEqual(expected_schema.domain.dtype, v.domain.dtype,
                           "dtype doesn't match for feature '%s'" % k)
      else:
        # Check the entire DatasetMetadata is as expected.
        # Use extra assertEqual for schemas, since full metadata assertEqual
        # error message is not conducive to debugging.
        self.assertEqual(expected_metadata.schema.column_schemas,
                         transformed_metadata.schema.column_schemas)
        self.assertEqual(expected_metadata, transformed_metadata)

    tf_transform_output = tft.TFTransformOutput(temp_dir)
    for filename, file_contents in six.iteritems(expected_vocab_file_contents):
      full_filename = tf_transform_output.vocabulary_file_by_name(filename)
      with tf.gfile.Open(full_filename) as f:
        self.assertEqual(f.readlines(), file_contents)
Esempio n. 19
0
    def assertAnalyzeAndTransformResults(self,
                                         input_data,
                                         input_metadata,
                                         preprocessing_fn,
                                         expected_data=None,
                                         expected_metadata=None,
                                         only_check_core_metadata=False,
                                         expected_vocab_file_contents=None,
                                         expected_asset_file_contents=None,
                                         test_data=None,
                                         desired_batch_size=None,
                                         beam_pipeline=None):
        """Assert that input data and metadata is transformed as expected.

    This methods asserts transformed data and transformed metadata match
    with expected_data and expected_metadata.

    Args:
      input_data: A sequence of dicts whose values are
          either strings, lists of strings, numeric types or a pair of those.
      input_metadata: DatasetMetadata describing input_data.
      preprocessing_fn: A function taking a dict of tensors and returning
          a dict of tensors.
      expected_data: (optional) A dataset with the same type constraints as
          input_data, but representing the output after transformation.
          If supplied, transformed data is asserted to be equal.
      expected_metadata: (optional) DatasetMetadata describing the transformed
          data. If supplied, transformed metadata is asserted to be equal.
      only_check_core_metadata: A boolean to indicate if all elements in
          the transformed metadata is asserted to be equal to expected metadata.
          If True, only transformed feature names, dtypes and representations
          are asserted.
      expected_vocab_file_contents: (optional) A dictionary from vocab filenames
          to their expected content as a list of text lines or a list of tuples
          of frequency and text. Values should be the expected result of calling
          f.readlines() on the given asset files.
      expected_asset_file_contents: deprecated.  Use
          expected_vocab_file_contents.
      test_data: (optional) If this is provided then instead of calling
          AnalyzeAndTransformDataset with input_data, this function will call
          AnalyzeDataset with input_data and TransformDataset with test_data.
          Note that this is the case even if input_data and test_data are equal.
          test_data should also conform to input_metadata.
      desired_batch_size: (optional) A batch size to batch elements by. If not
          provided, a batch size will be computed automatically.
      beam_pipeline: (optional) A Beam Pipeline to use in this test.
    Raises:
      AssertionError: if the expected data does not match the results of
          transforming input_data according to preprocessing_fn, or
          (if provided) if the expected metadata does not match.
      ValueError: if expected_vocab_file_contents and
          expected_asset_file_contents are both set.
    """
        if (expected_vocab_file_contents is not None
                and expected_asset_file_contents is not None):
            raise ValueError('only one of expected_asset_file_contents and '
                             'expected_asset_file_contents should be set')
        elif expected_asset_file_contents is not None:
            tf.logging.warn('expected_asset_file_contents is deprecated, use '
                            'expected_vocab_file_contents')

        expected_vocab_file_contents = (expected_vocab_file_contents
                                        or expected_asset_file_contents or {})
        del expected_asset_file_contents

        # Note: we don't separately test AnalyzeDataset and TransformDataset as
        # AnalyzeAndTransformDataset currently simply composes these two
        # transforms.  If in future versions of the code, the implementation
        # differs, we should also run AnalyzeDataset and TransformDatset composed.
        temp_dir = tempfile.mkdtemp(prefix=self._testMethodName,
                                    dir=self.get_temp_dir())
        with beam_pipeline or beam.Pipeline(
                runner=self._makeRunner()) as pipeline:
            with beam_impl.Context(temp_dir=temp_dir,
                                   desired_batch_size=desired_batch_size):
                input_data = pipeline | 'CreateInput' >> beam.Create(
                    input_data)
                if test_data is None:
                    (transformed_data, transformed_metadata), transform_fn = (
                        (input_data, input_metadata)
                        |
                        beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
                else:
                    transform_fn = ((input_data, input_metadata)
                                    |
                                    beam_impl.AnalyzeDataset(preprocessing_fn))
                    test_data = pipeline | 'CreateTest' >> beam.Create(
                        test_data)
                    transformed_data, transformed_metadata = (
                        ((test_data, input_metadata), transform_fn)
                        | beam_impl.TransformDataset())

                # Write transform_fn so we can test its assets
                _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir)

                if expected_data is not None:
                    transformed_data_coder = tft.coders.ExampleProtoCoder(
                        transformed_metadata.schema)

                    transformed_data_path = os.path.join(
                        temp_dir, 'transformed_data')
                    _ = (transformed_data
                         | beam.Map(transformed_data_coder.encode)
                         | beam.io.tfrecordio.WriteToTFRecord(
                             transformed_data_path, shard_name_template=''))

        if expected_data is not None:
            examples = tf.python_io.tf_record_iterator(
                path=transformed_data_path)
            transformed_data = [
                transformed_data_coder.decode(x) for x in examples
            ]
            self.assertDataCloseOrEqual(expected_data, transformed_data)

        tf_transform_output = tft.TFTransformOutput(temp_dir)
        if expected_metadata:
            transformed_metadata = tf_transform_output.transformed_metadata

            if only_check_core_metadata:
                # preprocessing_fn may add metadata to column schema only relevant to
                # internal implementation such as vocabulary_file. As such, only check
                # feature names, dtypes and representations are as expected.
                self.assertSameElements(
                    transformed_metadata.schema.column_schemas.keys(),
                    expected_metadata.schema.column_schemas.keys())
                for k, v in transformed_metadata.schema.column_schemas.iteritems(
                ):
                    expected_schema = expected_metadata.schema.column_schemas[
                        k]
                    self.assertEqual(
                        expected_schema.representation, v.representation,
                        "representation doesn't match for feature '%s'" % k)
                    self.assertEqual(
                        expected_schema.domain.dtype, v.domain.dtype,
                        "dtype doesn't match for feature '%s'" % k)
            else:
                # Check the entire DatasetMetadata is as expected.
                # Use extra assertEqual for schemas, since full metadata assertEqual
                # error message is not conducive to debugging.
                self.assertEqual(expected_metadata.schema.column_schemas,
                                 transformed_metadata.schema.column_schemas)
                self.assertEqual(expected_metadata, transformed_metadata)

        for filename, file_contents in six.iteritems(
                expected_vocab_file_contents):
            full_filename = tf_transform_output.vocabulary_file_by_name(
                filename)
            with tf.gfile.Open(full_filename) as f:
                file_lines = f.readlines()

                # Store frequency case.
                if isinstance(file_contents[0], tuple):
                    word_and_frequency_list = []
                    for content in file_lines:
                        frequency, word = content.split(' ', 1)
                        word_and_frequency_list.append(
                            (word.strip('\n'), float(frequency.strip('\n'))))
                    self.assertAllEqual(
                        zip(*word_and_frequency_list)[0],
                        zip(*file_contents)[0])
                    np.testing.assert_almost_equal(
                        zip(*word_and_frequency_list)[1],
                        zip(*file_contents)[1])
                else:
                    file_lines = [
                        content.strip('\n') for content in file_lines
                    ]
                    self.assertAllEqual(file_lines, file_contents)
def preprocess(in_test_mode):
  import os
  import os.path
  import tempfile
  from apache_beam.io import tfrecordio
  from tensorflow_transform.coders import example_proto_coder
  from tensorflow_transform.tf_metadata import dataset_metadata
  from tensorflow_transform.tf_metadata import dataset_schema
  from tensorflow_transform.beam import tft_beam_io
  from tensorflow_transform.beam.tft_beam_io import transform_fn_io

  job_name = 'preprocess-taxi-features' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')    
  if in_test_mode:
    import shutil
    print 'Launching local job ... hang on'
    OUTPUT_DIR = './preproc_tft'
    shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
    EVERY_N = 100000
  else:
    print 'Launching Dataflow job {} ... hang on'.format(job_name)
    OUTPUT_DIR = 'gs://{0}/taxifare/preproc_tft/'.format(BUCKET)
    import subprocess
    subprocess.call('gsutil rm -r {}'.format(OUTPUT_DIR).split())
    EVERY_N = 10000
    
  options = {
    'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'),
    'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
    'job_name': job_name,
    'project': PROJECT,
    'max_num_workers': 24,
    'teardown_policy': 'TEARDOWN_ALWAYS',
    'no_save_main_session': True,
    'requirements_file': 'requirements.txt'
  }
  opts = beam.pipeline.PipelineOptions(flags=[], **options)
  if in_test_mode:
    RUNNER = 'DirectRunner'
  else:
    RUNNER = 'DataflowRunner'

  # set up metadata
  raw_data_schema = {
    colname : dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation())
                   for colname in 'dayofweek,key'.split(',')
  }
  raw_data_schema.update({
      colname : dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation())
                   for colname in 'fare_amount,pickuplon,pickuplat,dropofflon,dropofflat'.split(',')
    })
  raw_data_schema.update({
      colname : dataset_schema.ColumnSchema(tf.int64, [], dataset_schema.FixedColumnRepresentation())
                   for colname in 'hourofday,passengers'.split(',')
    })
  raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(raw_data_schema))

  # run Beam  
  with beam.Pipeline(RUNNER, options=opts) as p:
    with beam_impl.Context(temp_dir=os.path.join(OUTPUT_DIR, 'tmp')):
      # save the raw data metadata
      _ = (raw_data_metadata
        | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(
            os.path.join(OUTPUT_DIR, 'metadata/rawdata_metadata'),
            pipeline=p))
      
      # analyze and transform training       
      raw_data = (p 
        | 'train_read' >> beam.io.Read(beam.io.BigQuerySource(query=create_query(1, EVERY_N), use_standard_sql=True))
        | 'train_filter' >> beam.Filter(is_valid))

      raw_dataset = (raw_data, raw_data_metadata)
      transformed_dataset, transform_fn = (
          raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocess_tft))
      transformed_data, transformed_metadata = transformed_dataset
      _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
          os.path.join(OUTPUT_DIR, 'train'),
          file_name_suffix='.gz',
          coder=example_proto_coder.ExampleProtoCoder(
              transformed_metadata.schema))
      
      # transform eval data
      raw_test_data = (p 
        | 'eval_read' >> beam.io.Read(beam.io.BigQuerySource(query=create_query(2, EVERY_N), use_standard_sql=True))
        | 'eval_filter' >> beam.Filter(is_valid))
      
      raw_test_dataset = (raw_test_data, raw_data_metadata)
      transformed_test_dataset = (
          (raw_test_dataset, transform_fn) | beam_impl.TransformDataset())
      transformed_test_data, _ = transformed_test_dataset
      _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
          os.path.join(OUTPUT_DIR, 'eval'),
          file_name_suffix='.gz',
          coder=example_proto_coder.ExampleProtoCoder(
              transformed_metadata.schema))
      _ = (transform_fn
           | 'WriteTransformFn' >>
           transform_fn_io.WriteTransformFn(os.path.join(OUTPUT_DIR, 'metadata')))
Esempio n. 21
0
def run(input_feature_spec,
        labels,
        feature_extraction,
        feature_scaling=None,
        eval_percent=20.0,
        beam_options=None,
        work_dir=None):
    """Runs the whole preprocessing step.

  This runs the feature extraction PTransform, validates that the data conforms
  to the schema provided, normalizes the features, and splits the dataset into
  a training and evaluation dataset.
  """

    # Populate optional arguments
    if not feature_scaling:
        feature_scaling = lambda inputs: inputs

    # Type checking
    if not isinstance(labels, list):
        raise ValueError('`labels` must be list(str). '
                         'Given: {} {}'.format(labels, type(labels)))

    if not isinstance(feature_extraction, beam.PTransform):
        raise ValueError('`feature_extraction` must be {}. '
                         'Given: {} {}'.format(beam.PTransform,
                                               feature_extraction,
                                               type(feature_extraction)))

    if not callable(feature_scaling):
        raise ValueError('`feature_scaling` must be callable. '
                         'Given: {} {}'.format(feature_scaling,
                                               type(feature_scaling)))

    if beam_options and not isinstance(beam_options, PipelineOptions):
        raise ValueError('`beam_options` must be {}. '
                         'Given: {} {}'.format(PipelineOptions, beam_options,
                                               type(beam_options)))

    if not work_dir:
        work_dir = tempfile.mkdtemp(prefix='tensorflow-preprocessing')

    tft_temp_dir = os.path.join(work_dir, 'tft-temp')
    train_dataset_dir = os.path.join(work_dir, 'train-dataset')
    eval_dataset_dir = os.path.join(work_dir, 'eval-dataset')

    transform_fn_dir = os.path.join(work_dir, transform_fn_io.TRANSFORM_FN_DIR)
    #  if tf.gfile.Exists(transform_fn_dir):
    if tf.io.gfile.exists(transform_fn_dir):
        tf.gfile.DeleteRecursively(transform_fn_dir)

    # [START dataflow_molecules_create_pipeline]
    # Build and run a Beam Pipeline
    with beam.Pipeline(options=beam_options) as p, \
         beam_impl.Context(temp_dir=tft_temp_dir):
        # [END dataflow_molecules_create_pipeline]

        # [START dataflow_molecules_feature_extraction]
        # Transform and validate the input data matches the input schema
        dataset = (
            p
            | 'Feature extraction' >> feature_extraction
            # [END dataflow_molecules_feature_extraction]
            # [START dataflow_molecules_validate_inputs]
            | 'Validate inputs' >> beam.ParDo(
                ValidateInputData(input_feature_spec)))
        # [END dataflow_molecules_validate_inputs]

        # [START dataflow_molecules_analyze_and_transform_dataset]
        # Apply the tf.Transform preprocessing_fn
        input_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec(input_feature_spec))

        dataset_and_metadata, transform_fn = (
            (dataset, input_metadata)
            | 'Feature scaling' >>
            beam_impl.AnalyzeAndTransformDataset(feature_scaling))
        dataset, metadata = dataset_and_metadata
        # [END dataflow_molecules_analyze_and_transform_dataset]

        # [START dataflow_molecules_split_to_train_and_eval_datasets]
        # Split the dataset into a training set and an evaluation set
        assert 0 < eval_percent < 100, 'eval_percent must in the range (0-100)'
        train_dataset, eval_dataset = (
            dataset
            | 'Split dataset' >> beam.Partition(
                lambda elem, _: int(random.uniform(0, 100) < eval_percent), 2))
        # [END dataflow_molecules_split_to_train_and_eval_datasets]

        # [START dataflow_molecules_write_tfrecords]
        # Write the datasets as TFRecords
        coder = example_proto_coder.ExampleProtoCoder(metadata.schema)

        train_dataset_prefix = os.path.join(train_dataset_dir, 'part')
        _ = (train_dataset
             | 'Write train dataset' >> tfrecordio.WriteToTFRecord(
                 train_dataset_prefix, coder))

        eval_dataset_prefix = os.path.join(eval_dataset_dir, 'part')
        _ = (eval_dataset
             | 'Write eval dataset' >> tfrecordio.WriteToTFRecord(
                 eval_dataset_prefix, coder))

        # Write the transform_fn
        _ = (transform_fn
             |
             'Write transformFn' >> transform_fn_io.WriteTransformFn(work_dir))
        # [END dataflow_molecules_write_tfrecords]

    return PreprocessData(input_feature_spec, labels,
                          train_dataset_prefix + '*',
                          eval_dataset_prefix + '*')
Esempio n. 22
0
def transform_data(train_data_file, test_data_file, working_dir):
  """Transform the data and write out as a TFRecord of Example protos.

  Read in the data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and converts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.

  Args:
    train_data_file: File containing training data
    test_data_file: File containing test data
    working_dir: Directory to write transformed data and metadata to
  """

  def preprocessing_fn(inputs):
    """Preprocess input columns into transformed columns."""
    # Since we are modifying some features and leaving others unchanged, we
    # start by setting `outputs` to a copy of `inputs.
    outputs = inputs.copy()

    # Scale numeric columns to have range [0, 1].
    for key in NUMERIC_FEATURE_KEYS:
      outputs[key] = tft.scale_to_0_1(outputs[key])

    # For all categorical columns except the label column, we generate a
    # vocabulary but do not modify the feature.  This vocabulary is instead
    # used in the trainer, by means of a feature column, to convert the feature
    # from a string to an integer id.
    for key in CATEGORICAL_FEATURE_KEYS:
      tft.vocabulary(inputs[key], vocab_filename=key)

    # For the label column we provide the mapping from string to index.
    table = tf.contrib.lookup.index_table_from_tensor(['>50K', '<=50K'])
    outputs[LABEL_KEY] = table.lookup(outputs[LABEL_KEY])

    return outputs

  # The "with" block will create a pipeline, and run that pipeline at the exit
  # of the block.
  with beam.Pipeline() as pipeline:
    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
      # Create a coder to read the census data with the schema.  To do this we
      # need to list all columns in order since the schema doesn't specify the
      # order of columns in the csv.
      ordered_columns = [
          'age', 'workclass', 'fnlwgt', 'education', 'education-num',
          'marital-status', 'occupation', 'relationship', 'race', 'sex',
          'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
          'label'
      ]
      converter = tft.coders.CsvCoder(ordered_columns, RAW_DATA_METADATA.schema)

      # Read in raw data and convert using CSV converter.  Note that we apply
      # some Beam transformations here, which will not be encoded in the TF
      # graph since we don't do the from within tf.Transform's methods
      # (AnalyzeDataset, TransformDataset etc.).  These transformations are just
      # to get data into a format that the CSV converter can read, in particular
      # removing spaces after commas.
      #
      # We use MapAndFilterErrors instead of Map to filter out decode errors in
      # convert.decode which should only occur for the trailing blank line.
      raw_data = (
          pipeline
          | 'ReadTrainData' >> textio.ReadFromText(train_data_file)
          | 'FixCommasTrainData' >> beam.Map(
              lambda line: line.replace(', ', ','))
          | 'DecodeTrainData' >> MapAndFilterErrors(converter.decode))

      # Combine data and schema into a dataset tuple.  Note that we already used
      # the schema to read the CSV data, but we also need it to interpret
      # raw_data.
      raw_dataset = (raw_data, RAW_DATA_METADATA)
      transformed_dataset, transform_fn = (
          raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
      transformed_data, transformed_metadata = transformed_dataset
      transformed_data_coder = tft.coders.ExampleProtoCoder(
          transformed_metadata.schema)

      _ = (
          transformed_data
          | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
          | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
              os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE)))

      # Now apply transform function to test data.  In this case we remove the
      # trailing period at the end of each line, and also ignore the header line
      # that is present in the test data file.
      raw_test_data = (
          pipeline
          | 'ReadTestData' >> textio.ReadFromText(test_data_file,
                                                  skip_header_lines=1)
          | 'FixCommasTestData' >> beam.Map(
              lambda line: line.replace(', ', ','))
          | 'RemoveTrailingPeriodsTestData' >> beam.Map(lambda line: line[:-1])
          | 'DecodeTestData' >> MapAndFilterErrors(converter.decode))

      raw_test_dataset = (raw_test_data, RAW_DATA_METADATA)

      transformed_test_dataset = (
          (raw_test_dataset, transform_fn) | beam_impl.TransformDataset())
      # Don't need transformed data schema, it's the same as before.
      transformed_test_data, _ = transformed_test_dataset

      _ = (
          transformed_test_data
          | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode)
          | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
              os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)))

      # Will write a SavedModel and metadata to two subdirectories of
      # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and
      # transform_fn_io.TRANSFORMED_METADATA_DIR respectively.
      _ = (
          transform_fn
          | 'WriteTransformFn' >>
          transform_fn_io.WriteTransformFn(working_dir))
Esempio n. 23
0
     # write validation dataset
     _ = (
         test
         | 'Encode & write test -> TFRecords' >>
         tfrecordio.WriteToTFRecord(
             file_path_prefix=os.path.join(args.data_dir, 'tfrecords',
                                           args.output_dir,
                                           EVAL_FILES_PATTERN),
             coder=transformed_data_coder,
             file_name_suffix='.gz',
             num_shards=1,
             compression_type=beam.io.filesystem.CompressionTypes.GZIP))
     # write the transform_fn
     _ = (
         transform_fn
         | 'Write transformFn' >> transform_fn_io.WriteTransformFn(
             os.path.join(args.data_dir, 'tfrecords', args.output_dir)))
 else:
     predictions = (prepared_samples
                    | 'Predict' >> beam.ParDo(
                        Predict(model_dir=os.path.join(
                            args.data_dir, 'models', args.model_dir))))
     _ = predictions | 'Print predictions' >> beam.Map(print)
     '''
     _ = (
         predictions
         | 'Write to BQ' >> beam.io.WriteToBigQuery(
             table=PRED_TABLE,
             schema={
                 'fields': [
                     {'name': 'item_number', 'type': 'INTEGER', 'mode': 'REQUIRED'},
                     {'name': 'pred_date', 'type': 'DATE', 'mode': 'REQUIRED'},
Esempio n. 24
0
def run_transform(output_dir,
                  schema,
                  train_data_file,
                  eval_data_file,
                  project,
                  mode,
                  preprocessing_fn=None):
    """Writes a tft transform fn, and metadata files.
  Args:
    output_dir: output folder
    schema: schema list.
    train_data_file: training data file pattern.
    eval_data_file: eval data file pattern.
    project: the project to run dataflow in.
    local: whether the job should be local or cloud.
    preprocessing_fn: a function used to preprocess the raw data. If not
                      specified, a function will be automatically inferred
                      from the schema.
  """

    tft_input_metadata = make_tft_input_metadata(schema)
    temp_dir = os.path.join(output_dir, 'tmp')
    preprocessing_fn = preprocessing_fn or make_preprocessing_fn(schema)

    if mode == 'local':
        pipeline_options = None
        runner = 'DirectRunner'
    elif mode == 'cloud':
        options = {
            'job_name':
            'pipeline-tft-' +
            datetime.datetime.now().strftime('%y%m%d-%H%M%S'),
            'temp_location':
            temp_dir,
            'project':
            project,
            'extra_packages': [
                'gs://ml-pipeline-playground/tensorflow-transform-0.6.0.dev0.tar.gz'
            ]
        }
        pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
        runner = 'DataFlowRunner'
    else:
        raise ValueError("Invalid mode %s." % mode)

    with beam.Pipeline(runner, options=pipeline_options) as p:
        with beam_impl.Context(temp_dir=temp_dir):
            names = [x['name'] for x in schema]
            converter = CsvCoder(names, tft_input_metadata.schema)
            train_data = (
                p
                | 'ReadTrainData' >> textio.ReadFromText(train_data_file)
                | 'DecodeTrainData' >> beam.Map(converter.decode))

            train_dataset = (train_data, tft_input_metadata)
            transformed_dataset, transform_fn = (
                train_dataset
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset

            # Writes transformed_metadata and transfrom_fn folders
            _ = (transform_fn | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(output_dir))

            # Write the raw_metadata
            metadata_io.write_metadata(metadata=tft_input_metadata,
                                       path=os.path.join(
                                           output_dir, 'metadata'))

            _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                os.path.join(output_dir, 'train'),
                coder=ExampleProtoCoder(transformed_metadata.schema))

            eval_data = (p
                         |
                         'ReadEvalData' >> textio.ReadFromText(eval_data_file)
                         | 'DecodeEvalData' >> beam.Map(converter.decode))

            eval_dataset = (eval_data, tft_input_metadata)

            transformed_eval_dataset = ((eval_dataset, transform_fn)
                                        | beam_impl.TransformDataset())
            transformed_eval_data, transformed_metadata = transformed_eval_dataset

            _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord(
                os.path.join(output_dir, 'eval'),
                coder=ExampleProtoCoder(transformed_metadata.schema))
Esempio n. 25
0
    def assertAnalyzeAndTransformResults(self,
                                         input_data,
                                         input_metadata,
                                         preprocessing_fn,
                                         expected_data=None,
                                         expected_metadata=None,
                                         expected_vocab_file_contents=None,
                                         expected_asset_file_contents=None,
                                         test_data=None,
                                         desired_batch_size=None,
                                         beam_pipeline=None,
                                         temp_dir=None,
                                         force_tf_compat_v1=True):
        """Assert that input data and metadata is transformed as expected.

    This methods asserts transformed data and transformed metadata match
    with expected_data and expected_metadata.

    Args:
      input_data: Input data formatted in one of two ways:
        * A sequence of dicts whose values are one of:
          strings, lists of strings, numeric types or a pair of those.
          Must have at least one key so that we can infer the batch size, or
        * A sequence of pa.RecordBatch.
      input_metadata: One of -
        * DatasetMetadata describing input_data if `input_data` are dicts.
        * TensorAdapterConfig otherwise.
      preprocessing_fn: A function taking a dict of tensors and returning
          a dict of tensors.
      expected_data: (optional) A dataset with the same type constraints as
          input_data, but representing the output after transformation.
          If supplied, transformed data is asserted to be equal.
      expected_metadata: (optional) DatasetMetadata describing the transformed
          data. If supplied, transformed metadata is asserted to be equal.
      expected_vocab_file_contents: (optional) A dictionary from vocab filenames
          to their expected content as a list of text lines or a list of tuples
          of frequency and text. Values should be the expected result of calling
          f.readlines() on the given asset files.
      expected_asset_file_contents: deprecated.  Use
          expected_vocab_file_contents.
      test_data: (optional) If this is provided then instead of calling
          AnalyzeAndTransformDataset with input_data, this function will call
          AnalyzeDataset with input_data and TransformDataset with test_data.
          Note that this is the case even if input_data and test_data are equal.
          test_data should also conform to input_metadata.
      desired_batch_size: (optional) A batch size to batch elements by. If not
          provided, a batch size will be computed automatically.
      beam_pipeline: (optional) A Beam Pipeline to use in this test.
      temp_dir: If set, it is used as output directory, else a new unique
          directory is created.
      force_tf_compat_v1: A `Boolean`. If `True`, TFT's public APIs use
          Tensorflow in compat.v1 mode.
    Raises:
      AssertionError: if the expected data does not match the results of
          transforming input_data according to preprocessing_fn, or
          (if provided) if the expected metadata does not match.
      ValueError: if expected_vocab_file_contents and
          expected_asset_file_contents are both set.
    """
        if (expected_vocab_file_contents is not None
                and expected_asset_file_contents is not None):
            raise ValueError('only one of expected_asset_file_contents and '
                             'expected_asset_file_contents should be set')
        elif expected_asset_file_contents is not None:
            tf.compat.v1.logging.warn(
                'expected_asset_file_contents is deprecated, use '
                'expected_vocab_file_contents')

        expected_vocab_file_contents = (expected_vocab_file_contents
                                        or expected_asset_file_contents or {})
        del expected_asset_file_contents

        # Note: we don't separately test AnalyzeDataset and TransformDataset as
        # AnalyzeAndTransformDataset currently simply composes these two
        # transforms.  If in future versions of the code, the implementation
        # differs, we should also run AnalyzeDataset and TransformDataset composed.
        temp_dir = temp_dir or tempfile.mkdtemp(prefix=self._testMethodName,
                                                dir=self.get_temp_dir())
        with beam_pipeline or self._makeTestPipeline() as pipeline:
            with beam_impl.Context(temp_dir=temp_dir,
                                   desired_batch_size=desired_batch_size,
                                   force_tf_compat_v1=force_tf_compat_v1):
                input_data = pipeline | 'CreateInput' >> beam.Create(
                    input_data, reshuffle=False)
                if test_data is None:
                    (transformed_data, transformed_metadata), transform_fn = (
                        (input_data, input_metadata)
                        |
                        beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
                else:
                    transform_fn = ((input_data, input_metadata)
                                    |
                                    beam_impl.AnalyzeDataset(preprocessing_fn))
                    test_data = pipeline | 'CreateTest' >> beam.Create(
                        test_data)
                    transformed_data, transformed_metadata = (
                        ((test_data, input_metadata), transform_fn)
                        | beam_impl.TransformDataset())

                # Write transform_fn so we can test its assets
                _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir)

                if expected_data is not None:
                    transformed_data_coder = tft.coders.ExampleProtoCoder(
                        transformed_metadata.schema)

                    transformed_data_path = os.path.join(
                        temp_dir, 'transformed_data')
                    _ = (transformed_data
                         | beam.Map(transformed_data_coder.encode)
                         | beam.io.tfrecordio.WriteToTFRecord(
                             transformed_data_path, shard_name_template=''))

        # TODO(ebreck) Log transformed_data somewhere.
        if expected_data is not None:
            examples = tf.compat.v1.python_io.tf_record_iterator(
                path=transformed_data_path)
            shapes = {
                f.name:
                [s.size for s in f.shape.dim] if f.HasField('shape') else [-1]
                for f in transformed_metadata.schema.feature
            }
            transformed_data = [
                _format_example_as_numpy_dict(e, shapes) for e in examples
            ]
            self.assertDataCloseOrEqual(expected_data, transformed_data)

        tf_transform_output = tft.TFTransformOutput(temp_dir)
        if expected_metadata:
            # Make a copy with no annotations.
            transformed_schema = schema_pb2.Schema()
            transformed_schema.CopyFrom(
                tf_transform_output.transformed_metadata.schema)
            transformed_schema.ClearField('annotation')
            for feature in transformed_schema.feature:
                feature.ClearField('annotation')
            self.assertEqual(expected_metadata.schema, transformed_schema)

        for filename, file_contents in six.iteritems(
                expected_vocab_file_contents):
            full_filename = tf_transform_output.vocabulary_file_by_name(
                filename)
            self.AssertVocabularyContents(full_filename, file_contents)
Esempio n. 26
0
def run(input_schema,
        labels,
        preprocessing_ptransform,
        full_pass_preprocessing_fn=None,
        eval_percent=20.0,
        beam_options=None,
        temp_dir=None,
        tft_temp_dir=None,
        train_dataset_dir=None,
        eval_dataset_dir=None):
    """Runs the whole preprocessing step.

  This runs the preprocessing PTransform, validates that the data conforms to
  the schema provided, does the full-pass preprocessing step and generates the
  input functions needed to train and evaluate the TensorFlow model.
  """

    # Populate optional arguments
    if not full_pass_preprocessing_fn:
        full_pass_preprocessing_fn = lambda inputs: inputs

    if not temp_dir:
        temp_dir = tempfile.mkdtemp(prefix='tensorflow_model')

    if not tft_temp_dir:
        tft_temp_dir = os.path.join(temp_dir, 'tft_temp')

    if not train_dataset_dir:
        train_dataset_dir = os.path.join(temp_dir, 'train_dataset')

    if not eval_dataset_dir:
        eval_dataset_dir = os.path.join(temp_dir, 'eval_dataset')

    # Type checking
    if not isinstance(labels, list):
        raise ValueError('`labels` must be list(str). '
                         'Given: {} {}'.format(labels, type(labels)))

    if not isinstance(preprocessing_ptransform, beam.PTransform):
        raise ValueError('`preprocessing_ptransform` must be {}. '
                         'Given: {} {}'.format(beam.PTransform,
                                               preprocessing_ptransform,
                                               type(preprocessing_ptransform)))

    if not callable(full_pass_preprocessing_fn):
        raise ValueError('`full_pass_preprocessing_fn` must be callable. '
                         'Given: {} {}'.format(
                             full_pass_preprocessing_fn,
                             type(full_pass_preprocessing_fn)))

    if beam_options and not isinstance(beam_options, PipelineOptions):
        raise ValueError('`beam_options` must be {}. '
                         'Given: {} {}'.format(PipelineOptions, beam_options,
                                               type(beam_options)))

    if tf.gfile.Exists(temp_dir):
        tf.gfile.DeleteRecursively(temp_dir)

    # Build and run a Beam Pipeline
    input_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.Schema(input_schema))

    with beam.Pipeline(options=beam_options) as p, \
         beam_impl.Context(temp_dir=tft_temp_dir):

        # Transform and validate the input data matches the input schema
        dataset = (p
                   | 'Preprocessing' >> preprocessing_ptransform
                   | 'ValidateInputData' >> beam.ParDo(
                       ValidateInputData(input_metadata)))

        # Apply the tf.Transform preprocessing_fn
        dataset_and_metadata, transform_fn = (
            (dataset, input_metadata)
            | 'FullPassPreprocessing' >>
            beam_impl.AnalyzeAndTransformDataset(full_pass_preprocessing_fn))

        dataset, metadata = dataset_and_metadata

        # Split the dataset into a training set and an evaluation set
        assert 0 < eval_percent < 100, 'eval_percent must in the range (0-100)'
        train_dataset, eval_dataset = (
            dataset
            | 'SplitDataset' >> beam.Partition(
                lambda elem, _: int(random.uniform(0, 100) < eval_percent), 2))

        # Write the datasets as TFRecords
        coder = example_proto_coder.ExampleProtoCoder(metadata.schema)

        train_dataset_prefix = os.path.join(train_dataset_dir, 'part')
        _ = (train_dataset
             | 'WriteTrainDataset' >> tfrecordio.WriteToTFRecord(
                 train_dataset_prefix, coder))

        eval_dataset_prefix = os.path.join(eval_dataset_dir, 'part')
        _ = (eval_dataset
             | 'WriteEvalDataset' >> tfrecordio.WriteToTFRecord(
                 eval_dataset_prefix, coder))

        # Write the transform_fn
        _ = (transform_fn
             |
             'WriteTransformFn' >> transform_fn_io.WriteTransformFn(temp_dir))

    return PreprocessData(
        labels, input_metadata.schema.as_feature_spec(),
        metadata.schema.as_feature_spec(),
        os.path.join(temp_dir, transform_fn_io.TRANSFORM_FN_DIR),
        train_dataset_prefix + '*', eval_dataset_prefix + '*')
Esempio n. 27
0
    def assertAnalyzeAndTransformResults(self,
                                         input_data,
                                         input_metadata,
                                         preprocessing_fn,
                                         expected_data=None,
                                         expected_metadata=None,
                                         expected_vocab_file_contents=None,
                                         expected_asset_file_contents=None,
                                         test_data=None,
                                         desired_batch_size=None,
                                         beam_pipeline=None,
                                         temp_dir=None):
        """Assert that input data and metadata is transformed as expected.

    This methods asserts transformed data and transformed metadata match
    with expected_data and expected_metadata.

    Args:
      input_data: A sequence of dicts whose values are
          either strings, lists of strings, numeric types or a pair of those.
      input_metadata: DatasetMetadata describing input_data.
      preprocessing_fn: A function taking a dict of tensors and returning
          a dict of tensors.
      expected_data: (optional) A dataset with the same type constraints as
          input_data, but representing the output after transformation.
          If supplied, transformed data is asserted to be equal.
      expected_metadata: (optional) DatasetMetadata describing the transformed
          data. If supplied, transformed metadata is asserted to be equal.
      expected_vocab_file_contents: (optional) A dictionary from vocab filenames
          to their expected content as a list of text lines or a list of tuples
          of frequency and text. Values should be the expected result of calling
          f.readlines() on the given asset files.
      expected_asset_file_contents: deprecated.  Use
          expected_vocab_file_contents.
      test_data: (optional) If this is provided then instead of calling
          AnalyzeAndTransformDataset with input_data, this function will call
          AnalyzeDataset with input_data and TransformDataset with test_data.
          Note that this is the case even if input_data and test_data are equal.
          test_data should also conform to input_metadata.
      desired_batch_size: (optional) A batch size to batch elements by. If not
          provided, a batch size will be computed automatically.
      beam_pipeline: (optional) A Beam Pipeline to use in this test.
      temp_dir: If set, it is used as output directory, else a new unique
          directory is created.
    Raises:
      AssertionError: if the expected data does not match the results of
          transforming input_data according to preprocessing_fn, or
          (if provided) if the expected metadata does not match.
      ValueError: if expected_vocab_file_contents and
          expected_asset_file_contents are both set.
    """
        if (expected_vocab_file_contents is not None
                and expected_asset_file_contents is not None):
            raise ValueError('only one of expected_asset_file_contents and '
                             'expected_asset_file_contents should be set')
        elif expected_asset_file_contents is not None:
            tf.compat.v1.logging.warn(
                'expected_asset_file_contents is deprecated, use '
                'expected_vocab_file_contents')

        expected_vocab_file_contents = (expected_vocab_file_contents
                                        or expected_asset_file_contents or {})
        del expected_asset_file_contents

        # Note: we don't separately test AnalyzeDataset and TransformDataset as
        # AnalyzeAndTransformDataset currently simply composes these two
        # transforms.  If in future versions of the code, the implementation
        # differs, we should also run AnalyzeDataset and TransformDatset composed.
        temp_dir = temp_dir or tempfile.mkdtemp(prefix=self._testMethodName,
                                                dir=self.get_temp_dir())
        with beam_pipeline or self._makeTestPipeline() as pipeline:
            with beam_impl.Context(temp_dir=temp_dir,
                                   desired_batch_size=desired_batch_size):
                input_data = pipeline | 'CreateInput' >> beam.Create(
                    input_data)
                if test_data is None:
                    (transformed_data, transformed_metadata), transform_fn = (
                        (input_data, input_metadata)
                        |
                        beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
                else:
                    transform_fn = ((input_data, input_metadata)
                                    |
                                    beam_impl.AnalyzeDataset(preprocessing_fn))
                    test_data = pipeline | 'CreateTest' >> beam.Create(
                        test_data)
                    transformed_data, transformed_metadata = (
                        ((test_data, input_metadata), transform_fn)
                        | beam_impl.TransformDataset())

                # Write transform_fn so we can test its assets
                _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir)

                if expected_data is not None:
                    transformed_data_coder = tft.coders.ExampleProtoCoder(
                        transformed_metadata.schema)

                    transformed_data_path = os.path.join(
                        temp_dir, 'transformed_data')
                    _ = (transformed_data
                         | beam.Map(transformed_data_coder.encode)
                         | beam.io.tfrecordio.WriteToTFRecord(
                             transformed_data_path, shard_name_template=''))

        # TODO(ebreck) Log transformed_data somewhere.
        if expected_data is not None:
            examples = tf.compat.v1.python_io.tf_record_iterator(
                path=transformed_data_path)
            transformed_data = [
                transformed_data_coder.decode(x) for x in examples
            ]
            self.assertDataCloseOrEqual(expected_data, transformed_data)

        tf_transform_output = tft.TFTransformOutput(temp_dir)
        if expected_metadata:
            self.assertEqual(expected_metadata,
                             tf_transform_output.transformed_metadata)

        for filename, file_contents in six.iteritems(
                expected_vocab_file_contents):
            full_filename = tf_transform_output.vocabulary_file_by_name(
                filename)
            with tf.io.gfile.GFile(full_filename, 'rb') as f:
                file_lines = f.readlines()

                # Store frequency case.
                if isinstance(file_contents[0], tuple):
                    word_and_frequency_list = []
                    for content in file_lines:
                        frequency, word = content.split(b' ', 1)
                        word_and_frequency_list.append(
                            (word.strip(b'\n'), float(frequency.strip(b'\n'))))
                    expected_words, expected_frequency = zip(
                        *word_and_frequency_list)
                    actual_words, actual_frequency = zip(*file_contents)
                    self.assertAllEqual(expected_words, actual_words)
                    np.testing.assert_almost_equal(expected_frequency,
                                                   actual_frequency)
                else:
                    file_lines = [
                        content.strip(b'\n') for content in file_lines
                    ]
                    self.assertAllEqual(file_lines, file_contents)
Esempio n. 28
0
    def assertAnalyzeAndTransformResults(self,
                                         input_data,
                                         input_metadata,
                                         preprocessing_fn,
                                         expected_data=None,
                                         expected_metadata=None,
                                         expected_vocab_file_contents=None,
                                         expected_asset_file_contents=None,
                                         test_data=None,
                                         desired_batch_size=None,
                                         beam_pipeline=None,
                                         temp_dir=None,
                                         use_tfxio=False,
                                         input_data_is_tfxio_format=False):
        """Assert that input data and metadata is transformed as expected.

    This methods asserts transformed data and transformed metadata match
    with expected_data and expected_metadata.

    Args:
      input_data: A sequence of dicts whose values are
          either strings, lists of strings, numeric types or a pair of those.
      input_metadata: DatasetMetadata describing input_data.
      preprocessing_fn: A function taking a dict of tensors and returning
          a dict of tensors.
      expected_data: (optional) A dataset with the same type constraints as
          input_data, but representing the output after transformation.
          If supplied, transformed data is asserted to be equal.
      expected_metadata: (optional) DatasetMetadata describing the transformed
          data. If supplied, transformed metadata is asserted to be equal.
      expected_vocab_file_contents: (optional) A dictionary from vocab filenames
          to their expected content as a list of text lines or a list of tuples
          of frequency and text. Values should be the expected result of calling
          f.readlines() on the given asset files.
      expected_asset_file_contents: deprecated.  Use
          expected_vocab_file_contents.
      test_data: (optional) If this is provided then instead of calling
          AnalyzeAndTransformDataset with input_data, this function will call
          AnalyzeDataset with input_data and TransformDataset with test_data.
          Note that this is the case even if input_data and test_data are equal.
          test_data should also conform to input_metadata.
      desired_batch_size: (optional) A batch size to batch elements by. If not
          provided, a batch size will be computed automatically.
      beam_pipeline: (optional) A Beam Pipeline to use in this test.
      temp_dir: If set, it is used as output directory, else a new unique
          directory is created.
      use_tfxio: If True, invoke AnalyzeAndTransformDataset using the new API
          that accepts standardized inputs (Arrow `RecordBatch`es). Otherwise
          use the old API that accepts Dicts.
      input_data_is_tfxio_format: If True, `input_data` and `test_data` are
          Arrow `RecordBatch`es and the `input_metadata` is
          `tfxio.tensor_adapter.TensorAdapterConfig`. Otherwise the input data
          is a list of Dicts and input_metadata is a `DatasetMetadata`.
    Raises:
      AssertionError: if the expected data does not match the results of
          transforming input_data according to preprocessing_fn, or
          (if provided) if the expected metadata does not match.
      ValueError: if expected_vocab_file_contents and
          expected_asset_file_contents are both set.
    """
        if (expected_vocab_file_contents is not None
                and expected_asset_file_contents is not None):
            raise ValueError('only one of expected_asset_file_contents and '
                             'expected_asset_file_contents should be set')
        elif expected_asset_file_contents is not None:
            tf.compat.v1.logging.warn(
                'expected_asset_file_contents is deprecated, use '
                'expected_vocab_file_contents')

        expected_vocab_file_contents = (expected_vocab_file_contents
                                        or expected_asset_file_contents or {})
        del expected_asset_file_contents

        if not use_tfxio and input_data_is_tfxio_format:
            raise ValueError('Unable to feed TFXIO input format to the old, '
                             'non-TFXIO API.')
        compatibility_tfxio_needed = use_tfxio and not input_data_is_tfxio_format
        # Note: we don't separately test AnalyzeDataset and TransformDataset as
        # AnalyzeAndTransformDataset currently simply composes these two
        # transforms.  If in future versions of the code, the implementation
        # differs, we should also run AnalyzeDataset and TransformDataset composed.
        temp_dir = temp_dir or tempfile.mkdtemp(prefix=self._testMethodName,
                                                dir=self.get_temp_dir())
        with beam_pipeline or self._makeTestPipeline() as pipeline:
            with beam_impl.Context(temp_dir=temp_dir,
                                   desired_batch_size=desired_batch_size,
                                   use_tfxio=use_tfxio):
                input_data = pipeline | 'CreateInput' >> beam.Create(
                    input_data, reshuffle=False)
                if compatibility_tfxio_needed:
                    legacy_input_metadata = input_metadata
                    input_data, input_metadata = self.convert_to_tfxio_api_inputs(
                        input_data, input_metadata, label='input_data')
                if test_data is None:
                    (transformed_data, transformed_metadata), transform_fn = (
                        (input_data, input_metadata)
                        |
                        beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
                else:
                    transform_fn = ((input_data, input_metadata)
                                    |
                                    beam_impl.AnalyzeDataset(preprocessing_fn))
                    test_data = pipeline | 'CreateTest' >> beam.Create(
                        test_data)
                    if compatibility_tfxio_needed:
                        test_data, _ = self.convert_to_tfxio_api_inputs(
                            test_data,
                            legacy_input_metadata,
                            label='test_data')
                    transformed_data, transformed_metadata = (
                        ((test_data, input_metadata), transform_fn)
                        | beam_impl.TransformDataset())

                # Write transform_fn so we can test its assets
                _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir)

                if expected_data is not None:
                    transformed_data_coder = tft.coders.ExampleProtoCoder(
                        transformed_metadata.schema)

                    transformed_data_path = os.path.join(
                        temp_dir, 'transformed_data')
                    _ = (transformed_data
                         | beam.Map(transformed_data_coder.encode)
                         | beam.io.tfrecordio.WriteToTFRecord(
                             transformed_data_path, shard_name_template=''))

        # TODO(ebreck) Log transformed_data somewhere.
        if expected_data is not None:
            examples = tf.compat.v1.python_io.tf_record_iterator(
                path=transformed_data_path)
            transformed_data = [
                transformed_data_coder.decode(x) for x in examples
            ]
            self.assertDataCloseOrEqual(expected_data, transformed_data)

        tf_transform_output = tft.TFTransformOutput(temp_dir)
        if expected_metadata:
            # Make a copy with no annotations.
            transformed_schema = schema_pb2.Schema()
            transformed_schema.CopyFrom(
                tf_transform_output.transformed_metadata.schema)
            transformed_schema.ClearField('annotation')
            for feature in transformed_schema.feature:
                feature.ClearField('annotation')
            self.assertEqual(expected_metadata.schema, transformed_schema)

        for filename, file_contents in six.iteritems(
                expected_vocab_file_contents):
            full_filename = tf_transform_output.vocabulary_file_by_name(
                filename)
            self.AssertVocabularyContents(full_filename, file_contents)
Esempio n. 29
0
            _ = norm_ts_windows_eval_data | 'Write TFrecords - eval' >> beam.io.tfrecordio.WriteToTFRecord(
                file_path_prefix=eval_tfrecord_path,
                file_name_suffix=".tfrecords",
                coder=example_proto_coder.ExampleProtoCoder(
                    norm_ts_windows_eval_metadata.schema))

            # Dump raw eval set for further tensorflow model analysis
            _ = ts_windows_eval | 'Write TFrecords - eval raw' >> beam.io.tfrecordio.WriteToTFRecord(
                file_path_prefix=eval_raw_tfrecord_path,
                file_name_suffix=".tfrecords",
                coder=example_proto_coder.ExampleProtoCoder(
                    ts_windows_schema.schema))

            # Dump transformation graph
            _ = transform_fn | 'Dump Transform Function Graph' >> transform_fn_io.WriteTransformFn(
                known_args.tft_artifacts_dir)

    # Dump parameters to be forwarded to the next pipeline step
    with open("/train_tfrecord_path.txt", "w") as f:
        f.write(train_tfrecord_path + '-*')

    with open("/eval_tfrecord_path.txt", "w") as f:
        f.write(eval_tfrecord_path + '-*')

    with open("/eval_raw_tfrecord_path.txt", "w") as f:
        f.write(eval_raw_tfrecord_path + '*')

    with open("/znorm_stats.txt", "w") as f:
        json.dump(znorm_stats, f)

    with open("/n_areas.txt", "w") as f:
def preprocess(query, in_test_mode):
  import os
  import os.path
  import tempfile
  from apache_beam.io import tfrecordio
  from tensorflow_transform.coders import example_proto_coder
  from tensorflow_transform.tf_metadata import dataset_metadata
  from tensorflow_transform.tf_metadata import dataset_schema
  from tensorflow_transform.beam.tft_beam_io import transform_fn_io

  job_name = 'preprocess-babyweight-features' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')    
  if in_test_mode:
    import shutil
    print 'Launching local job ... hang on'
    OUTPUT_DIR = './preproc_tft'
    shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
  else:
    print 'Launching Dataflow job {} ... hang on'.format(job_name)
    OUTPUT_DIR = 'gs://{0}/babyweight/preproc_tft/'.format(BUCKET)
    import subprocess
    subprocess.call('gsutil rm -r {}'.format(OUTPUT_DIR).split())
    
  options = {
    'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'),
    'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
    'job_name': job_name,
    'project': PROJECT,
    'max_num_workers': 24,
    'teardown_policy': 'TEARDOWN_ALWAYS',
    'no_save_main_session': True,
    'requirements_file': 'requirements.txt'
  }
  opts = beam.pipeline.PipelineOptions(flags=[], **options)
  if in_test_mode:
    RUNNER = 'DirectRunner'
  else:
    RUNNER = 'DataflowRunner'

  # set up metadata  
  raw_data_schema = {
    colname : dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation())
                   for colname in 'key,is_male,mother_race,mother_married,cigarette_use,alcohol_use'.split(',')
  }
  raw_data_schema.update({
      colname : dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation())
                   for colname in 'weight_pounds,mother_age,plurality,gestation_weeks'.split(',')
    })
  raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(raw_data_schema))

  def read_rawdata(p, step, test_mode):
    if step == 'train':
        selquery = 'SELECT * FROM ({}) WHERE ABS(MOD(hashmonth, 4)) < 3'.format(query)
    else:
        selquery = 'SELECT * FROM ({}) WHERE ABS(MOD(hashmonth, 4)) = 3'.format(query)
    if in_test_mode:
        selquery = selquery + ' LIMIT 100'
    #print 'Processing {} data from {}'.format(step, selquery)
    return (p 
          | '{}_read'.format(step) >> beam.io.Read(beam.io.BigQuerySource(query=selquery, use_standard_sql=True))
          | '{}_cleanup'.format(step) >> beam.FlatMap(cleanup)
                   )
  
  # run Beam  
  with beam.Pipeline(RUNNER, options=opts) as p:
    with beam_impl.Context(temp_dir=os.path.join(OUTPUT_DIR, 'tmp')):

      # analyze and transform training       
      raw_data = read_rawdata(p, 'train', in_test_mode)
      raw_dataset = (raw_data, raw_data_metadata)
      transformed_dataset, transform_fn = (
          raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocess_tft))
      transformed_data, transformed_metadata = transformed_dataset
      _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
          os.path.join(OUTPUT_DIR, 'train'),
          coder=example_proto_coder.ExampleProtoCoder(
              transformed_metadata.schema))
      
      # transform eval data
      raw_test_data = read_rawdata(p, 'eval', in_test_mode)
      raw_test_dataset = (raw_test_data, raw_data_metadata)
      transformed_test_dataset = (
          (raw_test_dataset, transform_fn) | beam_impl.TransformDataset())
      transformed_test_data, _ = transformed_test_dataset
      _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
          os.path.join(OUTPUT_DIR, 'eval'),
          coder=example_proto_coder.ExampleProtoCoder(
              transformed_metadata.schema))
      _ = (transform_fn
           | 'WriteTransformFn' >>
           transform_fn_io.WriteTransformFn(os.path.join(OUTPUT_DIR, 'metadata')))

  job = p.run()