Example #1
0
def run_transformation_pipeline(args, options):

    options = beam.pipeline.PipelineOptions(flags=[], **options)

    print("Sink transformed data files: {}".format(args.transform_test_prefix))
    print("Sink transform artefacts directory: {}".format(
        params.TRANSFORM_ARTIFACTS_DIR))

    print("Temporary directory: {}".format(params.TEMP_DIR))
    print("")

    with beam.Pipeline(runner, options=options) as pipeline:
        with impl.Context(params.TEMP_DIR):

            raw_metadata = featurizer.create_raw_metadata()
            converter = tft_coders.csv_coder.CsvCoder(
                column_names=metadata.RAW_FEATURE_NAMES,
                delimiter=params.RAW_DATA_DELIMITER,
                schema=raw_metadata.schema)

            ###### analyze & transform train #########################################################
            if (runner == 'DirectRunner'):
                print("Transform training data....")

            step = 'train'

            # Read raw train data from csv files
            raw_train_data = (
                pipeline
                | '{} - Read Raw Data'.format(step) >>
                beam.io.textio.ReadFromText(args.raw_train_file)
                | '{} - Remove Empty Rows'.format(step) >>
                beam.Filter(lambda line: line)
                | '{} - FixCommasAndRemoveFiledTestData'.format(step) >>
                beam.Map(fix_comma_and_filter_third_column)
                | '{} - Decode CSV Data'.format(step) >> MapAndFilterErrors(
                    converter.decode))

            # create a train dataset from the data and schema
            raw_train_dataset = (raw_train_data, raw_metadata)

            # analyze and transform raw_train_dataset to produced transformed_train_dataset and transform_fn
            transformed_train_dataset, transform_fn = (
                raw_train_dataset
                | '{} - Analyze & Transform'.format(step) >>
                impl.AnalyzeAndTransformDataset(preprocess))

            # get data and schema separately from the transformed_train_dataset
            transformed_train_data, transformed_metadata = transformed_train_dataset

            ###### transform test ##################################################################

            if (runner == 'DirectRunner'):
                print("Transform test data....")

            step = 'test'

            raw_test_data = (
                pipeline
                | '{} - Read Raw Data'.format(step) >>
                beam.io.textio.ReadFromText(args.raw_test_file)
                | '{} - Remove Empty Lines'.format(step) >>
                beam.Filter(lambda line: line)
                | '{} - FixCommasAndRemoveFiledTestData'.format(step) >>
                beam.Map(fix_comma_and_filter_third_column)
                | '{} - Decode CSV Data'.format(step) >> MapAndFilterErrors(
                    converter.decode))

            # create a test dataset from the data and schema
            raw_test_dataset = (raw_test_data, raw_metadata)

            # transform test data based on produced transform_fn (from analyzing train_data)
            transformed_test_dataset = (
                (raw_test_dataset, transform_fn)
                | '{} - Transform'.format(step) >> impl.TransformDataset())

            # get data from the transformed_test_dataset
            transformed_test_data, _ = transformed_test_dataset

            # write transformed test data to sink
            _ = (transformed_test_data
                 | '{} - Write Transformed Data'.format(step) >>
                 beam.io.tfrecordio.WriteToTFRecord(
                     file_path_prefix=args.transform_test_prefix,
                     file_name_suffix=".tfrecords",
                     coder=tft_coders.example_proto_coder.ExampleProtoCoder(
                         transformed_metadata.schema)))

    if runner == 'DataflowRunner':
        pipeline.run()
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir,
               frequency_threshold):
    """Run pre-processing step as a pipeline.

  Args:
    pipeline: beam pipeline
    training_data: file paths to input csv files.
    eval_data: file paths to input csv files.
    predict_data: file paths to input csv files.
    output_dir: file path to where to write all the output files.
    frequency_threshold: frequency threshold to use for categorical values.
  """
    # 1) The schema can be either defined in-memory or read from a configuration
    #    file, in this case we are creating the schema in-memory.
    input_schema = criteo.make_input_schema()

    # 2) Configure the coder to map the source file column names to a dictionary
    #    of key -> tensor_proto with the appropiate type derived from the
    #    input_schema.
    coder = criteo.make_tsv_coder(input_schema)

    # 3) Read from text using the coder.
    train_data = (pipeline
                  | 'ReadTrainingData' >> beam.io.ReadFromText(training_data)
                  | 'ParseTrainingCsv' >> beam.Map(coder.decode))

    evaluate_data = (pipeline
                     | 'ReadEvalData' >> beam.io.ReadFromText(eval_data)
                     | 'ParseEvalCsv' >> beam.Map(coder.decode))

    input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema)
    _ = (input_metadata
         | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(
             os.path.join(output_dir, path_constants.RAW_METADATA_DIR),
             pipeline=pipeline))

    preprocessing_fn = criteo.make_preprocessing_fn(frequency_threshold)
    (train_dataset, train_metadata), transform_fn = (
        (train_data, input_metadata)
        | 'AnalyzeAndTransform' >>
        tft.AnalyzeAndTransformDataset(preprocessing_fn))

    # WriteTransformFn writes transform_fn and metadata to fixed subdirectories
    # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and
    # path_constants.TRANSFORMED_METADATA_DIR.
    _ = (transform_fn
         | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir))

    # TODO(b/34231369) Remember to eventually also save the statistics.

    (evaluate_dataset,
     evaluate_metadata) = (((evaluate_data, input_metadata), transform_fn)
                           | 'TransformEval' >> tft.TransformDataset())

    train_coder = coders.ExampleProtoCoder(train_metadata.schema)
    _ = (
        train_dataset
        | 'SerializeTrainExamples' >> beam.Map(train_coder.encode)
        | 'ShuffleTraining' >> _Shuffle()  # pylint: disable=no-value-for-parameter
        | 'WriteTraining' >>
        beam.io.WriteToTFRecord(os.path.join(
            output_dir, path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX),
                                file_name_suffix='.tfrecord.gz'))

    evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema)
    _ = (
        evaluate_dataset
        | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode)
        | 'ShuffleEval' >> _Shuffle()  # pylint: disable=no-value-for-parameter
        | 'WriteEval' >>
        beam.io.WriteToTFRecord(os.path.join(
            output_dir, path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX),
                                file_name_suffix='.tfrecord.gz'))

    if predict_data:
        predict_mode = tf.contrib.learn.ModeKeys.INFER
        predict_schema = criteo.make_input_schema(mode=predict_mode)
        tsv_coder = criteo.make_tsv_coder(predict_schema, mode=predict_mode)
        predict_coder = coders.ExampleProtoCoder(predict_schema)
        serialized_examples = (
            pipeline
            | 'ReadPredictData' >> beam.io.ReadFromText(predict_data)
            | 'ParsePredictCsv' >> beam.Map(tsv_coder.decode)
            # TODO(b/35194257) Obviate the need for this explicit serialization.
            | 'EncodePredictData' >> beam.Map(predict_coder.encode))
        _ = (serialized_examples
             | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord(
                 os.path.join(
                     output_dir,
                     path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
                 file_name_suffix='.tfrecord.gz'))
        _ = (serialized_examples
             | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json)
             | 'WritePredictDataAsText' >> beam.io.WriteToText(
                 os.path.join(
                     output_dir,
                     path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
                 file_name_suffix='.txt'))
Example #3
0
def transform_data(input_handle,
                   outfile_prefix,
                   working_dir,
                   max_rows=None,
                   pipeline_args=None):
    """The main tf.transform method which analyzes and transforms data.

  Args:
    input_handle: BigQuery table name to process specified as
      DATASET.TABLE or path to csv file with input data.
    outfile_prefix: Filename prefix for emitted transformed examples
    working_dir: Directory in which transformed examples and transform
      function will be emitted.
    max_rows: Number of rows to query from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """
    def preprocessing_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
        outputs = {}
        for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[key] = transform.scale_to_z_score(inputs[key])

        for key in taxi.VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            outputs[key] = transform.string_to_int(
                inputs[key],
                top_k=taxi.VOCAB_SIZE,
                num_oov_buckets=taxi.OOV_SIZE)

        for key in taxi.BUCKET_FEATURE_KEYS:
            outputs[key] = transform.bucketize(inputs[key],
                                               taxi.FEATURE_BUCKET_COUNT)

        for key in taxi.CATEGORICAL_FEATURE_KEYS:
            outputs[key] = inputs[key]

        # Was this passenger a big tipper?
        def convert_label(label):
            taxi_fare = inputs[taxi.FARE_KEY]
            return tf.where(
                tf.is_nan(taxi_fare),
                tf.cast(tf.zeros_like(taxi_fare), tf.int64),
                # Test if the tip was > 20% of the fare.
                tf.cast(
                    tf.greater(label, tf.multiply(taxi_fare,
                                                  tf.constant(0.2))),
                    tf.int64))

        outputs[taxi.LABEL_KEY] = transform.apply_function(
            convert_label, inputs[taxi.LABEL_KEY])

        return outputs

    raw_feature_spec = taxi.get_raw_feature_spec()
    raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)

    with beam.Pipeline(argv=pipeline_args) as pipeline:
        with beam_impl.Context(temp_dir=working_dir):
            if input_handle.lower().endswith('csv'):
                csv_coder = taxi.make_csv_coder()
                raw_data = (pipeline
                            | 'ReadFromText' >> beam.io.ReadFromText(
                                input_handle, skip_header_lines=1)
                            | 'ParseCSV' >> beam.Map(csv_coder.decode))
            else:
                query = taxi.make_sql(input_handle, max_rows, for_eval=False)
                raw_data = (pipeline
                            | 'ReadBigQuery' >> beam.io.Read(
                                beam.io.BigQuerySource(query=query,
                                                       use_standard_sql=True)))

            raw_data |= 'CleanData' >> beam.Map(taxi.clean_raw_data_dict)

            transform_fn = (
                (raw_data, raw_data_metadata)
                | 'Analyze' >> beam_impl.AnalyzeDataset(preprocessing_fn))

            _ = (transform_fn
                 | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(working_dir))

            # Shuffling the data before materialization will improve Training
            # effectiveness downstream.
            shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle(
            )

            (transformed_data, transformed_metadata) = (
                ((shuffled_data, raw_data_metadata), transform_fn)
                | 'Transform' >> beam_impl.TransformDataset())

            coder = example_proto_coder.ExampleProtoCoder(
                transformed_metadata.schema)
            _ = (
                transformed_data
                | 'SerializeExamples' >> beam.Map(coder.encode)
                | 'WriteExamples' >> beam.io.WriteToTFRecord(
                    os.path.join(working_dir, outfile_prefix),
                    compression_type=beam.io.filesystem.CompressionTypes.GZIP))
def preprocess(query, in_test_mode):
  import os
  import os.path
  import tempfile
  from apache_beam.io import tfrecordio
  from tensorflow_transform.coders import example_proto_coder
  from tensorflow_transform.tf_metadata import dataset_metadata
  from tensorflow_transform.tf_metadata import dataset_schema
  from tensorflow_transform.beam.tft_beam_io import transform_fn_io

  job_name = 'preprocess-babyweight-features' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')    
  if in_test_mode:
    import shutil
    print 'Launching local job ... hang on'
    OUTPUT_DIR = './preproc_tft'
    shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
  else:
    print 'Launching Dataflow job {} ... hang on'.format(job_name)
    OUTPUT_DIR = 'gs://{0}/babyweight/preproc_tft/'.format(BUCKET)
    import subprocess
    subprocess.call('gsutil rm -r {}'.format(OUTPUT_DIR).split())
    
  options = {
    'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'),
    'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
    'job_name': job_name,
    'project': PROJECT,
    'max_num_workers': 24,
    'teardown_policy': 'TEARDOWN_ALWAYS',
    'no_save_main_session': True,
    'requirements_file': 'requirements.txt'
  }
  opts = beam.pipeline.PipelineOptions(flags=[], **options)
  if in_test_mode:
    RUNNER = 'DirectRunner'
  else:
    RUNNER = 'DataflowRunner'

  # set up metadata  
  raw_data_schema = {
    colname : dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation())
                   for colname in 'key,is_male,mother_race,mother_married,cigarette_use,alcohol_use'.split(',')
  }
  raw_data_schema.update({
      colname : dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation())
                   for colname in 'weight_pounds,mother_age,plurality,gestation_weeks'.split(',')
    })
  raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(raw_data_schema))

  def read_rawdata(p, step, test_mode):
    if step == 'train':
        selquery = 'SELECT * FROM ({}) WHERE ABS(MOD(hashmonth, 4)) < 3'.format(query)
    else:
        selquery = 'SELECT * FROM ({}) WHERE ABS(MOD(hashmonth, 4)) = 3'.format(query)
    if in_test_mode:
        selquery = selquery + ' LIMIT 100'
    #print 'Processing {} data from {}'.format(step, selquery)
    return (p 
          | '{}_read'.format(step) >> beam.io.Read(beam.io.BigQuerySource(query=selquery, use_standard_sql=True))
          | '{}_cleanup'.format(step) >> beam.FlatMap(cleanup)
                   )
  
  # run Beam  
  with beam.Pipeline(RUNNER, options=opts) as p:
    with beam_impl.Context(temp_dir=os.path.join(OUTPUT_DIR, 'tmp')):

      # analyze and transform training       
      raw_data = read_rawdata(p, 'train', in_test_mode)
      raw_dataset = (raw_data, raw_data_metadata)
      transformed_dataset, transform_fn = (
          raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocess_tft))
      transformed_data, transformed_metadata = transformed_dataset
      _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
          os.path.join(OUTPUT_DIR, 'train'),
          coder=example_proto_coder.ExampleProtoCoder(
              transformed_metadata.schema))
      
      # transform eval data
      raw_test_data = read_rawdata(p, 'eval', in_test_mode)
      raw_test_dataset = (raw_test_data, raw_data_metadata)
      transformed_test_dataset = (
          (raw_test_dataset, transform_fn) | beam_impl.TransformDataset())
      transformed_test_data, _ = transformed_test_dataset
      _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
          os.path.join(OUTPUT_DIR, 'eval'),
          coder=example_proto_coder.ExampleProtoCoder(
              transformed_metadata.schema))
      _ = (transform_fn
           | 'WriteTransformFn' >>
           transform_fn_io.WriteTransformFn(os.path.join(OUTPUT_DIR, 'metadata')))

  job = p.run()
Example #5
0
    def test_single_phase_run_twice(self):

        cache_location = self._make_cache_location('input_cache_1',
                                                   'output_cache_1')

        span_0_key = 'span-0'
        span_1_key = 'span-1'

        def preprocessing_fn(inputs):

            _ = tft.vocabulary(inputs['s'])

            _ = tft.bucketize(inputs['x'], 2, name='bucketize')

            return {
                'x_min':
                tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'x_mean':
                tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'y_min':
                tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
                'y_mean':
                tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
            }

        input_data = [{'x': 12, 'y': 1, 's': 'b'}, {'x': 10, 'y': 1, 's': 'c'}]
        input_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec({
                'x':
                tf.FixedLenFeature([], tf.float32),
                'y':
                tf.FixedLenFeature([], tf.float32),
                's':
                tf.FixedLenFeature([], tf.string),
            }))
        input_data_dict = {
            span_0_key: [{
                'x': -2,
                'y': 1,
                's': 'a',
            }, {
                'x': 4,
                'y': -4,
                's': 'a',
            }],
            span_1_key:
            input_data,
        }
        with beam_impl.Context(temp_dir=self.get_temp_dir()):

            flat_data = input_data_dict.values() | 'Flatten' >> beam.Flatten()

            transform_fn = ((flat_data, input_data_dict, input_metadata) |
                            (beam_impl.AnalyzeDatasetWithCache(
                                preprocessing_fn, cache_location)))

        transformed_dataset = ((
            (input_data_dict[span_1_key], input_metadata), transform_fn)
                               | beam_impl.TransformDataset())

        transformed_data, unused_transformed_metadata = transformed_dataset

        exepected_transformed_data = [
            {
                'x_mean': 6.0,
                'x_min': -2.0,
                'y_mean': -0.25,
                'y_min': -4.0,
            },
            {
                'x_mean': 6.0,
                'x_min': -2.0,
                'y_mean': -0.25,
                'y_min': -4.0,
            },
        ]
        self.assertDataCloseOrEqual(transformed_data,
                                    exepected_transformed_data)

        transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn')
        _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir)

        for key in input_data_dict:
            key_cache_dir = os.path.join(cache_location.output_cache_dir, key)
            self.assertTrue(tf.gfile.IsDirectory(key_cache_dir))
            self.assertEqual(len(tf.gfile.ListDirectory(key_cache_dir)), 6)

        cache_location = self._make_cache_location('output_cache_1',
                                                   'output_cache_2')

        with beam_impl.Context(temp_dir=self.get_temp_dir()):

            flat_data = input_data_dict.values() | 'Flatten' >> beam.Flatten()

            transform_fn = ((flat_data, input_data_dict, input_metadata) |
                            (beam_impl.AnalyzeDatasetWithCache(
                                preprocessing_fn, cache_location)))

        transformed_dataset = ((
            (input_data_dict[span_1_key], input_metadata), transform_fn)
                               | beam_impl.TransformDataset())
        transformed_data, unused_transformed_metadata = transformed_dataset
        self.assertDataCloseOrEqual(transformed_data,
                                    exepected_transformed_data)

        self.assertFalse(tf.gfile.IsDirectory(cache_location.output_cache_dir))
Example #6
0
  def assertAnalyzeAndTransformResults(self,
                                       input_data,
                                       input_metadata,
                                       preprocessing_fn,
                                       expected_data=None,
                                       expected_metadata=None,
                                       expected_vocab_file_contents=None,
                                       test_data=None,
                                       desired_batch_size=None,
                                       beam_pipeline=None,
                                       temp_dir=None,
                                       force_tf_compat_v1=True,
                                       output_record_batches=False):
    """Assert that input data and metadata is transformed as expected.

    This methods asserts transformed data and transformed metadata match
    with expected_data and expected_metadata.

    Args:
      input_data: Input data formatted in one of two ways:
        * A sequence of dicts whose values are one of:
          strings, lists of strings, numeric types or a pair of those.
          Must have at least one key so that we can infer the batch size, or
        * A sequence of pa.RecordBatch.
      input_metadata: One of -
        * DatasetMetadata describing input_data if `input_data` are dicts.
        * TensorAdapterConfig otherwise.
      preprocessing_fn: A function taking a dict of tensors and returning
          a dict of tensors.
      expected_data: (optional) A dataset with the same type constraints as
          input_data, but representing the output after transformation.
          If supplied, transformed data is asserted to be equal.
      expected_metadata: (optional) DatasetMetadata describing the transformed
          data. If supplied, transformed metadata is asserted to be equal.
      expected_vocab_file_contents: (optional) A dictionary from vocab filenames
          to their expected content as a list of text lines or a list of tuples
          of frequency and text. Values should be the expected result of calling
          f.readlines() on the given asset files.
      test_data: (optional) If this is provided then instead of calling
          AnalyzeAndTransformDataset with input_data, this function will call
          AnalyzeDataset with input_data and TransformDataset with test_data.
          Note that this is the case even if input_data and test_data are equal.
          test_data should also conform to input_metadata.
      desired_batch_size: (optional) A batch size to batch elements by. If not
          provided, a batch size will be computed automatically.
      beam_pipeline: (optional) A Beam Pipeline to use in this test.
      temp_dir: If set, it is used as output directory, else a new unique
          directory is created.
      force_tf_compat_v1: A bool. If `True`, TFT's public APIs use Tensorflow
          in compat.v1 mode.
      output_record_batches: (optional) A bool. If `True`, `TransformDataset`
          and `AnalyzeAndTransformDataset` output `pyarrow.RecordBatch`es;
          otherwise, they output instance dicts.
    Raises:
      AssertionError: if the expected data does not match the results of
          transforming input_data according to preprocessing_fn, or
          (if provided) if the expected metadata does not match.
    """

    expected_vocab_file_contents = expected_vocab_file_contents or {}

    # Note: we don't separately test AnalyzeDataset and TransformDataset as
    # AnalyzeAndTransformDataset currently simply composes these two
    # transforms.  If in future versions of the code, the implementation
    # differs, we should also run AnalyzeDataset and TransformDataset composed.
    temp_dir = temp_dir or tempfile.mkdtemp(
        prefix=self._testMethodName, dir=self.get_temp_dir())
    with beam_pipeline or self._makeTestPipeline() as pipeline:
      with beam_impl.Context(
          temp_dir=temp_dir,
          desired_batch_size=desired_batch_size,
          force_tf_compat_v1=force_tf_compat_v1):
        input_data = pipeline | 'CreateInput' >> beam.Create(input_data,
                                                             reshuffle=False)
        if test_data is None:
          (transformed_data, transformed_metadata), transform_fn = (
              (input_data, input_metadata)
              | beam_impl.AnalyzeAndTransformDataset(
                  preprocessing_fn,
                  output_record_batches=output_record_batches))
        else:
          transform_fn = ((input_data, input_metadata)
                          | beam_impl.AnalyzeDataset(preprocessing_fn))
          test_data = pipeline | 'CreateTest' >> beam.Create(test_data)
          transformed_data, transformed_metadata = (
              ((test_data, input_metadata), transform_fn)
              | beam_impl.TransformDataset(
                  output_record_batches=output_record_batches))

        # Write transform_fn so we can test its assets
        _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir)

        transformed_data_path = os.path.join(temp_dir, 'transformed_data')
        if expected_data is not None:
          if output_record_batches:

            def record_batch_to_examples(data_batch):
              # Ignore unary pass-through features.
              record_batch, _ = data_batch
              return example_coder.RecordBatchToExamples(record_batch)

            encode_ptransform = beam.FlatMap(record_batch_to_examples)
          else:
            transformed_data_coder = tft.coders.ExampleProtoCoder(
                transformed_metadata.schema)
            encode_ptransform = beam.Map(transformed_data_coder.encode)

          _ = (
              transformed_data
              | encode_ptransform
              | beam.io.tfrecordio.WriteToTFRecord(
                  transformed_data_path, shard_name_template=''))

    # TODO(ebreck) Log transformed_data somewhere.
    if expected_data is not None:
      examples = tf.compat.v1.python_io.tf_record_iterator(
          path=transformed_data_path)
      shapes = {
          f.name:
          [s.size for s in f.shape.dim] if f.HasField('shape') else [-1]
          for f in transformed_metadata.schema.feature
      }
      transformed_data = [
          _format_example_as_numpy_dict(e, shapes) for e in examples
      ]
      self.assertDataCloseOrEqual(expected_data, transformed_data)

    tf_transform_output = tft.TFTransformOutput(temp_dir)
    if expected_metadata:
      # Make a copy with no annotations.
      transformed_schema = schema_pb2.Schema()
      transformed_schema.CopyFrom(
          tf_transform_output.transformed_metadata.schema)
      transformed_schema.ClearField('annotation')
      for feature in transformed_schema.feature:
        feature.ClearField('annotation')

      # assertProtoEqual has a size limit on the length of the
      # serialized as text strings. Therefore, we first try to use
      # assertProtoEqual, if that fails we try to use assertEqual, if that fails
      # as well then we raise the exception from assertProtoEqual.
      try:
        compare.assertProtoEqual(self, expected_metadata.schema,
                                 transformed_schema)
      except AssertionError as compare_exception:
        try:
          self.assertEqual(expected_metadata.schema, transformed_schema)
        except AssertionError:
          raise compare_exception

    for filename, file_contents in six.iteritems(expected_vocab_file_contents):
      full_filename = tf_transform_output.vocabulary_file_by_name(filename)
      self.AssertVocabularyContents(full_filename, file_contents)
Example #7
0
def transform_data(working_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data from the positive and negative examples on disk, and
  transform it using a preprocessing pipeline that removes punctuation,
  tokenizes and maps tokens to int64 values indices.

  Args:
    working_dir: Directory to read shuffled data from and write transformed data
        and metadata to.
  """

    with beam.Pipeline() as pipeline:
        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            coder = tft.coders.ExampleProtoCoder(RAW_DATA_METADATA.schema)
            train_data = (pipeline
                          | 'ReadTrain' >> tfrecordio.ReadFromTFRecord(
                              os.path.join(working_dir,
                                           SHUFFLED_TRAIN_DATA_FILEBASE + '*'))
                          | 'DecodeTrain' >> beam.Map(coder.decode))

            test_data = (pipeline
                         | 'ReadTest' >> tfrecordio.ReadFromTFRecord(
                             os.path.join(working_dir,
                                          SHUFFLED_TEST_DATA_FILEBASE + '*'))
                         | 'DecodeTest' >> beam.Map(coder.decode))

            def preprocessing_fn(inputs):
                """Preprocess input columns into transformed columns."""
                review = inputs[REVIEW_KEY]

                review_tokens = tf.string_split(review, DELIMITERS)
                review_indices = tft.compute_and_apply_vocabulary(
                    review_tokens, top_k=VOCAB_SIZE)
                # Add one for the oov bucket created by compute_and_apply_vocabulary.
                review_bow_indices, review_weight = tft.tfidf(
                    review_indices, VOCAB_SIZE + 1)
                return {
                    REVIEW_KEY: review_bow_indices,
                    REVIEW_WEIGHT_KEY: review_weight,
                    LABEL_KEY: inputs[LABEL_KEY]
                }

            (transformed_train_data, transformed_metadata), transform_fn = (
                (train_data, RAW_DATA_METADATA)
                | 'AnalyzeAndTransform' >>
                beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data_coder = tft.coders.ExampleProtoCoder(
                transformed_metadata.schema)

            transformed_test_data, _ = (
                ((test_data, RAW_DATA_METADATA), transform_fn)
                | 'Transform' >> beam_impl.TransformDataset())

            _ = (transformed_train_data
                 | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
                 | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                     os.path.join(working_dir,
                                  TRANSFORMED_TRAIN_DATA_FILEBASE)))

            _ = (
                transformed_test_data
                | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode)
                | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
                    os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)))

            # Will write a SavedModel and metadata to two subdirectories of
            # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and
            # transform_fn_io.TRANSFORMED_METADATA_DIR respectively.
            _ = (transform_fn
                 | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(working_dir))
def preprocess(in_test_mode):
  import os
  import os.path
  import tempfile
  from apache_beam.io import tfrecordio
  from tensorflow_transform.coders import example_proto_coder
  from tensorflow_transform.tf_metadata import dataset_metadata
  from tensorflow_transform.tf_metadata import dataset_schema
  from tensorflow_transform.beam import tft_beam_io
  from tensorflow_transform.beam.tft_beam_io import transform_fn_io

  job_name = 'preprocess-taxi-features' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')    
  if in_test_mode:
    import shutil
    print 'Launching local job ... hang on'
    OUTPUT_DIR = './preproc_tft'
    shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
    EVERY_N = 100000
  else:
    print 'Launching Dataflow job {} ... hang on'.format(job_name)
    OUTPUT_DIR = 'gs://{0}/taxifare/preproc_tft/'.format(BUCKET)
    import subprocess
    subprocess.call('gsutil rm -r {}'.format(OUTPUT_DIR).split())
    EVERY_N = 10000
    
  options = {
    'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'),
    'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
    'job_name': job_name,
    'project': PROJECT,
    'max_num_workers': 24,
    'teardown_policy': 'TEARDOWN_ALWAYS',
    'no_save_main_session': True,
    'requirements_file': 'requirements.txt'
  }
  opts = beam.pipeline.PipelineOptions(flags=[], **options)
  if in_test_mode:
    RUNNER = 'DirectRunner'
  else:
    RUNNER = 'DataflowRunner'

  # set up metadata
  raw_data_schema = {
    colname : dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation())
                   for colname in 'dayofweek,key'.split(',')
  }
  raw_data_schema.update({
      colname : dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation())
                   for colname in 'fare_amount,pickuplon,pickuplat,dropofflon,dropofflat'.split(',')
    })
  raw_data_schema.update({
      colname : dataset_schema.ColumnSchema(tf.int64, [], dataset_schema.FixedColumnRepresentation())
                   for colname in 'hourofday,passengers'.split(',')
    })
  raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(raw_data_schema))

  # run Beam  
  with beam.Pipeline(RUNNER, options=opts) as p:
    with beam_impl.Context(temp_dir=os.path.join(OUTPUT_DIR, 'tmp')):
      # save the raw data metadata
      _ = (raw_data_metadata
        | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(
            os.path.join(OUTPUT_DIR, 'metadata/rawdata_metadata'),
            pipeline=p))
      
      # analyze and transform training       
      raw_data = (p 
        | 'train_read' >> beam.io.Read(beam.io.BigQuerySource(query=create_query(1, EVERY_N), use_standard_sql=True))
        | 'train_filter' >> beam.Filter(is_valid))

      raw_dataset = (raw_data, raw_data_metadata)
      transformed_dataset, transform_fn = (
          raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocess_tft))
      transformed_data, transformed_metadata = transformed_dataset
      _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
          os.path.join(OUTPUT_DIR, 'train'),
          file_name_suffix='.gz',
          coder=example_proto_coder.ExampleProtoCoder(
              transformed_metadata.schema))
      
      # transform eval data
      raw_test_data = (p 
        | 'eval_read' >> beam.io.Read(beam.io.BigQuerySource(query=create_query(2, EVERY_N), use_standard_sql=True))
        | 'eval_filter' >> beam.Filter(is_valid))
      
      raw_test_dataset = (raw_test_data, raw_data_metadata)
      transformed_test_dataset = (
          (raw_test_dataset, transform_fn) | beam_impl.TransformDataset())
      transformed_test_data, _ = transformed_test_dataset
      _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
          os.path.join(OUTPUT_DIR, 'eval'),
          file_name_suffix='.gz',
          coder=example_proto_coder.ExampleProtoCoder(
              transformed_metadata.schema))
      _ = (transform_fn
           | 'WriteTransformFn' >>
           transform_fn_io.WriteTransformFn(os.path.join(OUTPUT_DIR, 'metadata')))
Example #9
0
def write_to_tfrecord(args):
    """
    This function is supposed to be called as a script.
    """
    # Decode arguments
    current_index, num_shards, train_split_fname_out, eval_split_fname_out, \
    exp_log_data_file_train_tfrecord, exp_log_data_file_eval_tfrecord, working_dir, data_formatter_module_path = args

    # num_shards = "32"
    current_index, num_shards = int(current_index), int(num_shards)

    split_train_file_pattern = '{}-{:05}-of-{:05}'.format(
        train_split_fname_out, current_index, num_shards) + '*'
    split_eval_file_pattern = '{}-{:05}-of-{:05}'.format(
        eval_split_fname_out, current_index, num_shards)

    log.info('exp_log_data_file_train_tfrecord {}'.format(
        exp_log_data_file_train_tfrecord))
    log.info('exp_log_data_file_eval_tfrecord {}'.format(
        exp_log_data_file_eval_tfrecord))
    log.info('split_train_file_pattern {}'.format(split_train_file_pattern))
    log.info('split_eval_file_pattern {}'.format(split_eval_file_pattern))

    data_formatter = import_from_uri(
        data_formatter_module_path).DataFormatter()

    # Set up the preprocessing pipeline.
    pipeline = beam.Pipeline(runner=DirectRunner())

    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
        # Read raw data files: CSV format ordered according to the `data_formatter`, that are then converted
        # into a cleaned up format.
        raw_train_data = (
            pipeline
            | 'ReadTrainDataFile' >> textio.ReadFromText(
                split_train_file_pattern, skip_header_lines=0)
            | 'DecodeTrainDataCSV' >> MapAndFilterErrors(
                tft_coders.CsvCoder(
                    data_formatter.get_features_and_targets(),
                    data_formatter.get_features_metadata().schema).decode))

        raw_eval_data = (
            pipeline
            | 'ReadEvalDataFile' >> textio.ReadFromText(
                split_eval_file_pattern, skip_header_lines=0)
            | 'DecodeEvalDataCSV' >> MapAndFilterErrors(
                tft_coders.CsvCoder(
                    data_formatter.get_features_and_targets(),
                    data_formatter.get_features_metadata().schema).decode))

        # Examples in tf-example format (for model analysis purposes).
        # raw_feature_spec = data_formatter.RAW_DATA_METADATA.schema.as_feature_spec()
        # raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
        # coder = example_proto_coder.ExampleProtoCoder(raw_schema)
        #
        # _ = (
        #         raw_eval_data
        #         | 'ToSerializedTFExample' >> beam.Map(coder.encode)
        #         | 'WriteAnalysisTFRecord' >> tfrecordio.WriteToTFRecord(
        #     '{}-{:05}-of-{:05}'.format(analysis_fname_out, i, num_shards),
        #     shard_name_template='', num_shards=1)
        # )

        # Write SavedModel and metadata to two subdirectories of working_dir, given by
        # `transform_fn_io.TRANSFORM_FN_DIR` and `transform_fn_io.TRANSFORMED_METADATA_DIR` respectively.
        transform_fn = (pipeline
                        | 'ReadTransformGraph' >>
                        transform_fn_io.ReadTransformFn(working_dir))

        # Applies the transformation `transform_fn` to the raw eval dataset
        (transformed_train_data, transformed_metadata) = (
            ((raw_train_data, data_formatter.get_features_metadata()),
             transform_fn)
            | 'TransformTrainData' >> beam_impl.TransformDataset())

        # Applies the transformation `transform_fn` to the raw eval dataset
        (transformed_eval_data, transformed_metadata) = (
            ((raw_eval_data, data_formatter.get_features_metadata()),
             transform_fn)
            | 'TransformEvalData' >> beam_impl.TransformDataset())

        # The data schema of the transformed data gets used to build a signature to create
        # a TFRecord (tf binary data format). This signature is a wrapper function used to
        # encode transformed data.
        transformed_data_coder = tft.coders.ExampleProtoCoder(
            transformed_metadata.schema)

        _ = (transformed_train_data
             | 'EncodeTrainDataTransform' >> MapAndFilterErrors(
                 transformed_data_coder.encode)
             | 'WriteTrainDataTFRecord' >>
             tfrecordio.WriteToTFRecord('{}-{:05}-of-{:05}'.format(
                 exp_log_data_file_train_tfrecord, current_index, num_shards),
                                        shard_name_template='',
                                        num_shards=1))

        _ = (transformed_eval_data
             | 'EncodeEvalDataTransform' >> MapAndFilterErrors(
                 transformed_data_coder.encode)
             | 'WriteEvalDataTFRecord' >>
             tfrecordio.WriteToTFRecord('{}-{:05}-of-{:05}'.format(
                 exp_log_data_file_eval_tfrecord, current_index, num_shards),
                                        shard_name_template='',
                                        num_shards=1))

    result = pipeline.run()
    result.wait_until_finish()
Example #10
0
    def assertAnalyzeAndTransformResults(self,
                                         input_data,
                                         input_metadata,
                                         preprocessing_fn,
                                         expected_data=None,
                                         expected_metadata=None,
                                         only_check_core_metadata=False):
        """Assert that input data and metadata is transformed as expected.

    This methods asserts transformed data and transformed metadata match
    with expected_data and expected_metadata.

    Args:
      input_data: A sequence of dicts whose values are
          either strings, lists of strings, numeric types or a pair of those.
      input_metadata: DatasetMetadata describing input_data.
      preprocessing_fn: A function taking a dict of tensors and returning
          a dict of tensors.
      expected_data: (optional) A dataset with the same type constraints as
          input_data, but representing the output after transformation.
          If supplied, transformed data is asserted to be equal.
      expected_metadata: (optional) DatasetMetadata describing the transformed
          data. If supplied, transformed metadata is asserted to be equal.
      only_check_core_metadata: A boolean to indicate if all elements in
          the transformed metadata is asserted to be equal to expected metadata.
          If True, only transformed feature names, dtypes and representations
          are asserted.
    Raises:
      AssertionError: if the expected data does not match the results of
          transforming input_data according to preprocessing_fn, or
          (if provided) if the expected metadata does not match.
    """
        # Note: we don't separately test AnalyzeDataset and TransformDataset as
        # AnalyzeAndTransformDataset currently simply composes these two
        # transforms.  If in future versions of the code, the implementation
        # differs, we should also run AnalyzeDataset and TransformDatset composed.
        #
        # Also, the dataset_metadata that is returned along with
        # `transformed_data` is incomplete as it does not contain the deferred
        # components, so we instead inspect the metadata returned along with the
        # transform function.
        temp_dir = self.get_temp_dir()
        with beam_impl.Context(temp_dir=temp_dir):
            transform_fn, transformed_metadata = (
                (input_data, input_metadata)
                |
                'AnalyzeDataset' >> beam_impl.AnalyzeDataset(preprocessing_fn))
            transformed_data, _ = (
                ((input_data, input_metadata),
                 (transform_fn, transformed_metadata))
                | 'TransformDataset' >> beam_impl.TransformDataset())

        if expected_data:
            self.assertDataCloseOrEqual(expected_data, transformed_data)

        if not expected_metadata:
            return

        transformed_metadata = self._resolveDeferredMetadata(
            transformed_metadata)

        if only_check_core_metadata:
            # preprocessing_fn may add metadata to column schema only relevant to
            # internal implementation such as vocabulary_file. As such, only check
            # feature names, dtypes and representations are as expected.
            self.assertSameElements(
                transformed_metadata.schema.column_schemas.keys(),
                expected_metadata.schema.column_schemas.keys())

            for k, v in transformed_metadata.schema.column_schemas.iteritems():
                expected_schema = expected_metadata.schema.column_schemas[k]

                self.assertEqual(
                    expected_schema.representation, v.representation,
                    "representation doesn't match for feature '%s'" % k)
                self.assertEqual(expected_schema.domain.dtype, v.domain.dtype,
                                 "dtype doesn't match for feature '%s'" % k)

        else:
            # Check the entire DatasetMetadata is as expected.
            # Use extra assertEqual for schemas, since full metadata assertEqual
            # error message is not conducive to debugging.
            self.assertEqual(expected_metadata.schema.column_schemas,
                             transformed_metadata.schema.column_schemas)

            self.assertEqual(expected_metadata, transformed_metadata)
def data_transformation(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/marshal"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    # -----------------------DATA LOADING START--------------------------------
    _kale_directory_file_names = [
        os.path.splitext(f)[0]
        for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
    ]

    if "schema" not in _kale_directory_file_names:
        raise ValueError("schema" + " does not exists in directory")

    _kale_load_file_name = [
        f
        for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f)) and
        os.path.splitext(f)[0] == "schema"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " +
                         "schema" + ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    schema = _kale_resource_load(os.path.join(
        _kale_data_directory, _kale_load_file_name))

    if "column_names" not in _kale_directory_file_names:
        raise ValueError("column_names" + " does not exists in directory")

    _kale_load_file_name = [
        f
        for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f)) and
        os.path.splitext(f)[0] == "column_names"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " +
                         "column_names" + ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    column_names = _kale_resource_load(os.path.join(
        _kale_data_directory, _kale_load_file_name))
    # -----------------------DATA LOADING END----------------------------------

    import os
    import shutil
    import logging
    import apache_beam as beam
    import tensorflow as tf
    import tensorflow_transform as tft
    import tensorflow_model_analysis as tfma
    import tensorflow_data_validation as tfdv

    from apache_beam.io import textio
    from apache_beam.io import tfrecordio

    from tensorflow_transform.beam import impl as beam_impl
    from tensorflow_transform.beam.tft_beam_io import transform_fn_io
    from tensorflow_transform.coders.csv_coder import CsvCoder
    from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder
    from tensorflow_transform.tf_metadata import dataset_metadata
    from tensorflow_transform.tf_metadata import metadata_io
    DATA_DIR = 'data/'
    TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv')
    EVALUATION_DATA = os.path.join(
        DATA_DIR, 'taxi-cab-classification/eval.csv')

    # Categorical features are assumed to each have a maximum value in the dataset.
    MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12]
    CATEGORICAL_FEATURE_KEYS = ['trip_start_hour',
                                'trip_start_day', 'trip_start_month']

    DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds']

    # Number of buckets used by tf.transform for encoding each feature.
    FEATURE_BUCKET_COUNT = 10

    BUCKET_FEATURE_KEYS = [
        'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']

    # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform
    VOCAB_SIZE = 1000

    # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed.
    OOV_SIZE = 10

    VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company',
                          'pickup_community_area', 'dropoff_community_area']

    # allow nan values in these features.
    OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract',
                         'company', 'trip_seconds', 'dropoff_community_area']

    LABEL_KEY = 'tips'
    FARE_KEY = 'fare'
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    # tf.get_logger().setLevel(logging.ERROR)

    def to_dense(tensor):
        """Takes as input a SparseTensor and return a Tensor with correct default value
        Args:
          tensor: tf.SparseTensor
        Returns:
          tf.Tensor with default value
        """
        if not isinstance(tensor, tf.sparse.SparseTensor):
            return tensor
        if tensor.dtype == tf.string:
            default_value = ''
        elif tensor.dtype == tf.float32:
            default_value = 0.0
        elif tensor.dtype == tf.int32:
            default_value = 0
        else:
            raise ValueError(f"Tensor type not recognized: {tensor.dtype}")

        return tf.squeeze(tf.sparse_to_dense(tensor.indices,
                                             [tensor.dense_shape[0], 1],
                                             tensor.values, default_value=default_value), axis=1)
        # TODO: Update to below version
        # return tf.squeeze(tf.sparse.to_dense(tensor, default_value=default_value), axis=1)

    def preprocess_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.
        Args:
          inputs: map from feature keys to raw not-yet-transformed features.
        Returns:
          Map from string feature key to transformed feature operations.
        """
        outputs = {}
        for key in DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[key] = tft.scale_to_z_score(to_dense(inputs[key]))

        for key in VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            if inputs[key].dtype == tf.string:
                vocab_tensor = to_dense(inputs[key])
            else:
                vocab_tensor = tf.as_string(to_dense(inputs[key]))
            outputs[key] = tft.compute_and_apply_vocabulary(
                vocab_tensor, vocab_filename='vocab_' + key,
                top_k=VOCAB_SIZE, num_oov_buckets=OOV_SIZE)

        for key in BUCKET_FEATURE_KEYS:
            outputs[key] = tft.bucketize(
                to_dense(inputs[key]), FEATURE_BUCKET_COUNT)

        for key in CATEGORICAL_FEATURE_KEYS:
            outputs[key] = tf.cast(to_dense(inputs[key]), tf.int64)

        taxi_fare = to_dense(inputs[FARE_KEY])
        taxi_tip = to_dense(inputs[LABEL_KEY])
        # Test if the tip was > 20% of the fare.
        tip_threshold = tf.multiply(taxi_fare, tf.constant(0.2))
        outputs[LABEL_KEY] = tf.logical_and(
            tf.logical_not(tf.math.is_nan(taxi_fare)),
            tf.greater(taxi_tip, tip_threshold))

        for key in outputs:
            if outputs[key].dtype == tf.bool:
                outputs[key] = tft.compute_and_apply_vocabulary(tf.as_string(outputs[key]),
                                                                vocab_filename='vocab_' + key)

        return outputs
    trns_output = os.path.join(DATA_DIR, "transformed")
    if os.path.exists(trns_output):
        shutil.rmtree(trns_output)

    tft_input_metadata = dataset_metadata.DatasetMetadata(schema)

    runner = 'DirectRunner'
    with beam.Pipeline(runner, options=None) as p:
        with beam_impl.Context(temp_dir=os.path.join(trns_output, 'tmp')):
            converter = CsvCoder(column_names, tft_input_metadata.schema)

            # READ TRAIN DATA
            train_data = (
                p
                | 'ReadTrainData' >> textio.ReadFromText(TRAIN_DATA, skip_header_lines=1)
                | 'DecodeTrainData' >> beam.Map(converter.decode))

            # TRANSFORM TRAIN DATA (and get transform_fn function)
            transformed_dataset, transform_fn = (
                (train_data, tft_input_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocess_fn))
            transformed_data, transformed_metadata = transformed_dataset

            # SAVE TRANSFORMED TRAIN DATA
            _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                os.path.join(trns_output, 'train'),
                coder=ExampleProtoCoder(transformed_metadata.schema))

            # READ EVAL DATA
            eval_data = (
                p
                | 'ReadEvalData' >> textio.ReadFromText(EVALUATION_DATA, skip_header_lines=1)
                | 'DecodeEvalData' >> beam.Map(converter.decode))

            # TRANSFORM EVAL DATA (using previously created transform_fn function)
            eval_dataset = (eval_data, tft_input_metadata)
            transformed_eval_data, transformed_metadata = (
                (eval_dataset, transform_fn) | beam_impl.TransformDataset())

            # SAVE EVAL DATA
            _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord(
                os.path.join(trns_output, 'eval'),
                coder=ExampleProtoCoder(transformed_metadata.schema))

            # SAVE transform_fn FUNCTION FOR LATER USE
            # TODO: check out what is the transform function (transform_fn) that came from previous step
            _ = (transform_fn | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(trns_output))

            # SAVE TRANSFORMED METADATA
            metadata_io.write_metadata(
                metadata=tft_input_metadata,
                path=os.path.join(trns_output, 'metadata'))

    # -----------------------DATA SAVING START---------------------------------
    if "trns_output" in locals():
        _kale_resource_save(trns_output, os.path.join(
            _kale_data_directory, "trns_output"))
    else:
        print("_kale_resource_save: `trns_output` not found.")
Example #12
0
def run(flags, pipeline_args):
    """Run Apache Beam pipeline to generate TFRecords for Survival Analysis"""
    options = PipelineOptions(flags=[], **pipeline_args)
    options.view_as(WorkerOptions).machine_type = flags.machine_type
    temp_dir = os.path.join(flags.output_dir, 'tmp')
    runner = 'DataflowRunner' if flags.cloud else 'DirectRunner'

    files = tf.gfile.Glob(flags.input_dir + "*")
    if not flags.cloud:
        files = files[0:
                      20]  # if running locally for testing, process less files

    logging.warning("Number of files: " + str(len(files)))
    labels = get_labels_array(
        "gs://columbia-dl-storage-bucket/ADNI_t1_list_with_fsstatus_20190111.csv"
    )

    with beam.Pipeline(runner, options=options) as p:
        with tft_beam.Context(temp_dir=temp_dir):

            input_metadata = dataset_metadata.DatasetMetadata(
                dataset_schema.from_feature_spec(features.RAW_FEATURE_SPEC))

            filenames = (p | 'Create filenames' >> beam.Create(files))
            nii = (filenames | 'Read NII' >> beam.Map(read_nii))
            nii_with_labels = (
                nii
                | 'Get Label' >> beam.FlatMap(lambda x: read_label(x, labels)))

            raw_train, raw_eval, raw_test = (
                nii_with_labels | 'RandomlySplitData' >> randomly_split(
                    train_size=.7, validation_size=.15, test_size=.15))

            raw_train = raw_train | 'FlattenTrain' >> beam.FlatMap(
                lambda x: x[1])
            raw_eval = (raw_eval
                        | 'FlattenEval' >> beam.FlatMap(lambda x: x[1]))
            raw_test = (raw_test
                        | 'FlattenTest' >> beam.FlatMap(lambda x: x[1]))

            raw_train | 'CountLabelFreq' >> extractAndCount(flags.output_dir)

            dataset_and_metadata, transform_fn = (
                (raw_train, input_metadata)
                | 'TransformData' >> tft_beam.AnalyzeAndTransformDataset(
                    features.preprocess))
            transform_fn = (
                (raw_train, input_metadata)
                |
                'AnalyzeTrain' >> tft_beam.AnalyzeDataset(features.preprocess))
            _ = (transform_fn
                 | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(
                     flags.output_dir))
            for dataset_type, dataset in [('Train', raw_train),
                                          ('Eval', raw_eval),
                                          ('Predict', raw_test)]:

                transform_label = 'Transform{}'.format(dataset_type)
                t, metadata = (((dataset, input_metadata), transform_fn)
                               |
                               transform_label >> tft_beam.TransformDataset())
                if dataset_type == 'Train':
                    _ = (metadata
                         | 'WriteMetadata' >>
                         tft_beam_io.WriteMetadata(os.path.join(
                             flags.output_dir, 'transformed_metadata'),
                                                   pipeline=p))
                write_label = 'Write{}TFRecord'.format(dataset_type)
                _ = t | write_label >> WriteTFRecord(
                    dataset_type, flags.output_dir, metadata)
Example #13
0
def preprocess(pipeline, args):
    input_metadata = metadata_io.read_metadata(
        os.path.join(args.analyze_output_dir, RAW_METADATA_DIR))

    schema = json.loads(
        file_io.read_file_to_string(
            os.path.join(args.analyze_output_dir, SCHEMA_FILE)).decode())
    features = json.loads(
        file_io.read_file_to_string(
            os.path.join(args.analyze_output_dir, FEATURES_FILE)).decode())

    column_names = [col['name'] for col in schema]

    exclude_outputs = None
    if not args.target:
        for name, transform in six.iteritems(features):
            if transform['transform'] == TARGET_TRANSFORM:
                target_name = name
                column_names.remove(target_name)
                exclude_outputs = [target_name]
                del input_metadata.schema.column_schemas[target_name]
                break

    if args.csv_file_pattern:
        coder = coders.CsvCoder(column_names,
                                input_metadata.schema,
                                delimiter=',')
        raw_data = (
            pipeline
            | 'ReadCsvData' >> beam.io.ReadFromText(args.csv_file_pattern)
            | 'ParseCsvData' >> beam.Map(coder.decode))
    else:
        columns = ', '.join(column_names)
        query = 'SELECT {columns} FROM `{table}`'.format(
            columns=columns, table=args.bigquery_table)
        raw_data = (
            pipeline
            | 'ReadBiqQueryData' >> beam.io.Read(
                beam.io.BigQuerySource(query=query, use_standard_sql=True)))

    # Note that prepare_image_transforms does not make embeddings, it justs reads
    # the image files and converts them to byte stings. tft.TransformDataset()
    # will apply the saved model that makes the image embeddings.
    image_columns = image_transform_columns(features)
    raw_data = (raw_data
                | 'PreprocessTransferredLearningTransformations' >> beam.Map(
                    prepare_image_transforms, image_columns))

    if args.shuffle:
        raw_data = raw_data | 'ShuffleData' >> shuffle()

    transform_fn = (pipeline
                    | 'ReadTransformFn' >> tft_beam_io.ReadTransformFn(
                        args.analyze_output_dir))

    (transformed_data,
     transform_metadata) = (((raw_data, input_metadata), transform_fn)
                            | 'ApplyTensorflowPreprocessingGraph' >>
                            tft.TransformDataset(exclude_outputs))

    tfexample_coder = coders.ExampleProtoCoder(transform_metadata.schema)
    _ = (transformed_data
         | 'SerializeExamples' >> beam.Map(tfexample_coder.encode)
         | 'WriteExamples' >> beam.io.WriteToTFRecord(
             os.path.join(args.output_dir, args.output_filename_prefix),
             file_name_suffix='.tfrecord.gz'))
Example #14
0
def tftransform(
    pipeline_args,  # type: List[str]
    temp_location,  # type: str
    schema_file,  # type: str
    output_dir,  # type: str
    preprocessing_fn,  # type: Any
    training_data=None,  # type: Union[None, str]
    evaluation_data=None,  # type: Union[None, str]
    transform_fn_dir=None,  # type: Union[None, str]
    compression_type=None  # type: str
):  # type: (...) -> PipelineState
    """
    Generic tf.transform pipeline that takes tf.{example, record} training and evaluation
    datasets and outputs transformed data together with transform function Saved Model.

    :param pipeline_args: un-parsed Dataflow arguments
    :param temp_location: temporary location for dataflow job working dir
    :param schema_file: path to the raw feature schema text file
    :param output_dir: output dir for transformed data and function
    :param preprocessing_fn: tf.transform preprocessing function
    :param training_data: path to the training data
    :param evaluation_data: path to the evaluation data
    :param transform_fn_dir: dir to previously saved transformation function to apply
    :param compression_type: compression type for writing of tf.records
    :return final state of the Beam pipeline
    """
    assert_not_empty_string(temp_location)
    assert_not_empty_string(schema_file)
    assert_not_empty_string(output_dir)
    assert_not_none(preprocessing_fn)

    if compression_type is None:
        compression_type = CompressionTypes.AUTO

    raw_feature_spec = schema_txt_file_to_feature_spec(schema_file)
    raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)
    raw_data_coder = ExampleProtoCoder(raw_data_metadata.schema)

    transformed_train_output_dir = os.path.join(output_dir, "training")
    transformed_eval_output_dir = os.path.join(output_dir, "evaluation")

    if not any(i.startswith("--job_name") for i in pipeline_args):
        pipeline_args.append("--job_name=tf-transform-{}-{}".format(
            getpass.getuser(), int(time.time())))

    pipeline = beam.Pipeline(argv=pipeline_args)
    with beam_impl.Context(temp_dir=temp_location):
        if training_data is not None:
            # if training data is provided, transform_fn_dir will be ignored
            if transform_fn_dir is not None:
                warnings.warn(
                    "Transform_fn_dir is ignored because training_data is provided"
                )

            transform_fn_output = os.path.join(output_dir, "transform_fn",
                                               "saved_model.pb")
            if FileSystems.exists(transform_fn_output):
                raise ValueError("Transform fn already exists at %s!" %
                                 transform_fn_output)

            # compute the transform_fn and apply to the training data
            raw_train_data = (pipeline
                              | "ReadTrainData" >> tfrecordio.ReadFromTFRecord(
                                  training_data, coder=raw_data_coder))

            ((transformed_train_data, transformed_train_metadata),
             transform_fn) = (
                 (raw_train_data, raw_data_metadata)
                 | ("AnalyzeAndTransformTrainData" >>
                    beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
             )  # noqa: E501

            _ = (  # noqa: F841
                transform_fn
                | "WriteTransformFn" >>
                transform_fn_io.WriteTransformFn(output_dir))

            transformed_train_coder = ExampleProtoCoder(
                transformed_train_metadata.schema)
            _ = (  # noqa: F841
                transformed_train_data
                | "WriteTransformedTrainData" >> tfrecordio.WriteToTFRecord(
                    os.path.join(transformed_train_output_dir,
                                 "part"),  # noqa: E501
                    coder=transformed_train_coder,  # noqa: E501
                    compression_type=compression_type,  # noqa: E501
                    file_name_suffix=".tfrecords"))  # noqa: E501
        else:
            if transform_fn_dir is None:
                raise ValueError(
                    "Either training_data or transformed_fn needs to be provided"
                )
            # load the transform_fn
            transform_fn = pipeline | transform_fn_io.ReadTransformFn(
                transform_fn_dir)

        if evaluation_data is not None:
            # if evaluation_data exists, apply the transform_fn to the evaluation data
            raw_eval_data = (pipeline
                             | "ReadEvalData" >> tfrecordio.ReadFromTFRecord(
                                 evaluation_data, coder=raw_data_coder))

            (transformed_eval_data, transformed_eval_metadata) = (
                ((raw_eval_data, raw_data_metadata), transform_fn)
                | "TransformEvalData" >> beam_impl.TransformDataset())

            transformed_eval_coder = ExampleProtoCoder(
                transformed_eval_metadata.schema)
            _ = (  # noqa: F841
                transformed_eval_data
                | "WriteTransformedEvalData" >> tfrecordio.WriteToTFRecord(
                    os.path.join(transformed_eval_output_dir,
                                 "part"),  # noqa: E501
                    coder=transformed_eval_coder,  # noqa: E501
                    compression_type=compression_type,  # noqa: E501
                    file_name_suffix=".tfrecords"))  # noqa: E501
    result = pipeline.run().wait_until_finish()

    return result
Example #15
0
    def test_caching_vocab_for_integer_categorical(self):

        span_0_key = 'span-0'
        span_1_key = 'span-1'

        def preprocessing_fn(inputs):
            return {
                'x_vocab':
                tft.compute_and_apply_vocabulary(inputs['x'],
                                                 frequency_threshold=2)
            }

        input_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec({
                'x':
                tf.FixedLenFeature([], tf.int64),
            }))
        input_data_dict = {
            span_0_key: [{
                'x': -2,
            }, {
                'x': -4,
            }, {
                'x': -1,
            }, {
                'x': 4,
            }],
            span_1_key: [{
                'x': -2,
            }, {
                'x': -1,
            }, {
                'x': 6,
            }, {
                'x': 7,
            }],
        }
        expected_transformed_data = [{
            'x_vocab': 0,
        }, {
            'x_vocab': 1,
        }, {
            'x_vocab': -1,
        }, {
            'x_vocab': -1,
        }]
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            with beam.Pipeline() as p:

                flat_data = p | 'CreateInputData' >> beam.Create(
                    list(itertools.chain(*input_data_dict.values())))

                # TODO(b/37788560): Get these names programmatically.
                cache_dict = {
                    span_0_key: {
                        '__v0__VocabularyAccumulate--compute_and_apply_vocabulary-vocabulary--':
                        p | 'CreateB' >> beam.Create(
                            [b'[-2, 2]', b'[-4, 1]', b'[-1, 1]', b'[4, 1]']),
                    },
                    span_1_key: {},
                }

                transform_fn, cache_output = (
                    (flat_data, input_data_dict, cache_dict, input_metadata)
                    | 'Analyze' >>
                    (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
                _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(
                    self._cache_dir)

                transformed_dataset = (
                    ((input_data_dict[span_1_key], input_metadata),
                     transform_fn)
                    | 'Transform' >> beam_impl.TransformDataset())

                transformed_data, _ = transformed_dataset

                beam_test_util.assert_that(
                    transformed_data,
                    beam_test_util.equal_to(expected_transformed_data),
                    label='first')
Example #16
0
    def testTransformSparseColumns(self):
        # Define a transform that takes a sparse column and a varlen column, and
        # returns a combination of dense, sparse, and varlen columns.
        def preprocessing_fn(inputs):
            sparse_sum = tft.map(lambda x: tf.sparse_reduce_sum(x, axis=1),
                                 inputs['sparse'])
            sparse_copy = tft.map(
                lambda y: tf.SparseTensor(y.indices, y.values, y.dense_shape),
                inputs['sparse'])
            varlen_copy = tft.map(
                lambda y: tf.SparseTensor(y.indices, y.values, y.dense_shape),
                inputs['varlen'])

            sparse_copy.schema = sch.ColumnSchema(
                sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32),
                                        sch.LogicalShape([sch.Axis(10)])),
                sch.SparseColumnRepresentation(
                    'val_copy', [sch.SparseIndexField('idx_copy', False)]))

            return {
                'fixed': sparse_sum,  # Schema should be inferred.
                'sparse': inputs['sparse'],  # Schema manually attached above.
                'varlen': inputs['varlen'],  # Schema should be inferred.
                'sparse_copy':
                sparse_copy,  # Schema should propagate from input.
                'varlen_copy':
                varlen_copy  # Schema should propagate from input.
            }

        # Run AnalyzeAndTransform on some input data and compare with expected
        # output.
        input_metadata = self.toMetadata({
            'sparse':
            tf.SparseFeature('idx', 'val', tf.float32, 10),
            'varlen':
            tf.VarLenFeature(tf.float32),
        })
        input_data = [{
            'idx': [0, 1],
            'val': [0., 1.],
            'varlen': [0., 1.]
        }, {
            'idx': [2, 3],
            'val': [2., 3.],
            'varlen': [3., 4., 5.]
        }, {
            'idx': [4, 5],
            'val': [4., 5.],
            'varlen': [6., 7.]
        }]
        transformed_dataset, transform_fn = (
            (input_data, input_metadata)
            | beam_impl.AnalyzeAndTransformDataset(
                preprocessing_fn, os.path.join(self.get_temp_dir(), 'sparse')))

        expected_transformed_metadata = self.toMetadata({
            'fixed':
            tf.FixedLenFeature(None, tf.float32, None),
            'sparse':
            tf.SparseFeature('idx', 'val', tf.float32, 10),
            'varlen':
            tf.VarLenFeature(tf.float32),
            'sparse_copy':
            tf.SparseFeature('idx_copy', 'val_copy', tf.float32, 10),
            'varlen_copy':
            tf.VarLenFeature(tf.float32)
        })
        expected_transformed_data = [{
            'fixed': 1.0,
            'idx': [0, 1],
            'val': [0., 1.],
            'varlen': [0., 1.],
            'idx_copy': [0, 1],
            'val_copy': [0., 1.],
            'varlen_copy': [0., 1.]
        }, {
            'fixed': 5.0,
            'idx': [2, 3],
            'val': [2., 3.],
            'varlen': [3., 4., 5.],
            'idx_copy': [2, 3],
            'val_copy': [2., 3.],
            'varlen_copy': [3., 4., 5.]
        }, {
            'fixed': 9.0,
            'idx': [4, 5],
            'val': [4., 5.],
            'varlen': [6., 7.],
            'idx_copy': [4, 5],
            'val_copy': [4., 5.],
            'varlen_copy': [6., 7.]
        }]
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_metadata))

        # Take the transform function and use TransformDataset to apply it to
        # some eval data, and compare with expected output.
        eval_data = [{
            'idx': [0],
            'val': [9.],
            'varlen': [9.]
        }, {
            'idx': [],
            'val': [],
            'varlen': []
        }, {
            'idx': [2, 4],
            'val': [8., 7.],
            'varlen': [8., 7.]
        }]
        transformed_eval_dataset = (((eval_data, input_metadata), transform_fn)
                                    | beam_impl.TransformDataset())

        expected_transformed_eval_values = [{
            'fixed': 9.,
            'idx': [0],
            'val': [9.],
            'varlen': [9.],
            'idx_copy': [0],
            'val_copy': [9.],
            'varlen_copy': [9.]
        }, {
            'fixed': 0.,
            'idx': [],
            'val': [],
            'varlen': [],
            'idx_copy': [],
            'val_copy': [],
            'varlen_copy': []
        }, {
            'fixed': 15.,
            'idx': [2, 4],
            'val': [8., 7.],
            'varlen': [8., 7.],
            'idx_copy': [2, 4],
            'val_copy': [8., 7.],
            'varlen_copy': [8., 7.]
        }]
        self.assertDatasetsEqual(
            transformed_eval_dataset,
            (expected_transformed_eval_values, expected_transformed_metadata))
Example #17
0
def transform_data(train_data_file, test_data_file, working_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and converts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.

  Args:
    train_data_file: File containing training data
    test_data_file: File containing test data
    working_dir: Directory to write transformed data and metadata to
  """
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        # Since we are modifying some features and leaving others unchanged, we
        # start by setting `outputs` to a copy of `inputs.
        outputs = inputs.copy()

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_FEATURE_KEYS:
            outputs[key] = tft.scale_to_0_1(outputs[key])

        # For all categorical columns except the label column, we generate a
        # vocabulary but do not modify the feature.  This vocabulary is instead
        # used in the trainer, by means of a feature column, to convert the feature
        # from a string to an integer id.
        for key in CATEGORICAL_FEATURE_KEYS:
            tft.uniques(inputs[key], vocab_filename=key)

        # For the label column we provide the mapping from string to index.
        def convert_label(label):
            table = lookup.index_table_from_tensor(['>50K', '<=50K'])
            return table.lookup(label)

        outputs[LABEL_KEY] = tft.apply_function(convert_label,
                                                outputs[LABEL_KEY])

        return outputs

    # The "with" block will create a pipeline, and run that pipeline at the exit
    # of the block.
    with beam.Pipeline() as pipeline:
        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            # Create a coder to read the census data with the schema.  To do this we
            # need to list all columns in order since the schema doesn't specify the
            # order of columns in the csv.
            ordered_columns = [
                'age', 'workclass', 'fnlwgt', 'education', 'education-num',
                'marital-status', 'occupation', 'relationship', 'race', 'sex',
                'capital-gain', 'capital-loss', 'hours-per-week',
                'native-country', 'label'
            ]
            converter = csv_coder.CsvCoder(ordered_columns,
                                           RAW_DATA_METADATA.schema)

            # Read in raw data and convert using CSV converter.  Note that we apply
            # some Beam transformations here, which will not be encoded in the TF
            # graph since we don't do the from within tf.Transform's methods
            # (AnalyzeDataset, TransformDataset etc.).  These transformations are just
            # to get data into a format that the CSV converter can read, in particular
            # removing spaces after commas.
            #
            # We use MapAndFilterErrors instead of Map to filter out decode errors in
            # convert.decode which should only occur for the trailing blank line.
            raw_data = (
                pipeline
                | 'ReadTrainData' >> textio.ReadFromText(train_data_file)
                | 'FixCommasTrainData' >>
                beam.Map(lambda line: line.replace(', ', ','))
                | 'DecodeTrainData' >> MapAndFilterErrors(converter.decode))

            # Combine data and schema into a dataset tuple.  Note that we already used
            # the schema to read the CSV data, but we also need it to interpret
            # raw_data.
            raw_dataset = (raw_data, RAW_DATA_METADATA)
            transformed_dataset, transform_fn = (
                raw_dataset
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset
            transformed_data_coder = example_proto_coder.ExampleProtoCoder(
                transformed_metadata.schema)

            _ = (transformed_data
                 | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
                 | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                     os.path.join(working_dir,
                                  TRANSFORMED_TRAIN_DATA_FILEBASE)))

            # Now apply transform function to test data.  In this case we remove the
            # trailing period at the end of each line, and also ignore the header line
            # that is present in the test data file.
            raw_test_data = (pipeline
                             | 'ReadTestData' >> textio.ReadFromText(
                                 test_data_file, skip_header_lines=1)
                             | 'FixCommasTestData' >>
                             beam.Map(lambda line: line.replace(', ', ','))
                             | 'RemoveTrailingPeriodsTestData' >>
                             beam.Map(lambda line: line[:-1])
                             | 'DecodeTestData' >> beam.Map(converter.decode))

            raw_test_dataset = (raw_test_data, RAW_DATA_METADATA)

            transformed_test_dataset = ((raw_test_dataset, transform_fn)
                                        | beam_impl.TransformDataset())
            # Don't need transformed data schema, it's the same as before.
            transformed_test_data, _ = transformed_test_dataset

            _ = (
                transformed_test_data
                | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode)
                | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
                    os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)))

            # Will write a SavedModel and metadata to two subdirectories of
            # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and
            # transform_fn_io.TRANSFORMED_METADATA_DIR respectively.
            _ = (transform_fn
                 | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(working_dir))
Example #18
0
def run_transform(output_dir,
                  schema,
                  train_data_file,
                  eval_data_file,
                  project,
                  mode,
                  preprocessing_fn=None):
    """Writes a tft transform fn, and metadata files.
  Args:
    output_dir: output folder
    schema: schema list.
    train_data_file: training data file pattern.
    eval_data_file: eval data file pattern.
    project: the project to run dataflow in.
    local: whether the job should be local or cloud.
    preprocessing_fn: a function used to preprocess the raw data. If not
                      specified, a function will be automatically inferred
                      from the schema.
  """

    tft_input_metadata = make_tft_input_metadata(schema)
    temp_dir = os.path.join(output_dir, 'tmp')
    preprocessing_fn = preprocessing_fn or make_preprocessing_fn(schema)

    if mode == 'local':
        pipeline_options = None
        runner = 'DirectRunner'
    elif mode == 'cloud':
        options = {
            'job_name':
            'pipeline-tft-' +
            datetime.datetime.now().strftime('%y%m%d-%H%M%S'),
            'temp_location':
            temp_dir,
            'project':
            project,
            'extra_packages': [
                'gs://ml-pipeline-playground/tensorflow-transform-0.6.0.dev0.tar.gz'
            ]
        }
        pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
        runner = 'DataFlowRunner'
    else:
        raise ValueError("Invalid mode %s." % mode)

    with beam.Pipeline(runner, options=pipeline_options) as p:
        with beam_impl.Context(temp_dir=temp_dir):
            names = [x['name'] for x in schema]
            converter = CsvCoder(names, tft_input_metadata.schema)
            train_data = (
                p
                | 'ReadTrainData' >> textio.ReadFromText(train_data_file)
                | 'DecodeTrainData' >> beam.Map(converter.decode))

            train_dataset = (train_data, tft_input_metadata)
            transformed_dataset, transform_fn = (
                train_dataset
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset

            # Writes transformed_metadata and transfrom_fn folders
            _ = (transform_fn | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(output_dir))

            # Write the raw_metadata
            metadata_io.write_metadata(metadata=tft_input_metadata,
                                       path=os.path.join(
                                           output_dir, 'metadata'))

            _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                os.path.join(output_dir, 'train'),
                coder=ExampleProtoCoder(transformed_metadata.schema))

            eval_data = (p
                         |
                         'ReadEvalData' >> textio.ReadFromText(eval_data_file)
                         | 'DecodeEvalData' >> beam.Map(converter.decode))

            eval_dataset = (eval_data, tft_input_metadata)

            transformed_eval_dataset = ((eval_dataset, transform_fn)
                                        | beam_impl.TransformDataset())
            transformed_eval_data, transformed_metadata = transformed_eval_dataset

            _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord(
                os.path.join(output_dir, 'eval'),
                coder=ExampleProtoCoder(transformed_metadata.schema))
def transform_data(train_neg_filepattern, train_pos_filepattern,
                   test_neg_filepattern, test_pos_filepattern,
                   transformed_train_filebase, transformed_test_filebase,
                   transformed_metadata_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data from the positive and negative examples on disk, and
  transform it using a preprocessing pipeline that removes punctuation,
  tokenizes and maps tokens to int64 values indices.

  Args:
    train_neg_filepattern: Filepattern for training data negative examples
    train_pos_filepattern: Filepattern for training data positive examples
    test_neg_filepattern: Filepattern for test data negative examples
    test_pos_filepattern: Filepattern for test data positive examples
    transformed_train_filebase: Base filename for transformed training data
        shards
    transformed_test_filebase: Base filename for transformed test data shards
    transformed_metadata_dir: Directory where metadata for transformed data
        should be written
  """

    with beam.Pipeline() as pipeline:
        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            # pylint: disable=no-value-for-parameter
            train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData(
                (train_neg_filepattern, train_pos_filepattern))
            # pylint: disable=no-value-for-parameter
            test_data = pipeline | 'ReadTest' >> ReadAndShuffleData(
                (test_neg_filepattern, test_pos_filepattern))

            metadata = dataset_metadata.DatasetMetadata(
                dataset_schema.Schema({
                    REVIEW_COLUMN:
                    dataset_schema.ColumnSchema(
                        tf.string, [],
                        dataset_schema.FixedColumnRepresentation()),
                    LABEL_COLUMN:
                    dataset_schema.ColumnSchema(
                        tf.int64, [],
                        dataset_schema.FixedColumnRepresentation()),
                }))

            def preprocessing_fn(inputs):
                """Preprocess input columns into transformed columns."""
                review = inputs[REVIEW_COLUMN]

                review_tokens = tf.string_split(review, DELIMITERS)
                review_indices = tft.string_to_int(review_tokens,
                                                   top_k=VOCAB_SIZE)
                # Add one for the oov bucket created by string_to_int.
                review_bow_indices, review_weight = tft.tfidf(
                    review_indices, VOCAB_SIZE + 1)
                return {
                    REVIEW_COLUMN: review_bow_indices,
                    REVIEW_WEIGHT: review_weight,
                    LABEL_COLUMN: inputs[LABEL_COLUMN]
                }

            (transformed_train_data, transformed_metadata), transform_fn = (
                (train_data, metadata)
                | 'AnalyzeAndTransform' >>
                beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

            transformed_test_data, _ = (
                ((test_data, metadata), transform_fn)
                | 'Transform' >> beam_impl.TransformDataset())

            _ = (transformed_train_data
                 | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                     transformed_train_filebase,
                     coder=example_proto_coder.ExampleProtoCoder(
                         transformed_metadata.schema)))

            _ = (transformed_test_data
                 | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
                     transformed_test_filebase,
                     coder=example_proto_coder.ExampleProtoCoder(
                         transformed_metadata.schema)))

            _ = (transformed_metadata
                 | 'WriteMetadata' >> beam_metadata_io.WriteMetadata(
                     transformed_metadata_dir, pipeline=pipeline))
Example #20
0
    def assertAnalyzeAndTransformResults(self,
                                         input_data,
                                         input_metadata,
                                         preprocessing_fn,
                                         expected_data=None,
                                         expected_metadata=None,
                                         expected_vocab_file_contents=None,
                                         expected_asset_file_contents=None,
                                         test_data=None,
                                         desired_batch_size=None,
                                         beam_pipeline=None,
                                         temp_dir=None,
                                         use_tfxio=False,
                                         input_data_is_tfxio_format=False):
        """Assert that input data and metadata is transformed as expected.

    This methods asserts transformed data and transformed metadata match
    with expected_data and expected_metadata.

    Args:
      input_data: A sequence of dicts whose values are
          either strings, lists of strings, numeric types or a pair of those.
      input_metadata: DatasetMetadata describing input_data.
      preprocessing_fn: A function taking a dict of tensors and returning
          a dict of tensors.
      expected_data: (optional) A dataset with the same type constraints as
          input_data, but representing the output after transformation.
          If supplied, transformed data is asserted to be equal.
      expected_metadata: (optional) DatasetMetadata describing the transformed
          data. If supplied, transformed metadata is asserted to be equal.
      expected_vocab_file_contents: (optional) A dictionary from vocab filenames
          to their expected content as a list of text lines or a list of tuples
          of frequency and text. Values should be the expected result of calling
          f.readlines() on the given asset files.
      expected_asset_file_contents: deprecated.  Use
          expected_vocab_file_contents.
      test_data: (optional) If this is provided then instead of calling
          AnalyzeAndTransformDataset with input_data, this function will call
          AnalyzeDataset with input_data and TransformDataset with test_data.
          Note that this is the case even if input_data and test_data are equal.
          test_data should also conform to input_metadata.
      desired_batch_size: (optional) A batch size to batch elements by. If not
          provided, a batch size will be computed automatically.
      beam_pipeline: (optional) A Beam Pipeline to use in this test.
      temp_dir: If set, it is used as output directory, else a new unique
          directory is created.
      use_tfxio: If True, invoke AnalyzeAndTransformDataset using the new API
          that accepts standardized inputs (Arrow `RecordBatch`es). Otherwise
          use the old API that accepts Dicts.
      input_data_is_tfxio_format: If True, `input_data` and `test_data` are
          Arrow `RecordBatch`es and the `input_metadata` is
          `tfxio.tensor_adapter.TensorAdapterConfig`. Otherwise the input data
          is a list of Dicts and input_metadata is a `DatasetMetadata`.
    Raises:
      AssertionError: if the expected data does not match the results of
          transforming input_data according to preprocessing_fn, or
          (if provided) if the expected metadata does not match.
      ValueError: if expected_vocab_file_contents and
          expected_asset_file_contents are both set.
    """
        if (expected_vocab_file_contents is not None
                and expected_asset_file_contents is not None):
            raise ValueError('only one of expected_asset_file_contents and '
                             'expected_asset_file_contents should be set')
        elif expected_asset_file_contents is not None:
            tf.compat.v1.logging.warn(
                'expected_asset_file_contents is deprecated, use '
                'expected_vocab_file_contents')

        expected_vocab_file_contents = (expected_vocab_file_contents
                                        or expected_asset_file_contents or {})
        del expected_asset_file_contents

        if not use_tfxio and input_data_is_tfxio_format:
            raise ValueError('Unable to feed TFXIO input format to the old, '
                             'non-TFXIO API.')
        compatibility_tfxio_needed = use_tfxio and not input_data_is_tfxio_format
        # Note: we don't separately test AnalyzeDataset and TransformDataset as
        # AnalyzeAndTransformDataset currently simply composes these two
        # transforms.  If in future versions of the code, the implementation
        # differs, we should also run AnalyzeDataset and TransformDataset composed.
        temp_dir = temp_dir or tempfile.mkdtemp(prefix=self._testMethodName,
                                                dir=self.get_temp_dir())
        with beam_pipeline or self._makeTestPipeline() as pipeline:
            with beam_impl.Context(temp_dir=temp_dir,
                                   desired_batch_size=desired_batch_size,
                                   use_tfxio=use_tfxio):
                input_data = pipeline | 'CreateInput' >> beam.Create(
                    input_data, reshuffle=False)
                if compatibility_tfxio_needed:
                    legacy_input_metadata = input_metadata
                    input_data, input_metadata = self.convert_to_tfxio_api_inputs(
                        input_data, input_metadata, label='input_data')
                if test_data is None:
                    (transformed_data, transformed_metadata), transform_fn = (
                        (input_data, input_metadata)
                        |
                        beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
                else:
                    transform_fn = ((input_data, input_metadata)
                                    |
                                    beam_impl.AnalyzeDataset(preprocessing_fn))
                    test_data = pipeline | 'CreateTest' >> beam.Create(
                        test_data)
                    if compatibility_tfxio_needed:
                        test_data, _ = self.convert_to_tfxio_api_inputs(
                            test_data,
                            legacy_input_metadata,
                            label='test_data')
                    transformed_data, transformed_metadata = (
                        ((test_data, input_metadata), transform_fn)
                        | beam_impl.TransformDataset())

                # Write transform_fn so we can test its assets
                _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir)

                if expected_data is not None:
                    transformed_data_coder = tft.coders.ExampleProtoCoder(
                        transformed_metadata.schema)

                    transformed_data_path = os.path.join(
                        temp_dir, 'transformed_data')
                    _ = (transformed_data
                         | beam.Map(transformed_data_coder.encode)
                         | beam.io.tfrecordio.WriteToTFRecord(
                             transformed_data_path, shard_name_template=''))

        # TODO(ebreck) Log transformed_data somewhere.
        if expected_data is not None:
            examples = tf.compat.v1.python_io.tf_record_iterator(
                path=transformed_data_path)
            transformed_data = [
                transformed_data_coder.decode(x) for x in examples
            ]
            self.assertDataCloseOrEqual(expected_data, transformed_data)

        tf_transform_output = tft.TFTransformOutput(temp_dir)
        if expected_metadata:
            # Make a copy with no annotations.
            transformed_schema = schema_pb2.Schema()
            transformed_schema.CopyFrom(
                tf_transform_output.transformed_metadata.schema)
            transformed_schema.ClearField('annotation')
            for feature in transformed_schema.feature:
                feature.ClearField('annotation')
            self.assertEqual(expected_metadata.schema, transformed_schema)

        for filename, file_contents in six.iteritems(
                expected_vocab_file_contents):
            full_filename = tf_transform_output.vocabulary_file_by_name(
                filename)
            self.AssertVocabularyContents(full_filename, file_contents)
def preprocess(p, output_dir, check_path, data_size, bq_table, split_data_path,
               project_id):
    """Main processing pipeline reading, processing and storing processed data.

  Performs the following operations:
    - reads data from BigQuery
    - adds hash key value to each row
    - scales data
    - shuffles and splits data in train / validation / test sets
    - oversamples train data
    - stores data as TFRecord
    - splits and stores test data into labels and features files

  Args:
    p: PCollection, initial pipeline.
    output_dir: string, path to directory to store output.
    check_path: string, path to directory to store data checks.
    data_size: tuple of float, ratio of data going respectively to train,
      validation and test sets.
    bq_table: string, name of table to read data from.
    split_data_path: string, path to directory to store train, validation and
      test raw datasets.
    project_id: string, GCP project id.

  Raises:
    ValueError: No test dataset found in pipeline output.
  """

    train_size, validation_size, test_size = data_size

    data = (p |
            'ReadData' >> read_data(bq_table=bq_table, project_id=project_id))

    _ = data | 'StoreData' >> beam.io.WriteToText(
        posixpath.join(output_dir, check_path, 'processed_data.txt'))

    split_data = (
        data |
        'RandomlySplitData' >> randomly_split(train_size=train_size,
                                              validation_size=validation_size,
                                              test_size=test_size))

    for k in split_data:
        split_data[k] |= 'AddHash_{}'.format(k.name) >> beam.ParDo(
            AddHash(),
            label_column=constants.LABEL_COLUMN,
            key_column=constants.KEY_COLUMN,
            dtype=k)

    # Splits test data into features pipeline and labels pipeline.
    if DatasetType.TEST not in split_data:
        raise ValueError('No test dataset found in pipeline output.')
    test_data = (split_data.pop(DatasetType.TEST)
                 | 'SplitFeaturesLabels' >> split_features_labels(
                     constants.LABEL_COLUMN, constants.KEY_COLUMN))

    # Stores test data features and labels pipeline separately.
    for k in test_data:
        _ = (test_data[k]
             | 'ParseJsonToString_{}'.format(k) >> beam.Map(json.dumps)
             | 'StoreSplitData_{}'.format(k) >> beam.io.WriteToText(
                 posixpath.join(
                     output_dir, split_data_path,
                     'split_data_{}_{}.txt'.format(DatasetType.TEST.name, k))))

    meta_data = dataset_metadata.DatasetMetadata(make_input_schema())

    transform_fn = (
        (split_data[DatasetType.TRAIN], meta_data)
        | 'AnalyzeTrainDataset' >> beam_impl.AnalyzeDataset(preprocessing_fn))

    _ = (transform_fn
         | 'WriteTransformFn' >> tft.beam.tft_beam_io.WriteTransformFn(
             posixpath.join(output_dir, constants.PATH_INPUT_TRANSFORMATION)))
    _ = (meta_data
         | 'WriteInputMetadata' >> tft.beam.tft_beam_io.WriteMetadata(
             posixpath.join(output_dir, constants.PATH_INPUT_SCHEMA),
             pipeline=p))

    transformed_metadata, transformed_data = {}, {}
    for k in [DatasetType.TRAIN, DatasetType.VAL]:
        transformed_data[k], transformed_metadata[k] = (
            ((split_data[k], meta_data), transform_fn)
            | 'Transform{}'.format(k) >> beam_impl.TransformDataset())

    transformed_data[DatasetType.TRAIN] = (
        transformed_data[DatasetType.TRAIN]
        | 'OverSampleTraining' >> oversampling())

    for k in transformed_data:
        _ = (transformed_data[k]
             | 'ShuffleData{}'.format(k) >> shuffle_data()
             | 'StoreData{}'.format(k) >> store_transformed_data(
                 schema=transformed_metadata[k],
                 path=posixpath.join(output_dir,
                                     constants.PATH_TRANSFORMED_DATA_SPLIT[k]),
                 name=DatasetType(k).name))

    for k in transformed_data:
        _ = (transformed_data[k] | 'CheckSize{}'.format(k.name) >> check_size(
            name=DatasetType(k).name,
            path=posixpath.join(output_dir, check_path, k.name)))
Example #22
0
def preprocess_data(train_neg_file_pattern,
                    train_pos_file_pattern,
                    test_neg_file_pattern,
                    test_pos_file_pattern,
                    transformed_train_file_pattern,
                    transformed_test_file_pattern,
                    transformed_metadata_dir,
                    raw_metadata_dir,
                    transform_func_dir,
                    temp_dir,
                    vocab_size,
                    delimiters):
    """Transform the data and write out as a TFRecord of Example protos.
    Read in the data from the positive and negative examples on disk, and
    transform it using a preprocessing pipeline that removes punctuation,
    tokenizes and maps tokens to int64 values indices.

    Args:
    train_neg_filepattern: Filepattern for training data negative examples
    train_pos_filepattern: Filepattern for training data positive examples
    test_neg_filepattern: Filepattern for test data negative examples
    test_pos_filepattern: Filepattern for test data positive examples
    transformed_train_filebase: Base filename for transformed training data shards
    transformed_test_filebase: Base filename for transformed test data shards
    transformed_metadata_dir: Directory where metadata for transformed data should be written


    raw_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({
        REVIEW_COLUMN: dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation()),
        LABEL_COLUMN: dataset_schema.ColumnSchema(tf.int64, [], dataset_schema.FixedColumnRepresentation()),
    }))
    """
    pipeline_name = 'DataflowRunner'
    options = {
        'job_name': ('cloud-ml-hazmat-preprocess-{}'.format(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))),
        'temp_location': temp_dir,
        'project': "stone-outpost-636",
        'max_num_workers': 8
    }
    pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
    #with beam.Pipeline(pipeline_name, options=pipeline_options) as pipeline:
    #    with beam_impl.Context(temp_dir=temp_dir):
    with beam.Pipeline() as pipeline:
        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):

            train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData((train_neg_file_pattern, train_pos_file_pattern))
            test_data = pipeline | 'ReadTest' >> ReadAndShuffleData((test_neg_file_pattern, test_pos_file_pattern))
            preprocessing_fn = generate_preprocessing_fn(vocab_size, delimiters)

            (transformed_train_data, transformed_metadata), transform_fn = ((train_data, const.RAW_METADATA)
              | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

            _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(transform_func_dir))

            transformed_test_data, _ = (((test_data, const.RAW_METADATA), transform_fn)
              | 'Transform' >> beam_impl.TransformDataset())

            _ = (transformed_train_data
              | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(transformed_train_file_pattern,
                  coder=example_proto_coder.ExampleProtoCoder(transformed_metadata.schema)))

            _ = (transformed_test_data
              | 'WriteTestData' >> tfrecordio.WriteToTFRecord(transformed_test_file_pattern,
                  coder=example_proto_coder.ExampleProtoCoder(transformed_metadata.schema)))

            _ = (transformed_metadata
              | 'WriteTransformedMetadata' >> beam_metadata_io.WriteMetadata(transformed_metadata_dir, pipeline=pipeline))

            _ = (const.RAW_METADATA
              | 'WriteRawMetadata' >> beam_metadata_io.WriteMetadata(raw_metadata_dir, pipeline=pipeline))
Example #23
0
    def test_single_phase_mixed_analyzer_run_once(self):
        cache_location = self._make_cache_location()

        span_0_key = 'span-0'
        span_1_key = 'span-1'

        _write_cache('__v0__CacheableCombineAccumulate--x_1-mean_and_var--',
                     span_0_key, [2.0, 1.0, 9.0],
                     cache_location.input_cache_dir)
        _write_cache('__v0__CacheableCombineAccumulate--x-x--', span_0_key,
                     [2.0, 4.0], cache_location.input_cache_dir)
        _write_cache('__v0__CacheableCombineAccumulate--y_1-mean_and_var--',
                     span_0_key, [2.0, -1.5, 6.25],
                     cache_location.input_cache_dir)
        _write_cache('__v0__CacheableCombineAccumulate--y-y--', span_0_key,
                     [4.0, 1.0], cache_location.input_cache_dir)

        def preprocessing_fn(inputs):

            integerized_s = tft.compute_and_apply_vocabulary(inputs['s'])

            _ = tft.bucketize(inputs['x'], 2, name='bucketize')

            return {
                'integerized_s':
                integerized_s,
                'x_min':
                tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'x_mean':
                tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'y_min':
                tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
                'y_mean':
                tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
            }

        # Run AnalyzeAndTransform on some input data and compare with expected
        # output.
        input_data = [{'x': 12, 'y': 1, 's': 'c'}, {'x': 10, 'y': 1, 's': 'c'}]
        input_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec({
                'x':
                tf.FixedLenFeature([], tf.float32),
                'y':
                tf.FixedLenFeature([], tf.float32),
                's':
                tf.FixedLenFeature([], tf.string),
            }))
        input_data_dict = {
            span_0_key: [{
                'x': -2,
                'y': 1,
                's': 'b',
            }, {
                'x': 4,
                'y': -4,
                's': 'b',
            }],
            span_1_key:
            input_data,
        }
        with beam_impl.Context(temp_dir=self.get_temp_dir()):

            flat_data = input_data_dict.values() | 'Flatten' >> beam.Flatten()

            transform_fn = ((flat_data, input_data_dict, input_metadata) |
                            (beam_impl.AnalyzeDatasetWithCache(
                                preprocessing_fn, cache_location)))

        transformed_dataset = ((
            (input_data_dict[span_1_key], input_metadata), transform_fn)
                               | beam_impl.TransformDataset())

        transformed_data, unused_transformed_metadata = transformed_dataset

        exepected_transformed_data = [
            {
                'x_mean': 6.0,
                'x_min': -2.0,
                'y_mean': -0.25,
                'y_min': -4.0,
                'integerized_s': 0,
            },
            {
                'x_mean': 6.0,
                'x_min': -2.0,
                'y_mean': -0.25,
                'y_min': -4.0,
                'integerized_s': 0,
            },
        ]
        self.assertDataCloseOrEqual(transformed_data,
                                    exepected_transformed_data)

        transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn')
        _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir)
Example #24
0
    def assertAnalyzeAndTransformResults(self,
                                         input_data,
                                         input_metadata,
                                         preprocessing_fn,
                                         expected_data=None,
                                         expected_metadata=None,
                                         only_check_core_metadata=False,
                                         expected_asset_file_contents=None,
                                         test_data=None,
                                         desired_batch_size=None):
        """Assert that input data and metadata is transformed as expected.

    This methods asserts transformed data and transformed metadata match
    with expected_data and expected_metadata.

    Args:
      input_data: A sequence of dicts whose values are
          either strings, lists of strings, numeric types or a pair of those.
      input_metadata: DatasetMetadata describing input_data.
      preprocessing_fn: A function taking a dict of tensors and returning
          a dict of tensors.
      expected_data: (optional) A dataset with the same type constraints as
          input_data, but representing the output after transformation.
          If supplied, transformed data is asserted to be equal.
      expected_metadata: (optional) DatasetMetadata describing the transformed
          data. If supplied, transformed metadata is asserted to be equal.
      only_check_core_metadata: A boolean to indicate if all elements in
          the transformed metadata is asserted to be equal to expected metadata.
          If True, only transformed feature names, dtypes and representations
          are asserted.
      expected_asset_file_contents: (optional) A dictionary from asset filenames
          to their expected content as a list of text lines.  Values should be
          the expected result of calling f.readlines() on the given asset files.
          Asset filenames are relative to the saved model's asset directory.
      test_data: (optional) If this is provided then instead of calling
          AnalyzeAndTransformDataset with input_data, this function will call
          AnalyzeDataset with input_data and TransformDataset with test_data.
          Note that this is the case even if input_data and test_data are equal.
          test_data should also conform to input_metadata.
      desired_batch_size: (optional) A batch size to batch elements by. If not
          provided, a batch size will be computed automatically.
    Raises:
      AssertionError: if the expected data does not match the results of
          transforming input_data according to preprocessing_fn, or
          (if provided) if the expected metadata does not match.
    """
        if expected_asset_file_contents is None:
            expected_asset_file_contents = {}
        # Note: we don't separately test AnalyzeDataset and TransformDataset as
        # AnalyzeAndTransformDataset currently simply composes these two
        # transforms.  If in future versions of the code, the implementation
        # differs, we should also run AnalyzeDataset and TransformDatset composed.
        temp_dir = self.get_temp_dir()
        with beam_impl.Context(temp_dir=temp_dir,
                               desired_batch_size=desired_batch_size):
            if test_data is None:
                (transformed_data, transformed_metadata), transform_fn = (
                    (input_data, input_metadata)
                    | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            else:
                transform_fn = ((input_data, input_metadata)
                                | beam_impl.AnalyzeDataset(preprocessing_fn))
                transformed_data, transformed_metadata = (
                    ((test_data, input_metadata), transform_fn)
                    | beam_impl.TransformDataset())

            # Write transform_fn so we can test its assets
            if expected_asset_file_contents:
                _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir)

        if expected_data is not None:
            self.assertDataCloseOrEqual(expected_data, transformed_data)

        if expected_metadata:
            transformed_metadata = self._resolveDeferredMetadata(
                transformed_metadata)

            if only_check_core_metadata:
                # preprocessing_fn may add metadata to column schema only relevant to
                # internal implementation such as vocabulary_file. As such, only check
                # feature names, dtypes and representations are as expected.
                self.assertSameElements(
                    transformed_metadata.schema.column_schemas.keys(),
                    expected_metadata.schema.column_schemas.keys())
                for k, v in transformed_metadata.schema.column_schemas.iteritems(
                ):
                    expected_schema = expected_metadata.schema.column_schemas[
                        k]
                    self.assertEqual(
                        expected_schema.representation, v.representation,
                        "representation doesn't match for feature '%s'" % k)
                    self.assertEqual(
                        expected_schema.domain.dtype, v.domain.dtype,
                        "dtype doesn't match for feature '%s'" % k)
            else:
                # Check the entire DatasetMetadata is as expected.
                # Use extra assertEqual for schemas, since full metadata assertEqual
                # error message is not conducive to debugging.
                self.assertEqual(expected_metadata.schema.column_schemas,
                                 transformed_metadata.schema.column_schemas)
                self.assertEqual(expected_metadata, transformed_metadata)

        for filename, file_contents in six.iteritems(
                expected_asset_file_contents):
            full_filename = os.path.join(temp_dir,
                                         transform_fn_io.TRANSFORM_FN_DIR,
                                         'assets', filename)
            with tf.gfile.Open(full_filename) as f:
                self.assertEqual(f.readlines(), file_contents)
Example #25
0
def transform_data(train_data_file, test_data_file, transformed_train_filebase,
                   transformed_test_filebase, transformed_metadata_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and coverts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.

  Args:
    train_data_file: File containing training data
    test_data_file: File containing test data
    transformed_train_filebase: Base filename for transformed training data
        shards
    transformed_test_filebase: Base filename for transformed test data shards
    transformed_metadata_dir: Directory where metadata for transformed data
        should be written
  """
    raw_data_schema = {
        key:
        dataset_schema.ColumnSchema(tf.string, [],
                                    dataset_schema.FixedColumnRepresentation())
        for key in CATEGORICAL_COLUMNS
    }
    raw_data_schema.update({
        key:
        dataset_schema.ColumnSchema(tf.float32, [],
                                    dataset_schema.FixedColumnRepresentation())
        for key in NUMERIC_COLUMNS
    })
    raw_data_schema[LABEL_COLUMN] = dataset_schema.ColumnSchema(
        tf.string, [], dataset_schema.FixedColumnRepresentation())
    raw_data_schema = dataset_schema.Schema(raw_data_schema)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_data_schema)

    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        outputs = {}

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_COLUMNS:
            outputs[key] = tft.scale_to_0_1(inputs[key])

        # For all categorical columns except the label column, we use
        # tft.string_to_int which computes the set of unique values and uses this
        # to convert the strings to indices.
        for key in CATEGORICAL_COLUMNS:
            outputs[key] = tft.string_to_int(inputs[key])

        # For the label column we provide the mapping from string to index.
        def convert_label(label):
            table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K'])
            return table.lookup(label)

        outputs[LABEL_COLUMN] = tft.map(convert_label, inputs[LABEL_COLUMN])

        return outputs

    # The "with" block will create a pipeline, and run that pipeline at the exit
    # of the block.
    with beam.Pipeline() as pipeline:
        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            # Create a coder to read the census data with the schema.  To do this we
            # need to list all columns in order since the schema doesn't specify the
            # order of columns in the csv.
            ordered_columns = [
                'age', 'workclass', 'fnlwgt', 'education', 'education-num',
                'marital-status', 'occupation', 'relationship', 'race', 'sex',
                'capital-gain', 'capital-loss', 'hours-per-week',
                'native-country', 'label'
            ]
            converter = csv_coder.CsvCoder(ordered_columns, raw_data_schema)

            # Read in raw data and convert using CSV converter.  Note that we apply
            # some Beam transformations here, which will not be encoded in the TF
            # graph since we don't do the from within tf.Transform's methods
            # (AnalyzeDataset, TransformDataset etc.).  These transformations are just
            # to get data into a format that the CSV converter can read, in particular
            # removing empty lines and removing spaces after commas.
            raw_data = (pipeline
                        |
                        'ReadTrainData' >> textio.ReadFromText(train_data_file)
                        | 'FilterTrainData' >> beam.Filter(lambda line: line)
                        | 'FixCommasTrainData' >>
                        beam.Map(lambda line: line.replace(', ', ','))
                        | 'DecodeTrainData' >> beam.Map(converter.decode))

            # Combine data and schema into a dataset tuple.  Note that we already used
            # the schema to read the CSV data, but we also need it to interpret
            # raw_data.
            raw_dataset = (raw_data, raw_data_metadata)
            transformed_dataset, transform_fn = (
                raw_dataset
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset

            _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                transformed_train_filebase,
                coder=example_proto_coder.ExampleProtoCoder(
                    transformed_metadata.schema))

            # Now apply transform function to test data.  In this case we also remove
            # the header line from the CSV file and the trailing period at the end of
            # each line.
            raw_test_data = (
                pipeline
                | 'ReadTestData' >> textio.ReadFromText(test_data_file)
                | 'FilterTestData' >> beam.Filter(
                    lambda line: line and line != '|1x3 Cross validator')
                | 'FixCommasTestData' >>
                beam.Map(lambda line: line.replace(', ', ','))
                | 'RemoveTrailingPeriodsTestData' >>
                beam.Map(lambda line: line[:-1])
                | 'DecodeTestData' >> beam.Map(converter.decode))

            raw_test_dataset = (raw_test_data, raw_data_metadata)

            transformed_test_dataset = ((raw_test_dataset, transform_fn)
                                        | beam_impl.TransformDataset())
            # Don't need transformed data schema, it's the same as before.
            transformed_test_data, _ = transformed_test_dataset

            _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
                transformed_test_filebase,
                coder=example_proto_coder.ExampleProtoCoder(
                    transformed_metadata.schema))

            _ = (transformed_metadata
                 | 'WriteMetadata' >> beam_metadata_io.WriteMetadata(
                     transformed_metadata_dir, pipeline=pipeline))
Example #26
0
    def test_single_phase_mixed_analyzer_run_once(self):
        span_0_key = 'span-0'
        span_1_key = 'span-1'

        def preprocessing_fn(inputs):

            integerized_s = tft.compute_and_apply_vocabulary(inputs['s'])

            _ = tft.bucketize(inputs['x'], 2, name='bucketize')

            return {
                'integerized_s':
                integerized_s,
                'x_min':
                tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'x_mean':
                tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'y_min':
                tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
                'y_mean':
                tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
            }

        # Run AnalyzeAndTransform on some input data and compare with expected
        # output.
        input_data = [{'x': 12, 'y': 1, 's': 'c'}, {'x': 10, 'y': 1, 's': 'c'}]
        input_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec({
                'x':
                tf.io.FixedLenFeature([], tf.float32),
                'y':
                tf.io.FixedLenFeature([], tf.float32),
                's':
                tf.io.FixedLenFeature([], tf.string),
            }))
        input_data_dict = {
            span_0_key: [{
                'x': -2,
                'y': 1,
                's': 'b',
            }, {
                'x': 4,
                'y': -4,
                's': 'b',
            }],
            span_1_key:
            input_data,
        }

        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            with beam.Pipeline() as p:

                flat_data = p | 'CreateInputData' >> beam.Create(
                    list(itertools.chain(*input_data_dict.values())))

                # TODO(b/37788560): Get these names programmatically.
                cache_dict = {
                    span_0_key: {
                        '__v0__CacheableCombineAccumulate--x_1-mean_and_var--':
                        p | 'CreateA' >> beam.Create([b'[2.0, 1.0, 9.0]']),
                        '__v0__CacheableCombineAccumulate--x-x--':
                        p | 'CreateB' >> beam.Create([b'[2.0, 4.0]']),
                        '__v0__CacheableCombineAccumulate--y_1-mean_and_var--':
                        p | 'CreateC' >> beam.Create([b'[2.0, -1.5, 6.25]']),
                        '__v0__CacheableCombineAccumulate--y-y--':
                        p | 'CreateD' >> beam.Create([b'[4.0, 1.0]']),
                    },
                    span_1_key: {},
                }

                transform_fn, cache_output = (
                    (flat_data, input_data_dict, cache_dict, input_metadata)
                    | 'Analyze' >>
                    (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
                _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(
                    self._cache_dir)

                transformed_dataset = (
                    ((input_data_dict[span_1_key], input_metadata),
                     transform_fn)
                    | 'Transform' >> beam_impl.TransformDataset())

                dot_string = nodes.get_dot_graph(
                    [analysis_graph_builder._ANALYSIS_GRAPH]).to_string()
                self.WriteRenderedDotFile(dot_string)

                transformed_data, unused_transformed_metadata = transformed_dataset

                expected_transformed = [
                    {
                        'x_mean': 6.0,
                        'x_min': -2.0,
                        'y_mean': -0.25,
                        'y_min': -4.0,
                        'integerized_s': 0,
                    },
                    {
                        'x_mean': 6.0,
                        'x_min': -2.0,
                        'y_mean': -0.25,
                        'y_min': -4.0,
                        'integerized_s': 0,
                    },
                ]
                beam_test_util.assert_that(
                    transformed_data,
                    beam_test_util.equal_to(expected_transformed))

                transform_fn_dir = os.path.join(self.base_test_dir,
                                                'transform_fn')
                _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir)
Example #27
0
    # TFTransform based preprocessing.
    raw_metadata = dataset_metadata.DatasetMetadata(
        schema=movielens.make_examples_schema())
    _ = (raw_metadata
         | 'WriteRawMetadata' >> tft_beam_io.WriteMetadata(
             os.path.join(args.output_dir, 'raw_metadata'), pipeline))

    preprocessing_fn = movielens.make_preprocessing_fn()
    train_features_transformed, transform_fn = (
        (train_data, raw_metadata)
        | 'AnalyzeAndTransform' >>
        tft.AnalyzeAndTransformDataset(preprocessing_fn))

    eval_features_transformed = (((eval_data, raw_metadata), transform_fn)
                                 | 'TransformEval' >> tft.TransformDataset())

    train_dataset_transformed, train_metadata = train_features_transformed
    training_coder = tft_coders.ExampleProtoCoder(train_metadata.schema)
    _ = (
        train_dataset_transformed
        | 'EncodeTraining' >> beam.Map(training_coder.encode)
        | 'ShuffleTraining' >> (_Shuffle())  # pylint: disable=no-value-for-parameter
        | 'WriteTraining' >> beam.io.WriteToTFRecord(
            os.path.join(args.output_dir, 'features_train'),
            file_name_suffix='.tfrecord.gz'))
    _ = (train_metadata
         | 'WriteTransformedMetadata' >> tft_beam_io.WriteMetadata(
             os.path.join(args.output_dir, 'transformed_metadata'), pipeline))

    eval_dataset_transformed, eval_metadata = eval_features_transformed
Example #28
0
    def test_single_phase_run_twice(self):

        span_0_key = 'span-0'
        span_1_key = 'span-1'

        def preprocessing_fn(inputs):

            _ = tft.vocabulary(inputs['s'])

            _ = tft.bucketize(inputs['x'], 2, name='bucketize')

            return {
                'x_min':
                tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'x_mean':
                tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
                'y_min':
                tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
                'y_mean':
                tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
            }

        input_data = [{'x': 12, 'y': 1, 's': 'b'}, {'x': 10, 'y': 1, 's': 'c'}]
        input_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec({
                'x':
                tf.io.FixedLenFeature([], tf.float32),
                'y':
                tf.io.FixedLenFeature([], tf.float32),
                's':
                tf.io.FixedLenFeature([], tf.string),
            }))
        input_data_dict = {
            span_0_key: [{
                'x': -2,
                'y': 1,
                's': 'a',
            }, {
                'x': 4,
                'y': -4,
                's': 'a',
            }],
            span_1_key:
            input_data,
        }
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            with beam.Pipeline() as p:

                flat_data = p | 'CreateInputData' >> beam.Create(
                    list(itertools.chain(*input_data_dict.values())))

                # This is needed due to b/123895600.
                for a, b in six.iteritems(input_data_dict):
                    input_data_dict[a] = p | a >> beam.Create(b)

                transform_fn, cache_output = (
                    (flat_data, input_data_dict, {}, input_metadata)
                    | 'Analyze' >>
                    (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
                _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(
                    self._cache_dir)

                transformed_dataset = (
                    ((input_data_dict[span_1_key], input_metadata),
                     transform_fn)
                    | 'Transform' >> beam_impl.TransformDataset())

                transformed_data, unused_transformed_metadata = transformed_dataset

                expected_transformed_data = [
                    {
                        'x_mean': 6.0,
                        'x_min': -2.0,
                        'y_mean': -0.25,
                        'y_min': -4.0,
                    },
                    {
                        'x_mean': 6.0,
                        'x_min': -2.0,
                        'y_mean': -0.25,
                        'y_min': -4.0,
                    },
                ]
                beam_test_util.assert_that(
                    transformed_data,
                    beam_test_util.equal_to(expected_transformed_data),
                    label='first')

                transform_fn_dir = os.path.join(self.base_test_dir,
                                                'transform_fn')
                _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir)

                for key in input_data_dict:
                    self.assertIn(key, cache_output)
                    self.assertEqual(6, len(cache_output[key]))

                transform_fn, second_output_cache = (
                    (flat_data, input_data_dict, cache_output, input_metadata)
                    | 'AnalyzeAgain' >>
                    (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))

                dot_string = nodes.get_dot_graph(
                    [analysis_graph_builder._ANALYSIS_GRAPH]).to_string()
                self.WriteRenderedDotFile(dot_string)

                transformed_dataset = (
                    ((input_data_dict[span_1_key], input_metadata),
                     transform_fn)
                    | 'TransformAgain' >> beam_impl.TransformDataset())
        transformed_data, unused_transformed_metadata = transformed_dataset
        beam_test_util.assert_that(
            transformed_data,
            beam_test_util.equal_to(expected_transformed_data),
            label='second')

        self.assertFalse(second_output_cache)
Example #29
0
    def assertAnalyzeAndTransformResults(self,
                                         input_data,
                                         input_metadata,
                                         preprocessing_fn,
                                         expected_data=None,
                                         expected_metadata=None,
                                         expected_vocab_file_contents=None,
                                         expected_asset_file_contents=None,
                                         test_data=None,
                                         desired_batch_size=None,
                                         beam_pipeline=None,
                                         temp_dir=None):
        """Assert that input data and metadata is transformed as expected.

    This methods asserts transformed data and transformed metadata match
    with expected_data and expected_metadata.

    Args:
      input_data: A sequence of dicts whose values are
          either strings, lists of strings, numeric types or a pair of those.
      input_metadata: DatasetMetadata describing input_data.
      preprocessing_fn: A function taking a dict of tensors and returning
          a dict of tensors.
      expected_data: (optional) A dataset with the same type constraints as
          input_data, but representing the output after transformation.
          If supplied, transformed data is asserted to be equal.
      expected_metadata: (optional) DatasetMetadata describing the transformed
          data. If supplied, transformed metadata is asserted to be equal.
      expected_vocab_file_contents: (optional) A dictionary from vocab filenames
          to their expected content as a list of text lines or a list of tuples
          of frequency and text. Values should be the expected result of calling
          f.readlines() on the given asset files.
      expected_asset_file_contents: deprecated.  Use
          expected_vocab_file_contents.
      test_data: (optional) If this is provided then instead of calling
          AnalyzeAndTransformDataset with input_data, this function will call
          AnalyzeDataset with input_data and TransformDataset with test_data.
          Note that this is the case even if input_data and test_data are equal.
          test_data should also conform to input_metadata.
      desired_batch_size: (optional) A batch size to batch elements by. If not
          provided, a batch size will be computed automatically.
      beam_pipeline: (optional) A Beam Pipeline to use in this test.
      temp_dir: If set, it is used as output directory, else a new unique
          directory is created.
    Raises:
      AssertionError: if the expected data does not match the results of
          transforming input_data according to preprocessing_fn, or
          (if provided) if the expected metadata does not match.
      ValueError: if expected_vocab_file_contents and
          expected_asset_file_contents are both set.
    """
        if (expected_vocab_file_contents is not None
                and expected_asset_file_contents is not None):
            raise ValueError('only one of expected_asset_file_contents and '
                             'expected_asset_file_contents should be set')
        elif expected_asset_file_contents is not None:
            tf.logging.warn('expected_asset_file_contents is deprecated, use '
                            'expected_vocab_file_contents')

        expected_vocab_file_contents = (expected_vocab_file_contents
                                        or expected_asset_file_contents or {})
        del expected_asset_file_contents

        # Note: we don't separately test AnalyzeDataset and TransformDataset as
        # AnalyzeAndTransformDataset currently simply composes these two
        # transforms.  If in future versions of the code, the implementation
        # differs, we should also run AnalyzeDataset and TransformDatset composed.
        temp_dir = temp_dir or tempfile.mkdtemp(prefix=self._testMethodName,
                                                dir=self.get_temp_dir())
        with beam_pipeline or beam.Pipeline(
                runner=self._makeRunner()) as pipeline:
            with beam_impl.Context(temp_dir=temp_dir,
                                   desired_batch_size=desired_batch_size):
                input_data = pipeline | 'CreateInput' >> beam.Create(
                    input_data)
                if test_data is None:
                    (transformed_data, transformed_metadata), transform_fn = (
                        (input_data, input_metadata)
                        |
                        beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
                else:
                    transform_fn = ((input_data, input_metadata)
                                    |
                                    beam_impl.AnalyzeDataset(preprocessing_fn))
                    test_data = pipeline | 'CreateTest' >> beam.Create(
                        test_data)
                    transformed_data, transformed_metadata = (
                        ((test_data, input_metadata), transform_fn)
                        | beam_impl.TransformDataset())

                # Write transform_fn so we can test its assets
                _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir)

                if expected_data is not None:
                    transformed_data_coder = tft.coders.ExampleProtoCoder(
                        transformed_metadata.schema)

                    transformed_data_path = os.path.join(
                        temp_dir, 'transformed_data')
                    _ = (transformed_data
                         | beam.Map(transformed_data_coder.encode)
                         | beam.io.tfrecordio.WriteToTFRecord(
                             transformed_data_path, shard_name_template=''))

        # TODO(ebreck) Log transformed_data somewhere.
        if expected_data is not None:
            examples = tf.python_io.tf_record_iterator(
                path=transformed_data_path)
            transformed_data = [
                transformed_data_coder.decode(x) for x in examples
            ]
            self.assertDataCloseOrEqual(expected_data, transformed_data)

        tf_transform_output = tft.TFTransformOutput(temp_dir)
        if expected_metadata:
            self.assertEqual(expected_metadata,
                             tf_transform_output.transformed_metadata)

        for filename, file_contents in six.iteritems(
                expected_vocab_file_contents):
            full_filename = tf_transform_output.vocabulary_file_by_name(
                filename)
            with tf.gfile.Open(full_filename, 'rb') as f:
                file_lines = f.readlines()

                # Store frequency case.
                if isinstance(file_contents[0], tuple):
                    word_and_frequency_list = []
                    for content in file_lines:
                        frequency, word = content.split(b' ', 1)
                        word_and_frequency_list.append(
                            (word.strip(b'\n'), float(frequency.strip(b'\n'))))
                    expected_words, expected_frequency = zip(
                        *word_and_frequency_list)
                    actual_words, actual_frequency = zip(*file_contents)
                    self.assertAllEqual(expected_words, actual_words)
                    np.testing.assert_almost_equal(expected_frequency,
                                                   actual_frequency)
                else:
                    file_lines = [
                        content.strip(b'\n') for content in file_lines
                    ]
                    self.assertAllEqual(file_lines, file_contents)
Example #30
0
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir,
               frequency_threshold):
    """Run pre-processing step as a pipeline.

  Args:
    pipeline: beam pipeline
    training_data: the name of the table to train on.
    eval_data: the name of the table to evaluate on.
    predict_data: the name of the table to predict on.
    output_dir: file path to where to write all the output files.
    frequency_threshold: frequency threshold to use for categorical values.
  """

    # 1) The schema can be either defined in-memory or read from a configuration
    #    file, in this case we are creating the schema in-memory.
    input_schema = reddit.make_input_schema()

    # 2) Read from BigQuery or from CSV.
    train_data = pipeline | 'ReadTrainingData' >> _ReadData(training_data)
    evaluate_data = pipeline | 'ReadEvalData' >> _ReadData(eval_data)

    # TODO(b/33688220) should the transform functions take shuffle as an optional
    # argument?
    # TODO(b/33688275) Should the transform functions have more user friendly
    # names?
    input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema)

    _ = (input_metadata
         | 'WriteInputMetadata' >> io.WriteMetadata(os.path.join(
             output_dir, path_constants.RAW_METADATA_DIR),
                                                    pipeline=pipeline))

    preprocessing_fn = reddit.make_preprocessing_fn(frequency_threshold)
    (train_dataset, train_metadata), transform_fn = (
        (train_data, input_metadata)
        | 'AnalyzeAndTransform' >>
        tft.AnalyzeAndTransformDataset(preprocessing_fn))

    # WriteTransformFn writes transform_fn and metadata to fixed subdirectories
    # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and
    # path_constants.TRANSFORMED_METADATA_DIR.
    _ = (transform_fn | 'WriteTransformFn' >> io.WriteTransformFn(output_dir))

    (evaluate_dataset,
     evaluate_metadata) = (((evaluate_data, input_metadata), transform_fn)
                           | 'TransformEval' >> tft.TransformDataset())

    # pylint: disable=expression-not-assigned
    # TODO(b/34231369) Remember to eventually also save the statistics and the
    # metadata.

    train_coder = coders.ExampleProtoCoder(train_metadata.schema)
    (train_dataset
     | 'SerializeTrainExamples' >> beam.Map(train_coder.encode)
     | 'WriteTraining' >>
     beam.io.WriteToTFRecord(os.path.join(
         output_dir, path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX),
                             file_name_suffix='.tfrecord.gz'))

    evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema)
    (evaluate_dataset
     | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode)
     | 'WriteEval' >> beam.io.WriteToTFRecord(os.path.join(
         output_dir, path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX),
                                              file_name_suffix='.tfrecord.gz'))

    if predict_data:
        predict_mode = tf.contrib.learn.ModeKeys.INFER
        predict_schema = reddit.make_input_schema(mode=predict_mode)
        predict_coder = coders.ExampleProtoCoder(predict_schema)

        # TODO(b/35653662): Simplify once tf.transform 0.1.5 is released.
        def encode_predict_data(d):
            try:
                return predict_coder.encode(d)
            except Exception:  # pylint: disable=broad-except
                # Compatibility path for tf.transform < 0.1.5
                return predict_coder.encode({
                    k: v.encode('utf-8') if isinstance(v, unicode) else v
                    for k, v in d.items()
                })

        serialized_examples = (
            pipeline
            | 'ReadPredictData' >> _ReadData(predict_data, mode=predict_mode)
            # TODO(b/35194257) Obviate the need for this explicit
            # serialization.
            | 'EncodePredictData' >> beam.Map(encode_predict_data))
        _ = (serialized_examples
             | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord(
                 os.path.join(
                     output_dir,
                     path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
                 file_name_suffix='.tfrecord.gz'))
        _ = (serialized_examples
             | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json)
             | 'WritePredictDataAsText' >> beam.io.WriteToText(
                 os.path.join(
                     output_dir,
                     path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
                 file_name_suffix='.txt'))