Exemple #1
0
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir,
               frequency_threshold):
  """Run pre-processing step as a pipeline.

  Args:
    pipeline: beam pipeline
    training_data: the name of the table to train on.
    eval_data: the name of the table to evaluate on.
    predict_data: the name of the table to predict on.
    output_dir: file path to where to write all the output files.
    frequency_threshold: frequency threshold to use for categorical values.
  """

  # 1) The schema can be either defined in-memory or read from a configuration
  #    file, in this case we are creating the schema in-memory.
  input_schema = reddit.make_input_schema()

  # 2) Read from BigQuery or from CSV.
  train_data = pipeline | 'ReadTrainingData' >> _ReadData(training_data)
  evaluate_data = pipeline | 'ReadEvalData' >> _ReadData(eval_data)

  input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema)

  _ = (input_metadata
       | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(
           os.path.join(output_dir, path_constants.RAW_METADATA_DIR),
           pipeline=pipeline))

  preprocessing_fn = reddit.make_preprocessing_fn(frequency_threshold)
  (train_dataset, train_metadata), transform_fn = (
      (train_data, input_metadata)
      | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset(
          preprocessing_fn))

  # WriteTransformFn writes transform_fn and metadata to fixed subdirectories
  # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and
  # path_constants.TRANSFORMED_METADATA_DIR.
  _ = (transform_fn
       | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir))

  (evaluate_dataset, evaluate_metadata) = (
      ((evaluate_data, input_metadata), transform_fn)
      | 'TransformEval' >> tft.TransformDataset())

  # pylint: disable=expression-not-assigned
  # TODO(b/34231369) Remember to eventually also save the statistics and the
  # metadata.

  train_coder = coders.ExampleProtoCoder(train_metadata.schema)
  (train_dataset
   | 'SerializeTrainExamples' >> beam.Map(train_coder.encode)
   | 'ShuffleTraining' >> _Shuffle()  # pylint: disable=no-value-for-parameter
   | 'WriteTraining' >> beam.io.WriteToTFRecord(
       os.path.join(output_dir,
                    path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX),
       file_name_suffix='.tfrecord.gz'))

  evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema)
  (evaluate_dataset
   | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode)
   | 'ShuffleEval' >> _Shuffle()  # pylint: disable=no-value-for-parameter
   | 'WriteEval' >> beam.io.WriteToTFRecord(
       os.path.join(output_dir,
                    path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX),
       file_name_suffix='.tfrecord.gz'))

  if predict_data:
    predict_mode = tf.contrib.learn.ModeKeys.INFER
    predict_schema = reddit.make_input_schema(mode=predict_mode)
    predict_coder = coders.ExampleProtoCoder(predict_schema)

    serialized_examples = (pipeline
                           | 'ReadPredictData' >> _ReadData(
                               predict_data, mode=predict_mode)
                           # TODO(b/35194257) Obviate the need for this explicit
                           # serialization.
                           | 'EncodePredictData' >> beam.Map(
                               predict_coder.encode))
    _ = (serialized_examples
         | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord(
             os.path.join(output_dir,
                          path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
             file_name_suffix='.tfrecord.gz'))
    _ = (serialized_examples
         | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json)
         | 'WritePredictDataAsText' >> beam.io.WriteToText(
             os.path.join(output_dir,
                          path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
             file_name_suffix='.txt'))
Exemple #2
0
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir,
               frequency_threshold):
  """Run pre-processing step as a pipeline.

  Args:
    pipeline: beam pipeline
    training_data: the name of the table to train on.
    eval_data: the name of the table to evaluate on.
    predict_data: the name of the table to predict on.
    output_dir: file path to where to write all the output files.
    frequency_threshold: frequency threshold to use for categorical values.
  """
  work_dir = os.path.join(output_dir, path_constants.TEMP_DIR)

  # 1) The schema can be either defined in-memory or read from a configuration
  #    file, in this case we are creating the schema in-memory.
  input_schema = reddit.make_input_schema()

  # 2) Read from BigQuery or from CSV.
  train_data = pipeline | 'ReadTrainingData' >> _ReadData(training_data)
  evaluate_data = pipeline | 'ReadEvalData' >> _ReadData(eval_data)

  # TODO(b/33688220) should the transform functions take shuffle as an optional
  # argument?
  # TODO(b/33688275) Should the transform functions have more user friendly
  # names?
  input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema)

  _ = (input_metadata
       | 'WriteInputMetadata' >> io.WriteMetadata(
           os.path.join(output_dir, path_constants.RAW_METADATA_DIR),
           pipeline=pipeline))

  preprocessing_fn = reddit.make_preprocessing_fn(frequency_threshold)
  (train_dataset, train_metadata), transform_fn = (
      (train_data, input_metadata)
      | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset(
          preprocessing_fn, work_dir))

  # WriteTransformFn writes transform_fn and metadata to fixed subdirectories
  # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and
  # path_constants.TRANSFORMED_METADATA_DIR.
  _ = (transform_fn | 'WriteTransformFn' >> io.WriteTransformFn(output_dir))

  (evaluate_dataset, evaluate_metadata) = (
      ((evaluate_data, input_metadata), transform_fn)
      | 'TransformEval' >> tft.TransformDataset())

  # pylint: disable=expression-not-assigned
  # TODO(b/34231369) Remember to eventually also save the statistics and the
  # metadata.

  train_coder = coders.ExampleProtoCoder(train_metadata.schema)
  (train_dataset
   | 'SerializeTrainExamples' >> beam.Map(train_coder.encode)
   | 'WriteTraining' >> beam.io.WriteToTFRecord(
       os.path.join(output_dir,
                    path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX),
       file_name_suffix='.tfrecord.gz'))

  evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema)
  (evaluate_dataset
   | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode)
   | 'WriteEval' >> beam.io.WriteToTFRecord(
       os.path.join(output_dir,
                    path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX),
       file_name_suffix='.tfrecord.gz'))

  if predict_data:
    predict_mode = tf.contrib.learn.ModeKeys.INFER
    predict_schema = reddit.make_input_schema(mode=predict_mode)
    predict_coder = coders.ExampleProtoCoder(predict_schema)

    # TODO(b/35653662): Simplify once tf.transform 0.1.5 is released.
    def encode_predict_data(d):
      try:
        return predict_coder.encode(d)
      except Exception:  # pylint: disable=broad-except
        # Compatibility path for tf.transform < 0.1.5
        return predict_coder.encode({
            k: v.encode('utf-8') if isinstance(v, unicode) else v
            for k, v in d.items()
        })

    serialized_examples = (pipeline
                           | 'ReadPredictData' >> _ReadData(
                               predict_data, mode=predict_mode)
                           # TODO(b/35194257) Obviate the need for this explicit
                           # serialization.
                           | 'EncodePredictData' >> beam.Map(
                               encode_predict_data))
    _ = (serialized_examples
         | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord(
             os.path.join(output_dir,
                          path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
             file_name_suffix='.tfrecord.gz'))
    _ = (serialized_examples
         | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json)
         | 'WritePredictDataAsText' >> beam.io.WriteToText(
             os.path.join(output_dir,
                          path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
             file_name_suffix='.txt'))
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir,
               frequency_threshold):
  """Run pre-processing step as a pipeline.

  Args:
    pipeline: beam pipeline
    training_data: the name of the table to train on.
    eval_data: the name of the table to evaluate on.
    predict_data: the name of the table to predict on.
    output_dir: file path to where to write all the output files.
    frequency_threshold: frequency threshold to use for categorical values.
  """

  # 1) The schema can be either defined in-memory or read from a configuration
  #    file, in this case we are creating the schema in-memory.
  input_schema = reddit.make_input_schema()

  # 2) Read from BigQuery or from CSV.
  train_data = pipeline | 'ReadTrainingData' >> _ReadData(training_data)
  evaluate_data = pipeline | 'ReadEvalData' >> _ReadData(eval_data)

  input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema)

  _ = (input_metadata
       | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(
           os.path.join(output_dir, path_constants.RAW_METADATA_DIR),
           pipeline=pipeline))

  preprocessing_fn = reddit.make_preprocessing_fn(frequency_threshold)
  transform_fn = ((train_data, input_metadata)
                  | 'Analyze' >> tft.AnalyzeDataset(preprocessing_fn))

  # WriteTransformFn writes transform_fn and metadata to fixed subdirectories
  # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and
  # path_constants.TRANSFORMED_METADATA_DIR.
  _ = (transform_fn
       | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir))

  @beam.ptransform_fn
  def TransformAndWrite(pcoll, path):  # pylint: disable=invalid-name
    pcoll |= 'Shuffle' >> _Shuffle()  # pylint: disable=no-value-for-parameter
    (dataset, metadata) = (((pcoll, input_metadata), transform_fn)
                           | 'Transform' >> tft.TransformDataset())
    coder = coders.ExampleProtoCoder(metadata.schema)
    _ = (dataset
         | 'SerializeExamples' >> beam.Map(coder.encode)
         | 'WriteExamples' >> beam.io.WriteToTFRecord(
             os.path.join(output_dir, path), file_name_suffix='.tfrecord.gz'))

  _ = train_data | 'TransformAndWriteTraining' >> TransformAndWrite(  # pylint: disable=no-value-for-parameter
      path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX)

  _ = evaluate_data | 'TransformAndWriteEval' >> TransformAndWrite(  # pylint: disable=no-value-for-parameter
      path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX)

  # TODO(b/35300113) Remember to eventually also save the statistics.

  if predict_data:
    predict_mode = tf.contrib.learn.ModeKeys.INFER
    predict_schema = reddit.make_input_schema(mode=predict_mode)
    predict_coder = coders.ExampleProtoCoder(predict_schema)

    serialized_examples = (pipeline
                           | 'ReadPredictData' >> _ReadData(
                               predict_data, mode=predict_mode)
                           # TODO(b/35194257) Obviate the need for this explicit
                           # serialization.
                           | 'EncodePredictData' >> beam.Map(
                               predict_coder.encode))
    _ = (serialized_examples
         | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord(
             os.path.join(output_dir,
                          path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
             file_name_suffix='.tfrecord.gz'))
    _ = (serialized_examples
         | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json)
         | 'WritePredictDataAsText' >> beam.io.WriteToText(
             os.path.join(output_dir,
                          path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
             file_name_suffix='.txt'))