コード例 #1
0
def encode():
    """
    Creates a Beam pipeline that generates data, transforms it and encodes it in ELWC
    """
    output_path = "./output"
    options = PipelineOptions()
    options.view_as(StandardOptions).runner = "DirectRunner"

    with beam.Pipeline(options=options) as pipeline:
        with tft_beam.Context(temp_dir="./tmp"):
            raw_data = generate_data(100)
            input_data = (pipeline | beam.Create(raw_data))

            transformed_data, transform_fn = (
                (input_data, raw_metadata)
                | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))

            elwc_coder = ELWCProtoCoder(context_specs, examples_specs)
            data, metadata = transformed_data

            _ = (data | beam.Map(elwc_coder.encode) | beam.io.WriteToTFRecord(
                file_path_prefix="{}/data".format(output_path),
                file_name_suffix=".tfrecords"))

            _ = (transform_fn | tft_beam.WriteTransformFn(output_path))
コード例 #2
0
ファイル: taxi_utils_test.py プロジェクト: jay90099/tfx
  def testPreprocessingFn(self):
    schema_file = os.path.join(self._testdata_path, 'schema_gen/schema.pbtxt')
    schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())
    feature_spec = taxi_utils._get_raw_feature_spec(schema)
    working_dir = self.get_temp_dir()
    transform_graph_path = os.path.join(working_dir, 'transform_graph')
    transformed_examples_path = os.path.join(
        working_dir, 'transformed_examples')

    # Run very simplified version of executor logic.
    # TODO(kestert): Replace with tft_unit.assertAnalyzeAndTransformResults.
    # Generate legacy `DatasetMetadata` object.  Future version of Transform
    # will accept the `Schema` proto directly.
    legacy_metadata = dataset_metadata.DatasetMetadata(
        schema_utils.schema_from_feature_spec(feature_spec))
    tfxio = tf_example_record.TFExampleRecord(
        file_pattern=os.path.join(self._testdata_path,
                                  'csv_example_gen/Split-train/*'),
        telemetry_descriptors=['Tests'],
        schema=legacy_metadata.schema)
    with beam.Pipeline() as p:
      with tft_beam.Context(temp_dir=os.path.join(working_dir, 'tmp')):
        examples = p | 'ReadTrainData' >> tfxio.BeamSource()
        (transformed_examples, transformed_metadata), transform_fn = (
            (examples, tfxio.TensorAdapterConfig())
            | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset(
                taxi_utils.preprocessing_fn))

        # WriteTransformFn writes transform_fn and metadata to subdirectories
        # tensorflow_transform.SAVED_MODEL_DIR and
        # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively.
        # pylint: disable=expression-not-assigned
        (transform_fn
         |
         'WriteTransformFn' >> tft_beam.WriteTransformFn(transform_graph_path))

        encoder = tft.coders.ExampleProtoCoder(transformed_metadata.schema)
        (transformed_examples
         | 'EncodeTrainData' >> beam.Map(encoder.encode)
         | 'WriteTrainData' >> beam.io.WriteToTFRecord(
             os.path.join(transformed_examples_path,
                          'Split-train/transformed_examples.gz'),
             coder=beam.coders.BytesCoder()))
        # pylint: enable=expression-not-assigned

    # Verify the output matches golden output.
    # NOTE: we don't verify that transformed examples match golden output.
    expected_transformed_schema = io_utils.parse_pbtxt_file(
        os.path.join(
            self._testdata_path,
            'transform/transform_graph/transformed_metadata/schema.pbtxt'),
        schema_pb2.Schema())
    transformed_schema = io_utils.parse_pbtxt_file(
        os.path.join(transform_graph_path, 'transformed_metadata/schema.pbtxt'),
        schema_pb2.Schema())
    # Clear annotations so we only have to test main schema.
    transformed_schema.ClearField('annotation')
    for feature in transformed_schema.feature:
      feature.ClearField('annotation')
    self.assertEqual(transformed_schema, expected_transformed_schema)
コード例 #3
0
def run():
    pipeline_options = PipelineOptions(['--runner=DirectRunner'])

    def preprocessing_fn(inputs):
        word = inputs['word']
        count = inputs['count']
        return {
            'word': word,
            'count': count,
            'count_normalized': tft.scale_to_0_1(count)
        }

    with beam.Pipeline(options=pipeline_options) as pipeline:
        with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
            counts_data = (pipeline
                           | "Load" >> ReadFromText(INPUT_FILE)
                           | "CountWords" >> CountWordsTransform())

            (transformed_data, transformed_metadata), _ = (
                (counts_data, COUNTS_METADATA)
                | "AnalyzeAndTransform" >>
                tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))

            output_column_names = ['word', 'count', 'count_normalized']
            transformed_data_coder = tft.coders.CsvCoder(
                output_column_names, transformed_metadata.schema)

            _ = (transformed_data
                 | "EncodeToCsv" >> beam.Map(transformed_data_coder.encode)
                 | "Save" >> WriteToText(OUTPUT_FILE))
コード例 #4
0
def _transform_and_write_tfr(
    dataset: pvalue.PCollection,
    tfr_writer: Callable[[], beam.io.tfrecordio.WriteToTFRecord],
    raw_metadata: types.BeamDatasetMetadata,
    preprocessing_fn: Optional[Callable] = None,
    transform_fn: Optional[types.TransformFn] = None,
    label: str = 'data'):
  """Applies TF Transform to dataset and outputs it as TFRecords."""

  dataset_metadata = (dataset, raw_metadata)

  if transform_fn:
    transformed_dataset, transformed_metadata = (
        (dataset_metadata, transform_fn)
        | f'Transform{label}' >> tft_beam.TransformDataset())
  else:
    if not preprocessing_fn:
      preprocessing_fn = lambda x: x
    (transformed_dataset, transformed_metadata), transform_fn = (
        dataset_metadata
        | f'AnalyzeAndTransform{label}' >>
        tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))

  transformed_data_coder = tft.coders.ExampleProtoCoder(
      transformed_metadata.schema)
  _ = (
      transformed_dataset
      | f'Encode{label}' >> beam.Map(transformed_data_coder.encode)
      | f'Write{label}' >> tfr_writer(prefix=label.lower()))

  return transform_fn
コード例 #5
0
ファイル: generate_vocab.py プロジェクト: hanneshapke/text
  def run_metrics():
    """Creates a pipeline to measure wordpiece vocab metrics over a corpus."""

    metrics_pipeline = beam.Pipeline()

    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
      # Read raw data and convert to TF Transform encoded dict.
      raw_data = (
          metrics_pipeline
          | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord(
              data_file, coder=beam.coders.ProtoCoder(tf.train.Example))
          | 'DecodeInputData' >> beam.Map(example_converter.decode))

      # Apply transform to wordpiece-tokenize input.
      (metrics_transformed_data, _), _ = (
          (raw_data, raw_metadata)
          | 'WordpieceTokenizeInput' >> tft_beam.AnalyzeAndTransformDataset(
              utils.metrics_preprocessing_fn(FLAGS.vocab_file,
                                             FLAGS.text_key,
                                             FLAGS.language_code_key)))

      # Initialize CSV coder. Aggregate values for each lang, calculate metrics,
      # and write to output to a CSV file.
      csv_converter = tft.coders.CsvCoder(columns, csv_schema)
      _ = (
          metrics_transformed_data
          | 'CompileTokenInfo' >> beam.ParDo(utils.CompileTokenizationInfo())
          | 'CombineStatsForLang' >> beam.CombineGlobally(utils.AggregateLang())
          | 'CalculateMetrics' >> beam.ParDo(utils.CalculateMetrics())
          | 'EncodeMetrics' >> beam.Map(csv_converter.encode)
          | 'WriteMetrics' >> beam.io.WriteToText(
              metrics_file, shard_name_template='', header=','.join(columns)))
    return metrics_pipeline
コード例 #6
0
def word_count(input_path, output_path, raw_metadata, min_token_frequency=2):
    """Returns a pipeline counting words and writing the output.

  Args:
    input_path: recordio file to read
    output_path: path in which to write the output
    raw_metadata: metadata of input tf.Examples
    min_token_frequency: the min frequency for a token to be included
  """

    lang_set = set(FLAGS.lang_set.split(','))

    # Create pipeline.
    pipeline = beam.Pipeline()

    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        converter = tft.coders.ExampleProtoCoder(raw_metadata.schema,
                                                 serialized=False)

        # Read raw data and convert to TF Transform encoded dict.
        raw_data = (
            pipeline
            | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord(
                input_path, coder=beam.coders.ProtoCoder(tf.train.Example))
            | 'DecodeInputData' >> beam.Map(converter.decode))

        # Apply TF Transform.
        (transformed_data, _), _ = (
            (raw_data, raw_metadata)
            |
            'FilterLangAndExtractToken' >> tft_beam.AnalyzeAndTransformDataset(
                utils.count_preprocessing_fn(FLAGS.text_key,
                                             FLAGS.language_code_key)))

        # Filter by languages.
        tokens = (
            transformed_data
            | 'FilterByLang' >> beam.ParDo(utils.FilterTokensByLang(lang_set)))

        # Calculate smoothing coefficients.
        coeffs = (tokens
                  | 'CalculateSmoothingCoefficients' >> beam.CombineGlobally(
                      utils.CalculateCoefficients(FLAGS.smoothing_exponent)))

        # Apply smoothing, aggregate counts, and sort words by count.
        _ = (tokens
             | 'ApplyExponentialSmoothing' >> beam.ParDo(
                 utils.ExponentialSmoothing(), beam.pvalue.AsSingleton(coeffs))
             | 'SumCounts' >> beam.CombinePerKey(sum)
             | 'FilterLowCounts' >> beam.ParDo(
                 utils.FilterByCount(FLAGS.max_word_length,
                                     min_token_frequency))
             |
             'MergeAndSortCounts' >> beam.CombineGlobally(utils.SortByCount())
             | 'Flatten' >> beam.FlatMap(lambda x: x)
             | 'FormatCounts' >> beam.Map(lambda tc: '%s\t%s' % (tc[0], tc[1]))
             | 'WriteSortedCount' >> beam.io.WriteToText(
                 output_path, shard_name_template=''))

    return pipeline
コード例 #7
0
def run(pipeline_options, known_args):
    global force_tf_compat_v1
    argv = None  # if None, uses sys.argv
    pipeline_options = PipelineOptions(argv)
    pipeline = beam.Pipeline(options=pipeline_options)

    if "universal-sentence-encoder" in MODEL_URL and int(
            MODEL_URL.split("/")[-1]) <= 2:
        # https://github.com/tensorflow/transform/issues/160
        force_tf_compat_v1 = True

    with tft_beam.Context(temp_dir=tempfile.mkdtemp(),
                          force_tf_compat_v1=force_tf_compat_v1):
        print("Context force_tf_compat_v1: {}".format(
            tft_beam.Context.get_use_tf_compat_v1()))
        articles = (
            pipeline
            | beam.Create([
                {
                    "id": "01",
                    "text": "To be, or not to be: that is the question: "
                },
                {
                    "id": "02",
                    "text": "Whether 'tis nobler in the mind to suffer "
                },
                {
                    "id": "03",
                    "text": "The slings and arrows of outrageous fortune, "
                },
                {
                    "id": "04",
                    "text": "Or to take arms against a sea of troubles, "
                },
            ]))

        articles_dataset = (articles, get_metadata())

        transformed_dataset, transform_fn = (
            articles_dataset
            | "Extract embeddings" >>
            tft_beam.AnalyzeAndTransformDataset(preprocess_fn))

        transformed_data, transformed_metadata = transformed_dataset

        _ = (transformed_data
             | "Print embeddings" >> beam.Map(print_pass)
             | "Write embeddings to TFRecords" >>
             beam.io.tfrecordio.WriteToTFRecord(
                 file_path_prefix="{0}".format(known_args.output_dir),
                 file_name_suffix=".tfrecords",
                 coder=tft_coders.example_proto_coder.ExampleProtoCoder(
                     transformed_metadata.schema),
                 num_shards=1))

    job = pipeline.run()
    if pipeline_options.get_all_options()["runner"] == "DirectRunner":
        job.wait_until_finish()
コード例 #8
0
def main():

    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        transformed_dataset, transform_fn = (  # pylint: disable=unused-variable
            (_RAW_DATA, _RAW_DATA_METADATA)
            | tft_beam.AnalyzeAndTransformDataset(_preprocessing_fn))

    transformed_data, transformed_metadata = transformed_dataset  # pylint: disable=unused-variable

    pprint.pprint(transformed_data)
コード例 #9
0
def data_transform():
    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        transformed_dataset, transform_fn = (
            (dict_features, data_metadata)
            | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))

    transformed_data, transformed_metadata = transformed_dataset

    for i in range(len(transformed_data)):
        print("Initial: ", dict_features[i])
        print("Transformed: ", transformed_data[i])
コード例 #10
0
def generate_skipgrams(data_uri,
                       feature_names,
                       vocabulary_size=10,
                       window_size=2,
                       negative_samples=0.,
                       save_path="temp"):
    def parse_tensor_f(x):
        xp = tf.io.parse_tensor(x, tf.int64)
        xp.set_shape([None])
        return {fname: xp[i] for i, fname in enumerate(feature_names)}

    raw_data = tf.data.TFRecordDataset(data_uri).map(
        parse_tensor_f).as_numpy_iterator()
    raw_data_schema = dataset_metadata.DatasetMetadata(
        schema_utils.schema_from_feature_spec({
            fname: tf.io.FixedLenFeature([], tf.int64)
            for fname in feature_names
        }))
    dataset = (raw_data, raw_data_schema)

    # Make the preprocessing_fn
    preprocessing_fn = make_preproc_func(vocabulary_size, window_size,
                                         negative_samples, feature_names)

    # Run the beam pipeline
    with beam.Pipeline() as pipeline:
        with tft_beam.Context(temp_dir=tempfile.mkdtemp(),
                              desired_batch_size=2):
            transformed_dataset, transform_fn = (
                dataset | "Make Skipgrams" >>
                tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
            print('Transformed dataset:\n{}'.format(
                pprint.pformat(transformed_dataset)))

            # pylint: disable=unused-variable
            transformed_data, transformed_metadata = transformed_dataset
            saved_results = (
                transformed_data
                | "Write to TFRecord" >> beam.io.tfrecordio.WriteToTFRecord(
                    file_path_prefix=save_path,
                    file_name_suffix=".tfrecords",
                    coder=tft.coders.example_proto_coder.ExampleProtoCoder(
                        transformed_metadata.schema)))
            print('\nRaw data:\n{}\n'.format(pprint.pformat(raw_data)))
            print('Transformed data:\n{}'.format(
                pprint.pformat(transformed_data)))
            # Return the list of paths of tfrecords
            num_rows_saved = len(transformed_data)

    return saved_results, num_rows_saved
コード例 #11
0
    def run_vocab():
        """Creates a pipeline to generate wordpiece vocab over a corpus."""

        vocab_pipeline = beam.Pipeline()

        with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
            # Read raw data and convert to TF Transform encoded dict.
            raw_data = (
                vocab_pipeline
                | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord(
                    data_file, coder=beam.coders.ProtoCoder(tf.train.Example))
                | 'DecodeInputData' >> beam.Map(example_converter.decode))

            # Apply TF Transform.
            (transformed_data,
             _), _ = ((raw_data, raw_metadata)
                      | 'FilterLangAndExtractToken' >>
                      tft_beam.AnalyzeAndTransformDataset(
                          utils.count_preprocessing_fn(
                              FLAGS.text_key, FLAGS.language_code_key)))

            # Filter by languages.
            tokens = (transformed_data
                      | 'FilterByLang' >> beam.ParDo(
                          utils.FilterTokensByLang(lang_set)))

            # Calculate smoothing coefficients.
            coeffs = (
                tokens
                | 'CalculateSmoothingCoefficients' >> beam.CombineGlobally(
                    utils.CalculateCoefficients(FLAGS.smoothing_exponent)))

            # Apply smoothing, aggregate counts, and sort words by count.
            _ = (tokens
                 | 'ApplyExponentialSmoothing' >> beam.ParDo(
                     utils.ExponentialSmoothing(),
                     beam.pvalue.AsSingleton(coeffs))
                 | 'SumCounts' >> beam.CombinePerKey(sum)
                 | 'FilterLowCounts' >> beam.ParDo(
                     utils.FilterByCount(FLAGS.max_word_length,
                                         min_token_frequency))
                 | 'MergeAndSortCounts' >> beam.CombineGlobally(
                     utils.SortByCount())
                 | 'LearnVocab' >> beam.ParDo(utils.LearnVocab(params))
                 | 'Flatten' >> beam.FlatMap(lambda x: x + '\n')
                 | 'WriteVocab' >> beam.io.WriteToText(
                     vocab_file,
                     shard_name_template='',
                     append_trailing_newlines=False))
        return vocab_pipeline
コード例 #12
0
def transform_tft(train_data, test_data, working_dir):
    options = PipelineOptions()
    options.view_as(StandardOptions).runner = 'DirectRunner'
    with beam.Pipeline(options=options) as pipeline:
        with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
            data_shape = train_data[0][0].shape
            raw_data = (
                pipeline | 'ReadTrainData' >> beam.Create(train_data)
                | 'CreateTrainData' >> beam.Map(lambda data: format(data)))
            raw_data_metadata = dataset_metadata.DatasetMetadata(
                dataset_schema.from_feature_spec({
                    IMAGE_KEY:
                    tf.FixedLenFeature(list(data_shape), tf.float32),
                    LABEL_KEY:
                    tf.FixedLenFeature([], tf.int64)
                }))
            raw_dataset = (raw_data, raw_data_metadata)
            transformed_dataset, transform_fn = (
                raw_dataset
                | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset
            transformed_data_coder = tft.coders.ExampleProtoCoder(
                transformed_metadata.schema)

            _ = (
                transformed_data
                | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
                | 'WriteTrainData' >> beam.io.WriteToTFRecord(
                    os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE),
                    file_name_suffix='.tfrecords'))

            raw_test_data = (
                pipeline | 'ReadTestData' >> beam.Create(test_data)
                | 'CreateTestData' >> beam.Map(lambda data: format(data)))
            raw_test_dataset = (raw_test_data, raw_data_metadata)

            transformed_test_dataset = ((raw_test_dataset, transform_fn)
                                        | tft_beam.TransformDataset())
            # Don't need transformed data schema, it's the same as before.
            transformed_test_data, _ = transformed_test_dataset

            _ = (transformed_test_data
                 | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode)
                 | 'WriteTestData' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE),
                     file_name_suffix='.tfrecords'))

            _ = (transform_fn |
                 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
コード例 #13
0
def transformed_data(working_dir):
    """数据处理与生成transform_fn"""
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        xi, yi = inputs["x"], inputs["y"]
        x_integerized = tft.compute_and_apply_vocabulary(xi, default_value=0, name="vocab")  # , top_k=VOCAB_SIZE)
        y_integerized = tft.compute_and_apply_vocabulary(yi, default_value=0, name="label")  # ,top_k=LABEL_SIZE
        return {"x": x_integerized, "y": y_integerized}

    # path_transform
    with tft_beam.Context(temp_dir=path_transform):
        transformed_dataset, transform_fn = ((xys, DATA_STRING_FEATURE_SPEC) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
        transformed_train_data, transformed_metadata = transformed_dataset
        _ = (transform_fn | tft_beam.WriteTransformFn(working_dir))
    return transformed_train_data
コード例 #14
0
    def pipeline(root):
        """Pipeline instantiation function.

    Args:
      root: Source pipeline from which to extend.
    """

        preprocessing_fn = compute_vocab_fn if FLAGS.vocab_gen_mode else apply_vocab_fn

        with tft_beam.Context(temp_dir=FLAGS.temp_dir):
            processed_lines = (
                root
                # Read in TSV data.
                | beam.io.ReadFromText(data_path)
                # Fill in missing elements with the defaults (zeros).
                | "FillMissing" >> beam.ParDo(FillMissing())
                # For numerical features, set negatives to zero. Then take log(x+1).
                | "NegsToZeroLog" >> beam.ParDo(NegsToZeroLog())
                # For categorical features, mod the values with vocab size.
                | "HexToIntModRange" >> beam.ParDo(HexToIntModRange()))

            # CSV reader: List the cols in order, as dataset schema is not ordered.
            ordered_columns = [
                LABEL_KEY
            ] + NUMERIC_FEATURE_KEYS + CATEGORICAL_FEATURE_KEYS
            converter = tft.coders.CsvCoder(ordered_columns,
                                            INPUT_METADATA.schema,
                                            delimiter=FLAGS.csv_delimeter)

            converted_data = (processed_lines
                              | "DecodeData" >> beam.Map(converter.decode))

            transformed_dataset, transform_fn = (  # pylint: disable=unused-variable
                (converted_data, INPUT_METADATA)
                | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset

            if not FLAGS.vocab_gen_mode:
                # Write to CSV.
                transformed_csv_coder = tft.coders.CsvCoder(
                    ordered_columns,
                    transformed_metadata.schema,
                    delimiter=FLAGS.csv_delimeter)
                _ = (transformed_data
                     |
                     "EncodeDataCsv" >> beam.Map(transformed_csv_coder.encode)
                     | "WriteDataCsv" >> beam.io.WriteToText(output_path))
コード例 #15
0
def main():
  def preprocessing_fn(inputs):
    """Preprocess input columns into transformed columns."""
    x = inputs['x']
    y = inputs['y']
    s = inputs['s']
    x_centered = x - tft.mean(x)
    y_normalized = tft.scale_to_0_1(y)
    s_integerized = tft.compute_and_apply_vocabulary(s)
    x_centered_times_y_normalized = (x_centered * y_normalized)
    return {
        'x_centered': x_centered,
        'y_normalized': y_normalized,
        'x_centered_times_y_normalized': x_centered_times_y_normalized,
        's_integerized': s_integerized
    }

  raw_data = [
      {'x': 1, 'y': 1, 's': 'hello'},
      {'x': 2, 'y': 2, 's': 'world'},
      {'x': 3, 'y': 3, 's': 'hello'}
  ]

  raw_data_metadata = dataset_metadata.DatasetMetadata(
      schema_utils.schema_from_feature_spec({
          's': tf.io.FixedLenFeature([], tf.string),
          'y': tf.io.FixedLenFeature([], tf.float32),
          'x': tf.io.FixedLenFeature([], tf.float32),
      }))

  with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
    transformed_dataset, transform_fn = (  # pylint: disable=unused-variable
        (raw_data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset(
            preprocessing_fn))

  transformed_data, transformed_metadata = transformed_dataset  # pylint: disable=unused-variable

  pprint.pprint(transformed_data)
コード例 #16
0
def run_hub2emb(args):
    '''Runs the embedding generation pipeline'''

    options = beam.options.pipeline_options.PipelineOptions(**args)
    args = namedtuple("options", args.keys())(*args.values())

    raw_metadata = create_metadata()
    converter = tft.coders.CsvCoder(column_names=['text'],
                                    schema=raw_metadata.schema)

    with beam.Pipeline(args.runner, options=options) as pipeline:
        with tft_beam.Context(args.temporary_dir):
            # Read the sentences from the input file
            sentences = (
                pipeline
                | 'Read sentences from files' >>
                beam.io.ReadFromText(file_pattern='corpus/text.txt')
                # | 'Convert to dictionary' >> beam.Map(converter.decode)
            )

            sentences_dataset = (sentences, raw_metadata)
            preprocess_fn = make_preprocess_fn(args.module_url,
                                               args.random_projection_matrix)
            # Generate the embeddings for the sentence using the TF-Hub module
            embeddings_dataset, _ = (
                sentences_dataset
                | 'Extract embeddings' >>
                tft_beam.AnalyzeAndTransformDataset(preprocess_fn))

            embeddings, transformed_metadata = embeddings_dataset
            # Write the embeddings to TFRecords files
            embeddings | 'Write embeddings to TFRecords' >> beam.io.tfrecordio.WriteToTFRecord(
                file_path_prefix='{}/emb'.format(args.output_dir),
                file_name_suffix='.tfrecords',
                coder=tft.coders.ExampleProtoCoder(
                    transformed_metadata.schema))
コード例 #17
0
def transform_data(train_data_file, test_data_file, working_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and converts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.

  Args:
    train_data_file: File containing training data
    test_data_file: File containing test data
    working_dir: Directory to write transformed data and metadata to
  """
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        # Since we are modifying some features and leaving others unchanged, we
        # start by setting `outputs` to a copy of `inputs.
        outputs = inputs.copy()

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_FEATURE_KEYS:
            outputs[key] = tft.scale_to_0_1(outputs[key])

        for key in OPTIONAL_NUMERIC_FEATURE_KEYS:
            # This is a SparseTensor because it is optional. Here we fill in a default
            # value when it is missing.
            dense = tf.compat.v1.sparse_to_dense(
                outputs[key].indices, [outputs[key].dense_shape[0], 1],
                outputs[key].values,
                default_value=0.)
            # Reshaping from a batch of vectors of size 1 to a batch to scalars.
            dense = tf.squeeze(dense, axis=1)
            outputs[key] = tft.scale_to_0_1(dense)

        # For all categorical columns except the label column, we generate a
        # vocabulary but do not modify the feature.  This vocabulary is instead
        # used in the trainer, by means of a feature column, to convert the feature
        # from a string to an integer id.
        for key in CATEGORICAL_FEATURE_KEYS:
            tft.vocabulary(inputs[key], vocab_filename=key)

        # For the label column we provide the mapping from string to index.
        table_keys = ['>50K', '<=50K']
        initializer = tf.lookup.KeyValueTensorInitializer(
            keys=table_keys,
            values=tf.cast(tf.range(len(table_keys)), tf.int64),
            key_dtype=tf.string,
            value_dtype=tf.int64)
        table = tf.lookup.StaticHashTable(initializer, default_value=-1)
        outputs[LABEL_KEY] = table.lookup(outputs[LABEL_KEY])

        return outputs

    # The "with" block will create a pipeline, and run that pipeline at the exit
    # of the block.
    with beam.Pipeline() as pipeline:
        with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
            # Create a coder to read the census data with the schema.  To do this we
            # need to list all columns in order since the schema doesn't specify the
            # order of columns in the csv.
            ordered_columns = [
                'age', 'workclass', 'fnlwgt', 'education', 'education-num',
                'marital-status', 'occupation', 'relationship', 'race', 'sex',
                'capital-gain', 'capital-loss', 'hours-per-week',
                'native-country', 'label'
            ]
            converter = tft.coders.CsvCoder(ordered_columns,
                                            RAW_DATA_METADATA.schema)

            # Read in raw data and convert using CSV converter.  Note that we apply
            # some Beam transformations here, which will not be encoded in the TF
            # graph since we don't do the from within tf.Transform's methods
            # (AnalyzeDataset, TransformDataset etc.).  These transformations are just
            # to get data into a format that the CSV converter can read, in particular
            # removing spaces after commas.
            #
            # We use MapAndFilterErrors instead of Map to filter out decode errors in
            # convert.decode which should only occur for the trailing blank line.
            raw_data = (
                pipeline
                | 'ReadTrainData' >> beam.io.ReadFromText(train_data_file)
                | 'FixCommasTrainData' >>
                beam.Map(lambda line: line.replace(', ', ','))
                | 'DecodeTrainData' >> MapAndFilterErrors(converter.decode))

            # Combine data and schema into a dataset tuple.  Note that we already used
            # the schema to read the CSV data, but we also need it to interpret
            # raw_data.
            raw_dataset = (raw_data, RAW_DATA_METADATA)
            transformed_dataset, transform_fn = (
                raw_dataset
                | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset
            transformed_data_coder = tft.coders.ExampleProtoCoder(
                transformed_metadata.schema)

            _ = (transformed_data
                 | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
                 | 'WriteTrainData' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir,
                                  TRANSFORMED_TRAIN_DATA_FILEBASE)))

            # Now apply transform function to test data.  In this case we remove the
            # trailing period at the end of each line, and also ignore the header line
            # that is present in the test data file.
            raw_test_data = (
                pipeline
                | 'ReadTestData' >> beam.io.ReadFromText(test_data_file,
                                                         skip_header_lines=1)
                | 'FixCommasTestData' >>
                beam.Map(lambda line: line.replace(', ', ','))
                | 'RemoveTrailingPeriodsTestData' >>
                beam.Map(lambda line: line[:-1])
                | 'DecodeTestData' >> MapAndFilterErrors(converter.decode))

            raw_test_dataset = (raw_test_data, RAW_DATA_METADATA)

            transformed_test_dataset = ((raw_test_dataset, transform_fn)
                                        | tft_beam.TransformDataset())
            # Don't need transformed data schema, it's the same as before.
            transformed_test_data, _ = transformed_test_dataset

            _ = (
                transformed_test_data
                | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode)
                | 'WriteTestData' >> beam.io.WriteToTFRecord(
                    os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)))

            # Will write a SavedModel and metadata to working_dir, which can then
            # be read by the tft.TFTransformOutput class.
            _ = (transform_fn
                 |
                 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
コード例 #18
0
    'add': tf.FixedLenFeature([], tf.boolean),
     'line_length': tf.FixedLenFeature([], tf.int32])))

input_data_metadata = dataset_metadata.DatasetMetadata(input_data_schema)

with beam.Pipeline() as pipeline:
    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        columns = text_fields + ["commented", "add", "line_length"]
        converter = tft.coders.CsvCoder(columns, input_data_schema)
        input_data = (
            pipeline
            | 'ReadInputData' >> beam.io.ReadFromText(train_data_file)
            | 'CleanInputData' >> MapAndFilterErrors(converter.decode))
        input_dataset = (input_data, input_data_metadata)
        transformed_dataset, transform_fn = (
            input_dataset | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
        transformed_data, transfored_metadata = transformed_dataset
        transformed_data_coder = tft.coders.ExampleProtoCoder(
          transformed_metadata.schema)

        # Write the resulting data out
        _ = (
          transformed_data
          | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
          | 'WriteTrainData' >> beam.io.WriteToTFRecord(
              os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE)))
        # We'll use the transform function later too
        _ = (
            transform_fn
            | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
コード例 #19
0
def generate_skipgram_beam(
    data_uri,
    feature_names,
    vocabulary_size=10,
    window_size=2,
    negative_samples=0.0,
    seed=None,
    temp_dir="/tmp",
    save_path="temp",
    beam_pipeline_args=None,
):
    """
    Generate Skipgrams with an Apache Beam pipeline.

    Parameters
    ----------
    data_uri : list(str)
        List of TFRecords that contains the tensors.
    feature_names : list(str), optional
        List of feature names, whose length must match the
        number of columns of features in the TFRecord.
        This helps determine the number of columns in the
        TFRecords.
    vocabulary_size : int, optional
        Size of skipgram vocabulary, by default 10
    window_size : int, optional
        Window size of skipgram, by default 2
    negative_samples : float, optional
        Fraction of negative samples of skipgram, by default 0.0
    seed : int, optional
        Random seed, by default None
    temp_dir : str, optional
        Directory to save temporary results used by the Beam
        pipeline, by default "/tmp"
    save_path : str, optional
        Output path name (without the .tfrecord extention),
        by default "temp"
    beam_pipeline_args: dict, optional.
        Pipeline options of Beam runner.
        The default is None.

    Returns
    -------
    saved_results: list(str)
        List of URIs / path to the TFRecord files.
    num_rows_saved: int
        Number of rows of the samples saved.
    """
    def parse_tensor_f(x):
        xp = tf.io.parse_tensor(x, tf.int32)
        xp.set_shape([None])
        return {fname: xp[i] for i, fname in enumerate(feature_names)}

    raw_data = tf.data.TFRecordDataset(data_uri).map(
        parse_tensor_f).as_numpy_iterator()
    raw_data_schema = dataset_metadata.DatasetMetadata(
        schema_utils.schema_from_feature_spec({
            fname: tf.io.FixedLenFeature([], tf.int64)
            for fname in feature_names
        }))
    dataset = (raw_data, raw_data_schema)

    # Make the preprocessing_fn
    preprocessing_fn = make_preproc_func(vocabulary_size, window_size,
                                         negative_samples, feature_names, seed)

    # Run the beam pipeline
    pipeline_options = (
        beam.options.pipeline_options.PipelineOptions.from_dictionary(
            beam_pipeline_args) if beam_pipeline_args is not None else None
    )  # None = DirectRunner, local mode

    with beam.Pipeline(
            options=pipeline_options) as Pipeline:  # options=pipeline_options
        with tft_beam.Context(temp_dir=temp_dir):
            # pylint: disable=unused-variable
            (
                transformed_dataset,
                transform_fn,
            ) = dataset | "Make Skipgrams " >> tft_beam.AnalyzeAndTransformDataset(
                preprocessing_fn)

            # pylint: disable=unused-variable
            transformed_data, transformed_metadata = transformed_dataset
            saved_results = (
                transformed_data
                | "Write to TFRecord" >> beam.io.tfrecordio.WriteToTFRecord(
                    file_path_prefix=save_path,
                    file_name_suffix=".tfrecords",
                    coder=tft.coders.example_proto_coder.ExampleProtoCoder(
                        transformed_metadata.schema),
                ))
            # print('\nRaw data:\n{}\n'.format(pprint.pformat(dataset)))
            # print('Transformed data:\n{}'.format(pprint.pformat(transformed_data)))
            # Return the list of paths of tfrecords
            num_rows_saved = len(transformed_data)

    return saved_results, num_rows_saved
コード例 #20
0
def calculate_metrics():
    """Returns a pipeline to compute wordpiece model stats given a vocab and corpus."""

    # Schema of input dataset.
    raw_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec({
            'text':
            tf.FixedLenFeature([], tf.string),
            'language_code':
            tf.FixedLenFeature([], tf.string),
        }))

    # Schema to format metrics as CSV.
    csv_schema = dataset_schema.from_feature_spec({
        'lang':
        tf.FixedLenFeature([], tf.string),
        'sample_count':
        tf.FixedLenFeature([], tf.int64),
        'micro_drop_char_percent':
        tf.FixedLenFeature([], tf.string),
        'macro_drop_char_percent':
        tf.FixedLenFeature([], tf.string),
        'micro_compress_ratio':
        tf.FixedLenFeature([], tf.string),
        'macro_compress_ratio':
        tf.FixedLenFeature([], tf.string),
        'unweighted_en_wp_overlap_percent':
        tf.FixedLenFeature([], tf.string),
        'weighted_en_wp_overlap_percent':
        tf.FixedLenFeature([], tf.string),
    })

    columns = [
        'lang', 'sample_count', 'micro_drop_char_percent',
        'macro_drop_char_percent', 'micro_compress_ratio',
        'macro_compress_ratio', 'unweighted_en_wp_overlap_percent',
        'weighted_en_wp_overlap_percent'
    ]

    # Create pipeline.
    pipeline = beam.Pipeline()

    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        example_converter = tft.coders.ExampleProtoCoder(raw_metadata.schema,
                                                         serialized=False)
        csv_converter = tft.coders.CsvCoder(columns, csv_schema)

        # Read raw data and convert to TF Transform encoded dict.
        raw_data = (pipeline
                    | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord(
                        FLAGS.input_file,
                        coder=beam.coders.ProtoCoder(tf.train.Example))
                    | 'DecodeInputData' >> beam.Map(example_converter.decode))

        # Apply transform to wordpiece-tokenize input.
        (transformed_data, _), _ = (
            (raw_data, raw_metadata)
            | 'WordpieceTokenizeInput' >> tft_beam.AnalyzeAndTransformDataset(
                utils.metrics_preprocessing_fn(
                    FLAGS.vocab_file, FLAGS.text_key, FLAGS.language_code_key))
        )

        # Aggregate values for each lang, calculate metrics, and write to output.
        _ = (transformed_data
             |
             'CompileTokenInfo' >> beam.ParDo(utils.CompileTokenizationInfo())
             | 'CombineStatsForLang' >> beam.CombineGlobally(
                 utils.AggregateLang())
             | 'CalculateMetrics' >> beam.ParDo(utils.CalculateMetrics())
             | 'EncodeMetrics' >> beam.Map(csv_converter.encode)
             | 'WriteMetrics' >> beam.io.WriteToText(FLAGS.output_file,
                                                     shard_name_template='',
                                                     header=','.join(columns)))

    return pipeline
コード例 #21
0
def transform_data(train_data_file, test_data_file, working_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and converts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.

  Args:
    train_data_file: File containing training data
    test_data_file: File containing test data
    working_dir: Directory to write transformed data and metadata to
  """

    # The "with" block will create a pipeline, and run that pipeline at the exit
    # of the block.
    with apache_beam.Pipeline() as pipeline:
        with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
            # Create a coder to read the census data with the schema.  To do this we
            # need to list all columns in order since the schema doesn't specify the
            # order of columns in the csv.
            ordered_columns = [
                'age', 'workclass', 'fnlwgt', 'education', 'education-num',
                'marital-status', 'occupation', 'relationship', 'race', 'sex',
                'capital-gain', 'capital-loss', 'hours-per-week',
                'native-country', 'label'
            ]
            converter = tft.coders.CsvCoder(ordered_columns,
                                            RAW_DATA_METADATA.schema)

            # Read in raw data and convert using CSV converter.  Note that we apply
            # some Beam transformations here, which will not be encoded in the TF
            # graph since we don't do them from within tf.Transform's methods
            # (AnalyzeDataset, TransformDataset etc.).  These transformations are just
            # to get data into a format that the CSV converter can read, in particular
            # removing spaces after commas.
            #
            # We use MapAndFilterErrors instead of Map to filter out decode errors in
            # convert.decode which should only occur for the trailing blank line.
            raw_data = (
                pipeline
                |
                'ReadTrainData' >> apache_beam.io.ReadFromText(train_data_file)
                | 'FixCommasTrainData' >>
                apache_beam.Map(lambda line: line.replace(', ', ','))
                | 'DecodeTrainData' >> MapAndFilterErrors(converter.decode))

            # Combine data and schema into a dataset tuple.  Note that we already used
            # the schema to read the CSV data, but we also need it to interpret
            # raw_data.
            raw_dataset = (raw_data, RAW_DATA_METADATA)
            transformed_dataset, transform_fn = (
                raw_dataset
                | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset
            # A coder between TF Examples and tf.Transform datasets.
            # Used to encode a tf.transform encoded dict as tf.Example.
            transformed_data_coder = tft.coders.ExampleProtoCoder(
                transformed_metadata.schema)

            _ = (transformed_data
                 | 'EncodeTrainData' >> apache_beam.Map(
                     transformed_data_coder.encode)
                 | 'WriteTrainData' >> apache_beam.io.WriteToTFRecord(
                     os.path.join(working_dir,
                                  TRANSFORMED_TRAIN_DATA_FILEBASE)))

            # Now apply transform function to test data.  In this case we remove the
            # trailing period at the end of each line, and also ignore the header line
            # that is present in the test data file.
            raw_test_data = (
                pipeline
                | 'ReadTestData' >> apache_beam.io.ReadFromText(
                    test_data_file, skip_header_lines=1)
                | 'FixCommasTestData' >>
                apache_beam.Map(lambda line: line.replace(', ', ','))
                | 'RemoveTrailingPeriodsTestData' >>
                apache_beam.Map(lambda line: line[:-1])
                | 'DecodeTestData' >> MapAndFilterErrors(converter.decode))

            raw_test_dataset = (raw_test_data, RAW_DATA_METADATA)

            transformed_test_dataset = ((raw_test_dataset, transform_fn)
                                        | tft_beam.TransformDataset())
            # Don't need transformed data schema, it's the same as before.
            transformed_test_data, _ = transformed_test_dataset

            _ = (
                transformed_test_data
                | 'EncodeTestData' >> apache_beam.Map(
                    transformed_data_coder.encode)
                | 'WriteTestData' >> apache_beam.io.WriteToTFRecord(
                    os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)))

            # Will write a SavedModel and metadata to working_dir, which can then
            # be read by the tft.TFTransformOutput class.
            _ = (transform_fn
                 |
                 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
コード例 #22
0
    def test_preprocessing_fn(self):
        schema_file = os.path.join(self._testdata_path,
                                   'schema_gen/schema.pbtxt')
        schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())
        feature_spec = taxi_utils._get_raw_feature_spec(schema)
        working_dir = self.get_temp_dir()
        transform_output_path = os.path.join(working_dir, 'transform_output')
        transformed_examples_path = os.path.join(working_dir,
                                                 'transformed_examples')

        # Run very simplified version of executor logic.
        # TODO(kestert): Replace with tft_unit.assertAnalyzeAndTransformResults.
        # Generate legacy `DatasetMetadata` object.  Future version of Transform
        # will accept the `Schema` proto directly.
        legacy_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec(feature_spec))
        decoder = tft.coders.ExampleProtoCoder(legacy_metadata.schema)
        with beam.Pipeline() as p:
            with tft_beam.Context(temp_dir=os.path.join(working_dir, 'tmp')):
                examples = (
                    p
                    | 'ReadTrainData' >> beam.io.ReadFromTFRecord(
                        os.path.join(self._testdata_path,
                                     'csv_example_gen/train/*'),
                        coder=beam.coders.BytesCoder(),
                        # TODO(b/114938612): Eventually remove this override.
                        validate=False)
                    | 'DecodeTrainData' >> beam.Map(decoder.decode))
                (transformed_examples, transformed_metadata), transform_fn = (
                    (examples, legacy_metadata)
                    | 'AnalyzeAndTransform' >>
                    tft_beam.AnalyzeAndTransformDataset(
                        taxi_utils.preprocessing_fn))

                # WriteTransformFn writes transform_fn and metadata to subdirectories
                # tensorflow_transform.SAVED_MODEL_DIR and
                # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively.
                # pylint: disable=expression-not-assigned
                (transform_fn
                 | 'WriteTransformFn' >>
                 tft_beam.WriteTransformFn(transform_output_path))

                encoder = tft.coders.ExampleProtoCoder(
                    transformed_metadata.schema)
                (transformed_examples
                 | 'EncodeTrainData' >> beam.Map(encoder.encode)
                 | 'WriteTrainData' >> beam.io.WriteToTFRecord(
                     os.path.join(transformed_examples_path,
                                  'train/transformed_examples.gz'),
                     coder=beam.coders.BytesCoder()))
                # pylint: enable=expression-not-assigned

        # Verify the output matches golden output.
        # NOTE: we don't verify that transformed examples match golden output.
        expected_transformed_schema = io_utils.parse_pbtxt_file(
            os.path.join(
                self._testdata_path,
                'transform/transform_output/transformed_metadata/schema.pbtxt'
            ), schema_pb2.Schema())
        transformed_schema = io_utils.parse_pbtxt_file(
            os.path.join(transform_output_path,
                         'transformed_metadata/schema.pbtxt'),
            schema_pb2.Schema())
        # Clear annotations so we only have to test main schema.
        for feature in transformed_schema.feature:
            feature.ClearField('annotation')
        self.assertEqual(transformed_schema, expected_transformed_schema)
コード例 #23
0
def transform_data(working_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data from the positive and negative examples on disk, and
  transform it using a preprocessing pipeline that removes punctuation,
  tokenizes and maps tokens to int64 values indices.

  Args:
    working_dir: Directory to read shuffled data from and write transformed data
        and metadata to.
  """

    with beam.Pipeline() as pipeline:
        with tft_beam.Context(
                temp_dir=os.path.join(working_dir, TRANSFORM_TEMP_DIR)):
            tfxio_train_data = tfxio.TFExampleRecord(file_pattern=os.path.join(
                working_dir, SHUFFLED_TRAIN_DATA_FILEBASE + '*'),
                                                     schema=SCHEMA)
            train_data = (pipeline |
                          'TFXIORead[Train]' >> tfxio_train_data.BeamSource())

            tfxio_test_data = tfxio.TFExampleRecord(file_pattern=os.path.join(
                working_dir, SHUFFLED_TEST_DATA_FILEBASE + '*'),
                                                    schema=SCHEMA)
            test_data = (pipeline
                         | 'TFXIORead[Test]' >> tfxio_test_data.BeamSource())

            def preprocessing_fn(inputs):
                """Preprocess input columns into transformed columns."""
                review = inputs[REVIEW_KEY]

                # Here tf.compat.v1.string_split behaves differently from
                # tf.strings.split.
                review_tokens = tf.compat.v1.string_split(review, DELIMITERS)
                review_indices = tft.compute_and_apply_vocabulary(
                    review_tokens, top_k=VOCAB_SIZE)
                # Add one for the oov bucket created by compute_and_apply_vocabulary.
                review_bow_indices, review_weight = tft.tfidf(
                    review_indices, VOCAB_SIZE + 1)
                return {
                    REVIEW_KEY: review_bow_indices,
                    REVIEW_WEIGHT_KEY: review_weight,
                    LABEL_KEY: inputs[LABEL_KEY]
                }

            # Transformed metadata is not necessary for encoding.
            # The TFXIO output format is chosen for improved performance.
            (transformed_train_data, _), transform_fn = (
                (train_data, tfxio_train_data.TensorAdapterConfig())
                | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset(
                    preprocessing_fn, output_record_batches=True))

            transformed_test_data, _ = (
                ((test_data, tfxio_test_data.TensorAdapterConfig()),
                 transform_fn)
                | 'Transform' >>
                tft_beam.TransformDataset(output_record_batches=True))

            # Extract transformed RecordBatches, encode and write them to the given
            # directory.
            coder = tfxio.RecordBatchToExamplesEncoder()
            _ = (transformed_train_data
                 | 'EncodeTrainData' >>
                 beam.FlatMapTuple(lambda batch, _: coder.encode(batch))
                 | 'WriteTrainData' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir,
                                  TRANSFORMED_TRAIN_DATA_FILEBASE)))

            _ = (
                transformed_test_data
                | 'EncodeTestData' >>
                beam.FlatMapTuple(lambda batch, _: coder.encode(batch))
                | 'WriteTestData' >> beam.io.WriteToTFRecord(
                    os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)))

            # Will write a SavedModel and metadata to two subdirectories of
            # working_dir, given by tft.TRANSFORM_FN_DIR and
            # tft.TRANSFORMED_METADATA_DIR respectively.
            _ = (transform_fn
                 |
                 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
コード例 #24
0
def transform_data(data):
    """
    :param data: A list of raw data.
    :return: A numpy array of arrays of integers.
    """
    with tft_beam.Context(temp_dir="temp/"):
        raw_data_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec({
                # early_slack_count, midday_slack_count and late_slack_count is when a
                # slack message was sent in the day.
                'early_slack_count':
                tensorflow.FixedLenFeature([], tensorflow.int64),
                'midday_slack_count':
                tensorflow.FixedLenFeature([], tensorflow.int64),
                'late_slack_count':
                tensorflow.FixedLenFeature([], tensorflow.int64),
                # negative_emoji, positive_emoji and neutral_emoji is the sentiment
                # of the emojis sent.
                'negative_emoji':
                tensorflow.FixedLenFeature([], tensorflow.int64),
                'positive_emoji':
                tensorflow.FixedLenFeature([], tensorflow.int64),
                'neutral_emoji':
                tensorflow.FixedLenFeature([], tensorflow.int64),
                # Github count
                'github_count':
                tensorflow.FixedLenFeature([], tensorflow.int64),
                # weekday
                'weekday':
                tensorflow.FixedLenFeature([], tensorflow.int64),
                'event_rating_ratio':
                tensorflow.FixedLenFeature([], tensorflow.int64),
                'temperature':
                tensorflow.FixedLenFeature([], tensorflow.int64),
                'precipitation':
                tensorflow.FixedLenFeature([], tensorflow.int64),
                'slack_negative_ratio':
                tensorflow.FixedLenFeature([], tensorflow.int64),
            }))

        transformed_dataset, transform_fn = (
            (data, raw_data_metadata)
            | tft_beam.AnalyzeAndTransformDataset(preprocess))
        transformed_data, transformed_metadata = transformed_dataset

    # TODO: There should be an easier way to do this.
    retransformed_data = []
    for trans in transformed_data:
        current = [
            trans["early_slack_count_normalized"],
            trans["midday_slack_count_normalized"],
            trans["late_slack_count_normalized"],
            trans["negative_emoji_normalized"],
            trans["neutral_emoji_normalized"],
            trans["positive_emoji_normalized"],
            trans["github_count_normalized"], trans["weekday"],
            trans["event_rating_normalized"], trans["temperature_normalized"],
            trans["precipitation_normalized"],
            trans["slack_negative_normalized"]
        ]

        retransformed_data.append(current)

    return array(retransformed_data)
コード例 #25
0
def transform_data(train_data_file, test_data_file, working_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and converts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.

  Args:
    train_data_file: File containing training data
    test_data_file: File containing test data
    working_dir: Directory to write transformed data and metadata to
  """
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        # Since we are modifying some features and leaving others unchanged, we
        # start by setting `outputs` to a copy of `inputs.
        outputs = inputs.copy()

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_FEATURE_KEYS:
            outputs[key] = tft.scale_to_0_1(inputs[key])

        for key in OPTIONAL_NUMERIC_FEATURE_KEYS:
            # This is a SparseTensor because it is optional. Here we fill in a default
            # value when it is missing.
            sparse = tf.sparse.SparseTensor(inputs[key].indices,
                                            inputs[key].values,
                                            [inputs[key].dense_shape[0], 1])
            dense = tf.sparse.to_dense(sp_input=sparse, default_value=0.)
            # Reshaping from a batch of vectors of size 1 to a batch to scalars.
            dense = tf.squeeze(dense, axis=1)
            outputs[key] = tft.scale_to_0_1(dense)

        # For all categorical columns except the label column, we generate a
        # vocabulary but do not modify the feature.  This vocabulary is instead
        # used in the trainer, by means of a feature column, to convert the feature
        # from a string to an integer id.
        for key in CATEGORICAL_FEATURE_KEYS:
            outputs[key] = tft.compute_and_apply_vocabulary(tf.strings.strip(
                inputs[key]),
                                                            num_oov_buckets=1,
                                                            vocab_filename=key)

        # For the label column we provide the mapping from string to index.
        table_keys = ['>50K', '<=50K']
        initializer = tf.lookup.KeyValueTensorInitializer(
            keys=table_keys,
            values=tf.cast(tf.range(len(table_keys)), tf.int64),
            key_dtype=tf.string,
            value_dtype=tf.int64)
        table = tf.lookup.StaticHashTable(initializer, default_value=-1)
        # Romove trailing periods for test data when the data is read with tf.data.
        label_str = tf.strings.regex_replace(inputs[LABEL_KEY], r'\.', '')
        label_str = tf.strings.strip(label_str)
        data_labels = table.lookup(label_str)
        transformed_label = tf.one_hot(indices=data_labels,
                                       depth=len(table_keys),
                                       on_value=1.0,
                                       off_value=0.0)
        outputs[LABEL_KEY] = tf.reshape(transformed_label,
                                        [-1, len(table_keys)])

        return outputs

    # The "with" block will create a pipeline, and run that pipeline at the exit
    # of the block.
    with beam.Pipeline() as pipeline:
        with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
            # Create a TFXIO to read the census data with the schema. To do this we
            # need to list all columns in order since the schema doesn't specify the
            # order of columns in the csv.
            # We first read CSV files and use BeamRecordCsvTFXIO whose .BeamSource()
            # accepts a PCollection[bytes] because we need to patch the records first
            # (see "FixCommasTrainData" below). Otherwise, tfxio.CsvTFXIO can be used
            # to both read the CSV files and parse them to TFT inputs:
            # csv_tfxio = tfxio.CsvTFXIO(...)
            # raw_data = (pipeline | 'ToRecordBatches' >> csv_tfxio.BeamSource())
            csv_tfxio = tfxio.BeamRecordCsvTFXIO(
                physical_format='text',
                column_names=ORDERED_CSV_COLUMNS,
                schema=SCHEMA)

            # Read in raw data and convert using CSV TFXIO.  Note that we apply
            # some Beam transformations here, which will not be encoded in the TF
            # graph since we don't do the from within tf.Transform's methods
            # (AnalyzeDataset, TransformDataset etc.).  These transformations are just
            # to get data into a format that the CSV TFXIO can read, in particular
            # removing spaces after commas.
            raw_data = (pipeline
                        | 'ReadTrainData' >> beam.io.ReadFromText(
                            train_data_file, coder=beam.coders.BytesCoder())
                        | 'FixCommasTrainData' >>
                        beam.Map(lambda line: line.replace(b', ', b','))
                        | 'DecodeTrainData' >> csv_tfxio.BeamSource())

            # Combine data and schema into a dataset tuple.  Note that we already used
            # the schema to read the CSV data, but we also need it to interpret
            # raw_data.
            raw_dataset = (raw_data, csv_tfxio.TensorAdapterConfig())
            transformed_dataset, transform_fn = (
                raw_dataset
                | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset
            transformed_data_coder = tft.coders.ExampleProtoCoder(
                transformed_metadata.schema)

            _ = (transformed_data
                 | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
                 | 'WriteTrainData' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir,
                                  TRANSFORMED_TRAIN_DATA_FILEBASE)))

            # Now apply transform function to test data.  In this case we remove the
            # trailing period at the end of each line, and also ignore the header line
            # that is present in the test data file.
            raw_test_data = (pipeline
                             | 'ReadTestData' >> beam.io.ReadFromText(
                                 test_data_file,
                                 skip_header_lines=1,
                                 coder=beam.coders.BytesCoder())
                             | 'FixCommasTestData' >>
                             beam.Map(lambda line: line.replace(b', ', b','))
                             | 'RemoveTrailingPeriodsTestData' >>
                             beam.Map(lambda line: line[:-1])
                             | 'DecodeTestData' >> csv_tfxio.BeamSource())

            raw_test_dataset = (raw_test_data, csv_tfxio.TensorAdapterConfig())

            transformed_test_dataset = ((raw_test_dataset, transform_fn)
                                        | tft_beam.TransformDataset())
            # Don't need transformed data schema, it's the same as before.
            transformed_test_data, _ = transformed_test_dataset

            _ = (
                transformed_test_data
                | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode)
                | 'WriteTestData' >> beam.io.WriteToTFRecord(
                    os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)))

            # Will write a SavedModel and metadata to working_dir, which can then
            # be read by the tft.TFTransformOutput class.
            _ = (transform_fn
                 |
                 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
def transform_data(working_dir):

    with beam.Pipeline() as pipeline:
        with tft_beam.Context(
                temp_dir=os.path.join(working_dir, TRANSFORM_TEMP_DIR)):
            train_coder = tft.coders.ExampleProtoCoder(
                TRAIN_RAW_DATA_METADATA.schema)
            test_coder = tft.coders.ExampleProtoCoder(
                TEST_RAW_DATA_METADATA.schema)

            train_data = (pipeline
                          | 'Read Train' >> beam.io.ReadFromTFRecord(
                              os.path.join(working_dir,
                                           TFRECORD_TRAIN_DATA_FILEBASE + '*'))
                          | 'Decode Train' >> beam.Map(train_coder.decode))

            test_data = (pipeline
                         | 'Read Test' >> beam.io.ReadFromTFRecord(
                             os.path.join(working_dir,
                                          TFRECORD_TEST_DATA_FILEBASE + '*'))
                         | 'Decode Test' >> beam.Map(test_coder.decode))

            def preprocessing_fn_train(inputs):
                """Preprocess input columns into transformed columns."""
                context = inputs['Context']
                utterance = inputs['Utterance']
                vocab = tf.concat([context, utterance], 0)

                context_tokens = tf.compat.v1.string_split(context, DELIMITERS)
                utterance_tokens = tf.compat.v1.string_split(
                    utterance, DELIMITERS)
                vocab_tokens = tf.compat.v1.string_split(vocab, DELIMITERS)

                vocab_mapping_file_path = tft.vocabulary(
                    vocab_tokens, vocab_filename='anantvir_train_vocab')

                mapped_context = tft.apply_vocabulary(
                    context_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)
                print(mapped_context)

                mapped_utterance = tft.apply_vocabulary(
                    utterance_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)

                return {
                    'Context': mapped_context,
                    'Utterance': mapped_utterance,
                }

            def preprocessing_fn_test(inputs):
                """Preprocess input columns into transformed columns."""
                context = inputs['Context']
                ground_truth_utterance = inputs['Ground Truth Utterance']
                distractor_0 = inputs['Distractor_0']
                distractor_1 = inputs['Distractor_1']
                distractor_2 = inputs['Distractor_2']
                distractor_3 = inputs['Distractor_3']
                distractor_4 = inputs['Distractor_4']
                distractor_5 = inputs['Distractor_5']
                distractor_6 = inputs['Distractor_6']
                distractor_7 = inputs['Distractor_7']
                distractor_8 = inputs['Distractor_8']
                vocab = tf.concat([
                    context, ground_truth_utterance, distractor_0,
                    distractor_1, distractor_2, distractor_3, distractor_4,
                    distractor_5, distractor_6, distractor_7, distractor_8
                ], 0)

                context_tokens = tf.compat.v1.string_split(context, DELIMITERS)
                ground_truth_utterance_tokens = tf.compat.v1.string_split(
                    ground_truth_utterance, DELIMITERS)
                distractor_0_tokens = tf.compat.v1.string_split(
                    distractor_0, DELIMITERS)
                distractor_1_tokens = tf.compat.v1.string_split(
                    distractor_1, DELIMITERS)
                distractor_2_tokens = tf.compat.v1.string_split(
                    distractor_2, DELIMITERS)
                distractor_3_tokens = tf.compat.v1.string_split(
                    distractor_3, DELIMITERS)
                distractor_4_tokens = tf.compat.v1.string_split(
                    distractor_4, DELIMITERS)
                distractor_5_tokens = tf.compat.v1.string_split(
                    distractor_5, DELIMITERS)
                distractor_6_tokens = tf.compat.v1.string_split(
                    distractor_6, DELIMITERS)
                distractor_7_tokens = tf.compat.v1.string_split(
                    distractor_7, DELIMITERS)
                distractor_8_tokens = tf.compat.v1.string_split(
                    distractor_8, DELIMITERS)

                vocab_tokens = tf.compat.v1.string_split(vocab, DELIMITERS)

                vocab_mapping_file_path = tft.vocabulary(
                    vocab_tokens, vocab_filename='anantvir_test_vocab')

                mapped_context = tft.apply_vocabulary(
                    context_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)
                mapped_ground_truth_utterance = tft.apply_vocabulary(
                    ground_truth_utterance_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)
                mapped_distractor_0 = tft.apply_vocabulary(
                    distractor_0_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)
                mapped_distractor_1 = tft.apply_vocabulary(
                    distractor_0_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)
                mapped_distractor_2 = tft.apply_vocabulary(
                    distractor_0_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)
                mapped_distractor_3 = tft.apply_vocabulary(
                    distractor_0_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)
                mapped_distractor_4 = tft.apply_vocabulary(
                    distractor_0_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)
                mapped_distractor_5 = tft.apply_vocabulary(
                    distractor_0_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)
                mapped_distractor_6 = tft.apply_vocabulary(
                    distractor_0_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)
                mapped_distractor_7 = tft.apply_vocabulary(
                    distractor_0_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)
                mapped_distractor_8 = tft.apply_vocabulary(
                    distractor_0_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)

                return {
                    'Context': mapped_context,
                    'Ground Truth Utterance': mapped_ground_truth_utterance,
                    'Distractor_0': mapped_distractor_0,
                    'Distractor_1': mapped_distractor_1,
                    'Distractor_2': mapped_distractor_2,
                    'Distractor_3': mapped_distractor_3,
                    'Distractor_4': mapped_distractor_4,
                    'Distractor_5': mapped_distractor_5,
                    'Distractor_6': mapped_distractor_6,
                    'Distractor_7': mapped_distractor_7,
                    'Distractor_8': mapped_distractor_8,
                }

            # train_transform_fn = (
            #     # data, metadata = dataset
            #     (train_data, TRAIN_RAW_DATA_METADATA)
            #     | 'Analyze' >> tft_beam.AnalyzeDataset(
            #         preprocessing_fn_train))

            (transformed_train_data,
             transformed_train_metadata), train_transform_fn = (
                 (train_data, TRAIN_RAW_DATA_METADATA)
                 | 'AnalyzeAndTransformTrain' >>
                 tft_beam.AnalyzeAndTransformDataset(preprocessing_fn_train))

            # https://stackoverflow.com/questions/46406419/collecting-output-from-apache-beam-pipeline-and-displaying-it-to-console

            def print_row(row):
                #raw_inputs = row['Context']
                #padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(raw_inputs,padding='post')
                print(row)

            _ = (transformed_train_data | 'print' >> beam.Map(print_row))

            transformed_train_data_coder = tft.coders.ExampleProtoCoder(
                transformed_train_metadata.schema)

            (transformed_test_data,
             transformed_test_metadata), test_transform_fn = (
                 (test_data, TEST_RAW_DATA_METADATA)
                 | 'AnalyzeAndTransformTest' >>
                 tft_beam.AnalyzeAndTransformDataset(preprocessing_fn_test))
            transformed_test_data_coder = tft.coders.ExampleProtoCoder(
                transformed_test_metadata.schema)

            _ = (transformed_train_data
                 | 'EncodeTrainData' >> beam.Map(
                     transformed_train_data_coder.encode)
                 | 'WriteTrainData' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir,
                                  TRANSFORMED_TRAIN_DATA_FILEBASE)))

            _ = (
                transformed_test_data
                | 'EncodeTestData' >> beam.Map(
                    transformed_test_data_coder.encode)
                | 'WriteTestData' >> beam.io.WriteToTFRecord(
                    os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)))
コード例 #27
0
def build_pipeline(df: pd.DataFrame, job_label: str, runner: str, project: str,
                   region: str, output_dir: str, compression: str,
                   num_shards: int, dataflow_options: dict,
                   integer_label: bool) -> beam.Pipeline:
    """Runs TFRecorder Beam Pipeline.

  Args:
    df: Pandas DataFrame
    job_label: User description for the beam job.
    runner: Beam Runner: (e.g. DataflowRunner, DirectRunner).
    project: GCP project ID (if DataflowRunner)
    region: GCP compute region (if DataflowRunner)
    output_dir: GCS or Local Path for output.
    compression: gzip or None.
    num_shards: Number of shards.
    dataflow_options: Dataflow Runner Options (optional)
    integer_label: Flags if label is already an integer.

  Returns:
    beam.Pipeline

  Note: These inputs must be validated upstream (by client.create_tfrecord())
  """

    job_name = _get_job_name(job_label)
    job_dir = _get_job_dir(output_dir, job_name)
    options = _get_pipeline_options(runner, job_name, job_dir, project, region,
                                    dataflow_options)

    #with beam.Pipeline(runner, options=options) as p:
    p = beam.Pipeline(options=options)
    with tft_beam.Context(temp_dir=os.path.join(job_dir, 'tft_tmp')):

        converter = tft.coders.CsvCoder(constants.IMAGE_CSV_COLUMNS,
                                        constants.IMAGE_CSV_METADATA.schema)

        extract_images_fn = beam_image.ExtractImagesDoFn(
            constants.IMAGE_URI_KEY)
        flatten_rows = ToCSVRows()

        # Each element in the image_csv_data PCollection will be a dict
        # including the image_csv_columns and the image features created from
        # extract_images_fn.
        image_csv_data = (
            p
            | 'ReadFromDataFrame' >> beam.Create(df.values.tolist())
            | 'ToCSVRows' >> beam.ParDo(flatten_rows)
            | 'DecodeCSV' >> beam.Map(converter.decode)
            | 'ReadImage' >> beam.ParDo(extract_images_fn))

        # Split dataset into train and validation.
        train_data, val_data, test_data, discard_data = (
            image_csv_data | 'SplitDataset' >> beam.Partition(
                _partition_fn, len(constants.SPLIT_VALUES)))

        train_dataset = (train_data, constants.RAW_METADATA)
        val_dataset = (val_data, constants.RAW_METADATA)
        test_dataset = (test_data, constants.RAW_METADATA)

        # TensorFlow Transform applied to all datasets.
        preprocessing_fn = functools.partial(_preprocessing_fn,
                                             integer_label=integer_label)
        transformed_train_dataset, transform_fn = (
            train_dataset
            | 'AnalyzeAndTransformTrain' >>
            tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))

        transformed_train_data, transformed_metadata = transformed_train_dataset
        transformed_data_coder = tft.coders.ExampleProtoCoder(
            transformed_metadata.schema)

        transformed_val_data, _ = (
            (val_dataset, transform_fn)
            | 'TransformVal' >> tft_beam.TransformDataset())

        transformed_test_data, _ = (
            (test_dataset, transform_fn)
            | 'TransformTest' >> tft_beam.TransformDataset())

        # Sinks for TFRecords and metadata.
        tfr_writer = functools.partial(_get_write_to_tfrecord,
                                       output_dir=job_dir,
                                       compress=compression,
                                       num_shards=num_shards)

        _ = (transformed_train_data
             | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
             | 'WriteTrainData' >> tfr_writer(prefix='train'))

        _ = (transformed_val_data
             | 'EncodeValData' >> beam.Map(transformed_data_coder.encode)
             | 'WriteValData' >> tfr_writer(prefix='val'))

        _ = (transformed_test_data
             | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode)
             | 'WriteTestData' >> tfr_writer(prefix='test'))

        _ = (discard_data
             | 'DiscardDataWriter' >> beam.io.WriteToText(
                 os.path.join(job_dir, 'discarded-data')))

        # Output transform function and metadata
        _ = (transform_fn
             | 'WriteTransformFn' >> tft_beam.WriteTransformFn(job_dir))

        # Output metadata schema
        _ = (transformed_metadata
             | 'WriteMetadata' >> tft_beam.WriteMetadata(job_dir, pipeline=p))

    return p
コード例 #28
0
def transform_data(train_data_file, test_data_file, working_dir):
    """Transform the data and write out as a TFRecord of Example protos.
    Read in the data using the parquet io, and transform it using a
    preprocessing pipeline that scales numeric data and converts categorical data
    from strings to int64 values indices, by creating a vocabulary for each
    category.
    Args:
      train_data_file: File containing training data
      test_data_file: File containing test data
      feature_config: named tuple with feature types
      working_dir: Directory to write transformed data and metadata to
    """

    numerical_feats = [
        "startCountTotal", "purchaseCountTotal", "globalStartCountTotal",
        "globalPurchaseCountTotal"
    ]

    categorical_feats = ["country", "sourceGameId", "platform"]

    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        outputs = {}

        for key in numerical_feats:
            outputs[key] = tf.cast(tft.bucketize(inputs[key], 20),
                                   tf.float32) / 20.0 - 0.5

        outputs["campaignCost_mod"] = inputs["campaignCost"] / 100.0

        inputs["game_zone"] = tf.string_join(
            [inputs["sourceGameId"], inputs["zone"]], separator="_")
        inputs["game_campaignId"] = tf.string_join(
            [inputs["sourceGameId"], inputs["campaignId"]], separator="_")

        for key in categorical_feats + ["game_zone", "game_campaignId"]:
            vocab = tft.vocabulary(inputs[key],
                                   vocab_filename=key,
                                   frequency_threshold=100)
            outputs[key] = tft.apply_vocabulary(inputs[key],
                                                vocab,
                                                default_value=0)

        outputs["label"] = inputs["label"]
        outputs["key"] = inputs["key"]

        return outputs

    # Input schema definition
    RAW_DATA_METADATA = gather_raw_metadata(
        numerical_feats + ["campaignCost"],
        categorical_feats + ["zone", "campaignId", "key"])

    # pipeline args to read from gcs, currently unused because reading local file
    pipeline_args = [
        '--runner=DirectRunner',
        '--project=unity-ads-ds-prd',
        #     '--staging_location=gs://unity-ads-ds-prd-users/villew/promo/staging',
        #     '--temp_location=gs://unity-ads-ds-prd-users/villew/promo/temp',
        '--job_name=transform-promo-data-to-tf-records'
    ]
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    # create a beam pipeline
    with beam.Pipeline(options=pipeline_options) as pipeline:
        with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
            raw_data = (
                pipeline
                | 'ReadTrainData' >> beam.io.ReadFromParquet(train_data_file))

            # Combine data and schema into a dataset tuple.
            raw_dataset = (raw_data, RAW_DATA_METADATA)
            transformed_dataset, transform_fn = (
                raw_dataset
                | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset
            transformed_data_coder = tft.coders.ExampleProtoCoder(
                transformed_metadata.schema)

            # write to tf record
            _ = (transformed_data
                 | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
                 | 'WriteTrainData' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir, "train_tfrecord")))

            # Now apply transform function to test data.
            raw_test_data = (
                pipeline
                | 'ReadTestData' >> beam.io.ReadFromParquet(test_data_file))

            raw_test_dataset = (raw_test_data, RAW_DATA_METADATA)

            transformed_test_dataset = ((raw_test_dataset, transform_fn)
                                        | tft_beam.TransformDataset())

            # Don't need transformed data schema, it's the same as before.
            transformed_test_data, _ = transformed_test_dataset

            _ = (transformed_test_data
                 | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode)
                 | 'WriteTestData' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir, "test_tfrecord")))

            # Will write a SavedModel and metadata to working_dir, which can then
            # be read by the tft.TFTransformOutput class.
            _ = (transform_fn
                 |
                 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
コード例 #29
0
def transform_data(working_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data from the positive and negative examples on disk, and
  transform it using a preprocessing pipeline that removes punctuation,
  tokenizes and maps tokens to int64 values indices.

  Args:
    working_dir: Directory to read shuffled data from and write transformed data
        and metadata to.
  """

    with beam.Pipeline() as pipeline:
        with tft_beam.Context(
                temp_dir=os.path.join(working_dir, TRANSFORM_TEMP_DIR)):
            coder = tft.coders.ExampleProtoCoder(RAW_DATA_METADATA.schema)
            train_data = (pipeline
                          | 'ReadTrain' >> beam.io.ReadFromTFRecord(
                              os.path.join(working_dir,
                                           SHUFFLED_TRAIN_DATA_FILEBASE + '*'))
                          | 'DecodeTrain' >> beam.Map(coder.decode))

            test_data = (pipeline
                         | 'ReadTest' >> beam.io.ReadFromTFRecord(
                             os.path.join(working_dir,
                                          SHUFFLED_TEST_DATA_FILEBASE + '*'))
                         | 'DecodeTest' >> beam.Map(coder.decode))

            def preprocessing_fn(inputs):
                """Preprocess input columns into transformed columns."""
                review = inputs[REVIEW_KEY]

                # Here tf.compat.v1.string_split behaves differently from
                # tf.strings.split.
                review_tokens = tf.compat.v1.string_split(review, DELIMITERS)
                review_indices = tft.compute_and_apply_vocabulary(
                    review_tokens, top_k=VOCAB_SIZE)
                # Add one for the oov bucket created by compute_and_apply_vocabulary.
                review_bow_indices, review_weight = tft.tfidf(
                    review_indices, VOCAB_SIZE + 1)
                return {
                    REVIEW_KEY: review_bow_indices,
                    REVIEW_WEIGHT_KEY: review_weight,
                    LABEL_KEY: inputs[LABEL_KEY]
                }

            (transformed_train_data, transformed_metadata), transform_fn = (
                (train_data, RAW_DATA_METADATA)
                | 'AnalyzeAndTransform' >>
                tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data_coder = tft.coders.ExampleProtoCoder(
                transformed_metadata.schema)

            transformed_test_data, _ = (
                ((test_data, RAW_DATA_METADATA), transform_fn)
                | 'Transform' >> tft_beam.TransformDataset())

            _ = (transformed_train_data
                 | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
                 | 'WriteTrainData' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir,
                                  TRANSFORMED_TRAIN_DATA_FILEBASE)))

            _ = (
                transformed_test_data
                | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode)
                | 'WriteTestData' >> beam.io.WriteToTFRecord(
                    os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)))

            # Will write a SavedModel and metadata to two subdirectories of
            # working_dir, given by tft.TRANSFORM_FN_DIR and
            # tft.TRANSFORMED_METADATA_DIR respectively.
            _ = (transform_fn
                 |
                 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
コード例 #30
0
def transform_data(input_features, preprocessing_fn, pipeline_args,
                   train_data_file, cv_data_file, test_data_file, working_dir):
    """Transform the data and write out as a TFRecord of Example protos.
    Read in the data using the parquet io, and transform it using a
    preprocessing pipeline that scales numeric data and converts categorical data
    from strings to int64 values indices, by creating a vocabulary for each
    category.
    Args:
      train_data_file: File containing training data
      test_data_file: File containing test data
      feature_config: named tuple with feature types
      working_dir: Directory to write transformed data and metadata to
    """

    # Input schema definition
    RAW_DATA_METADATA = _get_raw_metadata(input_features)

    # pipeline args to read from gcs, currently unused because reading local file
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    # create a beam pipeline
    with beam.Pipeline(options=pipeline_options) as pipeline:
        # Needs to be GCS location if the process is running on Dataflow, otherwise it can't share model files
        temp_dir = pipeline_options.get_all_options().get(
            'temp_location') or tempfile.mkdtemp()
        with tft_beam.Context(temp_dir=temp_dir):
            raw_data = (
                pipeline
                | 'ReadTrainData' >> beam.io.ReadFromParquet(train_data_file))

            # Combine data and schema into a dataset tuple.
            raw_dataset = (raw_data, RAW_DATA_METADATA)
            transformed_dataset, transform_fn = (
                raw_dataset
                | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset
            transformed_data_coder = tft.coders.ExampleProtoCoder(
                transformed_metadata.schema)

            # write to tf record
            _ = (transformed_data
                 | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
                 | 'WriteTrainData' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir, "train_tfrecord")))

            def encode_data(data_path, prefix, output_filename):
                # Apply transform function to test data.
                raw_data = (
                    pipeline
                    |
                    'ReadData' + prefix >> beam.io.ReadFromParquet(data_path))

                raw_dataset = (raw_data, RAW_DATA_METADATA)

                transformed_dataset = (
                    (raw_dataset, transform_fn)
                    | 'Transform' + prefix >> tft_beam.TransformDataset())

                # Don't need transformed data schema, it's the same as before.
                transformed_data, _ = transformed_dataset

                _ = (transformed_data
                     | 'EncodeData' + prefix >> beam.Map(
                         transformed_data_coder.encode)
                     | 'WriteData' + prefix >> beam.io.WriteToTFRecord(
                         os.path.join(working_dir, output_filename)))

            encode_data(cv_data_file, "-cv", "cv_tfrecord")
            encode_data(test_data_file, "-test", "test_tfrecord")

            # Will write a SavedModel and metadata to working_dir, which can then
            # be read by the tft.TFTransformOutput class.
            _ = (transform_fn
                 |
                 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))