Beispiel #1
0
    def testNumericAnalyzersWithSparseInputs(self):
        def repeat(in_tensor, value):
            batch_size = tf.shape(in_tensor)[0]
            return tf.ones([batch_size], value.dtype) * value

        input_data = [{'a': [4, 5, 6]}, {'a': [1, 2]}]
        input_metadata = self.toMetadata({'a': tf.VarLenFeature(tf.int64)})
        input_dataset = (input_data, input_metadata)

        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            with self.assertRaises(TypeError):

                def min_fn(inputs):
                    return {
                        'min': tft.map(repeat, inputs['a'],
                                       tft.min(inputs['a']))
                    }

                _ = input_dataset | beam_impl.AnalyzeDataset(min_fn)

            with self.assertRaises(TypeError):

                def max_fn(inputs):
                    return {
                        'max': tft.map(repeat, inputs['a'],
                                       tft.max(inputs['a']))
                    }

                _ = input_dataset | beam_impl.AnalyzeDataset(max_fn)

            with self.assertRaises(TypeError):

                def sum_fn(inputs):
                    return {
                        'sum': tft.map(repeat, inputs['a'],
                                       tft.sum(inputs['a']))
                    }

                _ = input_dataset | beam_impl.AnalyzeDataset(sum_fn)

            with self.assertRaises(TypeError):

                def size_fn(inputs):
                    return {
                        'size': tft.map(repeat, inputs['a'],
                                        tft.size(inputs['a']))
                    }

                _ = input_dataset | beam_impl.AnalyzeDataset(size_fn)

            with self.assertRaises(TypeError):

                def mean_fn(inputs):
                    return {
                        'mean': tft.map(repeat, inputs['a'],
                                        tft.mean(inputs['a']))
                    }

                _ = input_dataset | beam_impl.AnalyzeDataset(mean_fn)
def build_pipeline(p, flags):
    """Sets up Apache Beam pipeline for execution."""

    raw_data = (
        p | 'QueryTable' >> beam.io.Read(
            beam.io.BigQuerySource(query=query.get_query(flags.bq_table),
                                   project=flags.project_id,
                                   use_standard_sql=True))
        # omit 'Generate data' step if working with real data
        | 'Generate data' >> beam.Map(_generate_fake_data)
        | 'Extract lifetime ' >> beam.Map(append_lifetime_duration)
        | 'Extract label' >> beam.Map(append_label)
        | 'Generate label array' >> beam.Map(combine_censorship_duration))
    raw_train, raw_eval, raw_test = (
        raw_data | 'RandomlySplitData' >> randomly_split(
            train_size=.7, validation_size=.15, test_size=.15))
    raw_metadata = features.get_raw_dataset_metadata()
    preprocess_fn = features.preprocess_fn
    transform_fn = ((raw_train, raw_metadata)
                    | 'AnalyzeTrain' >> tft_beam.AnalyzeDataset(preprocess_fn))
    (transform_fn
     | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(flags.output_dir))

    for dataset_type, dataset in [('Train', raw_train), ('Eval', raw_eval),
                                  ('Test', raw_test)]:
        transform_label = 'Transform{}'.format(dataset_type)
        t, metadata = (((dataset, raw_metadata), transform_fn)
                       | transform_label >> tft_beam.TransformDataset())
        if dataset_type == 'Train':
            (metadata | 'WriteMetadata' >> tft_beam_io.WriteMetadata(
                os.path.join(flags.output_dir, 'transformed_metadata'),
                pipeline=p))
        write_label = 'Write{}TFRecord'.format(dataset_type)
        t | write_label >> write_tfrecord(dataset_type, flags.output_dir,
                                          metadata)
Beispiel #3
0
    def testTransformWithExcludedOutputs(self):
        def preprocessing_fn(inputs):
            return {
                'x_scaled': tft.scale_to_0_1(inputs['x']),
                'y_scaled': tft.scale_to_0_1(inputs['y'])
            }

        # Run AnalyzeAndTransform on some input data and compare with expected
        # output.
        input_data = [{'x': 5, 'y': 1}, {'x': 1, 'y': 2}]
        input_metadata = self.toMetadata({
            'x':
            tf.FixedLenFeature((), tf.float32, 0),
            'y':
            tf.FixedLenFeature((), tf.float32, 0)
        })
        with beam_impl.Context(temp_dir=self.get_temp_dir()):
            transform_fn = ((input_data, input_metadata)
                            | beam_impl.AnalyzeDataset(preprocessing_fn))

        # Take the transform function and use TransformDataset to apply it to
        # some eval data, with missing 'y' column.
        eval_data = [{'x': 6}]
        eval_metadata = self.toMetadata(
            {'x': tf.FixedLenFeature((), tf.float32, 0)})
        transformed_eval_dataset = (
            ((eval_data, eval_metadata), transform_fn)
            | beam_impl.TransformDataset(exclude_outputs=['y_scaled']))

        expected_transformed_eval_data = [{'x_scaled': 1.25}]
        expected_transformed_eval_schema = self.toMetadata(
            {'x_scaled': tf.FixedLenFeature((), tf.float32, None)})
        self.assertDatasetsEqual(
            transformed_eval_dataset,
            (expected_transformed_eval_data, expected_transformed_eval_schema))
Beispiel #4
0
    def testAnalyzeBeforeTransform(self):
        def preprocessing_fn(inputs):
            return {'x_scaled': tft.scale_to_0_1(inputs['x'])}

        # Run AnalyzeAndTransform on some input data and compare with expected
        # output.
        input_data = [{'x': 4}, {'x': 1}, {'x': 5}, {'x': 2}]
        input_metadata = self.toMetadata(
            {'x': tf.FixedLenFeature((), tf.float32, 0)})
        transformed_dataset, transform_fn = (
            (input_data, input_metadata)
            | beam_impl.AnalyzeAndTransformDataset(
                preprocessing_fn,
                os.path.join(self.get_temp_dir(),
                             'analyze_before_transform_at')))

        expected_transformed_data = [{
            'x_scaled': 0.75
        }, {
            'x_scaled': 0.0
        }, {
            'x_scaled': 1.0
        }, {
            'x_scaled': 0.25
        }]
        expected_transformed_metadata = self.toMetadata(
            {'x_scaled': tf.FixedLenFeature((), tf.float32, None)})
        self.assertDatasetsEqual(
            transformed_dataset,
            (expected_transformed_data, expected_transformed_metadata))

        # Take the transform function and use TransformDataset to apply it to
        # some eval data, and compare with expected output.
        eval_data = [{'x': 6}, {'x': 3}]
        transformed_eval_dataset = (((eval_data, input_metadata), transform_fn)
                                    | beam_impl.TransformDataset())

        expected_transformed_eval_data = [{
            'x_scaled': 1.25
        }, {
            'x_scaled': 0.5
        }]
        self.assertDatasetsEqual(
            transformed_eval_dataset,
            (expected_transformed_eval_data, expected_transformed_metadata))

        # Redo test with eval data, using AnalyzeDataset instead of
        # AnalyzeAndTransformDataset to genereate transform_fn.
        transform_fn = ((input_data, input_metadata)
                        | beam_impl.AnalyzeDataset(
                            preprocessing_fn,
                            os.path.join(self.get_temp_dir(),
                                         'analyze_before_transform_a')))
        transformed_eval_dataset = (((eval_data, input_metadata), transform_fn)
                                    | beam_impl.TransformDataset())
        self.assertDatasetsEqual(
            transformed_eval_dataset,
            (expected_transformed_eval_data, expected_transformed_metadata))
def run(pipeline_options, known_args):
  pipeline = beam.Pipeline(options=pipeline_options)

  with impl.Context(known_args.transform_temp_dir):
    articles = (
        pipeline
        | 'Get Paths' >> beam.Create(get_paths(known_args.file_pattern))
        | 'Get Articles' >> beam.Map(get_articles)
        | 'Get Article' >> beam.FlatMap(lambda x: x)
    )

    dataset = (articles, get_metadata())

    transform_fn = (
        dataset
        | 'Analyse dataset' >> impl.AnalyzeDataset(preprocess_fn)
    )

    transformed_data_with_meta = (
        (dataset, transform_fn)
        | 'Transform dataset' >> impl.TransformDataset()
    )

    transformed_data, transformed_metadata = transformed_data_with_meta

    transform_fn | 'Export Transform Fn' >> transform_fn_io.WriteTransformFn(
        known_args.transform_export_dir)

    (
        transformed_data
        | 'Convert to Insertable data' >> beam.Map(to_bq_row)
        | 'Write to BigQuery table' >> beam.io.WriteToBigQuery(
            project=known_args.bq_project,
            dataset=known_args.bq_dataset,
            table=known_args.bq_table,
            schema=get_bigquery_schema(),
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
    )

    if known_args.enable_tfrecord:
      transformed_data | 'Write TFRecords' >> beam.io.tfrecordio.WriteToTFRecord(
          file_path_prefix='{0}/{1}'.format(known_args.tfrecord_export_dir, 'reuter'),
          file_name_suffix='.tfrecords',
          coder=tft_coders.example_proto_coder.ExampleProtoCoder(transformed_metadata.schema))

    if known_args.enable_debug:
      transformed_data | 'Debug Output' >> beam.io.textio.WriteToText(
          file_path_prefix=known_args.debug_output_prefix, file_name_suffix='.txt')


  job = pipeline.run()

  if pipeline_options.get_all_options()['runner'] == 'DirectRunner':
    job.wait_until_finish()
Beispiel #6
0
def main():
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        x = inputs['x']
        y = inputs['y']
        s = inputs['s']
        x_centered = x - tft.mean(x)
        y_normalized = tft.scale_to_0_1(y)
        s_integerized = tft.string_to_int(s)
        x_centered_times_y_normalized = (x_centered * y_normalized)
        return {
            'x_centered': x_centered,
            'y_normalized': y_normalized,
            'x_centered_times_y_normalized': x_centered_times_y_normalized,
            's_integerized': s_integerized
        }

    raw_data = [{
        'x': 1,
        'y': 1,
        's': 'hello'
    }, {
        'x': 2,
        'y': 2,
        's': 'world'
    }, {
        'x': 3,
        'y': 3,
        's': 'hello'
    }]

    raw_data_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.Schema({
            's':
            dataset_schema.ColumnSchema(
                tf.string, [], dataset_schema.FixedColumnRepresentation()),
            'y':
            dataset_schema.ColumnSchema(
                tf.float32, [], dataset_schema.FixedColumnRepresentation()),
            'x':
            dataset_schema.ColumnSchema(
                tf.float32, [], dataset_schema.FixedColumnRepresentation())
        }))

    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
        transform_fn = ((raw_data, raw_data_metadata)
                        | beam_impl.AnalyzeDataset(preprocessing_fn))
        transformed_dataset = (((raw_data, raw_data_metadata), transform_fn)
                               | beam_impl.TransformDataset())

    # pylint: disable=unused-variable
    transformed_data, transformed_metadata = transformed_dataset

    pprint.pprint(transformed_data)
Beispiel #7
0
    def build_graph(self):
        # Move percentage of train data to .`PPGRAPH_EXT` files, used for graph building.
        # num_lines = 0
        # for i in range(DATASET_NUM_SHARDS):
        #     _fname = '{}-{:05}-of-{:05}'.format(self.train_fname_out, i, self.config.DATASET_NUM_SHARDS)
        #     num_lines += sum(1 for _ in open(_fname))
        #     _fname_marked = '{}-{:05}-of-{:05}.{}'.format(self.train_fname_out, i, self.config.DATASET_NUM_SHARDS,
        #                                                   PPGRAPH_EXT)
        #     shutil.move(_fname, _fname_marked)
        #     if num_lines >= self.config.PPGRAPH_MAX_SAMPLES:
        #         break

        # Set up the preprocessing pipeline for analyzing the dataset. The analyze call is not combined with the
        # transform call because we will parallelize the transform call later. We had the issue that this process
        # runs on a single core and tends to cause OOM issues.
        pipeline = beam.Pipeline(runner=DirectRunner())

        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            # todo: maybe, I should only use train data (or percentage of train data) to build the graph
            raw_train_data = (
                pipeline
                | 'ReadTrainDataFile' >> textio.ReadFromText(
                    'data/features' + '*' + 'shard' + '*', skip_header_lines=0)
                | 'DecodeTrainDataCSV' >> MapAndFilterErrors(
                    tft_coders.CsvCoder(
                        self.data_formatter.get_ordered_columns(),
                        self.data_formatter.get_raw_data_metadata().schema).
                    decode))

            # Combine data and schema into a dataset tuple.  Note that we already used
            # the schema to read the CSV data, but we also need it to interpret
            # raw_data.
            # That is when to use vocabulary, scale_to_0_1 or sparse_to_dense ...
            transform_fn = (
                (raw_train_data, self.data_formatter.get_raw_data_metadata())
                | beam_impl.AnalyzeDataset(
                    PreprocessingFunction().transform_to_tfrecord))

            # Write SavedModel and metadata to two subdirectories of working_dir, given by
            # `transform_fn_io.TRANSFORM_FN_DIR` and `transform_fn_io.TRANSFORMED_METADATA_DIR` respectively.
            _ = (transform_fn
                 | 'WriteTransformGraph' >>
                 transform_fn_io.WriteTransformFn(TARGET_DIR))  # working dir

        # Run the Beam preprocessing pipeline.
        st = time.time()
        result = pipeline.run()
        result.wait_until_finish()
        self.logger.info(
            'Transformation graph built and written in {:.2f} sec'.format(
                time.time() - st))
Beispiel #8
0
def make_transform_graph(output_dir, schema, features):
    """Writes a tft transform fn, and metadata files.

  Args:
    output_dir: output folder
    schema: schema list
    features: features dict
  """

    tft_input_schema = make_tft_input_schema(
        schema, os.path.join(output_dir, STATS_FILE))
    tft_input_metadata = dataset_metadata.DatasetMetadata(
        schema=tft_input_schema)
    preprocessing_fn = make_preprocessing_fn(output_dir, features)

    # preprocessing_fn does not use any analyzer, so we can run a local beam job
    # to properly make and write the transform function.
    temp_dir = os.path.join(output_dir, 'tmp')
    with beam.Pipeline('DirectRunner', options=None) as p:
        with tft_impl.Context(temp_dir=temp_dir):

            # Not going to transform, so no data is needed.
            train_data = p | beam.Create([])

            transform_fn = (
                (train_data, tft_input_metadata)
                | 'BuildTransformFn'  # noqa
                >> tft_impl.AnalyzeDataset(preprocessing_fn))  # noqa

            # Writes transformed_metadata and transfrom_fn folders
            _ = (transform_fn |
                 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir)
                 )  # noqa

            # Write the raw_metadata
            metadata_io.write_metadata(metadata=tft_input_metadata,
                                       path=os.path.join(
                                           output_dir, RAW_METADATA_DIR))
def preprocess(p, output_dir, check_path, data_size, bq_table, split_data_path,
               project_id):
    """Main processing pipeline reading, processing and storing processed data.

  Performs the following operations:
    - reads data from BigQuery
    - adds hash key value to each row
    - scales data
    - shuffles and splits data in train / validation / test sets
    - oversamples train data
    - stores data as TFRecord
    - splits and stores test data into labels and features files

  Args:
    p: PCollection, initial pipeline.
    output_dir: string, path to directory to store output.
    check_path: string, path to directory to store data checks.
    data_size: tuple of float, ratio of data going respectively to train,
      validation and test sets.
    bq_table: string, name of table to read data from.
    split_data_path: string, path to directory to store train, validation and
      test raw datasets.
    project_id: string, GCP project id.

  Raises:
    ValueError: No test dataset found in pipeline output.
  """

    train_size, validation_size, test_size = data_size

    data = (p |
            'ReadData' >> read_data(bq_table=bq_table, project_id=project_id))

    _ = data | 'StoreData' >> beam.io.WriteToText(
        posixpath.join(output_dir, check_path, 'processed_data.txt'))

    split_data = (
        data |
        'RandomlySplitData' >> randomly_split(train_size=train_size,
                                              validation_size=validation_size,
                                              test_size=test_size))

    for k in split_data:
        split_data[k] |= 'AddHash_{}'.format(k.name) >> beam.ParDo(
            AddHash(),
            label_column=constants.LABEL_COLUMN,
            key_column=constants.KEY_COLUMN,
            dtype=k)

    # Splits test data into features pipeline and labels pipeline.
    if DatasetType.TEST not in split_data:
        raise ValueError('No test dataset found in pipeline output.')
    test_data = (split_data.pop(DatasetType.TEST)
                 | 'SplitFeaturesLabels' >> split_features_labels(
                     constants.LABEL_COLUMN, constants.KEY_COLUMN))

    # Stores test data features and labels pipeline separately.
    for k in test_data:
        _ = (test_data[k]
             | 'ParseJsonToString_{}'.format(k) >> beam.Map(json.dumps)
             | 'StoreSplitData_{}'.format(k) >> beam.io.WriteToText(
                 posixpath.join(
                     output_dir, split_data_path,
                     'split_data_{}_{}.txt'.format(DatasetType.TEST.name, k))))

    meta_data = dataset_metadata.DatasetMetadata(make_input_schema())

    transform_fn = (
        (split_data[DatasetType.TRAIN], meta_data)
        | 'AnalyzeTrainDataset' >> beam_impl.AnalyzeDataset(preprocessing_fn))

    _ = (transform_fn
         | 'WriteTransformFn' >> tft.beam.tft_beam_io.WriteTransformFn(
             posixpath.join(output_dir, constants.PATH_INPUT_TRANSFORMATION)))
    _ = (meta_data
         | 'WriteInputMetadata' >> tft.beam.tft_beam_io.WriteMetadata(
             posixpath.join(output_dir, constants.PATH_INPUT_SCHEMA),
             pipeline=p))

    transformed_metadata, transformed_data = {}, {}
    for k in [DatasetType.TRAIN, DatasetType.VAL]:
        transformed_data[k], transformed_metadata[k] = (
            ((split_data[k], meta_data), transform_fn)
            | 'Transform{}'.format(k) >> beam_impl.TransformDataset())

    transformed_data[DatasetType.TRAIN] = (
        transformed_data[DatasetType.TRAIN]
        | 'OverSampleTraining' >> oversampling())

    for k in transformed_data:
        _ = (transformed_data[k]
             | 'ShuffleData{}'.format(k) >> shuffle_data()
             | 'StoreData{}'.format(k) >> store_transformed_data(
                 schema=transformed_metadata[k],
                 path=posixpath.join(output_dir,
                                     constants.PATH_TRANSFORMED_DATA_SPLIT[k]),
                 name=DatasetType(k).name))

    for k in transformed_data:
        _ = (transformed_data[k] | 'CheckSize{}'.format(k.name) >> check_size(
            name=DatasetType(k).name,
            path=posixpath.join(output_dir, check_path, k.name)))
Beispiel #10
0
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir,
               frequency_threshold):
    """Run pre-processing step as a pipeline.

  Args:
    pipeline: beam pipeline
    training_data: the name of the table to train on.
    eval_data: the name of the table to evaluate on.
    predict_data: the name of the table to predict on.
    output_dir: file path to where to write all the output files.
    frequency_threshold: frequency threshold to use for categorical values.
  """

    # 1) The schema can be either defined in-memory or read from a configuration
    #    file, in this case we are creating the schema in-memory.
    input_schema = reddit.make_input_schema()

    # 2) Read from BigQuery or from CSV.
    train_data = pipeline | 'ReadTrainingData' >> _ReadData(training_data)
    evaluate_data = pipeline | 'ReadEvalData' >> _ReadData(eval_data)

    input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema)

    _ = (input_metadata
         | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(
             os.path.join(output_dir, path_constants.RAW_METADATA_DIR),
             pipeline=pipeline))

    preprocessing_fn = reddit.make_preprocessing_fn(frequency_threshold)
    transform_fn = ((train_data, input_metadata)
                    | 'Analyze' >> tft.AnalyzeDataset(preprocessing_fn))

    # WriteTransformFn writes transform_fn and metadata to fixed subdirectories
    # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and
    # path_constants.TRANSFORMED_METADATA_DIR.
    _ = (transform_fn
         | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir))

    @beam.ptransform_fn
    def TransformAndWrite(pcoll, path):  # pylint: disable=invalid-name
        pcoll |= 'Shuffle' >> _Shuffle()  # pylint: disable=no-value-for-parameter
        (dataset, metadata) = (((pcoll, input_metadata), transform_fn)
                               | 'Transform' >> tft.TransformDataset())
        coder = coders.ExampleProtoCoder(metadata.schema)
        _ = (dataset
             | 'SerializeExamples' >> beam.Map(coder.encode)
             | 'WriteExamples' >> beam.io.WriteToTFRecord(
                 os.path.join(output_dir, path),
                 file_name_suffix='.tfrecord.gz'))

    _ = train_data | 'TransformAndWriteTraining' >> TransformAndWrite(  # pylint: disable=no-value-for-parameter
        path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX)

    _ = evaluate_data | 'TransformAndWriteEval' >> TransformAndWrite(  # pylint: disable=no-value-for-parameter
        path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX)

    # TODO(b/35300113) Remember to eventually also save the statistics.

    if predict_data:
        predict_mode = tf.contrib.learn.ModeKeys.INFER
        predict_schema = reddit.make_input_schema(mode=predict_mode)
        predict_coder = coders.ExampleProtoCoder(predict_schema)

        serialized_examples = (
            pipeline
            | 'ReadPredictData' >> _ReadData(predict_data, mode=predict_mode)
            # TODO(b/35194257) Obviate the need for this explicit
            # serialization.
            | 'EncodePredictData' >> beam.Map(predict_coder.encode))
        _ = (serialized_examples
             | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord(
                 os.path.join(
                     output_dir,
                     path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
                 file_name_suffix='.tfrecord.gz'))
        _ = (serialized_examples
             | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json)
             | 'WritePredictDataAsText' >> beam.io.WriteToText(
                 os.path.join(
                     output_dir,
                     path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
                 file_name_suffix='.txt'))
Beispiel #11
0
    def assertAnalyzeAndTransformResults(self,
                                         input_data,
                                         input_metadata,
                                         preprocessing_fn,
                                         expected_data=None,
                                         expected_metadata=None,
                                         expected_vocab_file_contents=None,
                                         expected_asset_file_contents=None,
                                         test_data=None,
                                         desired_batch_size=None,
                                         beam_pipeline=None,
                                         temp_dir=None,
                                         use_tfxio=False,
                                         input_data_is_tfxio_format=False):
        """Assert that input data and metadata is transformed as expected.

    This methods asserts transformed data and transformed metadata match
    with expected_data and expected_metadata.

    Args:
      input_data: A sequence of dicts whose values are
          either strings, lists of strings, numeric types or a pair of those.
      input_metadata: DatasetMetadata describing input_data.
      preprocessing_fn: A function taking a dict of tensors and returning
          a dict of tensors.
      expected_data: (optional) A dataset with the same type constraints as
          input_data, but representing the output after transformation.
          If supplied, transformed data is asserted to be equal.
      expected_metadata: (optional) DatasetMetadata describing the transformed
          data. If supplied, transformed metadata is asserted to be equal.
      expected_vocab_file_contents: (optional) A dictionary from vocab filenames
          to their expected content as a list of text lines or a list of tuples
          of frequency and text. Values should be the expected result of calling
          f.readlines() on the given asset files.
      expected_asset_file_contents: deprecated.  Use
          expected_vocab_file_contents.
      test_data: (optional) If this is provided then instead of calling
          AnalyzeAndTransformDataset with input_data, this function will call
          AnalyzeDataset with input_data and TransformDataset with test_data.
          Note that this is the case even if input_data and test_data are equal.
          test_data should also conform to input_metadata.
      desired_batch_size: (optional) A batch size to batch elements by. If not
          provided, a batch size will be computed automatically.
      beam_pipeline: (optional) A Beam Pipeline to use in this test.
      temp_dir: If set, it is used as output directory, else a new unique
          directory is created.
      use_tfxio: If True, invoke AnalyzeAndTransformDataset using the new API
          that accepts standardized inputs (Arrow `RecordBatch`es). Otherwise
          use the old API that accepts Dicts.
      input_data_is_tfxio_format: If True, `input_data` and `test_data` are
          Arrow `RecordBatch`es and the `input_metadata` is
          `tfxio.tensor_adapter.TensorAdapterConfig`. Otherwise the input data
          is a list of Dicts and input_metadata is a `DatasetMetadata`.
    Raises:
      AssertionError: if the expected data does not match the results of
          transforming input_data according to preprocessing_fn, or
          (if provided) if the expected metadata does not match.
      ValueError: if expected_vocab_file_contents and
          expected_asset_file_contents are both set.
    """
        if (expected_vocab_file_contents is not None
                and expected_asset_file_contents is not None):
            raise ValueError('only one of expected_asset_file_contents and '
                             'expected_asset_file_contents should be set')
        elif expected_asset_file_contents is not None:
            tf.compat.v1.logging.warn(
                'expected_asset_file_contents is deprecated, use '
                'expected_vocab_file_contents')

        expected_vocab_file_contents = (expected_vocab_file_contents
                                        or expected_asset_file_contents or {})
        del expected_asset_file_contents

        if not use_tfxio and input_data_is_tfxio_format:
            raise ValueError('Unable to feed TFXIO input format to the old, '
                             'non-TFXIO API.')
        compatibility_tfxio_needed = use_tfxio and not input_data_is_tfxio_format
        # Note: we don't separately test AnalyzeDataset and TransformDataset as
        # AnalyzeAndTransformDataset currently simply composes these two
        # transforms.  If in future versions of the code, the implementation
        # differs, we should also run AnalyzeDataset and TransformDataset composed.
        temp_dir = temp_dir or tempfile.mkdtemp(prefix=self._testMethodName,
                                                dir=self.get_temp_dir())
        with beam_pipeline or self._makeTestPipeline() as pipeline:
            with beam_impl.Context(temp_dir=temp_dir,
                                   desired_batch_size=desired_batch_size,
                                   use_tfxio=use_tfxio):
                input_data = pipeline | 'CreateInput' >> beam.Create(
                    input_data, reshuffle=False)
                if compatibility_tfxio_needed:
                    legacy_input_metadata = input_metadata
                    input_data, input_metadata = self.convert_to_tfxio_api_inputs(
                        input_data, input_metadata, label='input_data')
                if test_data is None:
                    (transformed_data, transformed_metadata), transform_fn = (
                        (input_data, input_metadata)
                        |
                        beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
                else:
                    transform_fn = ((input_data, input_metadata)
                                    |
                                    beam_impl.AnalyzeDataset(preprocessing_fn))
                    test_data = pipeline | 'CreateTest' >> beam.Create(
                        test_data)
                    if compatibility_tfxio_needed:
                        test_data, _ = self.convert_to_tfxio_api_inputs(
                            test_data,
                            legacy_input_metadata,
                            label='test_data')
                    transformed_data, transformed_metadata = (
                        ((test_data, input_metadata), transform_fn)
                        | beam_impl.TransformDataset())

                # Write transform_fn so we can test its assets
                _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir)

                if expected_data is not None:
                    transformed_data_coder = tft.coders.ExampleProtoCoder(
                        transformed_metadata.schema)

                    transformed_data_path = os.path.join(
                        temp_dir, 'transformed_data')
                    _ = (transformed_data
                         | beam.Map(transformed_data_coder.encode)
                         | beam.io.tfrecordio.WriteToTFRecord(
                             transformed_data_path, shard_name_template=''))

        # TODO(ebreck) Log transformed_data somewhere.
        if expected_data is not None:
            examples = tf.compat.v1.python_io.tf_record_iterator(
                path=transformed_data_path)
            transformed_data = [
                transformed_data_coder.decode(x) for x in examples
            ]
            self.assertDataCloseOrEqual(expected_data, transformed_data)

        tf_transform_output = tft.TFTransformOutput(temp_dir)
        if expected_metadata:
            # Make a copy with no annotations.
            transformed_schema = schema_pb2.Schema()
            transformed_schema.CopyFrom(
                tf_transform_output.transformed_metadata.schema)
            transformed_schema.ClearField('annotation')
            for feature in transformed_schema.feature:
                feature.ClearField('annotation')
            self.assertEqual(expected_metadata.schema, transformed_schema)

        for filename, file_contents in six.iteritems(
                expected_vocab_file_contents):
            full_filename = tf_transform_output.vocabulary_file_by_name(
                filename)
            self.AssertVocabularyContents(full_filename, file_contents)
Beispiel #12
0
    def assertAnalyzeAndTransformResults(self,
                                         input_data,
                                         input_metadata,
                                         preprocessing_fn,
                                         expected_data=None,
                                         expected_metadata=None,
                                         expected_vocab_file_contents=None,
                                         expected_asset_file_contents=None,
                                         test_data=None,
                                         desired_batch_size=None,
                                         beam_pipeline=None,
                                         temp_dir=None,
                                         force_tf_compat_v1=True):
        """Assert that input data and metadata is transformed as expected.

    This methods asserts transformed data and transformed metadata match
    with expected_data and expected_metadata.

    Args:
      input_data: Input data formatted in one of two ways:
        * A sequence of dicts whose values are one of:
          strings, lists of strings, numeric types or a pair of those.
          Must have at least one key so that we can infer the batch size, or
        * A sequence of pa.RecordBatch.
      input_metadata: One of -
        * DatasetMetadata describing input_data if `input_data` are dicts.
        * TensorAdapterConfig otherwise.
      preprocessing_fn: A function taking a dict of tensors and returning
          a dict of tensors.
      expected_data: (optional) A dataset with the same type constraints as
          input_data, but representing the output after transformation.
          If supplied, transformed data is asserted to be equal.
      expected_metadata: (optional) DatasetMetadata describing the transformed
          data. If supplied, transformed metadata is asserted to be equal.
      expected_vocab_file_contents: (optional) A dictionary from vocab filenames
          to their expected content as a list of text lines or a list of tuples
          of frequency and text. Values should be the expected result of calling
          f.readlines() on the given asset files.
      expected_asset_file_contents: deprecated.  Use
          expected_vocab_file_contents.
      test_data: (optional) If this is provided then instead of calling
          AnalyzeAndTransformDataset with input_data, this function will call
          AnalyzeDataset with input_data and TransformDataset with test_data.
          Note that this is the case even if input_data and test_data are equal.
          test_data should also conform to input_metadata.
      desired_batch_size: (optional) A batch size to batch elements by. If not
          provided, a batch size will be computed automatically.
      beam_pipeline: (optional) A Beam Pipeline to use in this test.
      temp_dir: If set, it is used as output directory, else a new unique
          directory is created.
      force_tf_compat_v1: A `Boolean`. If `True`, TFT's public APIs use
          Tensorflow in compat.v1 mode.
    Raises:
      AssertionError: if the expected data does not match the results of
          transforming input_data according to preprocessing_fn, or
          (if provided) if the expected metadata does not match.
      ValueError: if expected_vocab_file_contents and
          expected_asset_file_contents are both set.
    """
        if (expected_vocab_file_contents is not None
                and expected_asset_file_contents is not None):
            raise ValueError('only one of expected_asset_file_contents and '
                             'expected_asset_file_contents should be set')
        elif expected_asset_file_contents is not None:
            tf.compat.v1.logging.warn(
                'expected_asset_file_contents is deprecated, use '
                'expected_vocab_file_contents')

        expected_vocab_file_contents = (expected_vocab_file_contents
                                        or expected_asset_file_contents or {})
        del expected_asset_file_contents

        # Note: we don't separately test AnalyzeDataset and TransformDataset as
        # AnalyzeAndTransformDataset currently simply composes these two
        # transforms.  If in future versions of the code, the implementation
        # differs, we should also run AnalyzeDataset and TransformDataset composed.
        temp_dir = temp_dir or tempfile.mkdtemp(prefix=self._testMethodName,
                                                dir=self.get_temp_dir())
        with beam_pipeline or self._makeTestPipeline() as pipeline:
            with beam_impl.Context(temp_dir=temp_dir,
                                   desired_batch_size=desired_batch_size,
                                   force_tf_compat_v1=force_tf_compat_v1):
                input_data = pipeline | 'CreateInput' >> beam.Create(
                    input_data, reshuffle=False)
                if test_data is None:
                    (transformed_data, transformed_metadata), transform_fn = (
                        (input_data, input_metadata)
                        |
                        beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
                else:
                    transform_fn = ((input_data, input_metadata)
                                    |
                                    beam_impl.AnalyzeDataset(preprocessing_fn))
                    test_data = pipeline | 'CreateTest' >> beam.Create(
                        test_data)
                    transformed_data, transformed_metadata = (
                        ((test_data, input_metadata), transform_fn)
                        | beam_impl.TransformDataset())

                # Write transform_fn so we can test its assets
                _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir)

                if expected_data is not None:
                    transformed_data_coder = tft.coders.ExampleProtoCoder(
                        transformed_metadata.schema)

                    transformed_data_path = os.path.join(
                        temp_dir, 'transformed_data')
                    _ = (transformed_data
                         | beam.Map(transformed_data_coder.encode)
                         | beam.io.tfrecordio.WriteToTFRecord(
                             transformed_data_path, shard_name_template=''))

        # TODO(ebreck) Log transformed_data somewhere.
        if expected_data is not None:
            examples = tf.compat.v1.python_io.tf_record_iterator(
                path=transformed_data_path)
            shapes = {
                f.name:
                [s.size for s in f.shape.dim] if f.HasField('shape') else [-1]
                for f in transformed_metadata.schema.feature
            }
            transformed_data = [
                _format_example_as_numpy_dict(e, shapes) for e in examples
            ]
            self.assertDataCloseOrEqual(expected_data, transformed_data)

        tf_transform_output = tft.TFTransformOutput(temp_dir)
        if expected_metadata:
            # Make a copy with no annotations.
            transformed_schema = schema_pb2.Schema()
            transformed_schema.CopyFrom(
                tf_transform_output.transformed_metadata.schema)
            transformed_schema.ClearField('annotation')
            for feature in transformed_schema.feature:
                feature.ClearField('annotation')
            self.assertEqual(expected_metadata.schema, transformed_schema)

        for filename, file_contents in six.iteritems(
                expected_vocab_file_contents):
            full_filename = tf_transform_output.vocabulary_file_by_name(
                filename)
            self.AssertVocabularyContents(full_filename, file_contents)
Beispiel #13
0
def transform_data(input_handle,
                   outfile_prefix,
                   working_dir,
                   schema_file,
                   transform_dir=None,
                   max_rows=None,
                   pipeline_args=None):
    """The main tf.transform method which analyzes and transforms data.

  Args:
    input_handle: BigQuery table name to process specified as DATASET.TABLE or
      path to csv file with input data.
    outfile_prefix: Filename prefix for emitted transformed examples
    working_dir: Directory in which transformed examples and transform function
      will be emitted.
    schema_file: An file path that contains a text-serialized TensorFlow
      metadata schema of the input data.
    transform_dir: Directory in which the transform output is located. If
      provided, this will load the transform_fn from disk instead of computing
      it over the data. Hint: this is useful for transforming eval data.
    max_rows: Number of rows to query from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """
    def preprocessing_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
        outputs = {}
        for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
                _fill_in_missing(inputs[key]))

        for key in taxi.VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            outputs[taxi.transformed_name(
                key)] = transform.compute_and_apply_vocabulary(
                    _fill_in_missing(inputs[key]),
                    top_k=taxi.VOCAB_SIZE,
                    num_oov_buckets=taxi.OOV_SIZE)

        for key in taxi.BUCKET_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = transform.bucketize(
                _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

        for key in taxi.CATEGORICAL_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

        # Was this passenger a big tipper?
        taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
        tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
        outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
            tf.is_nan(taxi_fare),
            tf.cast(tf.zeros_like(taxi_fare), tf.int64),
            # Test if the tip was > 20% of the fare.
            tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
                    tf.int64))

        return outputs

    schema = taxi.read_schema(schema_file)
    raw_feature_spec = taxi.get_raw_feature_spec(schema)
    raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)

    with beam.Pipeline(argv=pipeline_args) as pipeline:
        with beam_impl.Context(temp_dir=working_dir):
            if input_handle.lower().endswith('csv'):
                csv_coder = taxi.make_csv_coder(schema)
                raw_data = (pipeline
                            | 'ReadFromText' >> beam.io.ReadFromText(
                                input_handle, skip_header_lines=1)
                            | 'ParseCSV' >> beam.Map(csv_coder.decode))
            else:
                query = taxi.make_sql(input_handle, max_rows, for_eval=False)
                raw_data = (
                    pipeline
                    | 'ReadBigQuery' >> beam.io.Read(
                        beam.io.BigQuerySource(query=query,
                                               use_standard_sql=True))
                    |
                    'CleanData' >> beam.Map(taxi.clean_raw_data_dict,
                                            raw_feature_spec=raw_feature_spec))

            if transform_dir is None:
                transform_fn = (
                    (raw_data, raw_data_metadata)
                    |
                    ('Analyze' >> beam_impl.AnalyzeDataset(preprocessing_fn)))

                _ = (transform_fn
                     | ('WriteTransformFn' >>
                        transform_fn_io.WriteTransformFn(working_dir)))
            else:
                transform_fn = pipeline | transform_fn_io.ReadTransformFn(
                    transform_dir)

            # Shuffling the data before materialization will improve Training
            # effectiveness downstream.
            shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle(
            )

            (transformed_data, transformed_metadata) = (
                ((shuffled_data, raw_data_metadata), transform_fn)
                | 'Transform' >> beam_impl.TransformDataset())

            coder = example_proto_coder.ExampleProtoCoder(
                transformed_metadata.schema)
            _ = (transformed_data
                 | 'SerializeExamples' >> beam.Map(coder.encode)
                 | 'WriteExamples' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir, outfile_prefix),
                     file_name_suffix='.gz'))
Beispiel #14
0
  def test_non_frequency_vocabulary_merge(self):
    """This test compares vocabularies produced with and without cache."""

    mi_vocab_name = 'mutual_information_vocab'
    adjusted_mi_vocab_name = 'adjusted_mutual_information_vocab'
    weighted_frequency_vocab_name = 'weighted_frequency_vocab'

    def preprocessing_fn(inputs):
      _ = tft.vocabulary(
          inputs['s'],
          labels=inputs['label'],
          store_frequency=True,
          vocab_filename=mi_vocab_name,
          min_diff_from_avg=0.1,
          use_adjusted_mutual_info=False)

      _ = tft.vocabulary(
          inputs['s'],
          labels=inputs['label'],
          store_frequency=True,
          vocab_filename=adjusted_mi_vocab_name,
          min_diff_from_avg=1.0,
          use_adjusted_mutual_info=True)

      _ = tft.vocabulary(
          inputs['s'],
          weights=inputs['weight'],
          store_frequency=True,
          vocab_filename=weighted_frequency_vocab_name,
          use_adjusted_mutual_info=False)
      return inputs

    span_0_key = 'span-0'
    span_1_key = 'span-1'

    input_data = [
        dict(s='a', weight=1, label=1),
        dict(s='a', weight=0.5, label=1),
        dict(s='b', weight=0.75, label=1),
        dict(s='b', weight=1, label=0),
    ]
    input_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec({
            's': tf.io.FixedLenFeature([], tf.string),
            'label': tf.io.FixedLenFeature([], tf.int64),
            'weight': tf.io.FixedLenFeature([], tf.float32),
        }))
    input_data_dict = {
        span_0_key: input_data,
        span_1_key: input_data,
    }

    with _TestPipeline() as p:
      flat_data = p | 'CreateInputData' >> beam.Create(
          list(itertools.chain(*input_data_dict.values())))

      # wrap each value in input_data_dict as a pcoll.
      input_data_pcoll_dict = {}
      for a, b in six.iteritems(input_data_dict):
        input_data_pcoll_dict[a] = p | a >> beam.Create(b)

      transform_fn_with_cache, output_cache = (
          (flat_data, input_data_pcoll_dict, {}, input_metadata) |
          (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
      transform_fn_with_cache_dir = os.path.join(self.base_test_dir,
                                                 'transform_fn_with_cache')
      _ = transform_fn_with_cache | tft_beam.WriteTransformFn(
          transform_fn_with_cache_dir)

      expected_accumulators = {
          b'__v0__VocabularyAccumulate[vocabulary]-\xd3\xe0p\x82\xb1\xa0z\xa3S\xd7N8@\x8f\xa2\xd7\xa1\x9e\xac;':
              [
                  b'["a", [2, [0.0, 1.0], [0.0, 0.0], 1.0]]',
                  b'["b", [2, [0.5, 0.5], [0.0, 0.0], 1.0]]'
              ],
          b'__v0__VocabularyAccumulate[vocabulary_1]-A\xc7_0\xee\xff\x88@E<\xde\xcb\x8d\xff5\xebyZZ\x8d':
              [
                  b'["a", [2, [0.0, 1.0], [0.0, 0.0], 1.0]]',
                  b'["b", [2, [0.5, 0.5], [0.0, 0.0], 1.0]]'
              ],
          b"__v0__VocabularyAccumulate[vocabulary_2]-\x97\x1c>\x851\x94'\xdc\xdf\xfd\xcc\x86\xb7\xb8\xe1\xe8*\x89B\t":
              [b'["a", 1.5]', b'["b", 1.75]'],
      }
      spans = [span_0_key, span_1_key]
      self.assertCountEqual(output_cache.keys(), spans)
      for span in spans:
        self.assertCountEqual(output_cache[span].keys(),
                              expected_accumulators.keys())
        for idx, (key,
                  value) in enumerate(six.iteritems(expected_accumulators)):
          beam_test_util.assert_that(
              output_cache[span][key],
              beam_test_util.equal_to(value),
              label='AssertCache[{}][{}]'.format(span, idx))

    # 4 from analysis on each of the input spans.
    self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 8)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 0)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 6)
    self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2)

    with _TestPipeline() as p:
      flat_data = p | 'CreateInputData' >> beam.Create(input_data * 2)

      transform_fn_no_cache = ((flat_data, input_metadata) |
                               (beam_impl.AnalyzeDataset(preprocessing_fn)))

      transform_fn_no_cache_dir = os.path.join(self.base_test_dir,
                                               'transform_fn_no_cache')
      _ = transform_fn_no_cache | tft_beam.WriteTransformFn(
          transform_fn_no_cache_dir)

    # 4 from analysis on each of the input spans.
    self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 8)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 0)
    self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 0)
    self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2)

    tft_output_cache = tft.TFTransformOutput(transform_fn_with_cache_dir)
    tft_output_no_cache = tft.TFTransformOutput(transform_fn_no_cache_dir)

    for vocab_filename in (mi_vocab_name, adjusted_mi_vocab_name,
                           weighted_frequency_vocab_name):
      cache_path = tft_output_cache.vocabulary_file_by_name(vocab_filename)
      no_cache_path = tft_output_no_cache.vocabulary_file_by_name(
          vocab_filename)
      with tf.io.gfile.GFile(cache_path, 'rb') as f1, tf.io.gfile.GFile(
          no_cache_path, 'rb') as f2:
        self.assertEqual(
            f1.readlines(), f2.readlines(),
            'vocab with cache != vocab without cache for: {}'.format(
                vocab_filename))
def transform_data(input_handle,
                   outfile_prefix,
                   working_dir,
                   setup_file,
                   ts1,
                   ts2,
                   project=None,
                   max_rows=None,
                   mode=None,
                   stage=None,
                   preprocessing_fn=None):
    """The main tf.transform method which analyzes and transforms data.

  Args:
    input_handle: BigQuery table name to process specified as
      DATASET.TABLE or path to csv file with input data.
    outfile_prefix: Filename prefix for emitted transformed examples
    working_dir: Directory in which transformed examples and transform
      function will be emitted.
    max_rows: Number of rows to query from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """
    def def_preprocessing_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
        outputs = {}
        for key in ts.DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[key] = transform.scale_to_z_score(inputs[key])

        for key in ts.VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            outputs[key] = transform.string_to_int(inputs[key],
                                                   top_k=ts.VOCAB_SIZE,
                                                   num_oov_buckets=ts.OOV_SIZE)

        for key in ts.BUCKET_FEATURE_KEYS:
            outputs[key] = transform.bucketize(inputs[key],
                                               ts.FEATURE_BUCKET_COUNT)

        for key in ts.CATEGORICAL_FEATURE_KEYS:
            outputs[key] = inputs[key]

        # Was this passenger a big tipper?
        def convert_label(label):
            taxi_fare = inputs[ts.FARE_KEY]
            return tf.where(
                tf.is_nan(taxi_fare),
                tf.cast(tf.zeros_like(taxi_fare), tf.int64),
                # Test if the tip was > 20% of the fare.
                tf.cast(
                    tf.greater(label, tf.multiply(taxi_fare,
                                                  tf.constant(0.2))),
                    tf.int64))

        outputs[ts.LABEL_KEY] = transform.apply_function(
            convert_label, inputs[ts.LABEL_KEY])

        return outputs

    preprocessing_fn = preprocessing_fn or def_preprocessing_fn

    print('ts1 %s, ts2 %s' % (ts1, ts2))
    raw_feature_spec = ts.get_raw_feature_spec()
    raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)
    temp_dir = os.path.join(working_dir, 'tmp')
    if stage is None:
        stage = 'train'

    if mode == 'local':
        options = {'project': project}
        pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
        runner = 'DirectRunner'
    elif mode == 'cloud':
        options = {
            'job_name': 'tft-' + stage + '-' + str(uuid.uuid4()),
            'temp_location': temp_dir,
            'project': project,
            'save_main_session': True,
            'setup_file': setup_file
        }
        pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
        runner = 'DataFlowRunner'
    else:
        raise ValueError("Invalid mode %s." % mode)

    with beam.Pipeline(runner, options=pipeline_options) as pipeline:
        with beam_impl.Context(temp_dir=temp_dir):
            csv_coder = ts.make_csv_coder()
            if 'csv' in input_handle.lower():
                raw_data = (pipeline
                            | 'ReadFromText' >> beam.io.ReadFromText(
                                input_handle, skip_header_lines=1)
                            | 'ParseCSV' >> beam.Map(csv_coder.decode))
            else:
                query = make_sql(input_handle,
                                 ts1,
                                 ts2,
                                 stage,
                                 max_rows=max_rows,
                                 for_eval=False)
                raw_data = (pipeline
                            | 'ReadBigQuery' >> beam.io.Read(
                                beam.io.BigQuerySource(query=query,
                                                       use_standard_sql=True)))

            raw_data |= 'CleanData' >> beam.Map(ts.clean_raw_data_dict)

            transform_fn = (
                (raw_data, raw_data_metadata)
                | 'Analyze' >> beam_impl.AnalyzeDataset(preprocessing_fn))

            _ = (transform_fn
                 | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(working_dir))

            # Shuffling the data before materialization will improve training
            # effectiveness downstream.
            shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle(
            )

            (transformed_data, transformed_metadata) = (
                ((shuffled_data, raw_data_metadata), transform_fn)
                | 'Transform' >> beam_impl.TransformDataset())

            if 'csv' not in input_handle.lower():  # if querying BQ
                _ = (raw_data
                     | beam.Map(csv_coder.encode)
                     | beam.io.WriteToText(os.path.join(
                         working_dir, '{}.csv'.format(stage)),
                                           num_shards=1))

            coder = example_proto_coder.ExampleProtoCoder(
                transformed_metadata.schema)
            _ = (
                transformed_data
                | 'SerializeExamples' >> beam.Map(coder.encode)
                | 'WriteExamples' >> beam.io.WriteToTFRecord(
                    os.path.join(working_dir, outfile_prefix),
                    compression_type=beam.io.filesystem.CompressionTypes.GZIP))
Beispiel #16
0
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir,
               frequency_threshold, delimiter):
    """Run pre-processing step as a pipeline.

  Args:
    pipeline: beam pipeline
    training_data: file paths to input csv files.
    eval_data: file paths to input csv files.
    predict_data: file paths to input csv files.
    output_dir: file path to where to write all the output files.
    frequency_threshold: frequency threshold to use for categorical values.
    delimiter: the column delimiter for the CSV format.
  """
    # 1) The schema can be either defined in-memory or read from a configuration
    #    file, in this case we are creating the schema in-memory.
    input_schema = criteo.make_input_schema()

    # 2) Configure the coder to map the source file column names to a dictionary
    #    of key -> tensor_proto with the appropiate type derived from the
    #    input_schema.
    coder = criteo.make_csv_coder(input_schema, delimiter)

    # 3) Read from text using the coder.
    train_data = (pipeline
                  | 'ReadTrainingData' >> beam.io.ReadFromText(training_data)
                  | 'ParseTrainingCsv' >> beam.Map(coder.decode))

    evaluate_data = (pipeline
                     | 'ReadEvalData' >> beam.io.ReadFromText(eval_data)
                     | 'ParseEvalCsv' >> beam.Map(coder.decode))

    input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema)
    _ = (input_metadata
         | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(
             os.path.join(output_dir, path_constants.RAW_METADATA_DIR),
             pipeline=pipeline))

    preprocessing_fn = criteo.make_preprocessing_fn(frequency_threshold)
    transform_fn = ((train_data, input_metadata)
                    | 'Analyze' >> tft.AnalyzeDataset(preprocessing_fn))

    # WriteTransformFn writes transform_fn and metadata to fixed subdirectories
    # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and
    # path_constants.TRANSFORMED_METADATA_DIR.
    _ = (transform_fn
         | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir))

    @beam.ptransform_fn
    def TransformAndWrite(pcoll, path):  # pylint: disable=invalid-name
        pcoll |= 'Shuffle' >> _Shuffle()  # pylint: disable=no-value-for-parameter
        (dataset, metadata) = (((pcoll, input_metadata), transform_fn)
                               | 'Transform' >> tft.TransformDataset())
        #coder = criteo.make_csv_coder(input_schema, delimiter)
        #coder = coders.ExampleProtoCoder(metadata.schema)
        column_names = ['clicked']
        #for name in INTEGER_COLUMN_NAMES:
        #  column_names.append(name)
        #for name in CATEGORICAL_COLUMN_NAMES:
        #  column_names.append(name)

        #coder = coders.CsvCoder(column_names, metadata.schema, delimiter=",")
        coder = coders.ExampleProtoCoder(metadata.schema)
        _ = (dataset
             | 'SerializeExamples' >> beam.Map(coder.encode)
             | 'WriteExamples' >> beam.io.WriteToTFRecord(
                 os.path.join(output_dir, path),
                 file_name_suffix='.tfrecord.gz'))

    _ = train_data | 'TransformAndWriteTraining' >> TransformAndWrite(  # pylint: disable=no-value-for-parameter
        path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX)

    _ = evaluate_data | 'TransformAndWriteEval' >> TransformAndWrite(  # pylint: disable=no-value-for-parameter
        path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX)

    # TODO(b/35300113) Remember to eventually also save the statistics.

    if predict_data:
        predict_mode = tf.contrib.learn.ModeKeys.INFER
        predict_schema = criteo.make_input_schema(mode=predict_mode)
        csv_coder = criteo.make_csv_coder(predict_schema, mode=predict_mode)
        predict_coder = coders.ExampleProtoCoder(predict_schema)
        serialized_examples = (
            pipeline
            | 'ReadPredictData' >> beam.io.ReadFromText(predict_data)
            | 'ParsePredictCsv' >> beam.Map(csv_coder.decode)
            # TODO(b/35194257) Obviate the need for this explicit serialization.
            | 'EncodePredictData' >> beam.Map(predict_coder.encode))
        _ = (serialized_examples
             | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord(
                 os.path.join(
                     output_dir,
                     path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
                 file_name_suffix='.tfrecord.gz'))
        _ = (serialized_examples
             | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json)
             | 'WritePredictDataAsText' >> beam.io.WriteToText(
                 os.path.join(
                     output_dir,
                     path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
                 file_name_suffix='.txt'))
Beispiel #17
0
      movies_data=movies_sideinput,
      rating_threshold=args.eval_score_threshold,
      is_ranking_problem=(args.eval_type == RANKING),
      is_train=False,
      num_ranking_candidate_movie_ids=args.num_ranking_candidate_movie_ids)

  # TFTransform based preprocessing.
  raw_metadata = dataset_metadata.DatasetMetadata(
      schema=movielens.make_examples_schema())
  _ = (raw_metadata
       | 'WriteRawMetadata' >> tft_beam_io.WriteMetadata(
           os.path.join(args.output_dir, 'raw_metadata'), pipeline))

  preprocessing_fn = movielens.make_preprocessing_fn()
  transform_fn = ((train_data, raw_metadata)
                  | 'Analyze' >> tft.AnalyzeDataset(preprocessing_fn))

  _ = (transform_fn
       | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(args.output_dir))

  @beam.ptransform_fn
  def TransformAndWrite(pcoll, path):  # pylint: disable=invalid-name
    pcoll |= 'Shuffle' >> _Shuffle()  # pylint: disable=no-value-for-parameter
    (dataset, metadata) = (((pcoll, raw_metadata), transform_fn)
                           | 'Transform' >> tft.TransformDataset())
    coder = tft_coders.ExampleProtoCoder(metadata.schema)
    _ = (dataset
         | 'SerializeExamples' >> beam.Map(coder.encode)
         | 'WriteExamples' >> beam.io.WriteToTFRecord(
             os.path.join(args.output_dir, path),
             file_name_suffix='.tfrecord.gz'))
Beispiel #18
0
def run(flags, pipeline_args):
    """Run Apache Beam pipeline to generate TFRecords for Survival Analysis"""
    options = PipelineOptions(flags=[], **pipeline_args)
    options.view_as(WorkerOptions).machine_type = flags.machine_type
    temp_dir = os.path.join(flags.output_dir, 'tmp')
    runner = 'DataflowRunner' if flags.cloud else 'DirectRunner'

    files = tf.gfile.Glob(flags.input_dir + "*")
    if not flags.cloud:
        files = files[0:
                      20]  # if running locally for testing, process less files

    logging.warning("Number of files: " + str(len(files)))
    labels = get_labels_array(
        "gs://columbia-dl-storage-bucket/ADNI_t1_list_with_fsstatus_20190111.csv"
    )

    with beam.Pipeline(runner, options=options) as p:
        with tft_beam.Context(temp_dir=temp_dir):

            input_metadata = dataset_metadata.DatasetMetadata(
                dataset_schema.from_feature_spec(features.RAW_FEATURE_SPEC))

            filenames = (p | 'Create filenames' >> beam.Create(files))
            nii = (filenames | 'Read NII' >> beam.Map(read_nii))
            nii_with_labels = (
                nii
                | 'Get Label' >> beam.FlatMap(lambda x: read_label(x, labels)))

            raw_train, raw_eval, raw_test = (
                nii_with_labels | 'RandomlySplitData' >> randomly_split(
                    train_size=.7, validation_size=.15, test_size=.15))

            raw_train = raw_train | 'FlattenTrain' >> beam.FlatMap(
                lambda x: x[1])
            raw_eval = (raw_eval
                        | 'FlattenEval' >> beam.FlatMap(lambda x: x[1]))
            raw_test = (raw_test
                        | 'FlattenTest' >> beam.FlatMap(lambda x: x[1]))

            raw_train | 'CountLabelFreq' >> extractAndCount(flags.output_dir)

            dataset_and_metadata, transform_fn = (
                (raw_train, input_metadata)
                | 'TransformData' >> tft_beam.AnalyzeAndTransformDataset(
                    features.preprocess))
            transform_fn = (
                (raw_train, input_metadata)
                |
                'AnalyzeTrain' >> tft_beam.AnalyzeDataset(features.preprocess))
            _ = (transform_fn
                 | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(
                     flags.output_dir))
            for dataset_type, dataset in [('Train', raw_train),
                                          ('Eval', raw_eval),
                                          ('Predict', raw_test)]:

                transform_label = 'Transform{}'.format(dataset_type)
                t, metadata = (((dataset, input_metadata), transform_fn)
                               |
                               transform_label >> tft_beam.TransformDataset())
                if dataset_type == 'Train':
                    _ = (metadata
                         | 'WriteMetadata' >>
                         tft_beam_io.WriteMetadata(os.path.join(
                             flags.output_dir, 'transformed_metadata'),
                                                   pipeline=p))
                write_label = 'Write{}TFRecord'.format(dataset_type)
                _ = t | write_label >> WriteTFRecord(
                    dataset_type, flags.output_dir, metadata)
Beispiel #19
0
  def assertAnalyzeAndTransformResults(self,
                                       input_data,
                                       input_metadata,
                                       preprocessing_fn,
                                       expected_data=None,
                                       expected_metadata=None,
                                       only_check_core_metadata=False,
                                       expected_vocab_file_contents=None,
                                       expected_asset_file_contents=None,
                                       test_data=None,
                                       desired_batch_size=None):
    """Assert that input data and metadata is transformed as expected.

    This methods asserts transformed data and transformed metadata match
    with expected_data and expected_metadata.

    Args:
      input_data: A sequence of dicts whose values are
          either strings, lists of strings, numeric types or a pair of those.
      input_metadata: DatasetMetadata describing input_data.
      preprocessing_fn: A function taking a dict of tensors and returning
          a dict of tensors.
      expected_data: (optional) A dataset with the same type constraints as
          input_data, but representing the output after transformation.
          If supplied, transformed data is asserted to be equal.
      expected_metadata: (optional) DatasetMetadata describing the transformed
          data. If supplied, transformed metadata is asserted to be equal.
      only_check_core_metadata: A boolean to indicate if all elements in
          the transformed metadata is asserted to be equal to expected metadata.
          If True, only transformed feature names, dtypes and representations
          are asserted.
      expected_vocab_file_contents: (optional) A dictionary from vocab filenames
          to their expected content as a list of text lines.  Values should be
          the expected result of calling f.readlines() on the given asset files.
      expected_asset_file_contents: deprecated.  Use
          expected_vocab_file_contents.
      test_data: (optional) If this is provided then instead of calling
          AnalyzeAndTransformDataset with input_data, this function will call
          AnalyzeDataset with input_data and TransformDataset with test_data.
          Note that this is the case even if input_data and test_data are equal.
          test_data should also conform to input_metadata.
      desired_batch_size: (optional) A batch size to batch elements by. If not
          provided, a batch size will be computed automatically.
    Raises:
      AssertionError: if the expected data does not match the results of
          transforming input_data according to preprocessing_fn, or
          (if provided) if the expected metadata does not match.
      ValueError: if expected_vocab_file_contents and
          expected_asset_file_contents are both set.
    """
    if (expected_vocab_file_contents is not None and
        expected_asset_file_contents is not None):
      raise ValueError('only one of expected_asset_file_contents and '
                       'expected_asset_file_contents should be set')
    elif expected_asset_file_contents is not None:
      tf.logging.warn('expected_asset_file_contents is deprecated, use '
                      'expected_vocab_file_contents')

    expected_vocab_file_contents = (
        expected_vocab_file_contents or expected_asset_file_contents or {})
    del expected_asset_file_contents

    # Note: we don't separately test AnalyzeDataset and TransformDataset as
    # AnalyzeAndTransformDataset currently simply composes these two
    # transforms.  If in future versions of the code, the implementation
    # differs, we should also run AnalyzeDataset and TransformDatset composed.
    temp_dir = self.get_temp_dir()
    with beam_impl.Context(
        temp_dir=temp_dir, desired_batch_size=desired_batch_size):
      if test_data is None:
        (transformed_data, transformed_metadata), transform_fn = (
            (input_data, input_metadata)
            | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
      else:
        transform_fn = ((input_data, input_metadata)
                        | beam_impl.AnalyzeDataset(preprocessing_fn))
        transformed_data, transformed_metadata = (
            ((test_data, input_metadata), transform_fn)
            | beam_impl.TransformDataset())

      # Write transform_fn so we can test its assets
      if expected_vocab_file_contents:
        _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir)

    if expected_data is not None:
      self.assertDataCloseOrEqual(expected_data, transformed_data)

    if expected_metadata:
      # Now that the pipeline has run, transformed_metadata.deferred_metadata
      # should be a list containing a single DatasetMetadata with the full
      # metadata.
      assert len(transformed_metadata.deferred_metadata) == 1
      transformed_metadata = transformed_metadata.deferred_metadata[0]

      if only_check_core_metadata:
        # preprocessing_fn may add metadata to column schema only relevant to
        # internal implementation such as vocabulary_file. As such, only check
        # feature names, dtypes and representations are as expected.
        self.assertSameElements(
            transformed_metadata.schema.column_schemas.keys(),
            expected_metadata.schema.column_schemas.keys())
        for k, v in transformed_metadata.schema.column_schemas.iteritems():
          expected_schema = expected_metadata.schema.column_schemas[k]
          self.assertEqual(expected_schema.representation, v.representation,
                           "representation doesn't match for feature '%s'" % k)
          self.assertEqual(expected_schema.domain.dtype, v.domain.dtype,
                           "dtype doesn't match for feature '%s'" % k)
      else:
        # Check the entire DatasetMetadata is as expected.
        # Use extra assertEqual for schemas, since full metadata assertEqual
        # error message is not conducive to debugging.
        self.assertEqual(expected_metadata.schema.column_schemas,
                         transformed_metadata.schema.column_schemas)
        self.assertEqual(expected_metadata, transformed_metadata)

    tf_transform_output = tft.TFTransformOutput(temp_dir)
    for filename, file_contents in six.iteritems(expected_vocab_file_contents):
      full_filename = tf_transform_output.vocabulary_file_by_name(filename)
      with tf.gfile.Open(full_filename) as f:
        self.assertEqual(f.readlines(), file_contents)
Beispiel #20
0
    def assertAnalyzeAndTransformResults(self,
                                         input_data,
                                         input_metadata,
                                         preprocessing_fn,
                                         expected_data=None,
                                         expected_metadata=None,
                                         only_check_core_metadata=False):
        """Assert that input data and metadata is transformed as expected.

    This methods asserts transformed data and transformed metadata match
    with expected_data and expected_metadata.

    Args:
      input_data: A sequence of dicts whose values are
          either strings, lists of strings, numeric types or a pair of those.
      input_metadata: DatasetMetadata describing input_data.
      preprocessing_fn: A function taking a dict of tensors and returning
          a dict of tensors.
      expected_data: (optional) A dataset with the same type constraints as
          input_data, but representing the output after transformation.
          If supplied, transformed data is asserted to be equal.
      expected_metadata: (optional) DatasetMetadata describing the transformed
          data. If supplied, transformed metadata is asserted to be equal.
      only_check_core_metadata: A boolean to indicate if all elements in
          the transformed metadata is asserted to be equal to expected metadata.
          If True, only transformed feature names, dtypes and representations
          are asserted.
    Raises:
      AssertionError: if the expected data does not match the results of
          transforming input_data according to preprocessing_fn, or
          (if provided) if the expected metadata does not match.
    """
        # Note: we don't separately test AnalyzeDataset and TransformDataset as
        # AnalyzeAndTransformDataset currently simply composes these two
        # transforms.  If in future versions of the code, the implementation
        # differs, we should also run AnalyzeDataset and TransformDatset composed.
        #
        # Also, the dataset_metadata that is returned along with
        # `transformed_data` is incomplete as it does not contain the deferred
        # components, so we instead inspect the metadata returned along with the
        # transform function.
        temp_dir = self.get_temp_dir()
        with beam_impl.Context(temp_dir=temp_dir):
            transform_fn, transformed_metadata = (
                (input_data, input_metadata)
                |
                'AnalyzeDataset' >> beam_impl.AnalyzeDataset(preprocessing_fn))
            transformed_data, _ = (
                ((input_data, input_metadata),
                 (transform_fn, transformed_metadata))
                | 'TransformDataset' >> beam_impl.TransformDataset())

        if expected_data:
            self.assertDataCloseOrEqual(expected_data, transformed_data)

        if not expected_metadata:
            return

        transformed_metadata = self._resolveDeferredMetadata(
            transformed_metadata)

        if only_check_core_metadata:
            # preprocessing_fn may add metadata to column schema only relevant to
            # internal implementation such as vocabulary_file. As such, only check
            # feature names, dtypes and representations are as expected.
            self.assertSameElements(
                transformed_metadata.schema.column_schemas.keys(),
                expected_metadata.schema.column_schemas.keys())

            for k, v in transformed_metadata.schema.column_schemas.iteritems():
                expected_schema = expected_metadata.schema.column_schemas[k]

                self.assertEqual(
                    expected_schema.representation, v.representation,
                    "representation doesn't match for feature '%s'" % k)
                self.assertEqual(expected_schema.domain.dtype, v.domain.dtype,
                                 "dtype doesn't match for feature '%s'" % k)

        else:
            # Check the entire DatasetMetadata is as expected.
            # Use extra assertEqual for schemas, since full metadata assertEqual
            # error message is not conducive to debugging.
            self.assertEqual(expected_metadata.schema.column_schemas,
                             transformed_metadata.schema.column_schemas)

            self.assertEqual(expected_metadata, transformed_metadata)
Beispiel #21
0
    def test_non_frequency_vocabulary_merge(self):
        """This test compares vocabularies produced with and without cache."""

        mi_vocab_name = 'mutual_information_vocab'
        adjusted_mi_vocab_name = 'adjusted_mutual_information_vocab'
        weighted_frequency_vocab_name = 'weighted_frequency_vocab'

        def preprocessing_fn(inputs):
            _ = tft.vocabulary(inputs['s'],
                               labels=inputs['label'],
                               store_frequency=True,
                               vocab_filename=mi_vocab_name,
                               min_diff_from_avg=0.1,
                               use_adjusted_mutual_info=False)

            _ = tft.vocabulary(inputs['s'],
                               labels=inputs['label'],
                               store_frequency=True,
                               vocab_filename=adjusted_mi_vocab_name,
                               min_diff_from_avg=1.0,
                               use_adjusted_mutual_info=True)

            _ = tft.vocabulary(inputs['s'],
                               weights=inputs['weight'],
                               store_frequency=True,
                               vocab_filename=weighted_frequency_vocab_name,
                               use_adjusted_mutual_info=False)
            return inputs

        span_0_key = 'span-0'
        span_1_key = 'span-1'

        input_data = [
            dict(s='a', weight=1, label=1),
            dict(s='a', weight=0.5, label=1),
            dict(s='b', weight=0.75, label=1),
            dict(s='b', weight=1, label=0),
        ]
        input_metadata = dataset_metadata.DatasetMetadata(
            dataset_schema.from_feature_spec({
                's':
                tf.io.FixedLenFeature([], tf.string),
                'label':
                tf.io.FixedLenFeature([], tf.int64),
                'weight':
                tf.io.FixedLenFeature([], tf.float32),
            }))
        input_data_dict = {
            span_0_key: input_data,
            span_1_key: input_data,
        }
        with beam_impl.Context(temp_dir=self.get_temp_dir()):

            flat_data = input_data_dict.values() | 'Flatten' >> beam.Flatten()

            transform_fn_with_cache, output_cache = (
                (flat_data, input_data_dict, {}, input_metadata) |
                (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))

            expected_accumulators = {
                '__v0__VocabularyAccumulate--vocabulary--':
                [b'["a", [2, 1.0, 0.0, 1.0]]', b'["b", [2, 0.5, 0.0, 1.0]]'],
                '__v0__VocabularyAccumulate--vocabulary_1--':
                [b'["a", [2, 1.0, 0.0, 1.0]]', b'["b", [2, 0.5, 0.0, 1.0]]'],
                '__v0__VocabularyAccumulate--vocabulary_2--':
                [b'["a", 1.5]', b'["b", 1.75]'],
            }
            spans = [span_0_key, span_1_key]
            self.assertCountEqual(output_cache.keys(), spans)
            for span in spans:
                self.assertCountEqual(output_cache[span].keys(),
                                      expected_accumulators.keys())
                for key, value in six.iteritems(expected_accumulators):
                    self.assertCountEqual(output_cache[span][key], value)

            transform_fn_no_cache = (
                (input_data * 2, input_metadata) |
                (beam_impl.AnalyzeDataset(preprocessing_fn)))

        transform_fn_with_cache_dir = os.path.join(self.base_test_dir,
                                                   'transform_fn_with_cache')
        _ = transform_fn_with_cache | tft_beam.WriteTransformFn(
            transform_fn_with_cache_dir)

        transform_fn_no_cache_dir = os.path.join(self.base_test_dir,
                                                 'transform_fn_no_cache')
        _ = transform_fn_no_cache | tft_beam.WriteTransformFn(
            transform_fn_no_cache_dir)

        tft_output_cache = tft.TFTransformOutput(transform_fn_with_cache_dir)
        tft_output_no_cache = tft.TFTransformOutput(transform_fn_no_cache_dir)

        for vocab_filename in (mi_vocab_name, adjusted_mi_vocab_name,
                               weighted_frequency_vocab_name):
            cache_path = tft_output_cache.vocabulary_file_by_name(
                vocab_filename)
            no_cache_path = tft_output_no_cache.vocabulary_file_by_name(
                vocab_filename)
            with tf.io.gfile.GFile(cache_path, 'rb') as f1, tf.io.gfile.GFile(
                    no_cache_path, 'rb') as f2:
                self.assertEqual(
                    f1.readlines(), f2.readlines(),
                    'vocab with cache != vocab without cache for: {}'.format(
                        vocab_filename))
Beispiel #22
0
    def assertAnalyzeAndTransformResults(self,
                                         input_data,
                                         input_metadata,
                                         preprocessing_fn,
                                         expected_data=None,
                                         expected_metadata=None,
                                         only_check_core_metadata=False,
                                         expected_vocab_file_contents=None,
                                         expected_asset_file_contents=None,
                                         test_data=None,
                                         desired_batch_size=None,
                                         beam_pipeline=None):
        """Assert that input data and metadata is transformed as expected.

    This methods asserts transformed data and transformed metadata match
    with expected_data and expected_metadata.

    Args:
      input_data: A sequence of dicts whose values are
          either strings, lists of strings, numeric types or a pair of those.
      input_metadata: DatasetMetadata describing input_data.
      preprocessing_fn: A function taking a dict of tensors and returning
          a dict of tensors.
      expected_data: (optional) A dataset with the same type constraints as
          input_data, but representing the output after transformation.
          If supplied, transformed data is asserted to be equal.
      expected_metadata: (optional) DatasetMetadata describing the transformed
          data. If supplied, transformed metadata is asserted to be equal.
      only_check_core_metadata: A boolean to indicate if all elements in
          the transformed metadata is asserted to be equal to expected metadata.
          If True, only transformed feature names, dtypes and representations
          are asserted.
      expected_vocab_file_contents: (optional) A dictionary from vocab filenames
          to their expected content as a list of text lines or a list of tuples
          of frequency and text. Values should be the expected result of calling
          f.readlines() on the given asset files.
      expected_asset_file_contents: deprecated.  Use
          expected_vocab_file_contents.
      test_data: (optional) If this is provided then instead of calling
          AnalyzeAndTransformDataset with input_data, this function will call
          AnalyzeDataset with input_data and TransformDataset with test_data.
          Note that this is the case even if input_data and test_data are equal.
          test_data should also conform to input_metadata.
      desired_batch_size: (optional) A batch size to batch elements by. If not
          provided, a batch size will be computed automatically.
      beam_pipeline: (optional) A Beam Pipeline to use in this test.
    Raises:
      AssertionError: if the expected data does not match the results of
          transforming input_data according to preprocessing_fn, or
          (if provided) if the expected metadata does not match.
      ValueError: if expected_vocab_file_contents and
          expected_asset_file_contents are both set.
    """
        if (expected_vocab_file_contents is not None
                and expected_asset_file_contents is not None):
            raise ValueError('only one of expected_asset_file_contents and '
                             'expected_asset_file_contents should be set')
        elif expected_asset_file_contents is not None:
            tf.logging.warn('expected_asset_file_contents is deprecated, use '
                            'expected_vocab_file_contents')

        expected_vocab_file_contents = (expected_vocab_file_contents
                                        or expected_asset_file_contents or {})
        del expected_asset_file_contents

        # Note: we don't separately test AnalyzeDataset and TransformDataset as
        # AnalyzeAndTransformDataset currently simply composes these two
        # transforms.  If in future versions of the code, the implementation
        # differs, we should also run AnalyzeDataset and TransformDatset composed.
        temp_dir = tempfile.mkdtemp(prefix=self._testMethodName,
                                    dir=self.get_temp_dir())
        with beam_pipeline or beam.Pipeline(
                runner=self._makeRunner()) as pipeline:
            with beam_impl.Context(temp_dir=temp_dir,
                                   desired_batch_size=desired_batch_size):
                input_data = pipeline | 'CreateInput' >> beam.Create(
                    input_data)
                if test_data is None:
                    (transformed_data, transformed_metadata), transform_fn = (
                        (input_data, input_metadata)
                        |
                        beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
                else:
                    transform_fn = ((input_data, input_metadata)
                                    |
                                    beam_impl.AnalyzeDataset(preprocessing_fn))
                    test_data = pipeline | 'CreateTest' >> beam.Create(
                        test_data)
                    transformed_data, transformed_metadata = (
                        ((test_data, input_metadata), transform_fn)
                        | beam_impl.TransformDataset())

                # Write transform_fn so we can test its assets
                _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir)

                if expected_data is not None:
                    transformed_data_coder = tft.coders.ExampleProtoCoder(
                        transformed_metadata.schema)

                    transformed_data_path = os.path.join(
                        temp_dir, 'transformed_data')
                    _ = (transformed_data
                         | beam.Map(transformed_data_coder.encode)
                         | beam.io.tfrecordio.WriteToTFRecord(
                             transformed_data_path, shard_name_template=''))

        if expected_data is not None:
            examples = tf.python_io.tf_record_iterator(
                path=transformed_data_path)
            transformed_data = [
                transformed_data_coder.decode(x) for x in examples
            ]
            self.assertDataCloseOrEqual(expected_data, transformed_data)

        tf_transform_output = tft.TFTransformOutput(temp_dir)
        if expected_metadata:
            transformed_metadata = tf_transform_output.transformed_metadata

            if only_check_core_metadata:
                # preprocessing_fn may add metadata to column schema only relevant to
                # internal implementation such as vocabulary_file. As such, only check
                # feature names, dtypes and representations are as expected.
                self.assertSameElements(
                    transformed_metadata.schema.column_schemas.keys(),
                    expected_metadata.schema.column_schemas.keys())
                for k, v in transformed_metadata.schema.column_schemas.iteritems(
                ):
                    expected_schema = expected_metadata.schema.column_schemas[
                        k]
                    self.assertEqual(
                        expected_schema.representation, v.representation,
                        "representation doesn't match for feature '%s'" % k)
                    self.assertEqual(
                        expected_schema.domain.dtype, v.domain.dtype,
                        "dtype doesn't match for feature '%s'" % k)
            else:
                # Check the entire DatasetMetadata is as expected.
                # Use extra assertEqual for schemas, since full metadata assertEqual
                # error message is not conducive to debugging.
                self.assertEqual(expected_metadata.schema.column_schemas,
                                 transformed_metadata.schema.column_schemas)
                self.assertEqual(expected_metadata, transformed_metadata)

        for filename, file_contents in six.iteritems(
                expected_vocab_file_contents):
            full_filename = tf_transform_output.vocabulary_file_by_name(
                filename)
            with tf.gfile.Open(full_filename) as f:
                file_lines = f.readlines()

                # Store frequency case.
                if isinstance(file_contents[0], tuple):
                    word_and_frequency_list = []
                    for content in file_lines:
                        frequency, word = content.split(' ', 1)
                        word_and_frequency_list.append(
                            (word.strip('\n'), float(frequency.strip('\n'))))
                    self.assertAllEqual(
                        zip(*word_and_frequency_list)[0],
                        zip(*file_contents)[0])
                    np.testing.assert_almost_equal(
                        zip(*word_and_frequency_list)[1],
                        zip(*file_contents)[1])
                else:
                    file_lines = [
                        content.strip('\n') for content in file_lines
                    ]
                    self.assertAllEqual(file_lines, file_contents)
Beispiel #23
0
    def assertAnalyzeAndTransformResults(self,
                                         input_data,
                                         input_metadata,
                                         preprocessing_fn,
                                         expected_data=None,
                                         expected_metadata=None,
                                         expected_vocab_file_contents=None,
                                         expected_asset_file_contents=None,
                                         test_data=None,
                                         desired_batch_size=None,
                                         beam_pipeline=None,
                                         temp_dir=None):
        """Assert that input data and metadata is transformed as expected.

    This methods asserts transformed data and transformed metadata match
    with expected_data and expected_metadata.

    Args:
      input_data: A sequence of dicts whose values are
          either strings, lists of strings, numeric types or a pair of those.
      input_metadata: DatasetMetadata describing input_data.
      preprocessing_fn: A function taking a dict of tensors and returning
          a dict of tensors.
      expected_data: (optional) A dataset with the same type constraints as
          input_data, but representing the output after transformation.
          If supplied, transformed data is asserted to be equal.
      expected_metadata: (optional) DatasetMetadata describing the transformed
          data. If supplied, transformed metadata is asserted to be equal.
      expected_vocab_file_contents: (optional) A dictionary from vocab filenames
          to their expected content as a list of text lines or a list of tuples
          of frequency and text. Values should be the expected result of calling
          f.readlines() on the given asset files.
      expected_asset_file_contents: deprecated.  Use
          expected_vocab_file_contents.
      test_data: (optional) If this is provided then instead of calling
          AnalyzeAndTransformDataset with input_data, this function will call
          AnalyzeDataset with input_data and TransformDataset with test_data.
          Note that this is the case even if input_data and test_data are equal.
          test_data should also conform to input_metadata.
      desired_batch_size: (optional) A batch size to batch elements by. If not
          provided, a batch size will be computed automatically.
      beam_pipeline: (optional) A Beam Pipeline to use in this test.
      temp_dir: If set, it is used as output directory, else a new unique
          directory is created.
    Raises:
      AssertionError: if the expected data does not match the results of
          transforming input_data according to preprocessing_fn, or
          (if provided) if the expected metadata does not match.
      ValueError: if expected_vocab_file_contents and
          expected_asset_file_contents are both set.
    """
        if (expected_vocab_file_contents is not None
                and expected_asset_file_contents is not None):
            raise ValueError('only one of expected_asset_file_contents and '
                             'expected_asset_file_contents should be set')
        elif expected_asset_file_contents is not None:
            tf.compat.v1.logging.warn(
                'expected_asset_file_contents is deprecated, use '
                'expected_vocab_file_contents')

        expected_vocab_file_contents = (expected_vocab_file_contents
                                        or expected_asset_file_contents or {})
        del expected_asset_file_contents

        # Note: we don't separately test AnalyzeDataset and TransformDataset as
        # AnalyzeAndTransformDataset currently simply composes these two
        # transforms.  If in future versions of the code, the implementation
        # differs, we should also run AnalyzeDataset and TransformDatset composed.
        temp_dir = temp_dir or tempfile.mkdtemp(prefix=self._testMethodName,
                                                dir=self.get_temp_dir())
        with beam_pipeline or self._makeTestPipeline() as pipeline:
            with beam_impl.Context(temp_dir=temp_dir,
                                   desired_batch_size=desired_batch_size):
                input_data = pipeline | 'CreateInput' >> beam.Create(
                    input_data)
                if test_data is None:
                    (transformed_data, transformed_metadata), transform_fn = (
                        (input_data, input_metadata)
                        |
                        beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
                else:
                    transform_fn = ((input_data, input_metadata)
                                    |
                                    beam_impl.AnalyzeDataset(preprocessing_fn))
                    test_data = pipeline | 'CreateTest' >> beam.Create(
                        test_data)
                    transformed_data, transformed_metadata = (
                        ((test_data, input_metadata), transform_fn)
                        | beam_impl.TransformDataset())

                # Write transform_fn so we can test its assets
                _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir)

                if expected_data is not None:
                    transformed_data_coder = tft.coders.ExampleProtoCoder(
                        transformed_metadata.schema)

                    transformed_data_path = os.path.join(
                        temp_dir, 'transformed_data')
                    _ = (transformed_data
                         | beam.Map(transformed_data_coder.encode)
                         | beam.io.tfrecordio.WriteToTFRecord(
                             transformed_data_path, shard_name_template=''))

        # TODO(ebreck) Log transformed_data somewhere.
        if expected_data is not None:
            examples = tf.compat.v1.python_io.tf_record_iterator(
                path=transformed_data_path)
            transformed_data = [
                transformed_data_coder.decode(x) for x in examples
            ]
            self.assertDataCloseOrEqual(expected_data, transformed_data)

        tf_transform_output = tft.TFTransformOutput(temp_dir)
        if expected_metadata:
            self.assertEqual(expected_metadata,
                             tf_transform_output.transformed_metadata)

        for filename, file_contents in six.iteritems(
                expected_vocab_file_contents):
            full_filename = tf_transform_output.vocabulary_file_by_name(
                filename)
            with tf.io.gfile.GFile(full_filename, 'rb') as f:
                file_lines = f.readlines()

                # Store frequency case.
                if isinstance(file_contents[0], tuple):
                    word_and_frequency_list = []
                    for content in file_lines:
                        frequency, word = content.split(b' ', 1)
                        word_and_frequency_list.append(
                            (word.strip(b'\n'), float(frequency.strip(b'\n'))))
                    expected_words, expected_frequency = zip(
                        *word_and_frequency_list)
                    actual_words, actual_frequency = zip(*file_contents)
                    self.assertAllEqual(expected_words, actual_words)
                    np.testing.assert_almost_equal(expected_frequency,
                                                   actual_frequency)
                else:
                    file_lines = [
                        content.strip(b'\n') for content in file_lines
                    ]
                    self.assertAllEqual(file_lines, file_contents)
Beispiel #24
0
  def assertAnalyzeAndTransformResults(self,
                                       input_data,
                                       input_metadata,
                                       preprocessing_fn,
                                       expected_data=None,
                                       expected_metadata=None,
                                       expected_vocab_file_contents=None,
                                       test_data=None,
                                       desired_batch_size=None,
                                       beam_pipeline=None,
                                       temp_dir=None,
                                       force_tf_compat_v1=False,
                                       output_record_batches=False):
    """Assert that input data and metadata is transformed as expected.

    This methods asserts transformed data and transformed metadata match
    with expected_data and expected_metadata.

    Args:
      input_data: Input data formatted in one of two ways:
        * A sequence of dicts whose values are one of:
          strings, lists of strings, numeric types or a pair of those.
          Must have at least one key so that we can infer the batch size, or
        * A sequence of pa.RecordBatch.
      input_metadata: One of -
        * DatasetMetadata describing input_data if `input_data` are dicts.
        * TensorAdapterConfig otherwise.
      preprocessing_fn: A function taking a dict of tensors and returning
          a dict of tensors.
      expected_data: (optional) A dataset with the same type constraints as
          input_data, but representing the output after transformation.
          If supplied, transformed data is asserted to be equal.
      expected_metadata: (optional) DatasetMetadata describing the transformed
          data. If supplied, transformed metadata is asserted to be equal.
      expected_vocab_file_contents: (optional) A dictionary from vocab filenames
          to their expected content as a list of text lines or a list of tuples
          of frequency and text. Values should be the expected result of calling
          f.readlines() on the given asset files.
      test_data: (optional) If this is provided then instead of calling
          AnalyzeAndTransformDataset with input_data, this function will call
          AnalyzeDataset with input_data and TransformDataset with test_data.
          Note that this is the case even if input_data and test_data are equal.
          test_data should also conform to input_metadata.
      desired_batch_size: (optional) A batch size to batch elements by. If not
          provided, a batch size will be computed automatically.
      beam_pipeline: (optional) A Beam Pipeline to use in this test.
      temp_dir: If set, it is used as output directory, else a new unique
          directory is created.
      force_tf_compat_v1: A bool. If `True`, TFT's public APIs use Tensorflow
          in compat.v1 mode.
      output_record_batches: (optional) A bool. If `True`, `TransformDataset`
          and `AnalyzeAndTransformDataset` output `pyarrow.RecordBatch`es;
          otherwise, they output instance dicts.
    Raises:
      AssertionError: if the expected data does not match the results of
          transforming input_data according to preprocessing_fn, or
          (if provided) if the expected metadata does not match.
    """

    expected_vocab_file_contents = expected_vocab_file_contents or {}

    # Note: we don't separately test AnalyzeDataset and TransformDataset as
    # AnalyzeAndTransformDataset currently simply composes these two
    # transforms.  If in future versions of the code, the implementation
    # differs, we should also run AnalyzeDataset and TransformDataset composed.
    temp_dir = temp_dir or tempfile.mkdtemp(
        prefix=self._testMethodName, dir=self.get_temp_dir())
    with beam_pipeline or self._makeTestPipeline() as pipeline:
      with beam_impl.Context(
          temp_dir=temp_dir,
          desired_batch_size=desired_batch_size,
          force_tf_compat_v1=force_tf_compat_v1):
        input_data = pipeline | 'CreateInput' >> beam.Create(input_data,
                                                             reshuffle=False)
        if test_data is None:
          (transformed_data, transformed_metadata), transform_fn = (
              (input_data, input_metadata)
              | beam_impl.AnalyzeAndTransformDataset(
                  preprocessing_fn,
                  output_record_batches=output_record_batches))
        else:
          transform_fn = ((input_data, input_metadata)
                          | beam_impl.AnalyzeDataset(preprocessing_fn))
          test_data = pipeline | 'CreateTest' >> beam.Create(test_data)
          transformed_data, transformed_metadata = (
              ((test_data, input_metadata), transform_fn)
              | beam_impl.TransformDataset(
                  output_record_batches=output_record_batches))

        # Write transform_fn so we can test its assets
        _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir)

        transformed_data_path = os.path.join(temp_dir, 'transformed_data')
        if expected_data is not None:
          if isinstance(transformed_metadata,
                        beam_metadata_io.BeamDatasetMetadata):
            deferred_schema = (
                transformed_metadata.deferred_metadata
                | 'GetDeferredSchema' >> beam.Map(lambda m: m.schema))
          else:
            deferred_schema = (
                self.pipeline | 'CreateDeferredSchema' >> beam.Create(
                    [transformed_metadata.schema]))

          if output_record_batches:
            # Since we are using a deferred schema, obtain a pcollection
            # containing the data coder that will be created from it.
            transformed_data_coder_pcol = (
                deferred_schema | 'RecordBatchToExamplesEncoder' >> beam.Map(
                    example_coder.RecordBatchToExamplesEncoder))

            encode_ptransform = 'EncodeRecordBatches' >> beam.FlatMap(
                _encode_transformed_data_batch,
                coder=beam.pvalue.AsSingleton(transformed_data_coder_pcol))
          else:
            # Since we are using a deferred schema, obtain a pcollection
            # containing the data coder that will be created from it.
            transformed_data_coder_pcol = (
                deferred_schema
                | 'ExampleProtoCoder' >> beam.Map(tft.coders.ExampleProtoCoder))
            encode_ptransform = 'EncodeExamples' >> beam.Map(
                lambda data, data_coder: data_coder.encode(data),
                data_coder=beam.pvalue.AsSingleton(transformed_data_coder_pcol))

          _ = (
              transformed_data
              | encode_ptransform
              | beam.io.tfrecordio.WriteToTFRecord(
                  transformed_data_path, shard_name_template=''))

    # TODO(ebreck) Log transformed_data somewhere.
    tf_transform_output = tft.TFTransformOutput(temp_dir)
    if expected_data is not None:
      examples = tf.compat.v1.python_io.tf_record_iterator(
          path=transformed_data_path)
      shapes = {
          f.name:
          [s.size for s in f.shape.dim] if f.HasField('shape') else [-1]
          for f in tf_transform_output.transformed_metadata.schema.feature
      }
      transformed_data = [
          _format_example_as_numpy_dict(e, shapes) for e in examples
      ]
      self.assertDataCloseOrEqual(expected_data, transformed_data)

    if expected_metadata:
      # Make a copy with no annotations.
      transformed_schema = schema_pb2.Schema()
      transformed_schema.CopyFrom(
          tf_transform_output.transformed_metadata.schema)
      transformed_schema.ClearField('annotation')
      for feature in transformed_schema.feature:
        feature.ClearField('annotation')

      # assertProtoEqual has a size limit on the length of the
      # serialized as text strings. Therefore, we first try to use
      # assertProtoEqual, if that fails we try to use assertEqual, if that fails
      # as well then we raise the exception from assertProtoEqual.
      try:
        compare.assertProtoEqual(self, expected_metadata.schema,
                                 transformed_schema)
      except AssertionError as compare_exception:
        try:
          self.assertEqual(expected_metadata.schema, transformed_schema)
        except AssertionError:
          raise compare_exception

    for filename, file_contents in expected_vocab_file_contents.items():
      full_filename = tf_transform_output.vocabulary_file_by_name(filename)
      self.AssertVocabularyContents(full_filename, file_contents)