def run(argv=None):
  """Runs the sparse measurements prediction pipeline.

  Args:
    argv: Pipeline options as a list of arguments.
  """
  pipeline_options = PipelineOptions(flags=argv)
  predict_options = pipeline_options.view_as(PredictOptions)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  pipeline_options.view_as(
      WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED'

  with beam.Pipeline(options=pipeline_options) as p:
    examples = (p | 'ReadExamples' >> tfrecordio.ReadFromTFRecord(
        file_pattern=predict_options.input,
        compression_type=CompressionTypes.GZIP))

    predictions = examples | 'Predict' >> beam.ParDo(
        PredictDoFn(model_export_dir=predict_options.model))

    _ = predictions | 'WriteTableRows' >> beam.io.Write(
        beam.io.BigQuerySink(
            predict_options.output,
            schema=get_table_schema(),
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED))
def run(argv=None):
    """Runs the revise preprocessed data pipeline.

  Args:
    argv: Pipeline options as a list of arguments.
  """
    pipeline_options = PipelineOptions(flags=argv)
    revise_options = pipeline_options.view_as(ReviseOptions)
    cloud_options = pipeline_options.view_as(GoogleCloudOptions)
    output_dir = os.path.join(
        revise_options.output,
        datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(
        WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED'
    cloud_options.staging_location = os.path.join(output_dir, 'tmp', 'staging')
    cloud_options.temp_location = os.path.join(output_dir, 'tmp')
    cloud_options.job_name = 'relabel-examples-%s' % (
        datetime.datetime.now().strftime('%y%m%d-%H%M%S'))

    metadata_query = str(
        Template(open(revise_options.metadata,
                      'r').read()).render(METADATA_QUERY_REPLACEMENTS))
    logging.info('metadata query : %s', metadata_query)

    with beam.Pipeline(options=pipeline_options) as p:
        # Gather our sample metadata into a python dictionary.
        samples_metadata = (
            p
            | 'ReadSampleMetadata' >> beam.io.Read(
                beam.io.BigQuerySource(query=metadata_query,
                                       use_standard_sql=True))
            | 'TableToDictionary' >> beam.CombineGlobally(
                util.TableToDictCombineFn(key_column=encoder.KEY_COLUMN)))

        # Read the tf.Example protos into a PCollection.
        examples = p | 'ReadExamples' >> tfrecordio.ReadFromTFRecord(
            file_pattern=revise_options.input,
            compression_type=CompressionTypes.GZIP)

        # Filter the TensorFlow Example Protocol Buffers.
        filtered_examples = (examples | 'ReviseExamples' >> beam.FlatMap(
            lambda example, samples_metadata: filter_and_revise_example(
                example, samples_metadata),
            beam.pvalue.AsSingleton(samples_metadata)))

        # Write the subset of tf.Example protos to Cloud Storage.
        _ = (filtered_examples
             | 'SerializeExamples' >>
             beam.Map(lambda example: example.SerializeToString())
             | 'WriteExamples' >> tfrecordio.WriteToTFRecord(
                 file_path_prefix=os.path.join(output_dir, 'examples'),
                 compression_type=CompressionTypes.GZIP,
                 file_name_suffix='.tfrecord.gz'))
def run_pipeline(beam_options):
    import tensorflow as tf

    with beam.Pipeline(options=beam_options) as p:
        (
          p
          | "Read files in" >> \
              tfrecordio.ReadFromTFRecord(beam_options.input)
          | "Parse TF Examples from file" >> \
              beam.Map(lambda row: tf.io.parse_example(
                  row,
                  feature_description)
              )
          | "Replace TF tensors with native types" >> \
              beam.Map(reformat_row)
          | "Write to Parquet" >> \
              parquetio.WriteToParquet(
                  beam_options.output,
                  pa.schema(parquet_schema),
                  num_shards=128
              )
        )
Example #4
0
def transform_data(working_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data from the positive and negative examples on disk, and
  transform it using a preprocessing pipeline that removes punctuation,
  tokenizes and maps tokens to int64 values indices.

  Args:
    working_dir: Directory to read shuffled data from and write transformed data
        and metadata to.
  """

    with beam.Pipeline() as pipeline:
        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            coder = tft.coders.ExampleProtoCoder(RAW_DATA_METADATA.schema)
            train_data = (pipeline
                          | 'ReadTrain' >> tfrecordio.ReadFromTFRecord(
                              os.path.join(working_dir,
                                           SHUFFLED_TRAIN_DATA_FILEBASE + '*'))
                          | 'DecodeTrain' >> beam.Map(coder.decode))

            test_data = (pipeline
                         | 'ReadTest' >> tfrecordio.ReadFromTFRecord(
                             os.path.join(working_dir,
                                          SHUFFLED_TEST_DATA_FILEBASE + '*'))
                         | 'DecodeTest' >> beam.Map(coder.decode))

            def preprocessing_fn(inputs):
                """Preprocess input columns into transformed columns."""
                review = inputs[REVIEW_KEY]

                review_tokens = tf.string_split(review, DELIMITERS)
                review_indices = tft.string_to_int(review_tokens,
                                                   top_k=VOCAB_SIZE)
                # Add one for the oov bucket created by string_to_int.
                review_bow_indices, review_weight = tft.tfidf(
                    review_indices, VOCAB_SIZE + 1)
                return {
                    REVIEW_KEY: review_bow_indices,
                    REVIEW_WEIGHT_KEY: review_weight,
                    LABEL_KEY: inputs[LABEL_KEY]
                }

            (transformed_train_data, transformed_metadata), transform_fn = (
                (train_data, RAW_DATA_METADATA)
                | 'AnalyzeAndTransform' >>
                beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data_coder = tft.coders.ExampleProtoCoder(
                transformed_metadata.schema)

            transformed_test_data, _ = (
                ((test_data, RAW_DATA_METADATA), transform_fn)
                | 'Transform' >> beam_impl.TransformDataset())

            _ = (transformed_train_data
                 | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
                 | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                     os.path.join(working_dir,
                                  TRANSFORMED_TRAIN_DATA_FILEBASE)))

            _ = (
                transformed_test_data
                | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode)
                | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
                    os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)))

            # Will write a SavedModel and metadata to two subdirectories of
            # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and
            # transform_fn_io.TRANSFORMED_METADATA_DIR respectively.
            _ = (transform_fn
                 | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(working_dir))
Example #5
0
def tftransform(
    pipeline_args,  # type: List[str]
    temp_location,  # type: str
    schema_file,  # type: str
    output_dir,  # type: str
    preprocessing_fn,  # type: Any
    training_data=None,  # type: Union[None, str]
    evaluation_data=None,  # type: Union[None, str]
    transform_fn_dir=None,  # type: Union[None, str]
    compression_type=None  # type: str
):  # type: (...) -> PipelineState
    """
    Generic tf.transform pipeline that takes tf.{example, record} training and evaluation
    datasets and outputs transformed data together with transform function Saved Model.

    :param pipeline_args: un-parsed Dataflow arguments
    :param temp_location: temporary location for dataflow job working dir
    :param schema_file: path to the raw feature schema text file
    :param output_dir: output dir for transformed data and function
    :param preprocessing_fn: tf.transform preprocessing function
    :param training_data: path to the training data
    :param evaluation_data: path to the evaluation data
    :param transform_fn_dir: dir to previously saved transformation function to apply
    :param compression_type: compression type for writing of tf.records
    :return final state of the Beam pipeline
    """
    assert_not_empty_string(temp_location)
    assert_not_empty_string(schema_file)
    assert_not_empty_string(output_dir)
    assert_not_none(preprocessing_fn)

    if compression_type is None:
        compression_type = CompressionTypes.AUTO

    raw_feature_spec = schema_txt_file_to_feature_spec(schema_file)
    raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)
    raw_data_coder = ExampleProtoCoder(raw_data_metadata.schema)

    transformed_train_output_dir = os.path.join(output_dir, "training")
    transformed_eval_output_dir = os.path.join(output_dir, "evaluation")

    if not any(i.startswith("--job_name") for i in pipeline_args):
        pipeline_args.append("--job_name=tf-transform-{}-{}".format(
            getpass.getuser(), int(time.time())))

    pipeline = beam.Pipeline(argv=pipeline_args)
    with beam_impl.Context(temp_dir=temp_location):
        if training_data is not None:
            # if training data is provided, transform_fn_dir will be ignored
            if transform_fn_dir is not None:
                warnings.warn(
                    "Transform_fn_dir is ignored because training_data is provided"
                )

            transform_fn_output = os.path.join(output_dir, "transform_fn",
                                               "saved_model.pb")
            if FileSystems.exists(transform_fn_output):
                raise ValueError("Transform fn already exists at %s!" %
                                 transform_fn_output)

            # compute the transform_fn and apply to the training data
            raw_train_data = (pipeline
                              | "ReadTrainData" >> tfrecordio.ReadFromTFRecord(
                                  training_data, coder=raw_data_coder))

            ((transformed_train_data, transformed_train_metadata),
             transform_fn) = (
                 (raw_train_data, raw_data_metadata)
                 | ("AnalyzeAndTransformTrainData" >>
                    beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
             )  # noqa: E501

            _ = (  # noqa: F841
                transform_fn
                | "WriteTransformFn" >>
                transform_fn_io.WriteTransformFn(output_dir))

            transformed_train_coder = ExampleProtoCoder(
                transformed_train_metadata.schema)
            _ = (  # noqa: F841
                transformed_train_data
                | "WriteTransformedTrainData" >> tfrecordio.WriteToTFRecord(
                    os.path.join(transformed_train_output_dir,
                                 "part"),  # noqa: E501
                    coder=transformed_train_coder,  # noqa: E501
                    compression_type=compression_type,  # noqa: E501
                    file_name_suffix=".tfrecords"))  # noqa: E501
        else:
            if transform_fn_dir is None:
                raise ValueError(
                    "Either training_data or transformed_fn needs to be provided"
                )
            # load the transform_fn
            transform_fn = pipeline | transform_fn_io.ReadTransformFn(
                transform_fn_dir)

        if evaluation_data is not None:
            # if evaluation_data exists, apply the transform_fn to the evaluation data
            raw_eval_data = (pipeline
                             | "ReadEvalData" >> tfrecordio.ReadFromTFRecord(
                                 evaluation_data, coder=raw_data_coder))

            (transformed_eval_data, transformed_eval_metadata) = (
                ((raw_eval_data, raw_data_metadata), transform_fn)
                | "TransformEvalData" >> beam_impl.TransformDataset())

            transformed_eval_coder = ExampleProtoCoder(
                transformed_eval_metadata.schema)
            _ = (  # noqa: F841
                transformed_eval_data
                | "WriteTransformedEvalData" >> tfrecordio.WriteToTFRecord(
                    os.path.join(transformed_eval_output_dir,
                                 "part"),  # noqa: E501
                    coder=transformed_eval_coder,  # noqa: E501
                    compression_type=compression_type,  # noqa: E501
                    file_name_suffix=".tfrecords"))  # noqa: E501
    result = pipeline.run().wait_until_finish()

    return result
Example #6
0
def transform_data(shuffled_train_filepattern, shuffled_test_filepattern,
                   transformed_train_filebase, transformed_test_filebase,
                   transformed_metadata_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data from the positive and negative examples on disk, and
  transform it using a preprocessing pipeline that removes punctuation,
  tokenizes and maps tokens to int64 values indices.

  Args:
    shuffled_train_filepattern: Base filename for shuffled training data shards
    shuffled_test_filepattern: Base filename for shuffled test data shards
    transformed_train_filebase: Base filename for transformed training data
        shards
    transformed_test_filebase: Base filename for transformed test data shards
    transformed_metadata_dir: Directory where metadata for transformed data
        should be written
  """

    with beam.Pipeline() as pipeline:
        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            train_data = (pipeline
                          | 'ReadTrain' >> tfrecordio.ReadFromTFRecord(
                              shuffled_train_filepattern,
                              coder=example_proto_coder.ExampleProtoCoder(
                                  RAW_DATA_METADATA.schema)))

            test_data = (pipeline | 'ReadTest' >> tfrecordio.ReadFromTFRecord(
                shuffled_test_filepattern,
                coder=example_proto_coder.ExampleProtoCoder(
                    RAW_DATA_METADATA.schema)))

            def preprocessing_fn(inputs):
                """Preprocess input columns into transformed columns."""
                review = inputs[REVIEW_COLUMN]

                review_tokens = tf.string_split(review, DELIMITERS)
                review_indices = tft.string_to_int(review_tokens,
                                                   top_k=VOCAB_SIZE)
                # Add one for the oov bucket created by string_to_int.
                review_bow_indices, review_weight = tft.tfidf(
                    review_indices, VOCAB_SIZE + 1)
                return {
                    REVIEW_COLUMN: review_bow_indices,
                    REVIEW_WEIGHT: review_weight,
                    LABEL_COLUMN: inputs[LABEL_COLUMN]
                }

            (transformed_train_data, transformed_metadata), transform_fn = (
                (train_data, RAW_DATA_METADATA)
                | 'AnalyzeAndTransform' >>
                beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

            transformed_test_data, _ = (
                ((test_data, RAW_DATA_METADATA), transform_fn)
                | 'Transform' >> beam_impl.TransformDataset())

            _ = (transformed_train_data
                 | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                     transformed_train_filebase,
                     coder=example_proto_coder.ExampleProtoCoder(
                         transformed_metadata.schema)))

            _ = (transformed_test_data
                 | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
                     transformed_test_filebase,
                     coder=example_proto_coder.ExampleProtoCoder(
                         transformed_metadata.schema)))

            _ = (transformed_metadata
                 | 'WriteMetadata' >> beam_metadata_io.WriteMetadata(
                     transformed_metadata_dir, pipeline=pipeline))