Beispiel #1
0
    def test_end2end_read_write_read(self):
        path = os.path.join(self._new_tempdir(), 'result')
        with TestPipeline() as p:
            # Initial read to validate the pipeline doesn't fail before the file is
            # created.
            _ = p | ReadFromTFRecord(path + '-*', validate=False)
            expected_data = [self.create_inputs() for _ in range(0, 10)]
            _ = p | beam.Create(expected_data) | WriteToTFRecord(
                path, file_name_suffix='.gz')

        # Read the file back and compare.
        with TestPipeline() as p:
            actual_data = p | ReadFromTFRecord(path + '-*', validate=True)
            beam.assert_that(actual_data, beam.equal_to(expected_data))
def run_tfma(slice_spec, input_csv, add_metrics_callbacks=None):
    """A simple wrapper function that runs tfma locally.

    A function that does extra transformations on the data and then run model analysis.

    Args:
        slice_spec: The slicing spec for how to slice the data.
        tf_run_id: An id to contruct the model directories with.
        tfma_run_id: An id to construct output directories with.
        input_csv: The evaluation data in csv format.
        add_metrics_callback: Optional list of callbacks for computing extra metrics.

    Returns:
        An EvalResult that can be used with TFMA visualization functions.
    """
    #EVAL_MODEL_DIR = 'eval'
    #eval_model_base_dir = os.path.join(params.Params.MODELS_DIR, EVAL_MODEL_DIR)
    my_eval_model_dir = os.path.join(eval_model_dir,
                                     next(os.walk(eval_model_dir))[1][0])
    print(my_eval_model_dir)

    tfma_out = os.path.join(params.Params.TFMA_OUT, args.run_id)
    display_only_data_location = input_csv
    with beam.Pipeline() as pipeline:
        result = (pipeline
                  | 'ReadFromTFRecords' >> ReadFromTFRecord(
                      params.Params.TRANSFORMED_EVAL_DATA_FILE_PREFIX + '-*')
                  | 'EvaluateAndWriteResults' >> tfma.EvaluateAndWriteResults(
                      eval_saved_model_path=my_eval_model_dir,
                      slice_spec=slice_spec,
                      output_path=tfma_out,
                      display_only_data_location=input_csv))

    return None  #tfma.load_eval_result(output_path=params.Params.TFMA_OUT)
Beispiel #3
0
 def test_process_gzip_auto(self):
     path = os.path.join(self._new_tempdir(), 'result.gz')
     self._write_file_gzip(path, FOO_BAR_RECORD_BASE64)
     with beam.Pipeline(DirectRunner()) as p:
         result = (p
                   | ReadFromTFRecord(
                       path, compression_type=fileio.CompressionTypes.AUTO))
         beam.assert_that(result, beam.equal_to(['foo', 'bar']))
Beispiel #4
0
 def test_process_gzip(self):
     path = os.path.join(self._new_tempdir(), 'result')
     self._write_file_gzip(path, FOO_BAR_RECORD_BASE64)
     with TestPipeline() as p:
         result = (p
                   | ReadFromTFRecord(
                       path, compression_type=CompressionTypes.GZIP))
         beam.assert_that(result, beam.equal_to(['foo', 'bar']))
Beispiel #5
0
 def test_process_gzip_auto(self):
   with TempDir() as temp_dir:
     path = temp_dir.create_temp_file('result.gz')
     _write_file_gzip(path, FOO_BAR_RECORD_BASE64)
     with TestPipeline() as p:
       result = (p
                 | ReadFromTFRecord(
                     path, compression_type=CompressionTypes.AUTO))
       assert_that(result, equal_to([b'foo', b'bar']))
Beispiel #6
0
 def test_process_gzip(self):
   with TempDir() as temp_dir:
     path = temp_dir.create_temp_file('result')
     _write_file_gzip(path, FOO_BAR_RECORD_BASE64)
     with TestPipeline() as p:
       result = (p
                 | ReadFromTFRecord(
                     path,
                     coder=coders.BytesCoder(),
                     compression_type=CompressionTypes.GZIP,
                     validate=True))
       assert_that(result, equal_to([b'foo', b'bar']))
Beispiel #7
0
  def test_end2end(self):
    file_path_prefix = os.path.join(self._new_tempdir(), 'result')

    # Generate a TFRecord file.
    with TestPipeline() as p:
      expected_data = [self.create_inputs() for _ in range(0, 10)]
      _ = p | beam.Create(expected_data) | WriteToTFRecord(file_path_prefix)

    # Read the file back and compare.
    with TestPipeline() as p:
      actual_data = p | ReadFromTFRecord(file_path_prefix + '-*')
      assert_that(actual_data, equal_to(expected_data))
Beispiel #8
0
    def test_end2end_auto_compression_unsharded(self):
        file_path_prefix = os.path.join(self._new_tempdir(), 'result')

        # Generate a TFRecord file.
        with beam.Pipeline(DirectRunner()) as p:
            expected_data = [self.create_inputs() for _ in range(0, 10)]
            _ = p | beam.Create(expected_data) | WriteToTFRecord(
                file_path_prefix + '.gz', shard_name_template='')

        # Read the file back and compare.
        with beam.Pipeline(DirectRunner()) as p:
            actual_data = p | ReadFromTFRecord(file_path_prefix + '.gz')
            beam.assert_that(actual_data, beam.equal_to(expected_data))
Beispiel #9
0
  def test_end2end_auto_compression_unsharded(self):
    with TempDir() as temp_dir:
      file_path_prefix = temp_dir.create_temp_file('result')

      # Generate a TFRecord file.
      with TestPipeline() as p:
        expected_data = [self.create_inputs() for _ in range(0, 10)]
        _ = p | beam.Create(expected_data) | WriteToTFRecord(
            file_path_prefix + '.gz', shard_name_template='')

      # Read the file back and compare.
      with TestPipeline() as p:
        actual_data = p | ReadFromTFRecord(file_path_prefix + '.gz')
        assert_that(actual_data, equal_to(expected_data))
Beispiel #10
0
  def test_end2end_example_proto(self):
    file_path_prefix = os.path.join(self._new_tempdir(), 'result')

    example = tf.train.Example()
    example.features.feature['int'].int64_list.value.extend(range(3))
    example.features.feature['bytes'].bytes_list.value.extend(
        [b'foo', b'bar'])

    with TestPipeline() as p:
      _ = p | beam.Create([example]) | WriteToTFRecord(
          file_path_prefix, coder=beam.coders.ProtoCoder(example.__class__))

    # Read the file back and compare.
    with TestPipeline() as p:
      actual_data = (p | ReadFromTFRecord(
          file_path_prefix + '-*',
          coder=beam.coders.ProtoCoder(example.__class__)))
      assert_that(actual_data, equal_to([example]))
def build_and_run_pipeline(pipeline_options, tfrecord_pattern, predict_dofn,
                           output_bq_table, bq_table_schema):
    """Build and run a Keras batch inference pipeline to BigQuery pipeline.

  Args:
    pipeline_options (beam.options.pipeline_options import PipelineOptions):
      Commandline arguments for this pipeline.
    tfrecord_pattern (str): A file glob pattern to read TFRecords from.
    predict_dofn (beam.DoFn): A DoFn that transforms TFExamples into
      dictionaries describing BigQuery rows.
    output_bq_table (str): A string of the form `project:dataset.table_name`.
      This table will be overwritten if it already exists.
    bq_table_schema (Union[str, TableSchema]): A BigQuery schema in the format
      used by `apache_beam.io.gcp.bigquery.WriteToBigQuery`.
  """
    with beam.Pipeline(options=pipeline_options) as p:
        _ = (p
             | ReadFromTFRecord(tfrecord_pattern,
                                coder=beam.coders.ProtoCoder(tf.train.Example))
             | beam.ParDo(predict_dofn)
             | WriteToBigQuery(
                 table=output_bq_table,
                 schema=bq_table_schema,
                 write_disposition=BigQueryDisposition.WRITE_TRUNCATE))