def test_end2end_read_write_read(self): path = os.path.join(self._new_tempdir(), 'result') with TestPipeline() as p: # Initial read to validate the pipeline doesn't fail before the file is # created. _ = p | ReadFromTFRecord(path + '-*', validate=False) expected_data = [self.create_inputs() for _ in range(0, 10)] _ = p | beam.Create(expected_data) | WriteToTFRecord( path, file_name_suffix='.gz') # Read the file back and compare. with TestPipeline() as p: actual_data = p | ReadFromTFRecord(path + '-*', validate=True) beam.assert_that(actual_data, beam.equal_to(expected_data))
def run_tfma(slice_spec, input_csv, add_metrics_callbacks=None): """A simple wrapper function that runs tfma locally. A function that does extra transformations on the data and then run model analysis. Args: slice_spec: The slicing spec for how to slice the data. tf_run_id: An id to contruct the model directories with. tfma_run_id: An id to construct output directories with. input_csv: The evaluation data in csv format. add_metrics_callback: Optional list of callbacks for computing extra metrics. Returns: An EvalResult that can be used with TFMA visualization functions. """ #EVAL_MODEL_DIR = 'eval' #eval_model_base_dir = os.path.join(params.Params.MODELS_DIR, EVAL_MODEL_DIR) my_eval_model_dir = os.path.join(eval_model_dir, next(os.walk(eval_model_dir))[1][0]) print(my_eval_model_dir) tfma_out = os.path.join(params.Params.TFMA_OUT, args.run_id) display_only_data_location = input_csv with beam.Pipeline() as pipeline: result = (pipeline | 'ReadFromTFRecords' >> ReadFromTFRecord( params.Params.TRANSFORMED_EVAL_DATA_FILE_PREFIX + '-*') | 'EvaluateAndWriteResults' >> tfma.EvaluateAndWriteResults( eval_saved_model_path=my_eval_model_dir, slice_spec=slice_spec, output_path=tfma_out, display_only_data_location=input_csv)) return None #tfma.load_eval_result(output_path=params.Params.TFMA_OUT)
def test_process_gzip_auto(self): path = os.path.join(self._new_tempdir(), 'result.gz') self._write_file_gzip(path, FOO_BAR_RECORD_BASE64) with beam.Pipeline(DirectRunner()) as p: result = (p | ReadFromTFRecord( path, compression_type=fileio.CompressionTypes.AUTO)) beam.assert_that(result, beam.equal_to(['foo', 'bar']))
def test_process_gzip(self): path = os.path.join(self._new_tempdir(), 'result') self._write_file_gzip(path, FOO_BAR_RECORD_BASE64) with TestPipeline() as p: result = (p | ReadFromTFRecord( path, compression_type=CompressionTypes.GZIP)) beam.assert_that(result, beam.equal_to(['foo', 'bar']))
def test_process_gzip_auto(self): with TempDir() as temp_dir: path = temp_dir.create_temp_file('result.gz') _write_file_gzip(path, FOO_BAR_RECORD_BASE64) with TestPipeline() as p: result = (p | ReadFromTFRecord( path, compression_type=CompressionTypes.AUTO)) assert_that(result, equal_to([b'foo', b'bar']))
def test_process_gzip(self): with TempDir() as temp_dir: path = temp_dir.create_temp_file('result') _write_file_gzip(path, FOO_BAR_RECORD_BASE64) with TestPipeline() as p: result = (p | ReadFromTFRecord( path, coder=coders.BytesCoder(), compression_type=CompressionTypes.GZIP, validate=True)) assert_that(result, equal_to([b'foo', b'bar']))
def test_end2end(self): file_path_prefix = os.path.join(self._new_tempdir(), 'result') # Generate a TFRecord file. with TestPipeline() as p: expected_data = [self.create_inputs() for _ in range(0, 10)] _ = p | beam.Create(expected_data) | WriteToTFRecord(file_path_prefix) # Read the file back and compare. with TestPipeline() as p: actual_data = p | ReadFromTFRecord(file_path_prefix + '-*') assert_that(actual_data, equal_to(expected_data))
def test_end2end_auto_compression_unsharded(self): file_path_prefix = os.path.join(self._new_tempdir(), 'result') # Generate a TFRecord file. with beam.Pipeline(DirectRunner()) as p: expected_data = [self.create_inputs() for _ in range(0, 10)] _ = p | beam.Create(expected_data) | WriteToTFRecord( file_path_prefix + '.gz', shard_name_template='') # Read the file back and compare. with beam.Pipeline(DirectRunner()) as p: actual_data = p | ReadFromTFRecord(file_path_prefix + '.gz') beam.assert_that(actual_data, beam.equal_to(expected_data))
def test_end2end_auto_compression_unsharded(self): with TempDir() as temp_dir: file_path_prefix = temp_dir.create_temp_file('result') # Generate a TFRecord file. with TestPipeline() as p: expected_data = [self.create_inputs() for _ in range(0, 10)] _ = p | beam.Create(expected_data) | WriteToTFRecord( file_path_prefix + '.gz', shard_name_template='') # Read the file back and compare. with TestPipeline() as p: actual_data = p | ReadFromTFRecord(file_path_prefix + '.gz') assert_that(actual_data, equal_to(expected_data))
def test_end2end_example_proto(self): file_path_prefix = os.path.join(self._new_tempdir(), 'result') example = tf.train.Example() example.features.feature['int'].int64_list.value.extend(range(3)) example.features.feature['bytes'].bytes_list.value.extend( [b'foo', b'bar']) with TestPipeline() as p: _ = p | beam.Create([example]) | WriteToTFRecord( file_path_prefix, coder=beam.coders.ProtoCoder(example.__class__)) # Read the file back and compare. with TestPipeline() as p: actual_data = (p | ReadFromTFRecord( file_path_prefix + '-*', coder=beam.coders.ProtoCoder(example.__class__))) assert_that(actual_data, equal_to([example]))
def build_and_run_pipeline(pipeline_options, tfrecord_pattern, predict_dofn, output_bq_table, bq_table_schema): """Build and run a Keras batch inference pipeline to BigQuery pipeline. Args: pipeline_options (beam.options.pipeline_options import PipelineOptions): Commandline arguments for this pipeline. tfrecord_pattern (str): A file glob pattern to read TFRecords from. predict_dofn (beam.DoFn): A DoFn that transforms TFExamples into dictionaries describing BigQuery rows. output_bq_table (str): A string of the form `project:dataset.table_name`. This table will be overwritten if it already exists. bq_table_schema (Union[str, TableSchema]): A BigQuery schema in the format used by `apache_beam.io.gcp.bigquery.WriteToBigQuery`. """ with beam.Pipeline(options=pipeline_options) as p: _ = (p | ReadFromTFRecord(tfrecord_pattern, coder=beam.coders.ProtoCoder(tf.train.Example)) | beam.ParDo(predict_dofn) | WriteToBigQuery( table=output_bq_table, schema=bq_table_schema, write_disposition=BigQueryDisposition.WRITE_TRUNCATE))