def testPredictExtractorWithSequentialKerasModel(self): # Note that the input will be called 'test_input' model = tf.keras.models.Sequential([ tf.keras.layers.Dense(1, activation=tf.nn.sigmoid, input_shape=(2, ), name='test') ]) model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001), loss=tf.keras.losses.binary_crossentropy, metrics=['accuracy']) train_features = {'test_input': [[0.0, 0.0], [1.0, 1.0]]} labels = [[1], [0]] example_weights = [1.0, 0.5] dataset = tf.data.Dataset.from_tensor_slices( (train_features, labels, example_weights)) dataset = dataset.shuffle(buffer_size=1).repeat().batch(2) model.fit(dataset, steps_per_epoch=1) export_dir = self._getExportDir() model.save(export_dir, save_format='tf') eval_config = config.EvalConfig(model_specs=[config.ModelSpec()]) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) schema = text_format.Parse( """ tensor_representation_group { key: "" value { tensor_representation { key: "test" value { dense_tensor { column_name: "test" shape { dim { size: 2 } } } } } } } feature { name: "test" type: FLOAT } feature { name: "non_model_feature" type: INT } """, schema_pb2.Schema()) tfx_io = test_util.InMemoryTFExampleRecord( schema=schema, raw_record_column_name=constants.BATCHED_INPUT_KEY) tensor_adapter_config = tensor_adapter.TensorAdapterConfig( arrow_schema=tfx_io.ArrowSchema(), tensor_representations=tfx_io.TensorRepresentations()) input_extractor = batched_input_extractor.BatchedInputExtractor( eval_config) predict_extractor = batched_predict_extractor_v2.BatchedPredictExtractor( eval_config=eval_config, eval_shared_model=eval_shared_model, tensor_adapter_config=tensor_adapter_config) # Notice that the features are 'test' but the model expects 'test_input'. # This tests that the PredictExtractor properly handles this case. examples = [ self._makeExample( test=[0.0, 0.0], non_model_feature=0), # should be ignored by model self._makeExample( test=[1.0, 1.0], non_model_feature=1), # should be ignored by model ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples], reshuffle=False) | 'BatchExamples' >> tfx_io.BeamSource(batch_size=2) | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | input_extractor.stage_name >> input_extractor.ptransform | predict_extractor.stage_name >> predict_extractor.ptransform) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) # We can't verify the actual predictions, but we can verify the keys. for item in got: self.assertIn(constants.BATCHED_PREDICTIONS_KEY, item) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def TEST_BeamPipeline(graph_name, output_names, feed_dicts): """Execute the Beam Pipeline. Args: graph_name: the graph to execute output_name: the name of the output inside the graph feed_dicts: a list of {graph_name: {placeholder_input: value}} Returns: None. Save the result into a file. """ op_to_filename = { 'main': './graphdefs/main_graph.pb', 'remote_op_a': './graphdefs/graph_a.pb', 'remote_op_b': './graphdefs/graph_b.pb', } op_to_outputs = { 'main': ['Add'], 'remote_op_b': ['Add_1'], 'remote_op_a': ['embedding_lookup/Identity'], } test = GraphPartition(op_to_filename, op_to_outputs) test.partition() """Define your feed_dict again. These relations are stored inside PyFunc, but we don't have the access. {graph_name: {remote op name: {placeholder name inside subgraph: input name}}} """ op_to_remote_op_name_mapping = { 'main': { 'remote_op_a': { 'ids_a': 'ids1' }, 'remote_op_b': { 'ids_b1': 'ids1', 'ids_b2': 'ids2' }, 'remote_op_a_1': { 'ids_a': 'FloorMod' } }, 'remote_op_b': { 'remote_op_a': { 'ids_a': 'FloorMod' }, 'remote_op_a_1': { 'ids_a': 'ids_b2' } }, } options = beam.options.pipeline_options.PipelineOptions() with beam.Pipeline(options=options) as p: inputs = p | 'read' >> beam.Create(feed_dicts) outputs = inputs | 'Graph' >> ExecuteOneGraph( test.op_to_execution_bundles, op_to_remote_op_name_mapping, graph_name) class GetOutput(beam.DoFn): def process(self, element, graph_name, output_names): outputs = [] for output_name in output_names: outputs.append(element[graph_name][output_name]) yield outputs outputs = outputs | beam.ParDo(GetOutput(), graph_name, output_names) outputs | 'output' >> beam.io.WriteToText('./beam_experiment') result = p.run() result.wait_until_finish()
# !head -n 20 data/wordcount-00000-of-00001 # !rm -Rf data/wordco* # !ls -la output p2 = beam.Pipeline() # list/array =[] # set = () # dictionary = {} lines = ( p2 | beam.Create(['Using create transform ', 'to generate in memory data ', 'This is the 3rd line ', 'Thanks']) | beam.io.WriteToText('data/outCreate1') ) p2.run() # visualize output # !head -n 20 data/outCreate1-00000-of-00001 # !cat data/both-00000-of-00001 p3 = beam.Pipeline() lines3 = ( p3
def test_single_phase_run_twice(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): _ = tft.vocabulary(inputs['s']) _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), } input_data = [{'x': 12, 'y': 1, 's': 'b'}, {'x': 10, 'y': 1, 's': 'c'}] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.FixedLenFeature([], tf.float32), 'y': tf.FixedLenFeature([], tf.float32), 's': tf.FixedLenFeature([], tf.string), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'a', }, { 'x': 4, 'y': -4, 's': 'a', }], span_1_key: input_data, } with beam_impl.Context(temp_dir=self.get_temp_dir()): with beam.Pipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # This is needed due to b/123895600. for a, b in six.iteritems(input_data_dict): input_data_dict[a] = p | a >> beam.Create(b) transform_fn, cache_output = ( (flat_data, input_data_dict, {}, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS( self._cache_dir) transformed_dataset = (((input_data_dict[span_1_key], input_metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) transformed_data, unused_transformed_metadata = transformed_dataset expected_transformed_data = [ { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, }, { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, }, ] beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='first') transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn') _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir) for key in input_data_dict: self.assertIn(key, cache_output) self.assertEqual(6, len(cache_output[key])) transform_fn, second_output_cache = ( (flat_data, input_data_dict, cache_output, input_metadata) | 'AnalyzeAgain' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) dot_string = nodes.get_dot_graph( [analysis_graph_builder._ANALYSIS_GRAPH]).to_string() self.WriteRenderedDotFile(dot_string) transformed_dataset = ( ((input_data_dict[span_1_key], input_metadata), transform_fn) | 'TransformAgain' >> beam_impl.TransformDataset()) transformed_data, unused_transformed_metadata = transformed_dataset beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='second') self.assertFalse(second_output_cache)
g_idx_dict['vwidth'] = int(lines[2].strip()) g_idx_dict['id'] = lines[3].strip() f_sfd = open(os.path.join(cur_font_path, '{}_{:02d}.sfd'.format(font_id, char_id)), mode='rb') g_idx_dict['sfd'] = f_sfd.read() g_idx_dict['binary_fp'] = lines[4].strip() # print(g_idx_dict) glyph_list.append(g_idx_dict) char_des.close() f_sfd.close() with open(glyph_list_path, 'wb') as f: pickle.dump(glyph_list, f) print("Processed all font files") # else: # with open(glyph_list_path, 'rb') as f: # glyph_list = pickle.load(f) # print("Loaded processed font files") # print('Submitting to beam ...') with beam.Pipeline() as p: records = p | 'Read' >> beam.Create(glyph_list) _ = records | 'Write' >> beam.io.WriteToParquet( target_beam_parquetio_file_prefix, pyarrow.schema([('uni', pyarrow.int64()), ('width', pyarrow.int64()), ('vwidth', pyarrow.int64()), ('sfd', pyarrow.string()), ('id', pyarrow.string()), ('binary_fp', pyarrow.string())]))
def testSquaredPearsonCorrelationWithWeights(self): computations = ( squared_pearson_correlation.SquaredPearsonCorrelation().computations( example_weighted=True)) metric = computations[0] example1 = { 'labels': np.array([1.0]), 'predictions': np.array([1.0]), 'example_weights': np.array([1.0]), } example2 = { 'labels': np.array([4.0]), 'predictions': np.array([2.0]), 'example_weights': np.array([2.0]), } example3 = { 'labels': np.array([3.0]), 'predictions': np.array([3.0]), 'example_weights': np.array([3.0]), } example4 = { 'labels': np.array([3.0]), 'predictions': np.array([4.0]), 'example_weights': np.array([4.0]), } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([example1, example2, example3, example4]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeMetric' >> beam.CombinePerKey(metric.combiner)) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) key = metric.keys[0] # 1: prediction = 1, label = 1 # 2: prediction = 2, label = 4 # 3: prediction = 3, label = 3 # 4: prediction = 4, label = 3 # # pred_x_labels = 1x1x1 + 2x2x4 + 3x3x3 + 4x4x3 = 92 # labels = 1x1 + 2x4 + 3x3 + 4x3 = 30 # preds = 1 + 2x2 + 3x3 + 4x4= 30 # sq_labels = 1x1x1 + 2x4x4+ 3x3x3 + 4x3x3 = 96 # sq_preds = 1x1x1 + 2x2x2 + 3x3x3 + 4x4x4 = 100 # examples = 1 + 2 + 3 + 4 = 10 # # r^2 = (92 - 30 * 30 / 10)^2 / (100 - 30^2 / 10) * (96 - 30^2 / 10) # r^2 = 4 / (10 * 6) = 0.06667 self.assertDictElementsAlmostEqual( got_metrics, {key: 0.06667}, places=5) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def expand(self, pcoll): p = pcoll.pipeline temp_location = p.options.view_as(GoogleCloudOptions).temp_location empty_pc = p | "ImpulseEmptyPC" >> beam.Create([]) singleton_pc = p | "ImpulseSingleElementPC" >> beam.Create([None]) load_job_name_pcv = pvalue.AsSingleton( singleton_pc | beam.Map(lambda _: _generate_load_job_name())) file_prefix_pcv = pvalue.AsSingleton( singleton_pc | "GenerateFilePrefix" >> beam.Map( file_prefix_generator(self._validate, self._custom_gcs_temp_location, temp_location))) destination_data_kv_pc = ( pcoll | "RewindowIntoGlobal" >> self._window_fn() | "AppendDestination" >> beam.ParDo(bigquery_tools.AppendDestinationsFn( self.destination), *self.table_side_inputs)) all_destination_file_pairs_pc = self._write_files(destination_data_kv_pc, file_prefix_pcv) grouped_files_pc = ( all_destination_file_pairs_pc | "GroupFilesByTableDestinations" >> beam.GroupByKey()) partitions = (grouped_files_pc | beam.ParDo(PartitionFiles(self.max_partition_size, self.max_files_per_partition)) .with_outputs(PartitionFiles.MULTIPLE_PARTITIONS_TAG, PartitionFiles.SINGLE_PARTITION_TAG)) multiple_partitions_per_destination_pc = partitions[ PartitionFiles.MULTIPLE_PARTITIONS_TAG] single_partition_per_destination_pc = partitions[ PartitionFiles.SINGLE_PARTITION_TAG] # When using dynamic destinations, elements with both single as well as # multiple partitions are loaded into BigQuery using temporary tables to # ensure atomicity. if self.dynamic_destinations: all_partitions = ((multiple_partitions_per_destination_pc, single_partition_per_destination_pc) | "FlattenPartitions" >> beam.Flatten()) destination_load_job_ids_pc, destination_copy_job_ids_pc = self.\ _load_data(all_partitions, empty_pc, load_job_name_pcv, singleton_pc) else: destination_load_job_ids_pc, destination_copy_job_ids_pc = self.\ _load_data(multiple_partitions_per_destination_pc, single_partition_per_destination_pc, load_job_name_pcv, singleton_pc) return { self.DESTINATION_JOBID_PAIRS: destination_load_job_ids_pc, self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc, self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc, }
def test_big_query_write_new_types(self): table_name = 'python_new_types_table' table_id = '{}.{}'.format(self.dataset_id, table_name) row_data = { 'float': 0.33, 'numeric': Decimal('10'), 'bytes': base64.b64encode(b'\xab\xac').decode('utf-8'), 'date': '3000-12-31', 'time': '23:59:59', 'datetime': '2018-12-31T12:44:31', 'timestamp': '2018-12-31 12:44:31.744957 UTC', 'geo': 'POINT(30 10)' } input_data = [row_data] # add rows with only one key value pair and None values for all other keys for key, value in row_data.items(): input_data.append({key: value}) table_schema = { "fields": [{ "name": "float", "type": "FLOAT" }, { "name": "numeric", "type": "NUMERIC" }, { "name": "bytes", "type": "BYTES" }, { "name": "date", "type": "DATE" }, { "name": "time", "type": "TIME" }, { "name": "datetime", "type": "DATETIME" }, { "name": "timestamp", "type": "TIMESTAMP" }, { "name": "geo", "type": "GEOGRAPHY" }] } expected_row = ( 0.33, Decimal('10'), b'\xab\xac', datetime.date(3000, 12, 31), datetime.time(23, 59, 59), datetime.datetime(2018, 12, 31, 12, 44, 31), datetime.datetime(2018, 12, 31, 12, 44, 31, 744957, tzinfo=pytz.utc), 'POINT(30 10)', ) expected_data = [expected_row] # add rows with only one key value pair and None values for all other keys for i, value in enumerate(expected_row): row = [None] * len(expected_row) row[i] = value expected_data.append(tuple(row)) pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query='SELECT float, numeric, bytes, date, time, datetime,' 'timestamp, geo FROM %s' % table_id, data=expected_data) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))
def test_big_query_write_without_schema(self): table_name = 'python_no_schema_table' self.create_table(table_name) table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [{ 'bytes': b'xyw', 'date': '2011-01-01', 'time': '23:59:59.999999' }, { 'bytes': b'abc', 'date': '2000-01-01', 'time': '00:00:00' }, { 'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date': '3000-12-31', 'time': '23:59:59' }, { 'bytes': b'\xab\xac\xad', 'date': '2000-01-01', 'time': '00:00:00' }] # bigquery io expects bytes to be base64 encoded values for row in input_data: row['bytes'] = base64.b64encode(row['bytes']) pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT bytes, date, time FROM %s" % table_id, data=[( b'xyw', datetime.date(2011, 1, 1), datetime.time(23, 59, 59, 999999), ), ( b'abc', datetime.date(2000, 1, 1), datetime.time(0, 0, 0), ), ( b'\xe4\xbd\xa0\xe5\xa5\xbd', datetime.date(3000, 12, 31), datetime.time(23, 59, 59), ), ( b'\xab\xac\xad', datetime.date(2000, 1, 1), datetime.time(0, 0, 0), )]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, temp_file_format=FileFormat.JSON))
def _build_pcollection(self, pipeline, filepaths, language): """Build PCollection of examples in the raw (text) form.""" def _extract_content(filepath): """Extracts article content from a single WikiMedia XML file.""" logging.info("generating examples from = %s", filepath) with open(filepath, "rb") as f: f = bz2.BZ2File(filename=f) if six.PY3: # Workaround due to: # https://github.com/tensorflow/tensorflow/issues/33563 utf_f = codecs.getreader("utf-8")(f) else: utf_f = f # To clear root, to free-up more memory than just `elem.clear()`. context = etree.iterparse(utf_f, events=("end", )) context = iter(context) unused_event, root = next(context) for unused_event, elem in context: if not elem.tag.endswith("page"): continue namespace = elem.tag[:-4] title = elem.find("./{0}title".format(namespace)).text ns = elem.find("./{0}ns".format(namespace)).text id_ = elem.find("./{0}id".format(namespace)).text # Filter pages that are not in the "main" namespace. if ns != "0": root.clear() continue raw_content = elem.find( "./{0}revision/{0}text".format(namespace)).text root.clear() # Filter redirects. if raw_content is None or raw_content.lower().startswith( "#redirect"): beam.metrics.Metrics.counter( language, "filtered-redirects").inc() continue beam.metrics.Metrics.counter(language, "extracted-examples").inc() yield (id_, title, raw_content) def _clean_content(inputs): """Cleans raw wikicode to extract text.""" id_, title, raw_content = inputs try: text = _parse_and_clean_wikicode(raw_content) except (mwparserfromhell.parser.ParserError) as e: beam.metrics.Metrics.counter(language, "parser-error").inc() logging.error("mwparserfromhell ParseError: %s", e) return if not text: beam.metrics.Metrics.counter(language, "empty-clean-examples").inc() return beam.metrics.Metrics.counter(language, "cleaned-examples").inc() yield id_, {"title": title, "text": text} return (pipeline | beam.Create(filepaths) | beam.FlatMap(_extract_content) | beam.transforms.Reshuffle() | beam.FlatMap(_clean_content))
def test_big_query_write(self): table_name = 'python_write_table' table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [ { 'number': 1, 'str': 'abc' }, { 'number': 2, 'str': 'def' }, { 'number': 3, 'str': u'你好' }, { 'number': 4, 'str': u'привет' }, ] table_schema = { "fields": [{ "name": "number", "type": "INTEGER" }, { "name": "str", "type": "STRING" }] } pipeline_verifiers = [ BigqueryFullResultMatcher(project=self.project, query="SELECT number, str FROM %s" % table_id, data=[( 1, 'abc', ), ( 2, 'def', ), ( 3, u'你好', ), ( 4, u'привет', )]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))
# distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # beam-playground: # name: HelloBeam # description: Task from katas to create a simple pipeline that takes a hardcoded input element "Hello Beam". # multifile: false # context_line: 30 # categories: # - Testing # - Quickstart import apache_beam as beam from log_elements import LogElements with beam.Pipeline() as p: (p | beam.Create(['Hello Beam']) | LogElements())
def testBatchSizeLimit(self): temp_export_dir = self._getExportDir() _, export_dir = batch_size_limited_classifier.simple_batch_size_limited_classifier( None, temp_export_dir) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) eval_config = config.EvalConfig(model_specs=[config.ModelSpec()]) schema = text_format.Parse( """ feature { name: "classes" type: BYTES } feature { name: "scores" type: FLOAT } feature { name: "labels" type: BYTES } """, schema_pb2.Schema()) tfx_io = test_util.InMemoryTFExampleRecord( schema=schema, raw_record_column_name=constants.BATCHED_INPUT_KEY) tensor_adapter_config = tensor_adapter.TensorAdapterConfig( arrow_schema=tfx_io.ArrowSchema(), tensor_representations=tfx_io.TensorRepresentations()) input_extractor = batched_input_extractor.BatchedInputExtractor( eval_config) predict_extractor = batched_predict_extractor_v2.BatchedPredictExtractor( eval_config=eval_config, eval_shared_model=eval_shared_model, tensor_adapter_config=tensor_adapter_config) examples = [] for _ in range(4): examples.append( self._makeExample(classes='first', scores=0.0, labels='third')) with beam.Pipeline() as pipeline: predict_extracts = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples], reshuffle=False) | 'BatchExamples' >> tfx_io.BeamSource(batch_size=1) | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | input_extractor.stage_name >> input_extractor.ptransform | predict_extractor.stage_name >> predict_extractor.ptransform) def check_result(got): try: self.assertLen(got, 4) # We can't verify the actual predictions, but we can verify the keys. for item in got: self.assertIn(constants.BATCHED_PREDICTIONS_KEY, item) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(predict_extracts, check_result, label='result')
def testBatchSizeLimitWithKerasModel(self): input1 = tf.keras.layers.Input(shape=(1, ), batch_size=1, name='input1') input2 = tf.keras.layers.Input(shape=(1, ), batch_size=1, name='input2') inputs = [input1, input2] input_layer = tf.keras.layers.concatenate(inputs) def add_1(tensor): return tf.add_n([tensor, tf.constant(1.0, shape=(1, 2))]) assert_layer = tf.keras.layers.Lambda(add_1)(input_layer) model = tf.keras.models.Model(inputs, assert_layer) model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001), loss=tf.keras.losses.binary_crossentropy, metrics=['accuracy']) export_dir = self._getExportDir() model.save(export_dir, save_format='tf') eval_config = config.EvalConfig(model_specs=[config.ModelSpec()]) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) schema = text_format.Parse( """ tensor_representation_group { key: "" value { tensor_representation { key: "input1" value { dense_tensor { column_name: "input1" shape { dim { size: 1 } } } } } tensor_representation { key: "input2" value { dense_tensor { column_name: "input2" shape { dim { size: 1 } } } } } } } feature { name: "input1" type: FLOAT } feature { name: "input2" type: FLOAT } """, schema_pb2.Schema()) tfx_io = test_util.InMemoryTFExampleRecord( schema=schema, raw_record_column_name=constants.BATCHED_INPUT_KEY) tensor_adapter_config = tensor_adapter.TensorAdapterConfig( arrow_schema=tfx_io.ArrowSchema(), tensor_representations=tfx_io.TensorRepresentations()) input_extractor = batched_input_extractor.BatchedInputExtractor( eval_config) predict_extractor = batched_predict_extractor_v2.BatchedPredictExtractor( eval_config=eval_config, eval_shared_model=eval_shared_model, tensor_adapter_config=tensor_adapter_config) examples = [] for _ in range(4): examples.append(self._makeExample(input1=0.0, input2=1.0)) with beam.Pipeline() as pipeline: predict_extracts = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples], reshuffle=False) | 'BatchExamples' >> tfx_io.BeamSource(batch_size=1) | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | input_extractor.stage_name >> input_extractor.ptransform | predict_extractor.stage_name >> predict_extractor.ptransform) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 4) # We can't verify the actual predictions, but we can verify the keys. for item in got: self.assertIn(constants.BATCHED_PREDICTIONS_KEY, item) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(predict_extracts, check_result, label='result')
# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import apache_beam as beam from log_elements import LogElements with beam.Pipeline() as p: (p | beam.Create(range(1, 11)) | beam.combiners.Top.Largest(2) | LogElements())
def test_big_query_write_temp_table_append_schema_update(self): """ Test that schema update options are respected when appending to an existing table via temporary tables. _MAXIMUM_SOURCE_URIS and max_file_size are both set to 1 to force multiple load jobs and usage of temporary tables. """ table_name = 'python_append_schema_update' self.create_table(table_name) table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [{ "int64": num, "bool": True, "nested_field": { "fruit": "Apple" } } for num in range(1, 3)] table_schema = { "fields": [{ "name": "int64", "type": "INT64" }, { "name": "bool", "type": "BOOL" }, { "name": "nested_field", "type": "RECORD", "mode": "REPEATED", "fields": [ { "name": "fruit", "type": "STRING", "mode": "NULLABLE" }, ] }] } args = self.test_pipeline.get_full_options_as_args( on_success_matcher=BigqueryFullResultMatcher( project=self.project, query=""" SELECT bytes, date, time, int64, bool, fruit FROM %s, UNNEST(nested_field) as nested_field ORDER BY int64 """ % table_id, data=[(None, None, None, num, True, "Apple") for num in range(1, 3)])) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, max_file_size=1, # bytes method=beam.io.WriteToBigQuery.Method.FILE_LOADS, additional_bq_parameters={ 'schemaUpdateOptions': ['ALLOW_FIELD_ADDITION'] }))
def testSquaredPearsonCorrelationWithoutWeights(self): computations = ( squared_pearson_correlation.SquaredPearsonCorrelation().computations()) metric = computations[0] example1 = { 'labels': np.array([2.0]), 'predictions': np.array([1.0]), 'example_weights': np.array([1.0]), } example2 = { 'labels': np.array([1.0]), 'predictions': np.array([2.0]), 'example_weights': np.array([1.0]), } example3 = { 'labels': np.array([2.0]), 'predictions': np.array([3.0]), 'example_weights': np.array([1.0]), } example4 = { 'labels': np.array([3.0]), 'predictions': np.array([4.0]), 'example_weights': np.array([1.0]), } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([example1, example2, example3, example4]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeMetric' >> beam.CombinePerKey(metric.combiner)) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) key = metric.keys[0] # 1: prediction = 1, label = 2 # 2: prediction = 2, label = 1 # 3: prediction = 3, label = 2 # 4: prediction = 4, label = 3 # # pred_x_labels = 2 + 2 + 6 + 12 = 22 # labels = 2 + 1 + 2 + 3 = 8 # preds = 1 + 2 + 3 + 4 = 10 # sq_labels = 4 + 1 + 4 + 9 = 18 # sq_preds = 1 + 4 + 9 + 16 = 30 # examples = 4 # # r^2 = (22 - 8 * 10 / 4)^2 / (30 - 10^2 / 4) * (18 - 8^2 / 4) # r^2 = 4 / (5 * 2) = 0.4 self.assertDictElementsAlmostEqual(got_metrics, {key: 0.4}, places=5) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def run(self, pipeline: pipeline_pb2.Pipeline) -> None: """Deploys given logical pipeline on Beam. Args: pipeline: Logical pipeline in IR format. """ # For CLI, while creating or updating pipeline, pipeline_args are extracted # and hence we avoid deploying the pipeline. if 'TFX_JSON_EXPORT_PIPELINE_ARGS_PATH' in os.environ: return # TODO(b/163003901): Support beam DAG runner args through IR. deployment_config = self._extract_deployment_config(pipeline) connection_config = deployment_config.metadata_connection_config mlmd_connection = metadata.Metadata( connection_config=connection_config) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_RUNNER: 'beam'}): with beam.Pipeline() as p: # Uses for triggering the component DoFns. root = p | 'CreateRoot' >> beam.Create([None]) # Stores mapping of component to its signal. signal_map = {} # pipeline.components are in topological order. for node in pipeline.nodes: # TODO(b/160882349): Support subpipeline pipeline_node = node.pipeline_node component_id = pipeline_node.node_info.id executor_spec = self._extract_executor_spec( deployment_config, component_id) custom_driver_spec = self._extract_custom_driver_spec( deployment_config, component_id) # Signals from upstream components. signals_to_wait = [] for upstream_node in pipeline_node.upstream_nodes: assert upstream_node in signal_map, ( 'Components is not in ' 'topological order') signals_to_wait.append(signal_map[upstream_node]) logging.info( 'Component %s depends on %s.', component_id, [s.producer.full_label for s in signals_to_wait]) # Each signal is an empty PCollection. AsIter ensures component will # be triggered after upstream components are finished. # LINT.IfChange signal_map[component_id] = ( root | 'Run[%s]' % component_id >> beam.ParDo( _PipelineNodeAsDoFn( pipeline_node=pipeline_node, mlmd_connection=mlmd_connection, pipeline_info=pipeline.pipeline_info, pipeline_runtime_spec=pipeline.runtime_spec, executor_spec=executor_spec, custom_driver_spec=custom_driver_spec), * [beam.pvalue.AsIter(s) for s in signals_to_wait])) # LINT.ThenChange(../beam/beam_dag_runner.py) logging.info('Component %s is scheduled.', component_id)
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # beam-playground: # name: CompositeTransform # description: Task from katas to implement a composite transform "ExtractAndMultiplyNumbers" # that extracts numbers from comma separated line and then multiplies each number by 10. # multifile: false # pipeline_options: # categories: # - Flatten import apache_beam as beam from log_elements import LogElements class ExtractAndMultiplyNumbers(beam.PTransform): def expand(self, pcoll): return (pcoll | beam.FlatMap(lambda line: map(int, line.split(','))) | beam.Map(lambda num: num * 10)) with beam.Pipeline() as p: (p | beam.Create(['1,2,3,4,5', '6,7,8,9,10']) | ExtractAndMultiplyNumbers() | LogElements())
def testMetricsWithoutWeights(self, metric_name, expected_value): # TODO (b/151636380): remove when CL/299961405 is propagated through Kokoro. if metric_name == 'specificity_at_sensitivity': fix_present = hasattr(tf.keras.metrics.SpecificityAtSensitivity, '_find_max_under_constraint') if not fix_present: expected_value = 0.5 computations = tf_metric_wrapper.tf_metric_computations( [self._tf_metric_by_name(metric_name)]) histogram = computations[0] matrix = computations[1] metric = computations[2] example1 = { 'labels': np.array([0.0]), 'predictions': np.array([0.0]), 'example_weights': np.array([1.0]), } example2 = { 'labels': np.array([0.0]), 'predictions': np.array([0.5]), 'example_weights': np.array([1.0]), } example3 = { 'labels': np.array([1.0]), 'predictions': np.array([0.3]), 'example_weights': np.array([1.0]), } example4 = { 'labels': np.array([1.0]), 'predictions': np.array([0.9]), 'example_weights': np.array([1.0]), } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([example1, example2, example3, example4]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner) | 'ComputeConfusionMatrix' >> beam.Map( lambda x: (x[0], matrix.result(x[1]))) # pyformat: disable | 'ComputeMetric' >> beam.Map( lambda x: (x[0], metric.result(x[1])))) # pyformat: disable # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) key = metric_types.MetricKey(name=metric_name) self.assertDictElementsAlmostEqual( got_metrics, {key: expected_value}, places=5) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def test_single_phase_mixed_analyzer_run_once(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): integerized_s = tft.compute_and_apply_vocabulary(inputs['s']) _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'integerized_s': integerized_s, 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), } # Run AnalyzeAndTransform on some input data and compare with expected # output. input_data = [{'x': 12, 'y': 1, 's': 'c'}, {'x': 10, 'y': 1, 's': 'c'}] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.FixedLenFeature([], tf.float32), 'y': tf.FixedLenFeature([], tf.float32), 's': tf.FixedLenFeature([], tf.string), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'b', }, { 'x': 4, 'y': -4, 's': 'b', }], span_1_key: input_data, } with beam_impl.Context(temp_dir=self.get_temp_dir()): with beam.Pipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # TODO(b/37788560): Get these names programmatically. cache_dict = { span_0_key: { '__v0__CacheableCombineAccumulate--x_1-mean_and_var--': p | 'CreateA' >> beam.Create([b'[2.0, 1.0, 9.0]']), '__v0__CacheableCombineAccumulate--x-x--': p | 'CreateB' >> beam.Create([b'[2.0, 4.0]']), '__v0__CacheableCombineAccumulate--y_1-mean_and_var--': p | 'CreateC' >> beam.Create([b'[2.0, -1.5, 6.25]']), '__v0__CacheableCombineAccumulate--y-y--': p | 'CreateD' >> beam.Create([b'[4.0, 1.0]']), }, span_1_key: {}, } transform_fn, cache_output = ( (flat_data, input_data_dict, cache_dict, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS( self._cache_dir) transformed_dataset = (((input_data_dict[span_1_key], input_metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) dot_string = nodes.get_dot_graph( [analysis_graph_builder._ANALYSIS_GRAPH]).to_string() self.WriteRenderedDotFile(dot_string) transformed_data, unused_transformed_metadata = transformed_dataset expected_transformed = [ { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, 'integerized_s': 0, }, { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, 'integerized_s': 0, }, ] beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed)) transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn') _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir)
def testMultiClassMetricsUsingConfusionMatrix(self, metric_name, top_k, expected_value): computations = tf_metric_wrapper.tf_metric_computations( [self._tf_metric_by_name(metric_name)], sub_key=metric_types.SubKey(top_k=top_k)) histogram = computations[0] matrix = computations[1] metric = computations[2] # top_k = 2 # TP = 0.5*0 + 0.7*1 + 0.9*1 + 0.3*0 = 1.6 # FP = 0.5*2 + 0.7*1 + 0.9*1 + 0.3*2 = 3.2 # FN = 0.5*1 + 0.7*0 + 0.9*0 + 0.3*1 = 0.8 # # top_k = 3 # TP = 0.5*0 + 0.7*1 + 0.9*1 + 0.3*1 = 1.9 # FP = 0.5*3 + 0.7*2 + 0.9*2 + 0.3*2 = 5.3 # FN = 0.5*1 + 0.7*0 + 0.9*0 + 0.3*0 = 0.5 example1 = { 'labels': np.array([2]), 'predictions': np.array([0.1, 0.2, 0.1, 0.25, 0.35]), 'example_weights': np.array([0.5]), } example2 = { 'labels': np.array([1]), 'predictions': np.array([0.2, 0.3, 0.05, 0.15, 0.3]), 'example_weights': np.array([0.7]), } example3 = { 'labels': np.array([3]), 'predictions': np.array([0.01, 0.2, 0.09, 0.5, 0.2]), 'example_weights': np.array([0.9]), } example4 = { 'labels': np.array([1]), 'predictions': np.array([0.3, 0.2, 0.05, 0.4, 0.05]), 'example_weights': np.array([0.3]), } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([example1, example2, example3, example4]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner) | 'ComputeConfusionMatrix' >> beam.Map( lambda x: (x[0], matrix.result(x[1]))) # pyformat: disable | 'ComputeMetric' >> beam.Map( lambda x: (x[0], metric.result(x[1])))) # pyformat: disable # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) key = metric_types.MetricKey( name=metric_name, sub_key=metric_types.SubKey(top_k=top_k)) self.assertDictElementsAlmostEqual( got_metrics, {key: expected_value}, places=5) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def test_caching_vocab_for_integer_categorical(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): return { 'x_vocab': tft.compute_and_apply_vocabulary( inputs['x'], frequency_threshold=2) } input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.FixedLenFeature([], tf.int64), })) input_data_dict = { span_0_key: [{ 'x': -2, }, { 'x': -4, }, { 'x': -1, }, { 'x': 4, }], span_1_key: [{ 'x': -2, }, { 'x': -1, }, { 'x': 6, }, { 'x': 7, }], } expected_transformed_data = [{ 'x_vocab': 0, }, { 'x_vocab': 1, }, { 'x_vocab': -1, }, { 'x_vocab': -1, }] with beam_impl.Context(temp_dir=self.get_temp_dir()): with beam.Pipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # TODO(b/37788560): Get these names programmatically. cache_dict = { span_0_key: { '__v0__VocabularyAccumulate--compute_and_apply_vocabulary-vocabulary--': p | 'CreateB' >> beam.Create( [b'[-2, 2]', b'[-4, 1]', b'[-1, 1]', b'[4, 1]']), }, span_1_key: {}, } transform_fn, cache_output = ( (flat_data, input_data_dict, cache_dict, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS( self._cache_dir) transformed_dataset = (((input_data_dict[span_1_key], input_metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) transformed_data, _ = transformed_dataset beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='first')
def testCustomTFMetricWithPadding(self): computation = tf_metric_wrapper.tf_metric_computations([ _CustomMetric(name='custom_label', update_y_pred=False), _CustomMetric(name='custom_pred', update_y_pred=True) ])[0] # label_sum = (1 - 1 - 1 - 1) * 1.0 + # (1 + 2 - 1.0 - 1) * 1.0 + # (1 + 2 + 3 - 1) * 2.0 # = 9.0 # # pred_sum = (0.1 + 0.2 + 0.3 + 0.0) * 1.0 + # (0.1 + 0.2 + 0.0 - 1.0) * 1.0 + # (0.1 + 0.2 + 0.3 - 1.0) * 2.0 # = -0.9 # # weights_total = (1.0 * 4 + 1.0 * 4 + 2.0 * 4) = 16.0 example1 = { 'labels': np.array([1], dtype=np.int64), 'predictions': np.array([0.1, 0.2, 0.3, 0.0]), 'example_weights': np.array([1.0]) } example2 = { 'labels': np.array([1, 2], dtype=np.int64), 'predictions': np.array([0.1, 0.2, 0.0]), 'example_weights': np.array([1.0]) } example3 = { 'labels': np.array([1, 2, 3], dtype=np.int64), 'predictions': np.array([0.1, 0.2, 0.3]), 'example_weights': np.array([2.0]) } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([example1, example2, example3]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'Combine' >> beam.CombinePerKey(computation.combiner)) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) custom_label_key = metric_types.MetricKey(name='custom_label') custom_pred_key = metric_types.MetricKey(name='custom_pred') self.assertDictElementsAlmostEqual(got_metrics, { custom_label_key: 9.0 / 16.0, custom_pred_key: -0.9 / 16.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def TEST_Subgraph(): op_to_filename = { 'main': './graphdefs/main_graph.pb', 'remote_op_a': './graphdefs/graph_a.pb', 'remote_op_b': './graphdefs/graph_b.pb', } op_to_outputs = { 'main': ['Add'], 'remote_op_b': ['Add_1'], 'remote_op_a': ['embedding_lookup/Identity'], } partition = GraphPartition(op_to_filename, op_to_outputs) partition.partition() feed_dicts_graph_b = [{ 'remote_op_b': { 'import/ids_b1:0': 3, 'import/ids_b2:0': 3 } }, { 'remote_op_b': { 'import/ids_b1:0': 10, 'import/ids_b2:0': 10 } }] graph_name = 'remote_op_b' output_names = ['import/FloorMod:0'] bundle = partition.op_to_execution_bundles[graph_name][0] options = beam.options.pipeline_options.PipelineOptions() with beam.Pipeline(options=options) as p: inputs = p | 'read' >> beam.Create(feed_dicts_graph_b) outputs = inputs | 'Graph' >> beam.ParDo(ExecuteOneSubgraph(), bundle, graph_name) class GetOutputs(beam.DoFn): def process(self, element, graph_name, output_names): result = [ element[graph_name][output_name] for output_name in output_names ] yield result outputs = outputs | beam.ParDo(GetOutputs(), graph_name, output_names) outputs | 'output' >> beam.io.WriteToText('./beam_experiment') result = p.run() result.wait_until_finish() result_original_model = [] for feed_dict in feed_dicts_graph_b: graph = partition.op_to_graph[graph_name] with tf.compat.v1.Session(graph=graph) as sess: result_original_model.append( sess.run(output_names, feed_dict[graph_name])) import subprocess result_beam_pipeline = subprocess.check_output( ['cat', './beam_experiment-00000-of-00001']) print('Results from the original model:', result_original_model) print('\nResults from the beam pipeline:', result_beam_pipeline)
def testWithMixedMetrics(self): computations = tf_metric_wrapper.tf_metric_computations([ tf.keras.metrics.AUC(name='auc'), tf.keras.losses.BinaryCrossentropy(name='binary_crossentropy'), tf.keras.metrics.MeanSquaredError(name='mse') ]) confusion_histogram = computations[0] confusion_matrix = computations[1].result confusion_metrics = computations[2].result non_confusion_metrics = computations[3] example1 = { 'labels': np.array([0.0]), 'predictions': np.array([0.0]), 'example_weights': np.array([1.0]), } example2 = { 'labels': np.array([0.0]), 'predictions': np.array([0.5]), 'example_weights': np.array([1.0]), } example3 = { 'labels': np.array([1.0]), 'predictions': np.array([0.3]), 'example_weights': np.array([1.0]), } example4 = { 'labels': np.array([1.0]), 'predictions': np.array([0.9]), 'example_weights': np.array([1.0]), } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter sliced_examples = ( pipeline | 'Create' >> beam.Create([example1, example2, example3, example4]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x))) confusion_result = ( sliced_examples | 'ComputeHistogram' >> beam.CombinePerKey(confusion_histogram.combiner) | 'ComputeConfusionMatrix' >> beam.Map( lambda x: (x[0], confusion_matrix(x[1]))) # pyformat: disable | 'ComputeMetric' >> beam.Map( lambda x: (x[0], confusion_metrics(x[1])))) # pyformat: disable non_confusion_result = ( sliced_examples | 'Combine' >> beam.CombinePerKey(non_confusion_metrics.combiner)) # pylint: enable=no-value-for-parameter def check_confusion_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) auc_key = metric_types.MetricKey(name='auc') self.assertDictElementsAlmostEqual( got_metrics, {auc_key: 0.75}, places=5) except AssertionError as err: raise util.BeamAssertException(err) def check_non_confusion_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) mse_key = metric_types.MetricKey(name='mse') binary_crossentropy_key = metric_types.MetricKey( name='binary_crossentropy') self.assertDictElementsAlmostEqual( got_metrics, { mse_key: 0.1875, binary_crossentropy_key: 0.0 }, places=5) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that( confusion_result, check_confusion_result, label='confusion') util.assert_that( non_confusion_result, check_non_confusion_result, label='non_confusion')
def test_assert_that(self): # TODO: figure out a way for runner to parse and raise the # underlying exception. with self.assertRaises(Exception): with self.create_pipeline() as p: assert_that(p | beam.Create(['a', 'b']), equal_to(['a']))
def test_row_coder_in_pipeine(self): with TestPipeline() as p: res = (p | beam.Create(self.PEOPLE) | beam.Filter(lambda person: person.name == "Jon Snow")) assert_that(res, equal_to([self.JON_SNOW]))
def expand(self, tensor_pcoll_mapping): """Converts a dict of statistics to a transform function. Args: tensor_pcoll_mapping: A dictionary mapping `Tensor`s to a singleton PCollection containing a _TensorValue. Returns: A single-element PCollection containing the directory name with the SavedModel. """ transform_fn = ( self.pipeline | 'CreateTransformFn' >> beam.Create([self._saved_model_dir])) if not tensor_pcoll_mapping: return transform_fn # Convert tensor_value_mapping into a DictPCollectionView so it can be # passed as a side input to the beam Map below. tensor_value_pairs = [] for name, pcoll in six.iteritems(tensor_pcoll_mapping): tensor_value_pairs.append( pcoll | 'AddName[%s]' % name >> beam.Map(lambda x, name=name: (name, x))) tensor_value_mapping = beam.pvalue.AsDict( tensor_value_pairs | 'MergeTensorValuePairs' >> beam.Flatten()) def replace_tensors_with_constant_values(saved_model_dir, tensor_value_mapping): """Replaces specified `Tensor`s with constant values. Constants are accepted as Python values; these are automatically wrapped in `tf.constant()`. This method creates its own temp dir, and is therefore idempotent since any retry will use a different temp dir. Args: saved_model_dir: A SavedModel directory providing a transform graph. The MetaGraphDef and signature are selected from the SavedModel using keys defined in `../constants.py` ('transform' and 'transform_signature', respectively). tensor_value_mapping: a dict of tensor names to values to use in place of those tensors. Returns: The directory name containing the updated SavedModel. Raises: RuntimeError: if there is no default graph available to which to apply the transform. """ graph = tf.Graph() with graph.as_default(): tensor_replacement_map = {} for orig_tensor_name, (value, is_asset) in six.iteritems(tensor_value_mapping): new_tensor = tf.constant(value) if is_asset: # Any newly frozen constant tensors containing filenames must be # added to the ASSET_FILENAMES collection. graph.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, new_tensor) tensor_replacement_map[orig_tensor_name] = new_tensor with tf.Session(graph=graph) as session: temp_dir = _make_unique_temp_dir(self._base_temp_dir) input_tensors, output_tensors = ( saved_transform_io.partially_apply_saved_transform( saved_model_dir, {}, tensor_replacement_map)) saved_transform_io.write_saved_transform_from_session( session, input_tensors, output_tensors, temp_dir) return temp_dir return (transform_fn | 'ReplaceTensorsWithConstantValues' >> beam.Map( replace_tensors_with_constant_values, tensor_value_mapping=tensor_value_mapping))
def testPredictExtractorWithRegressionModel(self): temp_export_dir = self._getExportDir() export_dir, _ = (fixed_prediction_estimator_extra_fields. simple_fixed_prediction_estimator_extra_fields( temp_export_dir, None)) eval_config = config.EvalConfig(model_specs=[config.ModelSpec()]) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) schema = text_format.Parse( """ feature { name: "prediction" type: FLOAT } feature { name: "label" type: FLOAT } feature { name: "fixed_int" type: INT } feature { name: "fixed_float" type: FLOAT } feature { name: "fixed_string" type: BYTES } """, schema_pb2.Schema()) tfx_io = test_util.InMemoryTFExampleRecord( schema=schema, raw_record_column_name=constants.BATCHED_INPUT_KEY) tensor_adapter_config = tensor_adapter.TensorAdapterConfig( arrow_schema=tfx_io.ArrowSchema(), tensor_representations=tfx_io.TensorRepresentations()) input_extractor = batched_input_extractor.BatchedInputExtractor( eval_config) predict_extractor = batched_predict_extractor_v2.BatchedPredictExtractor( eval_config=eval_config, eval_shared_model=eval_shared_model, tensor_adapter_config=tensor_adapter_config) examples = [ self._makeExample(prediction=0.2, label=1.0, fixed_int=1, fixed_float=1.0, fixed_string='fixed_string1'), self._makeExample(prediction=0.8, label=0.0, fixed_int=1, fixed_float=1.0, fixed_string='fixed_string2'), self._makeExample(prediction=0.5, label=0.0, fixed_int=2, fixed_float=1.0, fixed_string='fixed_string3') ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples], reshuffle=False) | 'BatchExamples' >> tfx_io.BeamSource(batch_size=3) | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | input_extractor.stage_name >> input_extractor.ptransform | predict_extractor.stage_name >> predict_extractor.ptransform) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) self.assertIn(constants.BATCHED_PREDICTIONS_KEY, got[0]) expected_preds = [0.2, 0.8, 0.5] self.assertAlmostEqual( got[0][constants.BATCHED_PREDICTIONS_KEY], expected_preds) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')