def test_streaming_wordcount(self): class WordExtractingDoFn(beam.DoFn): def process(self, element): text_line = element.strip() words = text_line.split() return words # Add the TestStream so that it can be cached. ib.options.capturable_sources.add(TestStream) p = beam.Pipeline( runner=interactive_runner.InteractiveRunner(), options=StandardOptions(streaming=True)) data = ( p | TestStream() .advance_watermark_to(0) .advance_processing_time(1) .add_elements(['to', 'be', 'or', 'not', 'to', 'be']) .advance_watermark_to(20) .advance_processing_time(1) .add_elements(['that', 'is', 'the', 'question']) | beam.WindowInto(beam.window.FixedWindows(10))) # yapf: disable counts = ( data | 'split' >> beam.ParDo(WordExtractingDoFn()) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda wordones: (wordones[0], sum(wordones[1])))) # Watch the local scope for Interactive Beam so that referenced PCollections # will be cached. ib.watch(locals()) # This is normally done in the interactive_utils when a transform is # applied but needs an IPython environment. So we manually run this here. ie.current_env().track_user_pipelines() # Create a fake limiter that cancels the BCJ once the main job receives the # expected amount of results. class FakeLimiter: def __init__(self, p, pcoll): self.p = p self.pcoll = pcoll def is_triggered(self): result = ie.current_env().pipeline_result(self.p) if result: try: results = result.get(self.pcoll) except ValueError: return False return len(results) >= 10 return False # This sets the limiters to stop reading when the test receives 10 elements. ie.current_env().options.capture_control.set_limiters_for_test( [FakeLimiter(p, data)]) # This tests that the data was correctly cached. pane_info = PaneInfo(True, True, PaneInfoTiming.UNKNOWN, 0, 0) expected_data_df = pd.DataFrame([ ('to', 0, [IntervalWindow(0, 10)], pane_info), ('be', 0, [IntervalWindow(0, 10)], pane_info), ('or', 0, [IntervalWindow(0, 10)], pane_info), ('not', 0, [IntervalWindow(0, 10)], pane_info), ('to', 0, [IntervalWindow(0, 10)], pane_info), ('be', 0, [IntervalWindow(0, 10)], pane_info), ('that', 20000000, [IntervalWindow(20, 30)], pane_info), ('is', 20000000, [IntervalWindow(20, 30)], pane_info), ('the', 20000000, [IntervalWindow(20, 30)], pane_info), ('question', 20000000, [IntervalWindow(20, 30)], pane_info) ], columns=[0, 'event_time', 'windows', 'pane_info']) # yapf: disable data_df = ib.collect(data, include_window_info=True) pd.testing.assert_frame_equal(expected_data_df, data_df) # This tests that the windowing was passed correctly so that all the data # is aggregated also correctly. pane_info = PaneInfo(True, False, PaneInfoTiming.ON_TIME, 0, 0) expected_counts_df = pd.DataFrame([ ('be', 2, 9999999, [IntervalWindow(0, 10)], pane_info), ('not', 1, 9999999, [IntervalWindow(0, 10)], pane_info), ('or', 1, 9999999, [IntervalWindow(0, 10)], pane_info), ('to', 2, 9999999, [IntervalWindow(0, 10)], pane_info), ('is', 1, 29999999, [IntervalWindow(20, 30)], pane_info), ('question', 1, 29999999, [IntervalWindow(20, 30)], pane_info), ('that', 1, 29999999, [IntervalWindow(20, 30)], pane_info), ('the', 1, 29999999, [IntervalWindow(20, 30)], pane_info), ], columns=[0, 1, 'event_time', 'windows', 'pane_info']) # yapf: disable counts_df = ib.collect(counts, include_window_info=True) # The group by key has no guarantee of order. So we post-process the DF by # sorting so we can test equality. sorted_counts_df = (counts_df .sort_values(['event_time', 0], ascending=True) .reset_index(drop=True)) # yapf: disable pd.testing.assert_frame_equal(expected_counts_df, sorted_counts_df)
def assertAnalyzeAndTransformResults(self, input_data, input_metadata, preprocessing_fn, expected_data=None, expected_metadata=None, expected_vocab_file_contents=None, expected_asset_file_contents=None, test_data=None, desired_batch_size=None, beam_pipeline=None, temp_dir=None): """Assert that input data and metadata is transformed as expected. This methods asserts transformed data and transformed metadata match with expected_data and expected_metadata. Args: input_data: A sequence of dicts whose values are either strings, lists of strings, numeric types or a pair of those. input_metadata: DatasetMetadata describing input_data. preprocessing_fn: A function taking a dict of tensors and returning a dict of tensors. expected_data: (optional) A dataset with the same type constraints as input_data, but representing the output after transformation. If supplied, transformed data is asserted to be equal. expected_metadata: (optional) DatasetMetadata describing the transformed data. If supplied, transformed metadata is asserted to be equal. expected_vocab_file_contents: (optional) A dictionary from vocab filenames to their expected content as a list of text lines or a list of tuples of frequency and text. Values should be the expected result of calling f.readlines() on the given asset files. expected_asset_file_contents: deprecated. Use expected_vocab_file_contents. test_data: (optional) If this is provided then instead of calling AnalyzeAndTransformDataset with input_data, this function will call AnalyzeDataset with input_data and TransformDataset with test_data. Note that this is the case even if input_data and test_data are equal. test_data should also conform to input_metadata. desired_batch_size: (optional) A batch size to batch elements by. If not provided, a batch size will be computed automatically. beam_pipeline: (optional) A Beam Pipeline to use in this test. temp_dir: If set, it is used as output directory, else a new unique directory is created. Raises: AssertionError: if the expected data does not match the results of transforming input_data according to preprocessing_fn, or (if provided) if the expected metadata does not match. ValueError: if expected_vocab_file_contents and expected_asset_file_contents are both set. """ if (expected_vocab_file_contents is not None and expected_asset_file_contents is not None): raise ValueError('only one of expected_asset_file_contents and ' 'expected_asset_file_contents should be set') elif expected_asset_file_contents is not None: tf.compat.v1.logging.warn( 'expected_asset_file_contents is deprecated, use ' 'expected_vocab_file_contents') expected_vocab_file_contents = (expected_vocab_file_contents or expected_asset_file_contents or {}) del expected_asset_file_contents # Note: we don't separately test AnalyzeDataset and TransformDataset as # AnalyzeAndTransformDataset currently simply composes these two # transforms. If in future versions of the code, the implementation # differs, we should also run AnalyzeDataset and TransformDataset composed. temp_dir = temp_dir or tempfile.mkdtemp(prefix=self._testMethodName, dir=self.get_temp_dir()) with beam_pipeline or self._makeTestPipeline() as pipeline: with beam_impl.Context(temp_dir=temp_dir, desired_batch_size=desired_batch_size): input_data = pipeline | 'CreateInput' >> beam.Create( input_data) if test_data is None: (transformed_data, transformed_metadata), transform_fn = ( (input_data, input_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) else: transform_fn = ((input_data, input_metadata) | beam_impl.AnalyzeDataset(preprocessing_fn)) test_data = pipeline | 'CreateTest' >> beam.Create( test_data) transformed_data, transformed_metadata = ( ((test_data, input_metadata), transform_fn) | beam_impl.TransformDataset()) # Write transform_fn so we can test its assets _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir) if expected_data is not None: transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) transformed_data_path = os.path.join( temp_dir, 'transformed_data') _ = (transformed_data | beam.Map(transformed_data_coder.encode) | beam.io.tfrecordio.WriteToTFRecord( transformed_data_path, shard_name_template='')) # TODO(ebreck) Log transformed_data somewhere. if expected_data is not None: examples = tf.compat.v1.python_io.tf_record_iterator( path=transformed_data_path) transformed_data = [ transformed_data_coder.decode(x) for x in examples ] self.assertDataCloseOrEqual(expected_data, transformed_data) tf_transform_output = tft.TFTransformOutput(temp_dir) if expected_metadata: # Make a copy with no annotations. transformed_schema = schema_pb2.Schema() transformed_schema.CopyFrom( tf_transform_output.transformed_metadata.schema) for feature in transformed_schema.feature: feature.ClearField('annotation') self.assertEqual(expected_metadata.schema, transformed_schema) for filename, file_contents in six.iteritems( expected_vocab_file_contents): full_filename = tf_transform_output.vocabulary_file_by_name( filename) self.AssertVocabularyContents(full_filename, file_contents)
def InputsToExtracts( # pylint: disable=invalid-name inputs: beam.pvalue.PCollection): """Converts serialized inputs (e.g. examples) to Extracts.""" return (inputs | 'AddInputKey' >> beam.Map(lambda x: {constants.INPUT_KEY: x}))
file_name_suffix='.tfrecord.gz')) _ = train_data | 'TransformAndWriteTraining' >> TransformAndWrite( # pylint: disable=no-value-for-parameter 'features_train') _ = eval_data | 'TransformAndWriteEval' >> TransformAndWrite( # pylint: disable=no-value-for-parameter 'features_eval') # TODO(b/35300113) Remember to eventually also save the statistics. # Save files for online and batch prediction. prediction_schema = movielens.make_prediction_schema() prediction_coder = tft_coders.ExampleProtoCoder(prediction_schema) prediction_data = (eval_data | 'EncodePrediction' >> beam.Map(prediction_coder.encode)) _ = (prediction_data | 'EncodePredictionAsB64Json' >> beam.Map(_encode_as_b64_json) | 'WritePredictDataAsText' >> beam.io.WriteToText( os.path.join(args.output_dir, 'features_predict'), file_name_suffix='.txt')) _ = (prediction_data | 'WritePredictDataAsTfRecord' >> beam.io.WriteToTFRecord( os.path.join(args.output_dir, 'features_predict'), file_name_suffix='.tfrecord.gz')) def _encode_as_b64_json(serialized_example): import base64 # pylint: disable=g-import-not-at-top import json # pylint: disable=g-import-not-at-top return json.dumps({'b64': base64.b64encode(serialized_example)})
def test_metrics(self): """Run a simple DoFn that increments a counter and verifies state caching metrics. Verifies that its expected value is written to a temporary file by the FileReporter""" counter_name = 'elem_counter' state_spec = userstate.BagStateSpec('state', VarIntCoder()) class DoFn(beam.DoFn): def __init__(self): self.counter = Metrics.counter(self.__class__, counter_name) logging.info('counter: %s' % self.counter.metric_name) def process(self, kv, state=beam.DoFn.StateParam(state_spec)): # Trigger materialization list(state.read()) state.add(1) self.counter.inc() options = self.create_options() # Test only supports parallelism of 1 options._all_options['parallelism'] = 1 # Create multiple bundles to test cache metrics options._all_options['max_bundle_size'] = 10 options._all_options['max_bundle_time_millis'] = 95130590130 experiments = options.view_as(DebugOptions).experiments or [] experiments.append('state_cache_size=123') options.view_as(DebugOptions).experiments = experiments with Pipeline(self.get_runner(), options) as p: # pylint: disable=expression-not-assigned (p | "create" >> beam.Create(list(range(0, 110))) | "mapper" >> beam.Map(lambda x: (x % 10, 'val')) | "stateful" >> beam.ParDo(DoFn())) lines_expected = {'counter: 110'} if streaming: lines_expected.update([ # Gauges for the last finished bundle 'stateful.beam.metric:statecache:capacity: 123', # These are off by 10 because the first bundle contains all the keys # once. Caching is only initialized after the first bundle. Caching # depends on the cache token which is lazily initialized by the # Runner's StateRequestHandlers. 'stateful.beam.metric:statecache:size: 10', 'stateful.beam.metric:statecache:get: 10', 'stateful.beam.metric:statecache:miss: 0', 'stateful.beam.metric:statecache:hit: 10', 'stateful.beam.metric:statecache:put: 0', 'stateful.beam.metric:statecache:extend: 10', 'stateful.beam.metric:statecache:evict: 0', # Counters # (total of get/hit will be off by 10 due to the caching # only getting initialized after the first bundle. # Caching depends on the cache token which is lazily # initialized by the Runner's StateRequestHandlers). 'stateful.beam.metric:statecache:get_total: 100', 'stateful.beam.metric:statecache:miss_total: 10', 'stateful.beam.metric:statecache:hit_total: 90', 'stateful.beam.metric:statecache:put_total: 10', 'stateful.beam.metric:statecache:extend_total: 100', 'stateful.beam.metric:statecache:evict_total: 0', ]) else: # Batch has a different processing model. All values for # a key are processed at once. lines_expected.update([ # Gauges 'stateful).beam.metric:statecache:capacity: 123', # For the first key, the cache token will not be set yet. # It's lazily initialized after first access in StateRequestHandlers 'stateful).beam.metric:statecache:size: 9', # We have 11 here because there are 110 / 10 elements per key 'stateful).beam.metric:statecache:get: 11', 'stateful).beam.metric:statecache:miss: 1', 'stateful).beam.metric:statecache:hit: 10', # State is flushed back once per key 'stateful).beam.metric:statecache:put: 1', 'stateful).beam.metric:statecache:extend: 1', 'stateful).beam.metric:statecache:evict: 0', # Counters 'stateful).beam.metric:statecache:get_total: 99', 'stateful).beam.metric:statecache:miss_total: 9', 'stateful).beam.metric:statecache:hit_total: 90', 'stateful).beam.metric:statecache:put_total: 9', 'stateful).beam.metric:statecache:extend_total: 9', 'stateful).beam.metric:statecache:evict_total: 0', ]) lines_actual = set() with open(self.test_metrics_path, 'r') as f: line = f.readline() while line: for metric_str in lines_expected: if metric_str in line: lines_actual.add(metric_str) line = f.readline() self.assertSetEqual(lines_actual, lines_expected)
def testMultiClassMetrics(self, metric_name, expected_value): computations = tf_metric_wrapper.tf_metric_computations( [self._tf_metric_by_name(metric_name)], config.EvalConfig()) histogram = computations[0] matrix = computations[1] metric = computations[2] example1 = { 'labels': np.array([2]), 'predictions': np.array([0.1, 0.2, 0.1, 0.25, 0.35]), 'example_weights': np.array([0.5]), } example2 = { 'labels': np.array([1]), 'predictions': np.array([0.2, 0.3, 0.05, 0.15, 0.3]), 'example_weights': np.array([0.7]), } example3 = { 'labels': np.array([3]), 'predictions': np.array([0.01, 0.2, 0.09, 0.5, 0.2]), 'example_weights': np.array([0.9]), } example4 = { 'labels': np.array([4]), 'predictions': np.array([0.3, 0.2, 0.05, 0.4, 0.05]), 'example_weights': np.array([0.3]), } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [example1, example2, example3, example4]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner) | 'ComputeConfusionMatrix' >> beam.Map( lambda x: (x[0], matrix.result(x[1]))) # pyformat: disable | 'ComputeMetric' >> beam.Map(lambda x: (x[0], metric.result(x[1]))) ) # pyformat: disable # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) top_k = int(metric_name.split('@')[1]) key = metric_types.MetricKey( name=metric_name, sub_key=metric_types.SubKey(top_k=top_k)) self.assertDictElementsAlmostEqual(got_metrics, {key: expected_value}, places=5) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def testWithDefaultMetricsProvidedByModel(self): export_dir = os.path.join(self._getTempDir(), 'export_dir') dummy_layer = tf.keras.layers.Input(shape=(1, )) model = tf.keras.models.Model([dummy_layer], [dummy_layer]) model.compile(loss=tf.keras.losses.BinaryCrossentropy(), metrics=[tf.keras.metrics.MeanSquaredError(name='mse')]) model.save(export_dir, save_format='tf') model_loader = types.ModelLoader( tags=[tf.saved_model.SERVING], construct_fn=model_util.model_construct_fn( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])) computations = tf_metric_wrapper.tf_metric_computations( [tf.keras.metrics.AUC(name='auc')], config.EvalConfig(), model_loader=model_loader) confusion_histogram = computations[0] confusion_matrix = computations[1].result confusion_metrics = computations[2].result non_confusion_metrics = computations[3] example1 = { 'labels': np.array([0.0]), 'predictions': np.array([0.0]), 'example_weights': np.array([1.0]), } example2 = { 'labels': np.array([0.0]), 'predictions': np.array([0.5]), 'example_weights': np.array([1.0]), } example3 = { 'labels': np.array([1.0]), 'predictions': np.array([0.3]), 'example_weights': np.array([1.0]), } example4 = { 'labels': np.array([1.0]), 'predictions': np.array([0.9]), 'example_weights': np.array([1.0]), } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter sliced_examples = ( pipeline | 'Create' >> beam.Create( [example1, example2, example3, example4]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x))) confusion_result = ( sliced_examples | 'ComputeHistogram' >> beam.CombinePerKey( confusion_histogram.combiner) | 'ComputeConfusionMatrix' >> beam.Map(lambda x: (x[0], confusion_matrix(x[1]))) # pyformat: disable | 'ComputeMetric' >> beam.Map(lambda x: (x[0], confusion_metrics(x[1]))) ) # pyformat: disable non_confusion_result = (sliced_examples | 'Combine' >> beam.CombinePerKey( non_confusion_metrics.combiner)) # pylint: enable=no-value-for-parameter def check_confusion_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) auc_key = metric_types.MetricKey(name='auc') self.assertDictElementsAlmostEqual(got_metrics, {auc_key: 0.75}, places=5) except AssertionError as err: raise util.BeamAssertException(err) def check_non_confusion_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) mse_key = metric_types.MetricKey(name='mse') binary_crossentropy_key = metric_types.MetricKey( name='binary_crossentropy') self.assertDictElementsAlmostEqual( got_metrics, { mse_key: 0.1875, binary_crossentropy_key: 0.0 }, places=5) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(confusion_result, check_confusion_result, label='confusion') util.assert_that(non_confusion_result, check_non_confusion_result, label='non_confusion')
def shuffle(pcoll): # pylint: disable=invalid-name return (pcoll | 'PairWithRandom' >> beam.Map(lambda x: (random.random(), x)) | 'GroupByRandom' >> beam.GroupByKey() | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
def preprocess(pipeline, args): input_metadata = metadata_io.read_metadata( os.path.join(args.analyze_output_dir, RAW_METADATA_DIR)) schema = json.loads( file_io.read_file_to_string( os.path.join(args.analyze_output_dir, SCHEMA_FILE)).decode()) features = json.loads( file_io.read_file_to_string( os.path.join(args.analyze_output_dir, FEATURES_FILE)).decode()) column_names = [col['name'] for col in schema] exclude_outputs = None if not args.target: for name, transform in six.iteritems(features): if transform['transform'] == TARGET_TRANSFORM: target_name = name column_names.remove(target_name) exclude_outputs = [target_name] del input_metadata.schema.column_schemas[target_name] break if args.csv_file_pattern: coder = coders.CsvCoder(column_names, input_metadata.schema, delimiter=',') raw_data = ( pipeline | 'ReadCsvData' >> beam.io.ReadFromText(args.csv_file_pattern) | 'ParseCsvData' >> beam.Map(coder.decode)) else: columns = ', '.join(column_names) query = 'SELECT {columns} FROM `{table}`'.format( columns=columns, table=args.bigquery_table) raw_data = ( pipeline | 'ReadBiqQueryData' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True))) # Note that prepare_image_transforms does not make embeddings, it justs reads # the image files and converts them to byte stings. tft.TransformDataset() # will apply the saved model that makes the image embeddings. image_columns = image_transform_columns(features) raw_data = (raw_data | 'PreprocessTransferredLearningTransformations' >> beam.Map( prepare_image_transforms, image_columns)) if args.shuffle: raw_data = raw_data | 'ShuffleData' >> shuffle() transform_fn = (pipeline | 'ReadTransformFn' >> tft_beam_io.ReadTransformFn( args.analyze_output_dir)) (transformed_data, transform_metadata) = (((raw_data, input_metadata), transform_fn) | 'ApplyTensorflowPreprocessingGraph' >> tft.TransformDataset(exclude_outputs)) tfexample_coder = coders.ExampleProtoCoder(transform_metadata.schema) _ = (transformed_data | 'SerializeExamples' >> beam.Map(tfexample_coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(args.output_dir, args.output_filename_prefix), file_name_suffix='.tfrecord.gz'))
def expand(self, dataset): return (dataset | 'DetectAnomaliesInExamples' >> beam.Map( _detect_anomalies_in_example, options=self.options) | 'GenerateAnomalyReasonKeys' >> beam.ParDo( _GenerateAnomalyReasonSliceKeys()))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """ Write description regarding this beautiful executor. Args: input_dict: output_dict: exec_properties: """ self._log_startup(input_dict, output_dict, exec_properties) schema = parse_schema(input_dict=input_dict) statistics = parse_statistics( split_name=DATA_SPLIT_NAME, statistics=input_dict[constants.STATISTICS]) source = exec_properties[StepKeys.SOURCE] args = exec_properties[StepKeys.ARGS] # pass the schema and stats straight to the Step args[constants.SCHEMA] = schema args[constants.STATISTICS] = statistics c = source_utils.load_source_path_class(source) split_step: BaseSplit = c(**args) # infer the names of the splits from the config split_names = split_step.get_split_names() # Get output split path examples_artifact = artifact_utils.get_single_instance( output_dict[constants.OUTPUT_EXAMPLES]) if SKIP in split_names: sanitized_names = [name for name in split_names if name != SKIP] examples_artifact.split_names = artifact_utils.encode_split_names( sanitized_names) else: examples_artifact.split_names = artifact_utils.encode_split_names( split_names) split_uris = [] for artifact in input_dict[constants.INPUT_EXAMPLES]: for split in artifact_utils.decode_split_names( artifact.split_names): uri = os.path.join(artifact.uri, split) split_uris.append((split, uri)) with self._make_beam_pipeline() as p: # The outer loop will for now only run once for split, uri in split_uris: input_uri = io_utils.all_files_pattern(uri) new_splits = ( p | 'ReadData.' + split >> beam.io.ReadFromTFRecord( file_pattern=input_uri) | beam.Map(tf.train.Example.FromString) | 'Split' >> beam.Partition( split_step.partition_fn()[0], split_step.get_num_splits(), **split_step.partition_fn()[1]) ) for split_name, new_split in zip(split_names, list(new_splits)): if split_name != SKIP: # WriteSplit function writes to TFRecord again (new_split | 'Serialize.' + split_name >> beam.Map( lambda x: x.SerializeToString()) | 'WriteSplit_' + split_name >> WriteSplit( get_split_uri( output_dict[constants.OUTPUT_EXAMPLES], split_name)))
def expand(self, pcoll): return pcoll | 'TestLabel' >> beam.Map(lambda x: 'Simple(%s)' % x)
def expand(self, pcoll): return pcoll | beam.Map(lambda x, s: x + s, self._payload)
def test_wordcount(self): class WordExtractingDoFn(beam.DoFn): def process(self, element): text_line = element.strip() words = text_line.split() return words p = beam.Pipeline( runner=interactive_runner.InteractiveRunner( direct_runner.DirectRunner())) # Count the occurrences of each word. counts = ( p | beam.Create(['to be or not to be that is the question']) | 'split' >> beam.ParDo(WordExtractingDoFn()) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda wordones: (wordones[0], sum(wordones[1])))) # Watch the local scope for Interactive Beam so that counts will be cached. ib.watch(locals()) result = p.run() result.wait_until_finish() actual = list(result.get(counts)) self.assertSetEqual( set(actual), set([ ('or', 1), ('that', 1), ('be', 2), ('is', 1), ('question', 1), ('to', 2), ('the', 1), ('not', 1), ])) # Truncate the precision to millis because the window coder uses millis # as units then gets upcast to micros. end_of_window = (GlobalWindow().max_timestamp().micros // 1000) * 1000 df_counts = ib.collect(counts, include_window_info=True) df_expected = pd.DataFrame({ 0: [e[0] for e in actual], 1: [e[1] for e in actual], 'event_time': [end_of_window for _ in actual], 'windows': [[GlobalWindow()] for _ in actual], 'pane_info': [ PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0) for _ in actual ] }, columns=[ 0, 1, 'event_time', 'windows', 'pane_info' ]) pd.testing.assert_frame_equal(df_expected, df_counts) actual_reified = result.get(counts, include_window_info=True) expected_reified = [ WindowedValue( e, Timestamp(micros=end_of_window), [GlobalWindow()], PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0)) for e in actual ] self.assertEqual(actual_reified, expected_reified)
def test_instrument_example_unbounded_pipeline_to_read_cache_not_cached(self): """Tests that the instrumenter works when the PCollection is not cached. """ # Create a new interactive environment to make the test idempotent. ie.new_env(cache_manager=streaming_cache.StreamingCache(cache_dir=None)) # Create the pipeline that will be instrumented. from apache_beam.options.pipeline_options import StandardOptions options = StandardOptions(streaming=True) p_original = beam.Pipeline(interactive_runner.InteractiveRunner(), options) source_1 = p_original | 'source1' >> beam.io.ReadFromPubSub( subscription='projects/fake-project/subscriptions/fake_sub') # pylint: disable=possibly-unused-variable pcoll_1 = source_1 | 'square1' >> beam.Map(lambda x: x * x) # Watch but do not cache the PCollections. ib.watch(locals()) # Instrument the original pipeline to create the pipeline the user will see. p_copy = beam.Pipeline.from_runner_api( p_original.to_runner_api(), runner=interactive_runner.InteractiveRunner(), options=options) instrumenter = instr.build_pipeline_instrument(p_copy) actual_pipeline = beam.Pipeline.from_runner_api( proto=instrumenter.instrumented_pipeline_proto(), runner=interactive_runner.InteractiveRunner(), options=options) # Now, build the expected pipeline which replaces the unbounded source with # a TestStream. source_1_cache_key = self.cache_key_of('source_1', source_1) p_expected = beam.Pipeline() test_stream = ( p_expected | TestStream(output_tags=[self.cache_key_of('source_1', source_1)])) # pylint: disable=expression-not-assigned ( test_stream | 'square1' >> beam.Map(lambda x: x * x) | 'reify' >> beam.Map(lambda _: _) | cache.WriteCache(ie.current_env().cache_manager(), 'unused')) # Test that the TestStream is outputting to the correct PCollection. class TestStreamVisitor(PipelineVisitor): def __init__(self): self.output_tags = set() def enter_composite_transform(self, transform_node): self.visit_transform(transform_node) def visit_transform(self, transform_node): transform = transform_node.transform if isinstance(transform, TestStream): self.output_tags = transform.output_tags v = TestStreamVisitor() actual_pipeline.visit(v) expected_output_tags = set([source_1_cache_key]) actual_output_tags = v.output_tags self.assertSetEqual(expected_output_tags, actual_output_tags) # Test that the pipeline is as expected. assert_pipeline_proto_equal( self, p_expected.to_runner_api(use_fake_coders=True), instrumenter.instrumented_pipeline_proto())
def run_pipeline(mae_input_pattern, mae_golden_dir, results_dir, mae_input_query, mae_golden_table, write_per_note_stats_to_gcs, results_table, per_note_results_table, debug_output_table, types_to_ignore, pipeline_args): """Evaluate the input files against the goldens.""" if ((mae_input_pattern is None) == (mae_input_query is None) or (mae_golden_dir is None) == (mae_golden_table is None) or (mae_input_query is None) != (mae_golden_table is None) or (mae_input_pattern is None) != (mae_golden_dir is None)): return [ 'Must set exactly one of: ' '(--mae_input_pattern AND --mae_golden_dir) ' 'OR (--mae_input_query AND --mae_golden_table).' ] if write_per_note_stats_to_gcs and not results_dir: return [ 'Must set --results_dir when --write_per_note_stats_to_gcs is set.' ] logging.info('Starting evaluation.') p = beam.Pipeline(options=PipelineOptions(pipeline_args)) if mae_input_pattern: filenames = [] storage_client = storage.Client() for f in gcsutil.find_files(mae_input_pattern, storage_client): if posixpath.dirname( f.string()) != posixpath.dirname(mae_input_pattern): # Ignore subdirectories. continue filenames.append(f) per_note_results = None if mae_input_query and mae_golden_table: query_template = ( 'SELECT findings.record_id, findings.xml, golden.xml ' 'FROM ({}) AS findings ' 'LEFT JOIN [{}] AS golden ' 'ON findings.record_id=golden.record_id') query = query_template.format(mae_input_query, mae_golden_table) per_note_results = (p | beam.io.Read(beam.io.BigQuerySource(query=query)) | beam.Map(compare_bq_row, types_to_ignore)) else: per_note_results = (p | beam.Create(filenames) | beam.Map( compare, mae_golden_dir, types_to_ignore)) now = str(_get_utcnow()) if debug_output_table: _ = (per_note_results | beam.FlatMap( format_debug_info, now ) | 'write_debug_info' >> beam.io.Write( beam.io.BigQuerySink( debug_output_table, schema=( 'record_id:STRING,classification:STRING,info_type:STRING,' 'text:STRING,context:STRING,start:INTEGER,end:INTEGER,' 'timestamp:TIMESTAMP'), write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))) if per_note_results_table: _ = (per_note_results | beam.Map( format_individual_result_for_bq, now ) | 'write_per_note' >> beam.io.Write( beam.io.BigQuerySink( per_note_results_table, schema=('record_id:STRING,' + BASE_SCHEMA), write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))) aggregate_results = (per_note_results | beam.CombineGlobally(CombineResultsFn())) if results_dir: _ = (aggregate_results | beam.Map(write_aggregate_results_to_gcs, results_dir)) if results_table: _ = (aggregate_results | beam.FlatMap( format_aggregate_results_for_bq, now ) | 'write_aggregate' >> beam.io.Write( beam.io.BigQuerySink( results_table, schema=('info_type:STRING,' + BASE_SCHEMA), write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))) if write_per_note_stats_to_gcs: _ = (per_note_results | beam.Map(get_binary_token_result) | beam.io.WriteToText( posixpath.join(results_dir, 'per-note-results'))) result = p.run().wait_until_finish() logging.info('Eval result: %s', result) return []
def test_instrument_example_unbounded_pipeline_to_multiple_read_cache(self): """Tests that the instrumenter works for multiple unbounded sources. """ # Create a new interactive environment to make the test idempotent. ie.new_env(cache_manager=streaming_cache.StreamingCache(cache_dir=None)) # Create the pipeline that will be instrumented. p_original = beam.Pipeline(interactive_runner.InteractiveRunner()) source_1 = p_original | 'source1' >> beam.io.ReadFromPubSub( subscription='projects/fake-project/subscriptions/fake_sub') source_2 = p_original | 'source2' >> beam.io.ReadFromPubSub( subscription='projects/fake-project/subscriptions/fake_sub') # pylint: disable=possibly-unused-variable pcoll_1 = source_1 | 'square1' >> beam.Map(lambda x: x * x) # pylint: disable=possibly-unused-variable pcoll_2 = source_2 | 'square2' >> beam.Map(lambda x: x * x) # Mock as if cacheable PCollections are cached. ib.watch(locals()) for name, pcoll in locals().items(): if not isinstance(pcoll, beam.pvalue.PCollection): continue cache_key = self.cache_key_of(name, pcoll) self._mock_write_cache([b''], cache_key) # Instrument the original pipeline to create the pipeline the user will see. instrumenter = instr.build_pipeline_instrument(p_original) actual_pipeline = beam.Pipeline.from_runner_api( proto=instrumenter.instrumented_pipeline_proto(), runner=interactive_runner.InteractiveRunner(), options=None) # Now, build the expected pipeline which replaces the unbounded source with # a TestStream. source_1_cache_key = self.cache_key_of('source_1', source_1) source_2_cache_key = self.cache_key_of('source_2', source_2) p_expected = beam.Pipeline() test_stream = ( p_expected | TestStream( output_tags=[ self.cache_key_of('source_1', source_1), self.cache_key_of('source_2', source_2) ])) # pylint: disable=expression-not-assigned test_stream[source_1_cache_key] | 'square1' >> beam.Map(lambda x: x * x) # pylint: disable=expression-not-assigned test_stream[source_2_cache_key] | 'square2' >> beam.Map(lambda x: x * x) # Test that the TestStream is outputting to the correct PCollection. class TestStreamVisitor(PipelineVisitor): def __init__(self): self.output_tags = set() def enter_composite_transform(self, transform_node): self.visit_transform(transform_node) def visit_transform(self, transform_node): transform = transform_node.transform if isinstance(transform, TestStream): self.output_tags = transform.output_tags v = TestStreamVisitor() actual_pipeline.visit(v) expected_output_tags = set([source_1_cache_key, source_2_cache_key]) actual_output_tags = v.output_tags self.assertSetEqual(expected_output_tags, actual_output_tags) # Test that the pipeline is as expected. assert_pipeline_proto_equal( self, p_expected.to_runner_api(), instrumenter.instrumented_pipeline_proto())
def test_bad_main_input(self): @typehints.with_input_types(str, int) def repeat(s, times): return s * times with self.assertRaises(typehints.TypeCheckError): [1, 2, 3] | beam.Map(repeat, 3)
def testBatching(self): computation = tf_metric_wrapper.tf_metric_computations( [_CustomMetric(), tf.keras.metrics.MeanSquaredError(name='mse')], config.EvalConfig(), batch_size=2)[0] example1 = { 'labels': [0.0], 'predictions': [0.0], 'example_weights': [1.0] } example2 = { 'labels': [0.0], 'predictions': [0.5], 'example_weights': [1.0] } example3 = { 'labels': [1.0], 'predictions': [0.3], 'example_weights': [1.0] } example4 = { 'labels': [1.0], 'predictions': [0.9], 'example_weights': [1.0] } example5 = { 'labels': [1.0], 'predictions': [0.5], 'example_weights': [0.0] } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [example1, example2, example3, example4, example5]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'Combine' >> beam.CombinePerKey(computation.combiner)) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertEqual(1, len(got), 'got: %s' % got) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) custom_key = metric_types.MetricKey(name='custom') mse_key = metric_types.MetricKey(name='mse') self.assertDictElementsAlmostEqual( got_metrics, { custom_key: (0.0 + 0.5 + 0.3 + 0.9 + 0.0) / (1.0 + 1.0 + 1.0 + 1.0 + 0.0), mse_key: 0.1875, }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def test_multiple_outputs_with_watermark_advancement(self): """Tests that the TestStream can independently control output watermarks.""" # Purposely set the watermark of numbers to 20 then letters to 5 to test # that the watermark advancement is per PCollection. # # This creates two PCollections, (a, b, c) and (1, 2, 3). These will be # emitted at different times so that they will have different windows. The # watermark advancement is checked by checking their windows. If the # watermark does not advance, then the windows will be [-inf, -inf). If the # windows do not advance separately, then the PCollections will both # windowed in [15, 30). letters_elements = [ TimestampedValue('a', 6), TimestampedValue('b', 7), TimestampedValue('c', 8), ] numbers_elements = [ TimestampedValue('1', 21), TimestampedValue('2', 22), TimestampedValue('3', 23), ] test_stream = (TestStream().advance_watermark_to( 0, tag='letters').advance_watermark_to( 0, tag='numbers').advance_watermark_to( 20, tag='numbers').advance_watermark_to( 5, tag='letters').add_elements( letters_elements, tag='letters').advance_watermark_to( 10, tag='letters').add_elements( numbers_elements, tag='numbers').advance_watermark_to( 30, tag='numbers')) options = StandardOptions(streaming=True) p = TestPipeline(is_integration_test=True, options=options) main = p | test_stream # Use an AfterWatermark trigger with an early firing to test that the # watermark is advancing properly and that the element is being emitted in # the correct window. letters = ( main['letters'] | 'letter windows' >> beam.WindowInto( FixedWindows(15), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'letter with key' >> beam.Map(lambda x: ('k', x)) | 'letter gbk' >> beam.GroupByKey()) numbers = ( main['numbers'] | 'number windows' >> beam.WindowInto( FixedWindows(15), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'number with key' >> beam.Map(lambda x: ('k', x)) | 'number gbk' >> beam.GroupByKey()) # The letters were emitted when the watermark was at 5, thus we expect to # see the elements in the [0, 15) window. We used an early trigger to make # sure that the ON_TIME empty pane was also emitted with a TestStream. # This pane has no data because of the early trigger causes the elements to # fire before the end of the window and because the accumulation mode # discards any data after the trigger fired. expected_letters = { window.IntervalWindow(0, 15): [ ('k', ['a', 'b', 'c']), ('k', []), ], } # Same here, except the numbers were emitted at watermark = 20, thus they # are in the [15, 30) window. expected_numbers = { window.IntervalWindow(15, 30): [ ('k', ['1', '2', '3']), ('k', []), ], } assert_that(letters, equal_to_per_window(expected_letters), label='letters assert per window') assert_that(numbers, equal_to_per_window(expected_numbers), label='numbers assert per window') p.run()
def expand(self, pcoll): return (pcoll | beam.Map(lambda info: (info[self.field], info['score'])) | beam.CombinePerKey(sum_ints))
def expand(self, pcoll): p = pcoll.pipeline try: step_name = self.label except AttributeError: step_name = 'BigQueryBatchFileLoads_%d' % BigQueryBatchFileLoads.COUNT BigQueryBatchFileLoads.COUNT += 1 temp_location = p.options.view_as(GoogleCloudOptions).temp_location job_name = ( p.options.view_as(GoogleCloudOptions).job_name or 'AUTOMATIC_JOB_NAME') empty_pc = p | "ImpulseEmptyPC" >> beam.Create([]) singleton_pc = p | "ImpulseSingleElementPC" >> beam.Create([None]) load_job_name_pcv = pvalue.AsSingleton( singleton_pc | "LoadJobNamePrefix" >> beam.Map( lambda _: _generate_job_name( job_name, bigquery_tools.BigQueryJobTypes.LOAD, 'LOAD_STEP'))) schema_mod_job_name_pcv = pvalue.AsSingleton( singleton_pc | "SchemaModJobNamePrefix" >> beam.Map( lambda _: _generate_job_name( job_name, bigquery_tools.BigQueryJobTypes.LOAD, 'SCHEMA_MOD_STEP'))) copy_job_name_pcv = pvalue.AsSingleton( singleton_pc | "CopyJobNamePrefix" >> beam.Map( lambda _: _generate_job_name( job_name, bigquery_tools.BigQueryJobTypes.COPY, 'COPY_STEP'))) file_prefix_pcv = pvalue.AsSingleton( singleton_pc | "GenerateFilePrefix" >> beam.Map( file_prefix_generator( self._validate, self._custom_gcs_temp_location, temp_location))) destination_data_kv_pc = ( pcoll | "RewindowIntoGlobal" >> self._window_fn() | "AppendDestination" >> beam.ParDo( bigquery_tools.AppendDestinationsFn(self.destination), *self.table_side_inputs)) if not self.with_auto_sharding: all_destination_file_pairs_pc = self._write_files( destination_data_kv_pc, file_prefix_pcv) else: all_destination_file_pairs_pc = self._write_files_with_auto_sharding( destination_data_kv_pc, file_prefix_pcv) grouped_files_pc = ( all_destination_file_pairs_pc | "GroupFilesByTableDestinations" >> beam.GroupByKey()) partitions = ( grouped_files_pc | beam.ParDo( PartitionFiles( self.max_partition_size, self.max_files_per_partition)).with_outputs( PartitionFiles.MULTIPLE_PARTITIONS_TAG, PartitionFiles.SINGLE_PARTITION_TAG)) multiple_partitions_per_destination_pc = partitions[ PartitionFiles.MULTIPLE_PARTITIONS_TAG] single_partition_per_destination_pc = partitions[ PartitionFiles.SINGLE_PARTITION_TAG] # When using dynamic destinations, elements with both single as well as # multiple partitions are loaded into BigQuery using temporary tables to # ensure atomicity. if self.dynamic_destinations: all_partitions = (( multiple_partitions_per_destination_pc, single_partition_per_destination_pc) | "FlattenPartitions" >> beam.Flatten()) destination_load_job_ids_pc, destination_copy_job_ids_pc = ( self._load_data(all_partitions, empty_pc, load_job_name_pcv, schema_mod_job_name_pcv, copy_job_name_pcv, p, step_name)) else: destination_load_job_ids_pc, destination_copy_job_ids_pc = ( self._load_data(multiple_partitions_per_destination_pc, single_partition_per_destination_pc, load_job_name_pcv, schema_mod_job_name_pcv, copy_job_name_pcv, p, step_name)) return { self.DESTINATION_JOBID_PAIRS: destination_load_job_ids_pc, self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc, self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc, }
def generate_statistics_from_bq( query: Text, output_path: Text, schema: schema_pb2.Schema, stats_options: stats_options.StatsOptions = stats_options.StatsOptions(), pipeline_options: Optional[PipelineOptions] = None, ) -> statistics_pb2.DatasetFeatureStatisticsList: """Computes data statistics from a BigQuery query result. Args: query: The BigQuery query. output_path: The file path to output data statistics result to. It will be a TFRecord file containing a single data statistics proto, and can be read with the 'load_statistics' API. If you run this function on Google Cloud, you must specify an output_path. Specifying None may cause an error. schema: A Schema protobuf to use for data validation stats_options: `tfdv.StatsOptions` for generating data statistics. pipeline_options: Optional beam pipeline options. This allows users to specify various beam pipeline execution parameters like pipeline runner (DirectRunner or DataflowRunner), cloud dataflow service project id, etc. See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for more details. Returns: A DatasetFeatureStatisticsList proto. """ column_specs = _get_column_specs(query) if not validate_bq_types(_get_column_specs(query).values()): raise ValueError("Unsupported BigQuery data types.") batch_size = (stats_options.desired_batch_size if stats_options.desired_batch_size and stats_options.desired_batch_size > 0 else tfdv.constants.DEFAULT_DESIRED_INPUT_BATCH_SIZE) # PyLint doesn't understand Beam PTransforms. # pylint: disable=no-value-for-parameter stats_output_path = os.path.join(output_path, _STATS_FILENAME) anomalies_output_path = os.path.join(output_path, _ANOMALIES_FILENAME) with beam.Pipeline(options=pipeline_options) as p: stats = ( p | 'GetData' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True)) # | 'DecodeData' >> DecodeBigQuery(column_specs, # desired_batch_size=batch_size) | 'DecodeExamples' >> batch_util.BatchExamplesToArrowTables() | 'GenerateStatistics' >> tfdv.GenerateStatistics()) _ = (stats | 'WriteStatsOutput' >> beam.io.WriteToTFRecord( file_path_prefix=stats_output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) _ = (stats | 'ValidateStatistics' >> beam.Map(tfdv.validate_statistics, schema=schema) | 'WriteAnomaliesOutput' >> beam.io.textio.WriteToText( file_path_prefix=anomalies_output_path, shard_name_template='', append_trailing_newlines=False))
def _load_data( self, partitions_using_temp_tables, partitions_direct_to_destination, load_job_name_pcv, schema_mod_job_name_pcv, copy_job_name_pcv, p, step_name): """Load data to BigQuery Data is loaded into BigQuery in the following two ways: 1. Single partition: When there is a single partition of files destined to a single destination, a single load job is triggered. 2. Multiple partitions and/or Dynamic Destinations: When there are multiple partitions of files destined for a single destination or when Dynamic Destinations are used, multiple load jobs need to be triggered for each partition/destination. Load Jobs are triggered to temporary tables, and those are later copied to the actual appropriate destination table. This ensures atomicity when only some of the load jobs would fail but not other. If any of them fails, then copy jobs are not triggered. """ # Load data using temp tables trigger_loads_outputs = ( partitions_using_temp_tables | "TriggerLoadJobsWithTempTables" >> beam.ParDo( TriggerLoadJobs( schema=self.schema, write_disposition=self.write_disposition, create_disposition=self.create_disposition, test_client=self.test_client, temporary_tables=True, additional_bq_parameters=self.additional_bq_parameters, source_format=self._temp_file_format, step_name=step_name, load_job_project_id=self.load_job_project_id), load_job_name_pcv, *self.schema_side_inputs).with_outputs( TriggerLoadJobs.TEMP_TABLES, main='main')) temp_tables_load_job_ids_pc = trigger_loads_outputs['main'] temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES] finished_temp_tables_load_jobs_pc = ( p | "ImpulseMonitorLoadJobs" >> beam.Create([None]) | "WaitForTempTableLoadJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), pvalue.AsList(temp_tables_load_job_ids_pc))) schema_mod_job_ids_pc = ( finished_temp_tables_load_jobs_pc | beam.ParDo( UpdateDestinationSchema( write_disposition=self.write_disposition, test_client=self.test_client, additional_bq_parameters=self.additional_bq_parameters, step_name=step_name, load_job_project_id=self.load_job_project_id), schema_mod_job_name_pcv)) finished_schema_mod_jobs_pc = ( p | "ImpulseMonitorSchemaModJobs" >> beam.Create([None]) | "WaitForSchemaModJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), pvalue.AsList(schema_mod_job_ids_pc))) destination_copy_job_ids_pc = ( finished_temp_tables_load_jobs_pc | beam.ParDo( TriggerCopyJobs( create_disposition=self.create_disposition, write_disposition=self.write_disposition, test_client=self.test_client, step_name=step_name, load_job_project_id=self.load_job_project_id), copy_job_name_pcv, pvalue.AsIter(finished_schema_mod_jobs_pc))) finished_copy_jobs_pc = ( p | "ImpulseMonitorCopyJobs" >> beam.Create([None]) | "WaitForCopyJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), pvalue.AsList(destination_copy_job_ids_pc))) _ = ( p | "RemoveTempTables/Impulse" >> beam.Create([None]) | "RemoveTempTables/PassTables" >> beam.FlatMap( lambda _, unused_copy_jobs, deleting_tables: deleting_tables, pvalue.AsIter(finished_copy_jobs_pc), pvalue.AsIter(temp_tables_pc)) | "RemoveTempTables/AddUselessValue" >> beam.Map(lambda x: (x, None)) | "RemoveTempTables/DeduplicateTables" >> beam.GroupByKey() | "RemoveTempTables/GetTableNames" >> beam.Keys() | "RemoveTempTables/Delete" >> beam.ParDo( DeleteTablesFn(self.test_client))) # Load data directly to destination table destination_load_job_ids_pc = ( partitions_direct_to_destination | "TriggerLoadJobsWithoutTempTables" >> beam.ParDo( TriggerLoadJobs( schema=self.schema, write_disposition=self.write_disposition, create_disposition=self.create_disposition, test_client=self.test_client, temporary_tables=False, additional_bq_parameters=self.additional_bq_parameters, source_format=self._temp_file_format, step_name=step_name, load_job_project_id=self.load_job_project_id), load_job_name_pcv, *self.schema_side_inputs)) _ = ( p | "ImpulseMonitorDestinationLoadJobs" >> beam.Create([None]) | "WaitForDestinationLoadJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), pvalue.AsList(destination_load_job_ids_pc))) destination_load_job_ids_pc = ( (temp_tables_load_job_ids_pc, destination_load_job_ids_pc) | beam.Flatten()) return destination_load_job_ids_pc, destination_copy_job_ids_pc
###### Pipline Beam (Transforms) ############ print(0000000000000000) # Building a Beam Pipline p1 = beam.Pipeline(options=pipeline_options) attendance_count = ( p1 | 'read pub_sub' >> beam.io.ReadFromPubSub( subscription=input_subscription ) #beam.io.ReadFromPubSub(subscription=input_subscription) #, timestamp_attribute # timestamp_attribute – # Message value to use as element timestamp. If None, uses message publishing time as the timestamp. | 'to python dict' >> beam.Map(to_python_dict) | 'Filter offline events' >> beam.Filter(lambda element: element['venue'][ 'mode'] == 'offline') # change to offline | 'get venue' >> beam.Map(get_venue) | 'build_tuple' >> beam.Map(build_tuple) # | 'ecode' >> beam.Map(lambda x : str(x).encode("utf-8")) | 'Write to BigQuery' >> beam.io.WriteToBigQuery( "totemic-polygon-279515:dataset.meetup", schema="geohash:string, mode:string, lat:Float, lon:float") #beam.io.WriteToText('ou.txt') ) print(111111111111111) # running pipline result = p1.run() #
def _get_page_content(self, pipeline, file_paths, dl_manager): """Build PCollection of un-split page content.""" wet_file_paths = pipeline | "create_wet_files" >> beam.Create( file_paths["wet_files"]) if "wet_urls" in file_paths: def download_url(url, downloader, pipeline): path = downloader.download(url) if not pipeline.is_local(): path = downloader.ship_files_with_pipeline(path, pipeline) return path dl_wet_file_paths = ( pipeline | "create_wet_urls" >> beam.Create(file_paths["wet_urls"]) | beam.Map( download_url, downloader=dl_manager, pipeline=pipeline)) wet_file_paths = (wet_file_paths, dl_wet_file_paths) | beam.Flatten() # Parse WET files and filter by length. # Output: url, text page_content = wet_file_paths | beam.FlatMap( split_wet_file) | beam.Filter(is_valid_length) # Optionally filter for RealNews domains. # Output: url, text if self.config.realnewslike: with open(file_paths["realnews_domains"], "r") as f: realnews_domains = json.load(f) page_content = page_content | beam.Filter(is_realnews_domain, realnews_domains) # Normalize and deduplicate by URL. # Output: url, text page_content = (page_content | "normalize_url" >> beam.Map(normalize_url) | "group_url" >> beam.GroupByKey() | beam.Map(dedupe_urls)) # Optionally filter for WebText-like URLs. # Output: url, text if self.config.webtextlike: webtextlike_urls = ( pipeline | "read_webtextlike_urls" >> beam.io.ReadFromText( os.path.join(file_paths["openwebtext_urls_zip"], _OPENWEBTEXT_URLS_FILE_PATTERN)) | "add_dummy_page" >> beam.Map(lambda x: (x, "")) | "normal_webtext_url" >> beam.Map(normalize_url)) page_content = ({ "text": page_content, "webtextlike_urls": webtextlike_urls } | "group_webtextlike_urls" >> beam.CoGroupByKey() | beam.FlatMap(filter_by_webtextlike)) # Optionally clean pages of badwords, boilerpolate text, and duplicate # spans of sentences. # Output: url, text if self.config.clean: with open(file_paths["badwords"], "r") as f: badwords = [l.strip() for l in f] page_content = page_content | "clean_pages" >> beam.FlatMap( get_clean_page_fn(badwords)) page_content = remove_duplicate_text(page_content) # Optionally filter out non-`language` pages. We do this after cleaning # since it may change the predominate language. if self.config.lang != "all": page_content |= beam.Filter(is_language, language=self.config.lang) return page_content
def expand(self, pcoll): return ( pcoll | 'window' >> beam.WindowInto(window.GlobalWindows()) | "Count" >> beam.combiners.Count.Globally() | "Log" >> beam.Map(log_count_info))
def test_able_to_cache_intermediate_unbounded_source_pcollection(self): """Tests being able to cache an intermediate source PCollection. In the following pipeline, the source doesn't have a reference and so is not automatically cached in the watch() command. This tests that this case is taken care of. """ # Create a new interactive environment to make the test idempotent. ie.new_env(cache_manager=streaming_cache.StreamingCache(cache_dir=None)) # Create the pipeline that will be instrumented. from apache_beam.options.pipeline_options import StandardOptions options = StandardOptions(streaming=True) p_original = beam.Pipeline(interactive_runner.InteractiveRunner(), options) # pylint: disable=possibly-unused-variable source_1 = ( p_original | 'source1' >> beam.io.ReadFromPubSub( subscription='projects/fake-project/subscriptions/fake_sub') | beam.Map(lambda e: e)) # Watch but do not cache the PCollections. ib.watch(locals()) # Make sure that sources without a user reference are still cached. instr.watch_sources(p_original) intermediate_source_pcoll = None for watching in ie.current_env().watching(): watching = list(watching) for var, watchable in watching: if 'synthetic' in var: intermediate_source_pcoll = watchable break # Instrument the original pipeline to create the pipeline the user will see. p_copy = beam.Pipeline.from_runner_api( p_original.to_runner_api(), runner=interactive_runner.InteractiveRunner(), options=options) instrumenter = instr.build_pipeline_instrument(p_copy) actual_pipeline = beam.Pipeline.from_runner_api( proto=instrumenter.instrumented_pipeline_proto(), runner=interactive_runner.InteractiveRunner(), options=options) # Now, build the expected pipeline which replaces the unbounded source with # a TestStream. intermediate_source_pcoll_cache_key = \ self.cache_key_of('synthetic_var_' + str(id(intermediate_source_pcoll)), intermediate_source_pcoll) p_expected = beam.Pipeline() test_stream = ( p_expected | TestStream(output_tags=[intermediate_source_pcoll_cache_key])) # pylint: disable=expression-not-assigned ( test_stream | 'square1' >> beam.Map(lambda e: e) | 'reify' >> beam.Map(lambda _: _) | cache.WriteCache(ie.current_env().cache_manager(), 'unused')) # Test that the TestStream is outputting to the correct PCollection. class TestStreamVisitor(PipelineVisitor): def __init__(self): self.output_tags = set() def enter_composite_transform(self, transform_node): self.visit_transform(transform_node) def visit_transform(self, transform_node): transform = transform_node.transform if isinstance(transform, TestStream): self.output_tags = transform.output_tags v = TestStreamVisitor() actual_pipeline.visit(v) expected_output_tags = set([intermediate_source_pcoll_cache_key]) actual_output_tags = v.output_tags self.assertSetEqual(expected_output_tags, actual_output_tags) # Test that the pipeline is as expected. assert_pipeline_proto_equal( self, p_expected.to_runner_api(use_fake_coders=True), instrumenter.instrumented_pipeline_proto())
def BatchedInputsToExtracts( # pylint: disable=invalid-name batched_inputs: beam.pvalue.PCollection): """Converts Arrow RecordBatch inputs to Extracts.""" return (batched_inputs | 'AddArrowRecordBatchKey' >> beam.Map(lambda x: {constants.ARROW_RECORD_BATCH_KEY: x}))
def run(): # Command line arguments parser = argparse.ArgumentParser( description='Load from Json into BigQuery') parser.add_argument('--project', required=True, help='Specify Google Cloud project') parser.add_argument('--region', required=True, help='Specify Google Cloud region') parser.add_argument('--staging_location', required=True, help='Specify Cloud Storage bucket for staging') parser.add_argument('--temp_location', required=True, help='Specify Cloud Storage bucket for temp') opts, pipeline_args = parser.parse_known_args() options = PipelineOptions(pipeline_args, save_main_session=True) if pipeline_args: options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format( 'no-shuffle-pipeline-', time.time_ns()) else: options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format( 'shuffle-pipeline-', time.time_ns()) options.view_as(GoogleCloudOptions).project = opts.project options.view_as(GoogleCloudOptions).region = opts.region options.view_as( GoogleCloudOptions).staging_location = opts.staging_location options.view_as(GoogleCloudOptions).temp_location = opts.temp_location options.view_as(StandardOptions).runner = 'DataflowRunner' table_schema = { "fields": [{ "name": "platform", "type": "STRING" }, { "name": "dep_count", "type": "INTEGER" }] } input_table = 'bigquery-public-data:libraries_io.dependencies' output_table = f"{opts.project}:dataflow_demos.shuffle_demo" p = beam.Pipeline(options=options) (p | 'ReadFromBQ' >> beam.io.ReadFromBigQuery(table=input_table) | 'ExtractPlatform' >> beam.FlatMap(extract_platform) | 'CountPerPlatform' >> CountPerPlatform() | 'ToDict' >> beam.Map(to_dict) | 'WriteToBQ' >> beam.io.WriteToBigQuery( output_table, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)) logging.getLogger().setLevel(logging.INFO) logging.info("Building pipeline ...") p.run()