def test_basic(self): examples = [ text_format.Parse( """ features { feature { key: "x" value { float_list { value: 0 }}} } """, tf.train.Example()), text_format.Parse( """ features { feature { key: "x" value { float_list { value: 1 }}} } """, tf.train.Example()) ] model_paths = [self._get_output_data_dir(m) for m in ('model1', 'model2')] for model_path in model_paths: self._build_predict_model(model_path) specs = [ model_spec_pb2.InferenceSpecType( saved_model_spec=model_spec_pb2.SavedModelSpec(model_path=p)) for p in model_paths ] with self._make_beam_pipeline() as pipeline: predictions = ( pipeline | beam.Create(examples) | run_inference.RunInferencePerModelImpl(specs) | beam.MapTuple( lambda _, p2: p2.predict_log.response.outputs['y'].float_val[0])) assert_that(predictions, equal_to([0.0, 2.0])) predictions_table = ( pipeline | 'CreateTable' >> beam.Create([(i, e) for i, e in enumerate(examples)]) | 'RunInferencePerModelTable' >> run_inference.RunInferencePerModelImpl(specs) | beam.MapTuple(lambda k, v: # pylint: disable=g-long-lambda (k, v[1].predict_log.response.outputs['y'].float_val[ 0]))) assert_that( predictions_table, equal_to([(0, 0.0), (1, 2.0)]), label='AssertTable')
def run( scenes: List[str], output_path_prefix: str, vis_params: Dict[str, Any], beam_args: Optional[List[str]] = None, ) -> None: """Load multiple Landsat scenes and render them as JPEG files. Args: scenes: List of Landsat 8 scene IDs. output_path_prefix: Path prefix to save the output files. vis_params: Visualization parameters including {rgb_bands, min, max, gamma}. beam_args: Optional list of arguments for Beam pipeline options. """ rgb_band_names = vis_params["rgb_band_names"] min_value = vis_params["min"] max_value = vis_params["max"] gamma = vis_params["gamma"] beam_options = PipelineOptions(beam_args, save_main_session=True) pipeline = beam.Pipeline(options=beam_options) ( pipeline | "Create scene IDs" >> beam.Create(scenes) | "Check GPU availability" >> beam.Map( lambda x, unused_side_input: x, unused_side_input=beam.pvalue.AsSingleton( pipeline | beam.Create([None]) | beam.Map(check_gpus) ), ) | "Get RGB band paths" >> beam.Map(get_band_paths, rgb_band_names) | "Load RGB band values" >> beam.MapTuple(load_values) | "Preprocess pixels" >> beam.MapTuple(preprocess_pixels, min_value, max_value, gamma) | "Convert to image" >> beam.MapTuple( lambda scene, rgb_pixels: ( scene, Image.fromarray(rgb_pixels.numpy(), mode="RGB"), ) ) | "Save to Cloud Storage" >> beam.MapTuple(save_to_gcs, output_path_prefix) ) pipeline.run()
def run_ex(self, pipeline, context, words_to_count, words_to_filter): self.pipeline = pipeline self.context = context return (pipeline | 'LoadingWordsInput' >> beam.Create(words_to_count) | 'FilterWords' >> beam.Filter(lambda w: w in words_to_filter) | 'MapToCount' >> beam.Map(lambda w: (w, 1)) | 'GroupWords' >> beam.GroupByKey() | 'Count' >> beam.MapTuple(self.count))
def expand(self, pcoll): return ( pcoll | 'Add Timestamps' >> beam.Map(lambda x: beam.window.TimestampedValue(x, time.time())) | "Window into Fixed Intervals" >> beam.WindowInto( window.FixedWindows(self.window_size)) | "Groupby" >> beam.GroupByKey() | "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val))
def test_map_tuple(self): # TODO(https://github.com/apache/beam/issues/19961): Also test with a fn # that accepts default arguments. def tuple_map_fn(a: str, b: str, c: str) -> str: return a + b + c th = beam.MapTuple(tuple_map_fn).get_type_hints() self.assertEqual(th.input_types, ((str, str, str), {})) self.assertEqual(th.output_types, ((str, ), {}))
def examples_wordcount_minimal(renames): """MinimalWordCount example snippets.""" import re import apache_beam as beam from apache_beam.options.pipeline_options import GoogleCloudOptions from apache_beam.options.pipeline_options import StandardOptions from apache_beam.options.pipeline_options import PipelineOptions # [START examples_wordcount_minimal_options] options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'my-project-id' google_cloud_options.job_name = 'myjob' google_cloud_options.staging_location = 'gs://your-bucket-name-here/staging' google_cloud_options.temp_location = 'gs://your-bucket-name-here/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # [END examples_wordcount_minimal_options] # Run it locally for testing. options = PipelineOptions() # [START examples_wordcount_minimal_create] p = beam.Pipeline(options=options) # [END examples_wordcount_minimal_create] ( # [START examples_wordcount_minimal_read] p | beam.io.ReadFromText('gs://dataflow-samples/shakespeare/kinglear.txt') # [END examples_wordcount_minimal_read] # [START examples_wordcount_minimal_pardo] | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) # [END examples_wordcount_minimal_pardo] # [START examples_wordcount_minimal_count] | beam.combiners.Count.PerElement() # [END examples_wordcount_minimal_count] # [START examples_wordcount_minimal_map] | beam.MapTuple(lambda word, count: '%s: %s' % (word, count)) # [END examples_wordcount_minimal_map] # [START examples_wordcount_minimal_write] | beam.io.WriteToText('gs://my-bucket/counts.txt') # [END examples_wordcount_minimal_write] ) p.visit(SnippetUtils.RenameFiles(renames)) # [START examples_wordcount_minimal_run] result = p.run() # [END examples_wordcount_minimal_run] result.wait_until_finish()
def expand(self, pcoll): return ( pcoll # Assigns window info to each Pub/Sub message based on its publish timestamp. | "Window into Fixed Intervals" >> beam.WindowInto(window.FixedWindows(self.window_size)) # If the windowed elements do not fit into memory please consider using `beam.util.BatchElements`. | "Add Dummy Key" >> beam.Map(lambda elem: (None, elem)) | "Groupby" >> beam.GroupByKey() | "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val) )
def expand(self, pcoll): if has_any_weight: return pcoll | beam.CombinePerKey(_sum_pairwise) else: # For non-weighted case, use sum combine fn over integers to allow # Beam to use Cython combiner. return (pcoll | 'RemoveWeights' >> beam.MapTuple(lambda k, v: (k, v[0])) | beam.CombinePerKey(sum))
def expand(self, inputs): def _encode_values(k, v): return (k, tf.compat.as_str_any(','.join(map(tf.compat.as_str_any, v)))) pcoll, = inputs return (pcoll | 'EncodeValues' >> beam.MapTuple(_encode_values) | 'SwapKeysAndValues' >> beam.KvSwap())
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from the Elastic Search. Returns: PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from the Elastic Search. """ exp_summary_models = ( self.pipeline | 'Get all non-deleted models' >> ( ndb_io.GetModels(exp_models.ExpSummaryModel.get_all())) ) exp_summary_iter = beam.pvalue.AsIter(exp_summary_models) exp_recommendations_models = ( exp_summary_models | 'Compute similarity' >> beam.ParDo( ComputeSimilarity(), exp_summary_iter) | 'Group similarities per exploration ID' >> beam.GroupByKey() | 'Sort and slice similarities' >> beam.MapTuple( lambda exp_id, similarities: ( exp_id, self._sort_and_slice_similarities(similarities))) | 'Create recommendation models' >> beam.MapTuple( self._create_recommendation) ) unused_put_result = ( exp_recommendations_models | 'Put models into the datastore' >> ndb_io.PutModels() ) return ( exp_recommendations_models | 'Count all new models' >> beam.combiners.Count.Globally() | 'Only create result for new models when > 0' >> ( beam.Filter(lambda x: x > 0)) | 'Create result for new models' >> beam.Map( lambda x: job_run_result.JobRunResult( stdout='SUCCESS %s' % x)) )
def expand( self, sliced_record_batchs_and_ys: Tuple[ beam.PCollection[types.SlicedRecordBatch], beam.PCollection[_SlicedYKey]] ) -> beam.PCollection[Tuple[_SlicedYKey, _ConditionalYRate]]: sliced_record_batchs, y_keys = sliced_record_batchs_and_ys # _SlicedXYKey(slice, x_path, x, y), xy_count partial_copresence_counts = ( sliced_record_batchs | 'ToPartialCopresenceCounts' >> beam.FlatMap( _to_partial_copresence_counts, self._y_path, self._x_paths, self._y_boundaries, self._example_weight_map, self._num_xy_pairs_batch_copresent)) # Compute placeholder copresence counts. # partial_copresence_counts will only include x-y pairs that are present, # but we would also like to keep track of x-y pairs that never appear, as # long as x and y independently occur in the slice. # _SlicedXKey(slice, x_path, x), x_count x_counts = ( sliced_record_batchs | 'ToPartialXCounts' >> beam.FlatMap( _to_partial_x_counts, self._x_paths, self._example_weight_map) | 'SumXCounts' >> beam.CombinePerKey(sum)) if self._min_x_count: x_counts = x_counts | 'FilterXCounts' >> beam.Filter( lambda kv: kv[1] > self._min_x_count) # _SlicedXYKey(slice, x_path, x, y), 0 placeholder_copresence_counts = ( (x_counts, y_keys) | 'GetPlaceholderCopresenceCounts' >> _GetPlaceholderCopresenceCounts(self._x_paths, self._min_x_count)) def move_y_to_value(key, xy_count): return _SlicedXKey(key.slice_key, key.x_path, key.x), (key.y, xy_count) # _SlicedXKey(slice, x_path, x), (y, xy_count) copresence_counts = ( (placeholder_copresence_counts, partial_copresence_counts) | 'FlattenCopresenceCounts' >> beam.Flatten() | 'SumCopresencePairs' >> beam.CombinePerKey(sum) | 'MoveYToValue' >> beam.MapTuple(move_y_to_value)) # _SlicedYKey(slice, y), _ConditionalYRate(x_path, x, xy_count, x_count) return (copresence_counts | 'JoinXCounts' >> beam.ParDo( _LookupInnerJoinDoFn(), right_iterable=beam.pvalue.AsIter(x_counts)) | 'MakeConditionalYRates' >> beam.Map( _make_conditional_y_rates, num_xy_pairs_distinct=self._num_xy_pairs_distinct))
def test_map_tuple(self): def f(a, b, y=None): return a, b, y expected = [(1, 2), (3, 4)] | beam.MapTuple(f, y=5) actual = [(1, 2), (3, 4)] | threadmap.ThreadMapTuple(f, y=5) self.assertEqual(expected, actual) actual = [(1, 2), (3, 4)] | threadmap.ThreadMapTuple(f, y=5, num_threads=None) self.assertEqual(expected, actual)
def run(beam_options): suffixes = get_suffixes() print(f"Processing {len(suffixes)} files") with beam.Pipeline(options=beam_options) as p: (p | beam.Create(suffixes) | "Download Data" >> beam.ParDo(download_subtile) | "Coarsen" >> beam.MapTuple(coarsen) | common.CombineSubtilesByKey() | common.WriteToNetCDFs(_name))
def expand(self, pcoll): return ( pcoll | "Start" >> beam.FlatMap(_start_stage, self.specs_by_target) | "CreateTasks" >> beam.FlatMapTuple(_copy_tasks) # prevent undesirable fusion # https://stackoverflow.com/a/54131856/809705 | "Reshuffle" >> beam.Reshuffle() | "CopyChunks" >> beam.MapTuple(_copy_chunk) # prepare inputs for the next stage (if any) | "Finish" >> beam.Distinct())
def test_timestamped_value(self): with TestPipeline() as p: result = (p | 'start' >> Create([(k, k) for k in range(10)]) | Map(lambda x_t: TimestampedValue(x_t[0], x_t[1])) | 'w' >> WindowInto(FixedWindows(5)) | Map(lambda v: ('key', v)) | GroupByKey() | beam.MapTuple(lambda k, vs: (k, sorted(vs)))) assert_that(result, equal_to([('key', [0, 1, 2, 3, 4]), ('key', [5, 6, 7, 8, 9])]))
def expand(self, sliced_record_batchs_and_ys: Tuple[types.SlicedRecordBatch, _SlicedYKey]): sliced_record_batchs, y_keys = sliced_record_batchs_and_ys # _SlicedXYKey(slice, x_path, x, y), xy_count partial_copresence_counts = ( sliced_record_batchs | 'ToPartialCopresenceCounts' >> beam.FlatMap( _to_partial_copresence_counts, self._y_path, self._x_paths, self._y_boundaries, self._weight_column_name, self._num_xy_pairs_batch_copresent)) # Compute placeholder copresence counts. # partial_copresence_counts will only include x-y pairs that are present, # but we would also like to keep track of x-y pairs that never appear, as # long as x and y independently occur in the slice. # _SlicedXKey(slice, x_path, x), x_count x_counts = ( sliced_record_batchs | 'ToPartialXCounts' >> beam.FlatMap( _to_partial_x_counts, self._x_paths, self._weight_column_name) | 'SumXCounts' >> beam.CombinePerKey(sum)) if self._min_x_count: x_counts = x_counts | 'FilterXCounts' >> beam.Filter( lambda kv: kv[1] > self._min_x_count) # _SlicedXYKey(slice, x_path, x, y), 0 placeholder_copresence_counts = ( (x_counts, y_keys) | 'GetPlaceholderCopresenceCounts' >> _GetPlaceholderCopresenceCounts(self._x_paths, self._min_x_count)) def move_y_to_value(key, xy_count): return _SlicedXKey(key.slice_key, key.x_path, key.x), (key.y, xy_count) # _SlicedXKey(slice, x_path, x), (y, xy_count) copresence_counts = ( (placeholder_copresence_counts, partial_copresence_counts) | 'FlattenCopresenceCounts' >> beam.Flatten() | 'SumCopresencePairs' >> beam.CombinePerKey(sum) | 'MoveYToValue' >> beam.MapTuple(move_y_to_value)) # _SlicedYKey(slice, y), _ConditionalYRate(x_path, x, xy_count, x_count) return ({ 'x_count': x_counts, 'xy_counts': copresence_counts } | 'CoGroupByForConditionalYRates' >> beam.CoGroupByKey() | 'JoinXCounts' >> beam.FlatMap(_join_x_counts, self._num_xy_pairs_distinct, self._num_x_values_distinct))
def run( scenes: List[str], output_path_prefix: str, vis_params: Dict[str, Any], gpus_optional: bool, beam_args: Optional[List[str]] = None, ) -> None: """Load multiple Landsat scenes and render them as JPEG files. Args: scenes: List of Landsat 8 scene IDs. output_path_prefix: Path prefix to save the output files. vis_params: Visualization parameters including {rgb_bands, min, max, gamma}. gpus_optional: If True, the pipeline won't crash if GPUs are not found. beam_args: Optional list of arguments for Beam pipeline options. """ rgb_band_names = vis_params["rgb_band_names"] min_value = vis_params["min"] max_value = vis_params["max"] gamma = vis_params["gamma"] options = PipelineOptions(beam_args, save_main_session=True) with beam.Pipeline(options=options) as pipeline: # Optionally, validate that the workers are using GPUs. (pipeline | beam.Create([None]) | "Check GPU availability" >> beam.Map(check_gpus, gpus_optional)) # Convert Landsat 8 scenes into images. (pipeline | "Create scene IDs" >> beam.Create(scenes) | "Get RGB band paths" >> beam.Map(get_band_paths, rgb_band_names) | "Load RGB band values" >> beam.MapTuple(load_values) | "Preprocess pixels" >> beam.MapTuple(preprocess_pixels, min_value, max_value, gamma) | "Convert to image" >> beam.MapTuple(lambda scene, rgb_pixels: ( scene, Image.fromarray(rgb_pixels.numpy(), mode="RGB"), )) | "Save to Cloud Storage" >> beam.MapTuple(save_to_gcs, output_path_prefix))
def expand(self, input_or_inputs): return ( input_or_inputs | "Window into fixed timespan" >> beam.WindowInto( window.FixedWindows(self.window_size)) # add timestamps, in this case, read it directly from the message | "Add timestamps to messages" >> beam.ParDo(ExposeMsgTimestamp()) # dummy key is used for aggregation purpose, i.e. every self.window_size we can assign a unique key | "Add Dummy Key" >> beam.Map(lambda elem: (None, elem)) | "Groupby" >> beam.GroupByKey() | "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val) | "Aggregate HLOC" >> beam.ParDo(AggregateData()))
def expand(self, pcoll): return ( pcoll # Assigns window info to each Pub/Sub message based on its # publish timestamp. | "Window into Sessions" #>> beam.WindowInto(window.Sessions(self.gap_size)) >> beam.WindowInto(window.SlidingWindows(10, 5)) | "Add timestamps to messages" >> beam.ParDo(AddTimestamps()) | "Add Dummy Key" >> beam.Map(lambda elem: (None, elem)) | "Groupby" >> beam.GroupByKey() | "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val))
def define(self, pipeline: beam.Pipeline) -> None: def format_result(word, count): return self.output_format % (word, count) _ = (pipeline | 'Read' >> self.words_source | 'Split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(str)) | 'PairWithOne' >> beam.Map(lambda x: (x, 1)) | 'GroupAndSum' >> beam.CombinePerKey(sum) | 'Format' >> beam.MapTuple(format_result) | 'Write' >> self.word_count_sink)
def WritePandasToCSV( pcoll: PCollection[pd.DataFrame], file_path_prefix: str, **kwargs, ): return (pcoll | "Convert DFs to tuples" >> beam.FlatMap(lambda df: map(list, df.values)) | "Convert to csv lines" >> beam.MapTuple(lambda *args: ",".join(map(str, args))) | "Write results to csv" >> beam_io.WriteToText( file_path_prefix=file_path_prefix, **kwargs))
def test_sliding_windows(self): with TestPipeline() as p: pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3) result = (pcoll | 'w' >> WindowInto(SlidingWindows(period=2, size=4)) | GroupByKey() | beam.MapTuple(lambda k, vs: (k, sorted(vs))) | reify_windows) expected = [('key @ [-2.0, 2.0)', [1]), ('key @ [0.0, 4.0)', [1, 2, 3]), ('key @ [2.0, 6.0)', [2, 3])] assert_that(result, equal_to(expected))
def run(argv=None): """Build and run the pipeline""" parser = argparse.ArgumentParser() parser.add_argument("--topic", type=str, help='Pub/Sub topic to read from') parser.add_argument("--output_bucket", help=('Output local filemane')) parser.add_argument('--output_bigquery', default='IoTData.engine', help=('Output BigQuery table: ' 'PROJECT:DATASET.TABLE ' 'or DATASET.TABLE.')) parser.add_argument('--output_bigquery_avg', default='DeviceData.engine_avr', help=('Output BigQuery table for averages: ' 'PROJECT:DATASET.TABLE or DATASET.TABLE.')) args, pipeline_args = parser.parse_known_args(argv) options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True options.view_as(StandardOptions).streaming = True p = beam.Pipeline(options=options) pubsub_stream = ( p | 'Read from PubSub' >> beam.io.ReadFromPubSub(topic=args.topic)) records = (pubsub_stream | 'Parse JSON to Dict' >> beam.Map(lambda e: json.loads(e)) | 'Add timestamp' >> beam.ParDo(AddTimestampToDict())) # stream to BigQuery (records | 'Write to BigQuery' >> beam.io.WriteToBigQuery( args.output_bigquery, schema=Schema.get_bigquery_schema(), create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) # averages (records | "Window for avg" >> beam.WindowInto(window.FixedWindows(60)) | 'Add deviceId Key' >> beam.ParDo(AddKeyToDict()) | 'Group by Key' >> beam.GroupByKey() | 'Count average' >> beam.ParDo(CountAverages()) | 'Write Avg to BigQuery' >> beam.io.WriteToBigQuery( args.output_bigquery_avg, schema=Schema.get_bigquery_avg_schema(), create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_APPEND)) (records | "Window for bucket" >> beam.WindowInto(window.FixedWindows(60)) | "Add Dummy Key" >> beam.Map(lambda elem: (None, elem)) | "Group by Dummy Key" >> beam.GroupByKey() | "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val) | "Write to GCS" >> beam.ParDo(WriteBatchesToGCS(args.output_bucket))) result = p.run() result.wait_until_finish()
def run(argv=None): pipeline_options = PipelineOptions(argv) options = pipeline_options.view_as(ParkdataPipelineOptions) # Save the main session that defines global import, functions and variables. Otherwise they are not saved during # the serialization. Details see https://cloud.google.com/dataflow/docs/resources/faq#how_do_i_handle_nameerrors pipeline_options.view_as( SetupOptions).save_main_session = options.save_session with beam.Pipeline(options=pipeline_options) as p: wikidata_data, commons_ids = ( p | "wikidata_query/create" >> beam.Create(wd_queries()) | "wikidata/query" >> wikidata.Query( FileSystems.join(options.base_path, "wikidata_query_cache.sqlite"), user_agent=options.user_agent, ) | "wikidata/group" >> beam.GroupByKey() | "wikidata/fetch" >> wikidata.Transform( options.supported_languages(), cache_file=FileSystems.join(options.base_path, "wikidata_cache.sqlite"), user_agent=options.user_agent, )) commons_data = commons_ids | "commons" >> commons.Transform( FileSystems.join(options.base_path, "commons_cache.sqlite"), user_agent=options.user_agent) wikipedia_data = wikidata_data | "wikipedia" >> wikipedia.Transform( FileSystems.join(options.base_path, "wikipedia_qache.sqlite"), user_agent=options.user_agent) changed_places = ( { Combine.TAG_COMMONS: commons_data, Combine.TAG_WIKIDATA: wikidata_data, Combine.TAG_WIKIPEDIA: wikipedia_data, } | "combine/group_by_key" >> beam.CoGroupByKey() | "combine/combine" >> beam.ParDo(Combine()) | "combine/changed" >> beam.ParDo( OutputNewOrChangedEntires( FileSystems.join(options.base_path, "output.sqlite")))) (changed_places | "firestore_output/convert_types" >> beam.MapTuple(use_firestore_types) | "firestore_output/write" >> beam.ParDo( FirestoreWrite(project=options.project_id, collection="places_v4", credentials="gcp-service-account.json")))
def run(options): with beam.Pipeline(options=options) as p: lines = (p | 'Read test xml file' >> beam.io.ReadFromText( p.options.input, validate=False)) counts = (lines | 'Split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(str)) | 'PairWIthOne' >> beam.Map(lambda x: (x, 1)) | 'GroupAndSum' >> beam.CombinePerKey(sum)) output = counts | 'Format' >> beam.MapTuple(format_result) output | 'Write' >> beam.io.WriteToText(p.options.output)
def expand(self, pcolls): scalar_inputs = [ expr for expr in self.stage.inputs if is_scalar(expr) ] tabular_inputs = [ expr for expr in self.stage.inputs if not is_scalar(expr) ] if len(tabular_inputs) == 0: partitioned_pcoll = next( pcolls.values()).pipeline | beam.Create([{}]) elif self.stage.partitioning != partitionings.Nothing(): # Arrange such that partitioned_pcoll is properly partitioned. main_pcolls = { expr._id: pcolls[expr._id] | 'Flat%s' % expr._id >> beam.FlatMap(self.stage.partitioning.partition_fn) for expr in tabular_inputs } | beam.CoGroupByKey() partitioned_pcoll = main_pcolls | beam.MapTuple( lambda _, inputs: {tag: pd.concat(vs) for tag, vs in inputs.items()}) else: # Already partitioned, or no partitioning needed. assert len(tabular_inputs) == 1 tag = tabular_inputs[0]._id partitioned_pcoll = pcolls[tag] | beam.Map( lambda df: {tag: df}) side_pcolls = { expr._id: beam.pvalue.AsSingleton(pcolls[expr._id]) for expr in scalar_inputs } # Actually evaluate the expressions. def evaluate(partition, stage=self.stage, **side_inputs): session = expressions.Session( dict([(expr, partition[expr._id]) for expr in tabular_inputs] + [(expr, side_inputs[expr._id]) for expr in scalar_inputs])) for expr in stage.outputs: yield beam.pvalue.TaggedOutput( expr._id, expr.evaluate_at(session)) return partitioned_pcoll | beam.FlatMap( evaluate, **side_pcolls).with_outputs()
def test_reshuffle_after_gbk_contents_unchanged(self): with TestPipeline() as pipeline: data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 3)] expected_result = [(1, [1, 2, 3]), (2, [1, 2]), (3, [1])] after_gbk = ( pipeline | beam.Create(data) | beam.GroupByKey() | beam.MapTuple(lambda k, vs: (k, sorted(vs)))) assert_that(after_gbk, equal_to(expected_result), label='after_gbk') after_reshuffle = after_gbk | beam.Reshuffle() assert_that( after_reshuffle, equal_to(expected_result), label='after_reshuffle')
def test_rewindow(self): with TestPipeline() as p: result = (p | Create([(k, k) for k in range(10)]) | Map(lambda x_t1: TimestampedValue(x_t1[0], x_t1[1])) | 'window' >> WindowInto(SlidingWindows(period=2, size=6)) # Per the model, each element is now duplicated across # three windows. Rewindowing must preserve this duplication. | 'rewindow' >> WindowInto(FixedWindows(5)) | 'rewindow2' >> WindowInto(FixedWindows(5)) | Map(lambda v: ('key', v)) | GroupByKey() | beam.MapTuple(lambda k, vs: (k, sorted(vs)))) assert_that(result, equal_to([('key', sorted([0, 1, 2, 3, 4] * 3)), ('key', sorted([5, 6, 7, 8, 9] * 3))]))
def CheckAggregation(inputs_and_expected, aggregation): # Split the test stream into a branch of to-be-processed elements, and # a branch of expected results. inputs, expected = ( inputs_and_expected | beam.FlatMapTuple(lambda tag, value: [ beam.pvalue.TaggedOutput(tag, ('key1', value)), beam.pvalue.TaggedOutput(tag, ('key2', value)), ]).with_outputs('input', 'expect')) # Process the inputs with the given windowing to produce actual outputs. outputs = ( inputs | beam.MapTuple(lambda key, value: TimestampedValue( (key, value), value)) | beam.WindowInto(window_fn, trigger=trigger_fn, accumulation_mode=accumulation_mode, timestamp_combiner=timestamp_combiner) | aggregation | beam.MapTuple(_windowed_value_info_map_fn) # Place outputs back into the global window to allow flattening # and share a single state in Check. | 'Global' >> beam.WindowInto( beam.transforms.window.GlobalWindows())) # Feed both the expected and actual outputs to Check() for comparison. tagged_expected = ( expected | beam.MapTuple(lambda key, value: (key, ('expect', value)))) tagged_outputs = ( outputs | beam.MapTuple(lambda key, value: (key, ('actual', value)))) # pylint: disable=expression-not-assigned ([tagged_expected, tagged_outputs] | beam.Flatten() | beam.ParDo(Check(self.allow_out_of_order)))
def test_reshuffle_global_window(self): with TestPipeline() as pipeline: data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)] expected_data = [(1, [1, 2, 4]), (2, [1, 2]), (3, [1])] before_reshuffle = ( pipeline | beam.Create(data) | beam.WindowInto(GlobalWindows()) | beam.GroupByKey() | beam.MapTuple(lambda k, vs: (k, sorted(vs)))) assert_that( before_reshuffle, equal_to(expected_data), label='before_reshuffle') after_reshuffle = before_reshuffle | beam.Reshuffle() assert_that( after_reshuffle, equal_to(expected_data), label='after reshuffle')