def expand(self, pcoll): return (pcoll | WindowInto(window.GlobalWindows()) | "ToVoidKey" >> Map(lambda v: (None, v)) | "Group" >> GroupByKey() | "UnKey" >> Map(lambda (k, v): v) | "Match" >> Map(matcher))
def run_combine(pipeline, input_elements=5, lift_combiners=True): # Calculate the expected result, which is the sum of an arithmetic sequence. # By default, this is equal to: 0 + 1 + 2 + 3 + 4 = 10 expected_result = input_elements * (input_elements - 1) / 2 # Enable runtime type checking in order to cover TypeCheckCombineFn by # the test. pipeline.get_pipeline_options().view_as( TypeOptions).runtime_type_check = True pipeline.get_pipeline_options().view_as( TypeOptions).allow_unsafe_triggers = True with pipeline as p: pcoll = p | 'Start' >> beam.Create(range(input_elements)) # Certain triggers, such as AfterCount, are incompatible with combiner # lifting. We can use that fact to prevent combiners from being lifted. if not lift_combiners: pcoll |= beam.WindowInto( window.GlobalWindows(), trigger=trigger.AfterCount(input_elements), accumulation_mode=trigger.AccumulationMode.DISCARDING) # Pass an additional 'None' in order to cover _CurriedFn by the test. pcoll |= 'Do' >> beam.CombineGlobally( combiners.SingleInputTupleCombineFn( CallSequenceEnforcingCombineFn(), CallSequenceEnforcingCombineFn()), None).with_fanout(fanout=1) assert_that(pcoll, equal_to([(expected_result, expected_result)]))
def load(events, metadata=None, pipeline_options=None): num_events_in_pane = 30 windowed_events = ( events | beam.WindowInto( window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(num_events_in_pane)), accumulation_mode=trigger.AccumulationMode.DISCARDING)) auction_by_seller_id = ( windowed_events | nexmark_query_util.JustAuctions() | 'query3_filter_category' >> beam.Filter(lambda auc: auc.category == 10) | 'query3_key_by_seller' >> beam.ParDo( nexmark_query_util.AuctionBySellerFn())) person_by_id = ( windowed_events | nexmark_query_util.JustPerson() | 'query3_filter_region' >> beam.Filter(lambda person: person.state in ['OR', 'ID', 'CA']) | 'query3_key_by_person_id' >> beam.ParDo( nexmark_query_util.PersonByIdFn())) return ({ nexmark_query_util.AUCTION_TAG: auction_by_seller_id, nexmark_query_util.PERSON_TAG: person_by_id, } | beam.CoGroupByKey() | 'query3_join' >> beam.ParDo( JoinFn(metadata.get('max_auction_waiting_time'))) | 'query3_output' >> beam.Map( lambda t: { ResultNames.NAME: t[1].name, ResultNames.CITY: t[1].city, ResultNames.STATE: t[1].state, ResultNames.AUCTION_ID: t[0].id }))
def test_fixed_global_window(self): self.run_windowed_side_inputs([1, 2, 11], window.FixedWindows(10), window.GlobalWindows(), expected=[(1, [1, 2, 11]), (2, [1, 2, 11]), (11, [1, 2, 11])])
def default_window_mapping_fn(target_window_fn): if target_window_fn == window.GlobalWindows(): return _global_window_mapping_fn def map_via_end(source_window): return list(target_window_fn.assign( window.WindowFn.AssignContext(source_window.max_timestamp())))[-1] return map_via_end
def _run_pardo_state_timers(self, windowed): state_spec = userstate.BagStateSpec('state', beam.coders.StrUtf8Coder()) timer_spec = userstate.TimerSpec('timer', userstate.TimeDomain.WATERMARK) elements = list('abcdefgh') buffer_size = 3 class BufferDoFn(beam.DoFn): def process(self, kv, ts=beam.DoFn.TimestampParam, timer=beam.DoFn.TimerParam(timer_spec), state=beam.DoFn.StateParam(state_spec)): _, element = kv state.add(element) buffer = state.read() # For real use, we'd keep track of this size separately. if len(list(buffer)) >= 3: state.clear() yield buffer else: timer.set(ts + 1) @userstate.on_timer(timer_spec) def process_timer(self, state=beam.DoFn.StateParam(state_spec)): buffer = state.read() state.clear() yield buffer def is_buffered_correctly(actual): # Pickling self in the closure for asserts gives errors (only on jenkins). self = FnApiRunnerTest('__init__') # Acutal should be a grouping of the inputs into batches of size # at most buffer_size, but the actual batching is nondeterministic # based on ordering and trigger firing timing. self.assertEqual(sorted(sum((list(b) for b in actual), [])), elements) self.assertEqual(max(len(list(buffer)) for buffer in actual), buffer_size) if windowed: # Elements were assigned to windows based on their parity. # Assert that each grouping consists of elements belonging to the # same window to ensure states and timers were properly partitioned. for b in actual: parity = set(ord(e) % 2 for e in b) self.assertEqual(1, len(parity), b) with self.create_pipeline() as p: actual = ( p | beam.Create(elements) # Send even and odd elements to different windows. | beam.Map(lambda e: window.TimestampedValue(e, ord(e) % 2)) | beam.WindowInto(window.FixedWindows(1) if windowed else window.GlobalWindows()) | beam.Map(lambda x: ('key', x)) | beam.ParDo(BufferDoFn())) assert_that(actual, is_buffered_correctly)
def expand(self, pcoll): windowing_saved = pcoll.windowing if windowing_saved.is_default(): # In this (common) case we can use a trivial trigger driver # and avoid the (expensive) window param. globally_windowed = window.GlobalWindows.windowed_value(None) MIN_TIMESTAMP = window.MIN_TIMESTAMP def reify_timestamps(element, timestamp=DoFn.TimestampParam): key, value = element if timestamp == MIN_TIMESTAMP: timestamp = None return key, (value, timestamp) def restore_timestamps(element): key, values = element return [ globally_windowed.with_value((key, value)) if timestamp is None else window.GlobalWindows.windowed_value( (key, value), timestamp) for (value, timestamp) in values ] else: # typing: All conditional function variants must have identical signatures def reify_timestamps( # type: ignore[misc] element, timestamp=DoFn.TimestampParam, window=DoFn.WindowParam): key, value = element # Transport the window as part of the value and restore it later. return key, windowed_value.WindowedValue( value, timestamp, [window]) def restore_timestamps(element): key, windowed_values = element return [ wv.with_value((key, wv.value)) for wv in windowed_values ] ungrouped = pcoll | Map(reify_timestamps).with_output_types(Any) # TODO(BEAM-8104) Using global window as one of the standard window. # This is to mitigate the Dataflow Java Runner Harness limitation to # accept only standard coders. ungrouped._windowing = Windowing( window.GlobalWindows(), triggerfn=Always(), accumulation_mode=AccumulationMode.DISCARDING, timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST) result = (ungrouped | GroupByKey() | FlatMap(restore_timestamps).with_output_types(Any)) result._windowing = windowing_saved return result
def expand(self, pcoll): windowing_saved = pcoll.windowing if windowing_saved.is_default(): # In this (common) case we can use a trivial trigger driver # and avoid the (expensive) window param. globally_windowed = window.GlobalWindows.windowed_value(None) window_fn = window.GlobalWindows() MIN_TIMESTAMP = window.MIN_TIMESTAMP def reify_timestamps(element, timestamp=DoFn.TimestampParam): key, value = element if timestamp == MIN_TIMESTAMP: timestamp = None return key, (value, timestamp) def restore_timestamps(element): key, values = element return [ globally_windowed.with_value((key, value)) if timestamp is None else window.GlobalWindows.windowed_value((key, value), timestamp) for (value, timestamp) in values] else: # The linter is confused. # hash(1) is used to force "runtime" selection of _IdentityWindowFn # pylint: disable=abstract-class-instantiated cls = hash(1) and _IdentityWindowFn window_fn = cls( windowing_saved.windowfn.get_window_coder()) def reify_timestamps(element, timestamp=DoFn.TimestampParam): key, value = element return key, TimestampedValue(value, timestamp) def restore_timestamps(element, window=DoFn.WindowParam): # Pass the current window since _IdentityWindowFn wouldn't know how # to generate it. key, values = element return [ windowed_value.WindowedValue( (key, value.value), value.timestamp, [window]) for value in values] ungrouped = pcoll | Map(reify_timestamps) ungrouped._windowing = Windowing( window_fn, triggerfn=AfterCount(1), accumulation_mode=AccumulationMode.DISCARDING, timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST) result = (ungrouped | GroupByKey() | FlatMap(restore_timestamps)) result._windowing = windowing_saved return result
def expand(self, pcoll): # We must have at least a single element to ensure the matcher # code gets run even if the input pcollection is empty. keyed_singleton = pcoll.pipeline | Create([(None, None)]) keyed_actual = (pcoll | WindowInto(window.GlobalWindows()) | "ToVoidKey" >> Map(lambda v: (None, v))) _ = ((keyed_singleton, keyed_actual) | "Group" >> CoGroupByKey() | "Unkey" >> Map(lambda (k, (_, actual_values)): actual_values) | "Match" >> Map(matcher))
def expand(self, pcoll): do_once = pcoll.pipeline | 'DoOnce' >> core.Create([None]) init_result_coll = do_once | 'InitializeWrite' >> core.Map( lambda _, sink: sink.initialize_write(), self.sink) if getattr(self.sink, 'num_shards', 0): min_shards = self.sink.num_shards if min_shards == 1: keyed_pcoll = pcoll | core.Map(lambda x: (None, x)) else: keyed_pcoll = pcoll | core.ParDo(_RoundRobinKeyFn(min_shards)) write_result_coll = (keyed_pcoll | core.WindowInto(window.GlobalWindows()) | core.GroupByKey() | 'WriteBundles' >> core.ParDo( _WriteKeyedBundleDoFn(self.sink), AsSingleton(init_result_coll))) else: min_shards = 1 write_result_coll = (pcoll | 'WriteBundles' >> core.ParDo(_WriteBundleDoFn(self.sink), AsSingleton(init_result_coll)) | 'Pair' >> core.Map(lambda x: (None, x)) | core.WindowInto(window.GlobalWindows()) | core.GroupByKey() | 'Extract' >> core.FlatMap(lambda x: x[1])) # PreFinalize should run before FinalizeWrite, and the two should not be # fused. pre_finalize_coll = do_once | 'PreFinalize' >> core.FlatMap( _pre_finalize, self.sink, AsSingleton(init_result_coll), AsIter(write_result_coll)) return do_once | 'FinalizeWrite' >> core.FlatMap( _finalize_write, self.sink, AsSingleton(init_result_coll), AsIter(write_result_coll), min_shards, AsSingleton(pre_finalize_coll))
def run(argv=None): # Use Python argparse module to parse custom arguments parser = argparse.ArgumentParser() parser.add_argument('--network') parser.add_argument('--input', dest='input', help='Input file to process.') parser.add_argument('--output', dest='output', help='Output file to write results to.') parser.add_argument('--output_topic', dest='out_topic', help=('Output PubSub topic of the form ' '"projects/<PROJECT>/topic/<TOPIC>".')) parser.add_argument('--input_topic', dest='in_topic', help=('Input PubSub topic of the form ' '"projects/<PROJECT>/topic/<TOPIC>".')) known_args, pipeline_args = parser.parse_known_args(argv) p_options = PipelineOptions(pipeline_args) google_cloud_options = p_options.view_as(GoogleCloudOptions) google_cloud_options.region = 'europe-west1' google_cloud_options.project = 'smartlive' '''google_cloud_options.job_name = 'dataflow-job-{}'.format( datetime.datetime.now().strftime("%Y-%m-%d%H%M%S") )''' google_cloud_options.staging_location = 'gs://rim-bucket/binaries' google_cloud_options.temp_location = 'gs://rim-bucket/temp' p_options.view_as(StandardOptions).runner = 'DirectRunner' p_options.view_as(SetupOptions).save_main_session = True p_options.view_as(StandardOptions).streaming = True p_options.view_as(WorkerOptions).subnetwork = ( 'regions/europe-west1/subnetworks/test') p = beam.Pipeline(options=p_options) lines = p | 'receive_data' >> beam.io.ReadFromPubSub( subscription=known_args.in_topic).with_input_types(str) \ | 'decode' >> beam.Map(lambda x: x.decode('utf-8')) \ | 'jsonload' >> beam.Map(lambda x: json.loads(x)) # ------------------------------ global window ----------------------------- # lines | 'window' >> beam.WindowInto(window.GlobalWindows(), trigger=trigger.AfterProcessingTime(10), accumulation_mode=trigger.AccumulationMode.DISCARDING) \ | 'CountGlobally' >> beam.CombineGlobally(beam.combiners.CountCombineFn()).without_defaults() \ | 'print' >> beam.ParDo(PrintFn()) lines | 'jsondumps' >> beam.Map(lambda x: json.dumps(x)) \ | 'encode' >> beam.Map(lambda x: x.encode('utf-8')) \ | 'send_to_Pub/Sub' >> beam.io.WriteToPubSub(known_args.out_topic) p.run().wait_until_finish()
def expand(self, pcoll): pcoll = ( pcoll | core.WindowInto(window.GlobalWindows()) | beam.ParDo(self._sharder) | beam.GroupByKey() # group by id and shard ) with warnings.catch_warnings(): # suppress a spurious warning generated within beam.io.Write. This warning is annoying but harmless warnings.filterwarnings( action="ignore", message="Using fallback coder for typehint: <type 'NoneType'>") return pcoll | beam.io.Write(self._sink).with_output_types(str)
def expand(self, pcoll): if reify_windows: pcoll = pcoll | ParDo(ReifyTimestampWindow()) # We must have at least a single element to ensure the matcher # code gets run even if the input pcollection is empty. keyed_singleton = pcoll.pipeline | Create([(None, None)]) keyed_actual = ( pcoll | WindowInto(window.GlobalWindows()) | "ToVoidKey" >> Map(lambda v: (None, v))) _ = ((keyed_singleton, keyed_actual) | "Group" >> CoGroupByKey() | "Unkey" >> Map(lambda k___actual_values: k___actual_values[1][1]) | "Match" >> Map(matcher))
def default_window_mapping_fn(target_window_fn): # type: (window.WindowFn) -> WindowMappingFn if target_window_fn == window.GlobalWindows(): return _global_window_mapping_fn if isinstance(target_window_fn, window.Sessions): raise RuntimeError("Sessions is not allowed in side inputs") def map_via_end(source_window): # type: (window.BoundedWindow) -> window.BoundedWindow return list( target_window_fn.assign( window.WindowFn.AssignContext( source_window.max_timestamp())))[-1] return map_via_end
def test_setting_global_window(self): with TestPipeline() as p: unkeyed_items = p | beam.Create([2, 11, 16, 27]) items = (unkeyed_items | 'key' >> beam.Map( lambda x: beam.window.TimestampedValue(('k', x), x))) # [START setting_global_window] from apache_beam import window session_windowed_items = ( items | 'window' >> beam.WindowInto(window.GlobalWindows())) # [END setting_global_window] summed = (session_windowed_items | 'group' >> beam.GroupByKey() | 'combine' >> beam.CombineValues(sum)) unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1]) assert_that(unkeyed, equal_to([56]))
def load(events, metadata=None): return ( events | nexmark_query_util.JustBids() | 'query12_extract_bidder' >> beam.Map(lambda bid: bid.bidder) # windowing with processing time trigger, currently not supported in batch | beam.WindowInto( window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterProcessingTime(metadata.get('window_size_sec'))), accumulation_mode=trigger.AccumulationMode.DISCARDING, allowed_lateness=0) | 'query12_bid_count' >> beam.combiners.Count.PerElement() | 'query12_output' >> beam.Map( lambda t: { ResultNames.BIDDER_ID: t[0], ResultNames.BID_COUNT: t[1] }))
def expand(self, pcoll): if reify_windows: pcoll = pcoll | ParDo(ReifyTimestampWindow()) keyed_singleton = pcoll.pipeline | Create([(None, None)]) keyed_actual = (pcoll | WindowInto(custom_windowing or window.GlobalWindows()) | "ToVoidKey" >> Map(lambda v: (None, v))) plain_actual = ((keyed_singleton, keyed_actual) | "Group" >> CoGroupByKey() | "Unkey" >> Map(lambda k_values: k_values[1][1])) if custom_windowing: plain_actual = plain_actual | "AddWindow" >> ParDo(AddWindow()) plain_actual = plain_actual | "Match" >> Map(matcher)
def run(argv=None): # Use Python argparse module to parse custom arguments parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://rim-bucket/market.txt', help='Input file to process.') parser.add_argument( '--output', dest='output', # CHANGE 1/5: The Google Cloud Storage path is required # for outputting the results. default='gs://rim-bucket/output/', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) p_options = PipelineOptions(pipeline_args) google_cloud_options = p_options.view_as(GoogleCloudOptions) google_cloud_options.region = 'europe-west1' google_cloud_options.project = 'smartlive' '''google_cloud_options.job_name = 'dataflow-job-{}'.format( datetime.datetime.now().strftime("%Y-%m-%d%H%M%S") )''' google_cloud_options.staging_location = 'gs://rim-bucket/binaries' google_cloud_options.temp_location = 'gs://rim-bucket/temp' p_options.view_as(StandardOptions).runner = 'DirectRunner' p_options.view_as(SetupOptions).save_main_session = True p_options.view_as(StandardOptions).streaming = True p_options.view_as(WorkerOptions).subnetwork = ( 'regions/europe-west1/subnetworks/test') p = beam.Pipeline(options=p_options) lines = p | 'receive_data' >> beam.io.ReadFromText( known_args.input)\ | 'window' >> beam.WindowInto(window.GlobalWindows()) \ | 'jsonload' >> beam.Map(lambda x: json.loads(x))\ | 'count' >> beam.Map(lambda x: len(x))\ | 'printnbrarticles' >> beam.ParDo(PrintFn()) \ # ----- window fixe + Trigger AfterWatermark + Accumulating mode ------ # (lines | 'CountGlobally' >> beam.CombineGlobally( beam.combiners.CountCombineFn()).without_defaults()) p.run().wait_until_finish()
def load(events, metadata=None): # find winning bids for each closed auction return (events # find winning bids | beam.Filter(nexmark_query_util.auction_or_bid) | winning_bids.WinningBids() # (auction_bids -> (aution.seller, bid) | beam.Map(lambda auc_bid: (auc_bid.auction.seller, auc_bid.bid)) # calculate and output mean as data arrives | beam.WindowInto( window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING, allowed_lateness=0) | beam.CombinePerKey(MovingMeanSellingPriceFn(10)) | beam.Map(lambda t: { ResultNames.SELLER: t[0], ResultNames.PRICE: t[1] }))
def run(argv=None): """Build and run the pipeline.""" args = ["--runner=PortableRunner", "--job_endpoint=localhost:8099", "--streaming"] if argv: args.extend(argv) parser = argparse.ArgumentParser() parser.add_argument('--count', dest='count', default=0, help='Number of triggers to generate ' '(0 means emit forever).') parser.add_argument('--interval_ms', dest='interval_ms', default=500, help='Interval between records per parallel ' 'Flink subtask.') known_args, pipeline_args = parser.parse_known_args(args) pipeline_options = PipelineOptions(pipeline_args) p = beam.Pipeline(options=pipeline_options) messages = (p | FlinkStreamingImpulseSource() .set_message_count(known_args.count) .set_interval_ms(known_args.interval_ms)) _ = (messages | 'decode' >> beam.Map(lambda x: ('', 1)) | 'window' >> beam.WindowInto(window.GlobalWindows(), trigger=Repeatedly( AfterProcessingTime(5 * 1000)), accumulation_mode= AccumulationMode.DISCARDING) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(count) | 'log' >> beam.Map(lambda x: logging.info("%d" % x[1]))) result = p.run() result.wait_until_finish()
def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) data = [{'message': 'Hi', 'timestamp': time.time()}] events = (p | 'Create Events' >> beam.Create(data) \ | 'Add Timestamps' >> beam.Map(lambda x: beam.window.TimestampedValue(x, x['timestamp'])) \ | 'Sliding Windows' >> beam.WindowInto(beam.window.SlidingWindows(60, 60)) \ | 'First window' >> beam.ParDo(DebugPrinterFn()) \ | 'global Window' >> beam.WindowInto(window.GlobalWindows()) \ | 'Second window' >> beam.ParDo(DebugPrinterFn())) result = p.run() result.wait_until_finish()
def expand(self, pcoll): if reify_windows: pcoll = pcoll | ParDo(ReifyTimestampWindow()) keyed_singleton = pcoll.pipeline | Create([(None, None)]) if use_global_window: pcoll = pcoll | WindowInto(window.GlobalWindows()) keyed_actual = pcoll | "ToVoidKey" >> Map(lambda v: (None, v)) # This is a CoGroupByKey so that the matcher always runs, even if the # PCollection is empty. plain_actual = ((keyed_singleton, keyed_actual) | "Group" >> CoGroupByKey() | "Unkey" >> Map(lambda k_values: k_values[1][1])) if not use_global_window: plain_actual = plain_actual | "AddWindow" >> ParDo(AddWindow()) plain_actual = plain_actual | "Match" >> Map(matcher)
def test(self): _ = ( self.pipeline | 'Read from pubsub' >> ReadFromPubSub( subscription=self.read_sub_name, with_attributes=True, id_label='id', ) | beam.Map(lambda x: bytes(1)).with_output_types(bytes) | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace)) | 'Window' >> beam.WindowInto( window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterCount(self.num_of_messages)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'Count messages' >> beam.CombineGlobally( beam.combiners.CountCombineFn()).without_defaults(). with_output_types(int) | 'Convert to bytes' >> beam.Map(lambda count: str(count).encode('utf-8')) | 'Write to Pubsub' >> beam.io.WriteToPubSub(self.matcher_topic_name))
def run(): options = PipelineOptions([ "--runner=PortableRunner", "--job_endpoint=localhost:8099", "--environment_type=LOOPBACK" ]) # options = PipelineOptions([ # "--runner=FlinkRunner", # "--flink_master=localhost:8081", # ]) with beam.Pipeline(options=options) as p: (p | 'ReadFromKafka' >> ReadFromKafka( consumer_config={"bootstrap.servers": "localhost:9092"}, topics=["beam-input"]) | 'ExtractWords' >> beam.FlatMap(lambda kv: re.findall(r'[A-Za-z\']+', kv[1])) | 'Window' >> beam.WindowInto( window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(1)), accumulation_mode=AccumulationMode.ACCUMULATING) | 'Count' >> beam.combiners.Count.PerElement() | 'Format' >> beam.Map(lambda word_count: '%s: %s' % (word_count[0], word_count[1])) | 'Log' >> beam.ParDo(LoggingDoFn()))
def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) (p | 'ReadFromKafka' >> ReadFromKafka( consumer_config={"bootstrap.servers": "localhost:9092"}, topics=["beam-input"]) | 'ExtractWords' >> beam.FlatMap(lambda (k, v): re.findall(r'[A-Za-z\']+', v)) | 'Window' >> beam.WindowInto( window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(1)), accumulation_mode=AccumulationMode.ACCUMULATING) | 'Count' >> beam.combiners.Count.PerElement() | 'Format' >> beam.Map(lambda word_count: '%s: %s' % (word_count[0], word_count[1])) | 'Log' >> beam.ParDo(LoggingDoFn())) result = p.run() result.wait_until_finish()
def get_windowing(self, unused_inputs): return core.Windowing(window.GlobalWindows())
def test_global_global_windows(self): self.run_windowed_side_inputs([1, 2, 3], window.GlobalWindows(), expected=[(1, [1, 2, 3]), (2, [1, 2, 3]), (3, [1, 2, 3])])
def get_windowing(self, _): return core.Windowing(window.GlobalWindows())
def expand(self, pcoll): return (pcoll | 'window' >> beam.WindowInto(window.GlobalWindows()) | "Count" >> beam.combiners.Count.Globally() | "Log" >> beam.Map(log_count_info))
def make_process_bundle_descriptor(self, data_api_service_descriptor, state_api_service_descriptor): # type: (Optional[endpoints_pb2.ApiServiceDescriptor], Optional[endpoints_pb2.ApiServiceDescriptor]) -> beam_fn_api_pb2.ProcessBundleDescriptor """Creates a ProcessBundleDescriptor for invoking the WindowFn's merge operation. """ def make_channel_payload(coder_id): # type: (str) -> bytes data_spec = beam_fn_api_pb2.RemoteGrpcPort(coder_id=coder_id) if data_api_service_descriptor: data_spec.api_service_descriptor.url = ( data_api_service_descriptor.url) return data_spec.SerializeToString() pipeline_context = self._execution_context_ref().pipeline_context global_windowing_strategy_id = self.uid('global_windowing_strategy') global_windowing_strategy_proto = core.Windowing( window.GlobalWindows()).to_runner_api(pipeline_context) coders = dict(pipeline_context.coders.get_id_to_proto_map()) def make_coder(urn, *components): # type: (str, str) -> str coder_proto = beam_runner_api_pb2.Coder( spec=beam_runner_api_pb2.FunctionSpec(urn=urn), component_coder_ids=components) coder_id = self.uid('coder') coders[coder_id] = coder_proto pipeline_context.coders.put_proto(coder_id, coder_proto) return coder_id bytes_coder_id = make_coder(common_urns.coders.BYTES.urn) window_coder_id = self._windowing_strategy_proto.window_coder_id global_window_coder_id = make_coder( common_urns.coders.GLOBAL_WINDOW.urn) iter_window_coder_id = make_coder(common_urns.coders.ITERABLE.urn, window_coder_id) input_coder_id = make_coder(common_urns.coders.KV.urn, bytes_coder_id, iter_window_coder_id) output_coder_id = make_coder( common_urns.coders.KV.urn, bytes_coder_id, make_coder( common_urns.coders.KV.urn, iter_window_coder_id, make_coder( common_urns.coders.ITERABLE.urn, make_coder(common_urns.coders.KV.urn, window_coder_id, iter_window_coder_id)))) windowed_input_coder_id = make_coder( common_urns.coders.WINDOWED_VALUE.urn, input_coder_id, global_window_coder_id) windowed_output_coder_id = make_coder( common_urns.coders.WINDOWED_VALUE.urn, output_coder_id, global_window_coder_id) self.windowed_input_coder_impl = pipeline_context.coders[ windowed_input_coder_id].get_impl() self.windowed_output_coder_impl = pipeline_context.coders[ windowed_output_coder_id].get_impl() self._bundle_processor_id = self.uid('merge_windows') return beam_fn_api_pb2.ProcessBundleDescriptor( id=self._bundle_processor_id, transforms={ self.TO_SDK_TRANSFORM: beam_runner_api_pb2.PTransform( unique_name='MergeWindows/Read', spec=beam_runner_api_pb2.FunctionSpec( urn=bundle_processor.DATA_INPUT_URN, payload=make_channel_payload(windowed_input_coder_id)), outputs={'input': 'input'}), 'Merge': beam_runner_api_pb2.PTransform( unique_name='MergeWindows/Merge', spec=beam_runner_api_pb2.FunctionSpec( urn=common_urns.primitives.MERGE_WINDOWS.urn, payload=self._windowing_strategy_proto.window_fn. SerializeToString()), inputs={'input': 'input'}, outputs={'output': 'output'}), self.FROM_SDK_TRANSFORM: beam_runner_api_pb2.PTransform( unique_name='MergeWindows/Write', spec=beam_runner_api_pb2.FunctionSpec( urn=bundle_processor.DATA_OUTPUT_URN, payload=make_channel_payload( windowed_output_coder_id)), inputs={'output': 'output'}), }, pcollections={ 'input': beam_runner_api_pb2.PCollection( unique_name='input', windowing_strategy_id=global_windowing_strategy_id, coder_id=input_coder_id), 'output': beam_runner_api_pb2.PCollection( unique_name='output', windowing_strategy_id=global_windowing_strategy_id, coder_id=output_coder_id), }, coders=coders, windowing_strategies={ global_windowing_strategy_id: global_windowing_strategy_proto, }, environments=dict(self._execution_context_ref(). pipeline_components.environments.items()), state_api_service_descriptor=state_api_service_descriptor, timer_api_service_descriptor=data_api_service_descriptor)