Example #1
0
 def expand(self, pcoll):
     return (pcoll
             | WindowInto(window.GlobalWindows())
             | "ToVoidKey" >> Map(lambda v: (None, v))
             | "Group" >> GroupByKey()
             | "UnKey" >> Map(lambda (k, v): v)
             | "Match" >> Map(matcher))
def run_combine(pipeline, input_elements=5, lift_combiners=True):
    # Calculate the expected result, which is the sum of an arithmetic sequence.
    # By default, this is equal to: 0 + 1 + 2 + 3 + 4 = 10
    expected_result = input_elements * (input_elements - 1) / 2

    # Enable runtime type checking in order to cover TypeCheckCombineFn by
    # the test.
    pipeline.get_pipeline_options().view_as(
        TypeOptions).runtime_type_check = True
    pipeline.get_pipeline_options().view_as(
        TypeOptions).allow_unsafe_triggers = True

    with pipeline as p:
        pcoll = p | 'Start' >> beam.Create(range(input_elements))

        # Certain triggers, such as AfterCount, are incompatible with combiner
        # lifting. We can use that fact to prevent combiners from being lifted.
        if not lift_combiners:
            pcoll |= beam.WindowInto(
                window.GlobalWindows(),
                trigger=trigger.AfterCount(input_elements),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)

        # Pass an additional 'None' in order to cover _CurriedFn by the test.
        pcoll |= 'Do' >> beam.CombineGlobally(
            combiners.SingleInputTupleCombineFn(
                CallSequenceEnforcingCombineFn(),
                CallSequenceEnforcingCombineFn()), None).with_fanout(fanout=1)
        assert_that(pcoll, equal_to([(expected_result, expected_result)]))
Example #3
0
def load(events, metadata=None, pipeline_options=None):
  num_events_in_pane = 30
  windowed_events = (
      events
      | beam.WindowInto(
          window.GlobalWindows(),
          trigger=trigger.Repeatedly(trigger.AfterCount(num_events_in_pane)),
          accumulation_mode=trigger.AccumulationMode.DISCARDING))
  auction_by_seller_id = (
      windowed_events
      | nexmark_query_util.JustAuctions()
      | 'query3_filter_category' >> beam.Filter(lambda auc: auc.category == 10)
      | 'query3_key_by_seller' >> beam.ParDo(
          nexmark_query_util.AuctionBySellerFn()))
  person_by_id = (
      windowed_events
      | nexmark_query_util.JustPerson()
      | 'query3_filter_region' >>
      beam.Filter(lambda person: person.state in ['OR', 'ID', 'CA'])
      | 'query3_key_by_person_id' >> beam.ParDo(
          nexmark_query_util.PersonByIdFn()))
  return ({
      nexmark_query_util.AUCTION_TAG: auction_by_seller_id,
      nexmark_query_util.PERSON_TAG: person_by_id,
  }
          | beam.CoGroupByKey()
          | 'query3_join' >> beam.ParDo(
              JoinFn(metadata.get('max_auction_waiting_time')))
          | 'query3_output' >> beam.Map(
              lambda t: {
                  ResultNames.NAME: t[1].name,
                  ResultNames.CITY: t[1].city,
                  ResultNames.STATE: t[1].state,
                  ResultNames.AUCTION_ID: t[0].id
              }))
Example #4
0
 def test_fixed_global_window(self):
     self.run_windowed_side_inputs([1, 2, 11],
                                   window.FixedWindows(10),
                                   window.GlobalWindows(),
                                   expected=[(1, [1, 2, 11]),
                                             (2, [1, 2, 11]),
                                             (11, [1, 2, 11])])
Example #5
0
def default_window_mapping_fn(target_window_fn):
  if target_window_fn == window.GlobalWindows():
    return _global_window_mapping_fn

  def map_via_end(source_window):
    return list(target_window_fn.assign(
        window.WindowFn.AssignContext(source_window.max_timestamp())))[-1]

  return map_via_end
Example #6
0
  def _run_pardo_state_timers(self, windowed):
    state_spec = userstate.BagStateSpec('state', beam.coders.StrUtf8Coder())
    timer_spec = userstate.TimerSpec('timer', userstate.TimeDomain.WATERMARK)
    elements = list('abcdefgh')
    buffer_size = 3

    class BufferDoFn(beam.DoFn):
      def process(self,
                  kv,
                  ts=beam.DoFn.TimestampParam,
                  timer=beam.DoFn.TimerParam(timer_spec),
                  state=beam.DoFn.StateParam(state_spec)):
        _, element = kv
        state.add(element)
        buffer = state.read()
        # For real use, we'd keep track of this size separately.
        if len(list(buffer)) >= 3:
          state.clear()
          yield buffer
        else:
          timer.set(ts + 1)

      @userstate.on_timer(timer_spec)
      def process_timer(self, state=beam.DoFn.StateParam(state_spec)):
        buffer = state.read()
        state.clear()
        yield buffer

    def is_buffered_correctly(actual):
      # Pickling self in the closure for asserts gives errors (only on jenkins).
      self = FnApiRunnerTest('__init__')
      # Acutal should be a grouping of the inputs into batches of size
      # at most buffer_size, but the actual batching is nondeterministic
      # based on ordering and trigger firing timing.
      self.assertEqual(sorted(sum((list(b) for b in actual), [])), elements)
      self.assertEqual(max(len(list(buffer)) for buffer in actual), buffer_size)
      if windowed:
        # Elements were assigned to windows based on their parity.
        # Assert that each grouping consists of elements belonging to the
        # same window to ensure states and timers were properly partitioned.
        for b in actual:
          parity = set(ord(e) % 2 for e in b)
          self.assertEqual(1, len(parity), b)

    with self.create_pipeline() as p:
      actual = (
          p
          | beam.Create(elements)
          # Send even and odd elements to different windows.
          | beam.Map(lambda e: window.TimestampedValue(e, ord(e) % 2))
          | beam.WindowInto(window.FixedWindows(1) if windowed
                            else window.GlobalWindows())
          | beam.Map(lambda x: ('key', x))
          | beam.ParDo(BufferDoFn()))

      assert_that(actual, is_buffered_correctly)
Example #7
0
    def expand(self, pcoll):
        windowing_saved = pcoll.windowing
        if windowing_saved.is_default():
            # In this (common) case we can use a trivial trigger driver
            # and avoid the (expensive) window param.
            globally_windowed = window.GlobalWindows.windowed_value(None)
            MIN_TIMESTAMP = window.MIN_TIMESTAMP

            def reify_timestamps(element, timestamp=DoFn.TimestampParam):
                key, value = element
                if timestamp == MIN_TIMESTAMP:
                    timestamp = None
                return key, (value, timestamp)

            def restore_timestamps(element):
                key, values = element
                return [
                    globally_windowed.with_value((key, value)) if
                    timestamp is None else window.GlobalWindows.windowed_value(
                        (key, value), timestamp)
                    for (value, timestamp) in values
                ]
        else:

            # typing: All conditional function variants must have identical signatures
            def reify_timestamps(  # type: ignore[misc]
                    element,
                    timestamp=DoFn.TimestampParam,
                    window=DoFn.WindowParam):
                key, value = element
                # Transport the window as part of the value and restore it later.
                return key, windowed_value.WindowedValue(
                    value, timestamp, [window])

            def restore_timestamps(element):
                key, windowed_values = element
                return [
                    wv.with_value((key, wv.value)) for wv in windowed_values
                ]

        ungrouped = pcoll | Map(reify_timestamps).with_output_types(Any)

        # TODO(BEAM-8104) Using global window as one of the standard window.
        # This is to mitigate the Dataflow Java Runner Harness limitation to
        # accept only standard coders.
        ungrouped._windowing = Windowing(
            window.GlobalWindows(),
            triggerfn=Always(),
            accumulation_mode=AccumulationMode.DISCARDING,
            timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST)
        result = (ungrouped
                  | GroupByKey()
                  | FlatMap(restore_timestamps).with_output_types(Any))
        result._windowing = windowing_saved
        return result
Example #8
0
  def expand(self, pcoll):
    windowing_saved = pcoll.windowing
    if windowing_saved.is_default():
      # In this (common) case we can use a trivial trigger driver
      # and avoid the (expensive) window param.
      globally_windowed = window.GlobalWindows.windowed_value(None)
      window_fn = window.GlobalWindows()
      MIN_TIMESTAMP = window.MIN_TIMESTAMP

      def reify_timestamps(element, timestamp=DoFn.TimestampParam):
        key, value = element
        if timestamp == MIN_TIMESTAMP:
          timestamp = None
        return key, (value, timestamp)

      def restore_timestamps(element):
        key, values = element
        return [
            globally_windowed.with_value((key, value))
            if timestamp is None
            else window.GlobalWindows.windowed_value((key, value), timestamp)
            for (value, timestamp) in values]

    else:
      # The linter is confused.
      # hash(1) is used to force "runtime" selection of _IdentityWindowFn
      # pylint: disable=abstract-class-instantiated
      cls = hash(1) and _IdentityWindowFn
      window_fn = cls(
          windowing_saved.windowfn.get_window_coder())

      def reify_timestamps(element, timestamp=DoFn.TimestampParam):
        key, value = element
        return key, TimestampedValue(value, timestamp)

      def restore_timestamps(element, window=DoFn.WindowParam):
        # Pass the current window since _IdentityWindowFn wouldn't know how
        # to generate it.
        key, values = element
        return [
            windowed_value.WindowedValue(
                (key, value.value), value.timestamp, [window])
            for value in values]

    ungrouped = pcoll | Map(reify_timestamps)
    ungrouped._windowing = Windowing(
        window_fn,
        triggerfn=AfterCount(1),
        accumulation_mode=AccumulationMode.DISCARDING,
        timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST)
    result = (ungrouped
              | GroupByKey()
              | FlatMap(restore_timestamps))
    result._windowing = windowing_saved
    return result
Example #9
0
 def expand(self, pcoll):
     # We must have at least a single element to ensure the matcher
     # code gets run even if the input pcollection is empty.
     keyed_singleton = pcoll.pipeline | Create([(None, None)])
     keyed_actual = (pcoll
                     | WindowInto(window.GlobalWindows())
                     | "ToVoidKey" >> Map(lambda v: (None, v)))
     _ = ((keyed_singleton, keyed_actual)
          | "Group" >> CoGroupByKey()
          |
          "Unkey" >> Map(lambda (k, (_, actual_values)): actual_values)
          | "Match" >> Map(matcher))
Example #10
0
 def expand(self, pcoll):
   do_once = pcoll.pipeline | 'DoOnce' >> core.Create([None])
   init_result_coll = do_once | 'InitializeWrite' >> core.Map(
       lambda _, sink: sink.initialize_write(), self.sink)
   if getattr(self.sink, 'num_shards', 0):
     min_shards = self.sink.num_shards
     if min_shards == 1:
       keyed_pcoll = pcoll | core.Map(lambda x: (None, x))
     else:
       keyed_pcoll = pcoll | core.ParDo(_RoundRobinKeyFn(min_shards))
     write_result_coll = (keyed_pcoll
                          | core.WindowInto(window.GlobalWindows())
                          | core.GroupByKey()
                          | 'WriteBundles' >> core.ParDo(
                              _WriteKeyedBundleDoFn(self.sink),
                              AsSingleton(init_result_coll)))
   else:
     min_shards = 1
     write_result_coll = (pcoll
                          | 'WriteBundles' >>
                          core.ParDo(_WriteBundleDoFn(self.sink),
                                     AsSingleton(init_result_coll))
                          | 'Pair' >> core.Map(lambda x: (None, x))
                          | core.WindowInto(window.GlobalWindows())
                          | core.GroupByKey()
                          | 'Extract' >> core.FlatMap(lambda x: x[1]))
   # PreFinalize should run before FinalizeWrite, and the two should not be
   # fused.
   pre_finalize_coll = do_once | 'PreFinalize' >> core.FlatMap(
       _pre_finalize,
       self.sink,
       AsSingleton(init_result_coll),
       AsIter(write_result_coll))
   return do_once | 'FinalizeWrite' >> core.FlatMap(
       _finalize_write,
       self.sink,
       AsSingleton(init_result_coll),
       AsIter(write_result_coll),
       min_shards,
       AsSingleton(pre_finalize_coll))
Example #11
0
def run(argv=None):
    # Use Python argparse module to parse custom arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--network')
    parser.add_argument('--input', dest='input', help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        help='Output file to write results to.')
    parser.add_argument('--output_topic',
                        dest='out_topic',
                        help=('Output PubSub topic of the form '
                              '"projects/<PROJECT>/topic/<TOPIC>".'))
    parser.add_argument('--input_topic',
                        dest='in_topic',
                        help=('Input PubSub topic of the form '
                              '"projects/<PROJECT>/topic/<TOPIC>".'))
    known_args, pipeline_args = parser.parse_known_args(argv)
    p_options = PipelineOptions(pipeline_args)
    google_cloud_options = p_options.view_as(GoogleCloudOptions)
    google_cloud_options.region = 'europe-west1'
    google_cloud_options.project = 'smartlive'
    '''google_cloud_options.job_name = 'dataflow-job-{}'.format(
        datetime.datetime.now().strftime("%Y-%m-%d%H%M%S")
    )'''
    google_cloud_options.staging_location = 'gs://rim-bucket/binaries'
    google_cloud_options.temp_location = 'gs://rim-bucket/temp'
    p_options.view_as(StandardOptions).runner = 'DirectRunner'
    p_options.view_as(SetupOptions).save_main_session = True
    p_options.view_as(StandardOptions).streaming = True
    p_options.view_as(WorkerOptions).subnetwork = (
        'regions/europe-west1/subnetworks/test')
    p = beam.Pipeline(options=p_options)

    lines = p | 'receive_data' >> beam.io.ReadFromPubSub(
        subscription=known_args.in_topic).with_input_types(str) \
        | 'decode' >> beam.Map(lambda x: x.decode('utf-8')) \
        | 'jsonload' >> beam.Map(lambda x: json.loads(x))

    # ------------------------------ global window  ----------------------------- #

    lines | 'window' >> beam.WindowInto(window.GlobalWindows(),
            trigger=trigger.AfterProcessingTime(10),
            accumulation_mode=trigger.AccumulationMode.DISCARDING) \
        | 'CountGlobally' >> beam.CombineGlobally(beam.combiners.CountCombineFn()).without_defaults() \
        | 'print' >> beam.ParDo(PrintFn())

    lines | 'jsondumps' >> beam.Map(lambda x: json.dumps(x)) \
        | 'encode' >> beam.Map(lambda x: x.encode('utf-8')) \
        | 'send_to_Pub/Sub' >> beam.io.WriteToPubSub(known_args.out_topic)

    p.run().wait_until_finish()
    def expand(self, pcoll):
        pcoll = (
            pcoll
            | core.WindowInto(window.GlobalWindows())
            | beam.ParDo(self._sharder)
            | beam.GroupByKey()  # group by id and shard
        )
        with warnings.catch_warnings():
            # suppress a spurious warning generated within beam.io.Write.  This warning is annoying but harmless
            warnings.filterwarnings(
                action="ignore",
                message="Using fallback coder for typehint: <type 'NoneType'>")

            return pcoll | beam.io.Write(self._sink).with_output_types(str)
Example #13
0
    def expand(self, pcoll):
      if reify_windows:
        pcoll = pcoll | ParDo(ReifyTimestampWindow())

      # We must have at least a single element to ensure the matcher
      # code gets run even if the input pcollection is empty.
      keyed_singleton = pcoll.pipeline | Create([(None, None)])
      keyed_actual = (
          pcoll
          | WindowInto(window.GlobalWindows())
          | "ToVoidKey" >> Map(lambda v: (None, v)))
      _ = ((keyed_singleton, keyed_actual)
           | "Group" >> CoGroupByKey()
           | "Unkey" >> Map(lambda k___actual_values: k___actual_values[1][1])
           | "Match" >> Map(matcher))
Example #14
0
def default_window_mapping_fn(target_window_fn):
    # type: (window.WindowFn) -> WindowMappingFn
    if target_window_fn == window.GlobalWindows():
        return _global_window_mapping_fn

    if isinstance(target_window_fn, window.Sessions):
        raise RuntimeError("Sessions is not allowed in side inputs")

    def map_via_end(source_window):
        # type: (window.BoundedWindow) -> window.BoundedWindow
        return list(
            target_window_fn.assign(
                window.WindowFn.AssignContext(
                    source_window.max_timestamp())))[-1]

    return map_via_end
Example #15
0
 def test_setting_global_window(self):
   with TestPipeline() as p:
     unkeyed_items = p | beam.Create([2, 11, 16, 27])
     items = (unkeyed_items
              | 'key' >> beam.Map(
                  lambda x: beam.window.TimestampedValue(('k', x), x)))
     # [START setting_global_window]
     from apache_beam import window
     session_windowed_items = (
         items | 'window' >> beam.WindowInto(window.GlobalWindows()))
     # [END setting_global_window]
     summed = (session_windowed_items
               | 'group' >> beam.GroupByKey()
               | 'combine' >> beam.CombineValues(sum))
     unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
     assert_that(unkeyed, equal_to([56]))
Example #16
0
def load(events, metadata=None):
  return (
      events
      | nexmark_query_util.JustBids()
      | 'query12_extract_bidder' >> beam.Map(lambda bid: bid.bidder)
      # windowing with processing time trigger, currently not supported in batch
      | beam.WindowInto(
          window.GlobalWindows(),
          trigger=trigger.Repeatedly(
              trigger.AfterProcessingTime(metadata.get('window_size_sec'))),
          accumulation_mode=trigger.AccumulationMode.DISCARDING,
          allowed_lateness=0)
      | 'query12_bid_count' >> beam.combiners.Count.PerElement()
      | 'query12_output' >> beam.Map(
          lambda t: {
              ResultNames.BIDDER_ID: t[0], ResultNames.BID_COUNT: t[1]
          }))
Example #17
0
        def expand(self, pcoll):
            if reify_windows:
                pcoll = pcoll | ParDo(ReifyTimestampWindow())

            keyed_singleton = pcoll.pipeline | Create([(None, None)])
            keyed_actual = (pcoll
                            | WindowInto(custom_windowing
                                         or window.GlobalWindows())
                            | "ToVoidKey" >> Map(lambda v: (None, v)))
            plain_actual = ((keyed_singleton, keyed_actual)
                            | "Group" >> CoGroupByKey()
                            | "Unkey" >> Map(lambda k_values: k_values[1][1]))

            if custom_windowing:
                plain_actual = plain_actual | "AddWindow" >> ParDo(AddWindow())

            plain_actual = plain_actual | "Match" >> Map(matcher)
Example #18
0
def run(argv=None):
    # Use Python argparse module to parse custom arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        default='gs://rim-bucket/market.txt',
                        help='Input file to process.')
    parser.add_argument(
        '--output',
        dest='output',
        # CHANGE 1/5: The Google Cloud Storage path is required
        # for outputting the results.
        default='gs://rim-bucket/output/',
        help='Output file to write results to.')

    known_args, pipeline_args = parser.parse_known_args(argv)
    p_options = PipelineOptions(pipeline_args)
    google_cloud_options = p_options.view_as(GoogleCloudOptions)
    google_cloud_options.region = 'europe-west1'
    google_cloud_options.project = 'smartlive'
    '''google_cloud_options.job_name = 'dataflow-job-{}'.format(
        datetime.datetime.now().strftime("%Y-%m-%d%H%M%S")
    )'''
    google_cloud_options.staging_location = 'gs://rim-bucket/binaries'
    google_cloud_options.temp_location = 'gs://rim-bucket/temp'

    p_options.view_as(StandardOptions).runner = 'DirectRunner'
    p_options.view_as(SetupOptions).save_main_session = True
    p_options.view_as(StandardOptions).streaming = True
    p_options.view_as(WorkerOptions).subnetwork = (
        'regions/europe-west1/subnetworks/test')
    p = beam.Pipeline(options=p_options)

    lines = p | 'receive_data' >> beam.io.ReadFromText(
        known_args.input)\
        | 'window' >> beam.WindowInto(window.GlobalWindows()) \
        | 'jsonload' >> beam.Map(lambda x: json.loads(x))\
        | 'count' >> beam.Map(lambda x: len(x))\
        | 'printnbrarticles' >> beam.ParDo(PrintFn()) \

    # ----- window fixe + Trigger AfterWatermark + Accumulating mode  ------ #
    (lines | 'CountGlobally' >> beam.CombineGlobally(
        beam.combiners.CountCombineFn()).without_defaults())

    p.run().wait_until_finish()
Example #19
0
def load(events, metadata=None):
    # find winning bids for each closed auction
    return (events
            # find winning bids
            | beam.Filter(nexmark_query_util.auction_or_bid)
            | winning_bids.WinningBids()
            # (auction_bids -> (aution.seller, bid)
            | beam.Map(lambda auc_bid: (auc_bid.auction.seller, auc_bid.bid))
            # calculate and output mean as data arrives
            | beam.WindowInto(
                window.GlobalWindows(),
                trigger=trigger.Repeatedly(trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.ACCUMULATING,
                allowed_lateness=0)
            | beam.CombinePerKey(MovingMeanSellingPriceFn(10))
            | beam.Map(lambda t: {
                ResultNames.SELLER: t[0],
                ResultNames.PRICE: t[1]
            }))
Example #20
0
def run(argv=None):
  """Build and run the pipeline."""
  args = ["--runner=PortableRunner",
          "--job_endpoint=localhost:8099",
          "--streaming"]
  if argv:
    args.extend(argv)

  parser = argparse.ArgumentParser()
  parser.add_argument('--count',
                      dest='count',
                      default=0,
                      help='Number of triggers to generate '
                           '(0 means emit forever).')
  parser.add_argument('--interval_ms',
                      dest='interval_ms',
                      default=500,
                      help='Interval between records per parallel '
                           'Flink subtask.')

  known_args, pipeline_args = parser.parse_known_args(args)

  pipeline_options = PipelineOptions(pipeline_args)

  p = beam.Pipeline(options=pipeline_options)

  messages = (p | FlinkStreamingImpulseSource()
              .set_message_count(known_args.count)
              .set_interval_ms(known_args.interval_ms))

  _ = (messages | 'decode' >> beam.Map(lambda x: ('', 1))
       | 'window' >> beam.WindowInto(window.GlobalWindows(),
                                     trigger=Repeatedly(
                                         AfterProcessingTime(5 * 1000)),
                                     accumulation_mode=
                                     AccumulationMode.DISCARDING)
       | 'group' >> beam.GroupByKey()
       | 'count' >> beam.Map(count)
       | 'log' >> beam.Map(lambda x: logging.info("%d" % x[1])))

  result = p.run()
  result.wait_until_finish()
Example #21
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    data = [{'message': 'Hi', 'timestamp': time.time()}]

    events = (p
      | 'Create Events' >> beam.Create(data) \
      | 'Add Timestamps' >> beam.Map(lambda x: beam.window.TimestampedValue(x, x['timestamp'])) \
      | 'Sliding Windows'   >> beam.WindowInto(beam.window.SlidingWindows(60, 60)) \
      | 'First window' >> beam.ParDo(DebugPrinterFn()) \
      | 'global Window'   >> beam.WindowInto(window.GlobalWindows()) \
      | 'Second window'   >> beam.ParDo(DebugPrinterFn()))

    result = p.run()
    result.wait_until_finish()
Example #22
0
        def expand(self, pcoll):
            if reify_windows:
                pcoll = pcoll | ParDo(ReifyTimestampWindow())

            keyed_singleton = pcoll.pipeline | Create([(None, None)])

            if use_global_window:
                pcoll = pcoll | WindowInto(window.GlobalWindows())

            keyed_actual = pcoll | "ToVoidKey" >> Map(lambda v: (None, v))

            # This is a CoGroupByKey so that the matcher always runs, even if the
            # PCollection is empty.
            plain_actual = ((keyed_singleton, keyed_actual)
                            | "Group" >> CoGroupByKey()
                            | "Unkey" >> Map(lambda k_values: k_values[1][1]))

            if not use_global_window:
                plain_actual = plain_actual | "AddWindow" >> ParDo(AddWindow())

            plain_actual = plain_actual | "Match" >> Map(matcher)
Example #23
0
 def test(self):
   _ = (
       self.pipeline
       | 'Read from pubsub' >> ReadFromPubSub(
           subscription=self.read_sub_name,
           with_attributes=True,
           id_label='id',
       )
       | beam.Map(lambda x: bytes(1)).with_output_types(bytes)
       | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace))
       | 'Window' >> beam.WindowInto(
           window.GlobalWindows(),
           trigger=trigger.Repeatedly(
               trigger.AfterCount(self.num_of_messages)),
           accumulation_mode=trigger.AccumulationMode.DISCARDING)
       | 'Count messages' >> beam.CombineGlobally(
           beam.combiners.CountCombineFn()).without_defaults().
       with_output_types(int)
       | 'Convert to bytes' >>
       beam.Map(lambda count: str(count).encode('utf-8'))
       | 'Write to Pubsub' >> beam.io.WriteToPubSub(self.matcher_topic_name))
def run():
    options = PipelineOptions([
        "--runner=PortableRunner", "--job_endpoint=localhost:8099",
        "--environment_type=LOOPBACK"
    ])
    # options = PipelineOptions([
    #     "--runner=FlinkRunner",
    #     "--flink_master=localhost:8081",
    # ])
    with beam.Pipeline(options=options) as p:
        (p | 'ReadFromKafka' >> ReadFromKafka(
            consumer_config={"bootstrap.servers": "localhost:9092"},
            topics=["beam-input"])
         | 'ExtractWords' >>
         beam.FlatMap(lambda kv: re.findall(r'[A-Za-z\']+', kv[1]))
         | 'Window' >> beam.WindowInto(
             window.GlobalWindows(),
             trigger=trigger.Repeatedly(trigger.AfterCount(1)),
             accumulation_mode=AccumulationMode.ACCUMULATING)
         | 'Count' >> beam.combiners.Count.PerElement()
         | 'Format' >> beam.Map(lambda word_count: '%s: %s' %
                                (word_count[0], word_count[1]))
         | 'Log' >> beam.ParDo(LoggingDoFn()))
def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    (p | 'ReadFromKafka' >> ReadFromKafka(
        consumer_config={"bootstrap.servers": "localhost:9092"},
        topics=["beam-input"])
     | 'ExtractWords' >> beam.FlatMap(lambda
                                      (k, v): re.findall(r'[A-Za-z\']+', v))
     | 'Window' >> beam.WindowInto(
         window.GlobalWindows(),
         trigger=trigger.Repeatedly(trigger.AfterCount(1)),
         accumulation_mode=AccumulationMode.ACCUMULATING)
     | 'Count' >> beam.combiners.Count.PerElement()
     | 'Format' >> beam.Map(lambda word_count: '%s: %s' %
                            (word_count[0], word_count[1]))
     | 'Log' >> beam.ParDo(LoggingDoFn()))

    result = p.run()
    result.wait_until_finish()
Example #26
0
 def get_windowing(self, unused_inputs):
     return core.Windowing(window.GlobalWindows())
Example #27
0
 def test_global_global_windows(self):
     self.run_windowed_side_inputs([1, 2, 3],
                                   window.GlobalWindows(),
                                   expected=[(1, [1, 2, 3]), (2, [1, 2, 3]),
                                             (3, [1, 2, 3])])
Example #28
0
 def get_windowing(self, _):
   return core.Windowing(window.GlobalWindows())
Example #29
0
 def expand(self, pcoll):
     return (pcoll
             | 'window' >> beam.WindowInto(window.GlobalWindows())
             | "Count" >> beam.combiners.Count.Globally()
             | "Log" >> beam.Map(log_count_info))
Example #30
0
    def make_process_bundle_descriptor(self, data_api_service_descriptor,
                                       state_api_service_descriptor):
        # type: (Optional[endpoints_pb2.ApiServiceDescriptor], Optional[endpoints_pb2.ApiServiceDescriptor]) -> beam_fn_api_pb2.ProcessBundleDescriptor
        """Creates a ProcessBundleDescriptor for invoking the WindowFn's
    merge operation.
    """
        def make_channel_payload(coder_id):
            # type: (str) -> bytes
            data_spec = beam_fn_api_pb2.RemoteGrpcPort(coder_id=coder_id)
            if data_api_service_descriptor:
                data_spec.api_service_descriptor.url = (
                    data_api_service_descriptor.url)
            return data_spec.SerializeToString()

        pipeline_context = self._execution_context_ref().pipeline_context
        global_windowing_strategy_id = self.uid('global_windowing_strategy')
        global_windowing_strategy_proto = core.Windowing(
            window.GlobalWindows()).to_runner_api(pipeline_context)
        coders = dict(pipeline_context.coders.get_id_to_proto_map())

        def make_coder(urn, *components):
            # type: (str, str) -> str
            coder_proto = beam_runner_api_pb2.Coder(
                spec=beam_runner_api_pb2.FunctionSpec(urn=urn),
                component_coder_ids=components)
            coder_id = self.uid('coder')
            coders[coder_id] = coder_proto
            pipeline_context.coders.put_proto(coder_id, coder_proto)
            return coder_id

        bytes_coder_id = make_coder(common_urns.coders.BYTES.urn)
        window_coder_id = self._windowing_strategy_proto.window_coder_id
        global_window_coder_id = make_coder(
            common_urns.coders.GLOBAL_WINDOW.urn)
        iter_window_coder_id = make_coder(common_urns.coders.ITERABLE.urn,
                                          window_coder_id)
        input_coder_id = make_coder(common_urns.coders.KV.urn, bytes_coder_id,
                                    iter_window_coder_id)
        output_coder_id = make_coder(
            common_urns.coders.KV.urn, bytes_coder_id,
            make_coder(
                common_urns.coders.KV.urn, iter_window_coder_id,
                make_coder(
                    common_urns.coders.ITERABLE.urn,
                    make_coder(common_urns.coders.KV.urn, window_coder_id,
                               iter_window_coder_id))))
        windowed_input_coder_id = make_coder(
            common_urns.coders.WINDOWED_VALUE.urn, input_coder_id,
            global_window_coder_id)
        windowed_output_coder_id = make_coder(
            common_urns.coders.WINDOWED_VALUE.urn, output_coder_id,
            global_window_coder_id)

        self.windowed_input_coder_impl = pipeline_context.coders[
            windowed_input_coder_id].get_impl()
        self.windowed_output_coder_impl = pipeline_context.coders[
            windowed_output_coder_id].get_impl()

        self._bundle_processor_id = self.uid('merge_windows')
        return beam_fn_api_pb2.ProcessBundleDescriptor(
            id=self._bundle_processor_id,
            transforms={
                self.TO_SDK_TRANSFORM:
                beam_runner_api_pb2.PTransform(
                    unique_name='MergeWindows/Read',
                    spec=beam_runner_api_pb2.FunctionSpec(
                        urn=bundle_processor.DATA_INPUT_URN,
                        payload=make_channel_payload(windowed_input_coder_id)),
                    outputs={'input': 'input'}),
                'Merge':
                beam_runner_api_pb2.PTransform(
                    unique_name='MergeWindows/Merge',
                    spec=beam_runner_api_pb2.FunctionSpec(
                        urn=common_urns.primitives.MERGE_WINDOWS.urn,
                        payload=self._windowing_strategy_proto.window_fn.
                        SerializeToString()),
                    inputs={'input': 'input'},
                    outputs={'output': 'output'}),
                self.FROM_SDK_TRANSFORM:
                beam_runner_api_pb2.PTransform(
                    unique_name='MergeWindows/Write',
                    spec=beam_runner_api_pb2.FunctionSpec(
                        urn=bundle_processor.DATA_OUTPUT_URN,
                        payload=make_channel_payload(
                            windowed_output_coder_id)),
                    inputs={'output': 'output'}),
            },
            pcollections={
                'input':
                beam_runner_api_pb2.PCollection(
                    unique_name='input',
                    windowing_strategy_id=global_windowing_strategy_id,
                    coder_id=input_coder_id),
                'output':
                beam_runner_api_pb2.PCollection(
                    unique_name='output',
                    windowing_strategy_id=global_windowing_strategy_id,
                    coder_id=output_coder_id),
            },
            coders=coders,
            windowing_strategies={
                global_windowing_strategy_id: global_windowing_strategy_proto,
            },
            environments=dict(self._execution_context_ref().
                              pipeline_components.environments.items()),
            state_api_service_descriptor=state_api_service_descriptor,
            timer_api_service_descriptor=data_api_service_descriptor)