Ejemplo n.º 1
0
 def test_window(self):
     l = [
         GlobalWindows.windowed_value('a', 100),
         GlobalWindows.windowed_value('b', 200),
         GlobalWindows.windowed_value('c', 300)
     ]
     expected = [
         TestWindowedValue(('a', 100, GlobalWindow()), 100,
                           [GlobalWindow()]),
         TestWindowedValue(('b', 200, GlobalWindow()), 200,
                           [GlobalWindow()]),
         TestWindowedValue(('c', 300, GlobalWindow()), 300,
                           [GlobalWindow()])
     ]
     with TestPipeline() as p:
         pc = p | beam.Create(l)
         # Map(lambda x: x) PTransform is added after Create here, because when
         # a PCollection of WindowedValues is created with Create PTransform,
         # the windows are not assigned to it. Adding a Map forces the
         # PCollection to go through a DoFn so that the PCollection consists of
         # the elements with timestamps assigned to them instead of a PCollection
         # of WindowedValue(element, timestamp, window).
         pc = pc | beam.Map(lambda x: x)
         reified_pc = pc | util.Reify.Window()
         assert_that(reified_pc, equal_to(expected), reify_windows=True)
Ejemplo n.º 2
0
 def test_window(self):
   l = [GlobalWindows.windowed_value('a', 100),
        GlobalWindows.windowed_value('b', 200),
        GlobalWindows.windowed_value('c', 300)]
   expected = [TestWindowedValue(('a', 100, GlobalWindow()), 100,
                                 [GlobalWindow()]),
               TestWindowedValue(('b', 200, GlobalWindow()), 200,
                                 [GlobalWindow()]),
               TestWindowedValue(('c', 300, GlobalWindow()), 300,
                                 [GlobalWindow()])]
   with TestPipeline() as p:
     pc = p | beam.Create(l)
     reified_pc = pc | util.Reify.Window()
     assert_that(reified_pc, equal_to(expected), reify_windows=True)
Ejemplo n.º 3
0
 def __iter__(self):
   output_stream = create_OutputStream()
   for encoded_key, values in self._table.items():
     key = self._key_coder.decode(encoded_key)
     self._post_grouped_coder.get_impl().encode_to_stream(
         GlobalWindows.windowed_value((key, values)), output_stream, True)
   return iter([output_stream.get()])
Ejemplo n.º 4
0
 def _read_values_to_bundles(reader):
   read_result = [GlobalWindows.windowed_value(e) for e in reader]
   return self._split_list_into_bundles(
       output_pcollection,
       read_result,
       _BoundedReadEvaluator.MAX_ELEMENT_PER_BUNDLE,
       lambda _: 1)
Ejemplo n.º 5
0
 def test_update_multiple(self):
     coder = coders.PickleCoder()
     total_size = 0
     opcounts = OperationCounters(CounterFactory(), 'some-name', coder, 0)
     self.verify_counters(opcounts, 0, float('nan'))
     value = GlobalWindows.windowed_value('abcde')
     opcounts.update_from(value)
     total_size += coder.estimate_size(value)
     value = GlobalWindows.windowed_value('defghij')
     opcounts.update_from(value)
     total_size += coder.estimate_size(value)
     self.verify_counters(opcounts, 2, float(total_size) / 2)
     value = GlobalWindows.windowed_value('klmnop')
     opcounts.update_from(value)
     total_size += coder.estimate_size(value)
     self.verify_counters(opcounts, 3, float(total_size) / 3)
    def finish_bundle(self):
        data = self._read_from_pubsub(self.source.timestamp_attribute)
        if data:
            output_pcollection = list(self._outputs)[0]
            bundle = self._evaluation_context.create_bundle(output_pcollection)
            # TODO(ccy): Respect the PubSub source's id_label field.
            for timestamp, message in data:
                if self.source.with_attributes:
                    element = message
                else:
                    element = message.payload
                bundle.output(
                    GlobalWindows.windowed_value(element, timestamp=timestamp))
            bundles = [bundle]
        else:
            bundles = []
        if self._applied_ptransform.inputs:
            input_pvalue = self._applied_ptransform.inputs[0]
        else:
            input_pvalue = pvalue.PBegin(
                self._applied_ptransform.transform.pipeline)
        unprocessed_bundle = self._evaluation_context.create_bundle(
            input_pvalue)

        # TODO(udim): Correct value for watermark hold.
        return TransformResult(self, bundles, [unprocessed_bundle], None,
                               {None: Timestamp.of(time.time())})
Ejemplo n.º 7
0
 def test_update_int(self):
     opcounts = OperationCounters(CounterFactory(), 'some-name',
                                  coders.PickleCoder(), 0)
     self.verify_counters(opcounts, 0)
     opcounts.update_from(GlobalWindows.windowed_value(1))
     opcounts.update_collect()
     self.verify_counters(opcounts, 1)
Ejemplo n.º 8
0
 def test_window_in_value(self):
   l = [GlobalWindows.windowed_value(('a', 1), 100),
        GlobalWindows.windowed_value(('b', 2), 200),
        GlobalWindows.windowed_value(('c', 3), 300)]
   expected = [TestWindowedValue(('a', (1, 100, GlobalWindow())), 100,
                                 [GlobalWindow()]),
               TestWindowedValue(('b', (2, 200, GlobalWindow())), 200,
                                 [GlobalWindow()]),
               TestWindowedValue(('c', (3, 300, GlobalWindow())), 300,
                                 [GlobalWindow()])]
   with TestPipeline() as p:
     # Map(lambda x: x) hack is used for the same reason here.
     # Also, this makes the typehint on Reify.WindowInValue work.
     pc = p | beam.Create(l) | beam.Map(lambda x: x)
     reified_pc = pc | util.Reify.WindowInValue()
     assert_that(reified_pc, equal_to(expected), reify_windows=True)
  def finish_bundle(self):
    data = self._read_from_pubsub(self.source.timestamp_attribute)
    if data:
      output_pcollection = list(self._outputs)[0]
      bundle = self._evaluation_context.create_bundle(output_pcollection)
      # TODO(ccy): Respect the PubSub source's id_label field.
      for timestamp, message in data:
        if self.source.with_attributes:
          element = message
        else:
          element = message.data
        bundle.output(
            GlobalWindows.windowed_value(element, timestamp=timestamp))
      bundles = [bundle]
    else:
      bundles = []
    if self._applied_ptransform.inputs:
      input_pvalue = self._applied_ptransform.inputs[0]
    else:
      input_pvalue = pvalue.PBegin(self._applied_ptransform.transform.pipeline)
    unprocessed_bundle = self._evaluation_context.create_bundle(
        input_pvalue)

    # TODO(udim): Correct value for watermark hold.
    return TransformResult(self, bundles, [unprocessed_bundle], None,
                           {None: Timestamp.of(time.time())})
 def test_update_int(self):
   opcounts = OperationCounters(CounterFactory(), 'some-name',
                                coders.PickleCoder(), 0)
   self.verify_counters(opcounts, 0)
   opcounts.update_from(GlobalWindows.windowed_value(1))
   opcounts.update_collect()
   self.verify_counters(opcounts, 1)
Ejemplo n.º 11
0
  def process_element(self, element):
    # The index into the TestStream list of events.
    self.current_index = element.value

    # The watermark of the _TestStream transform itself.
    self.watermark = element.timestamp

    # We can either have the _TestStream or the _WatermarkController to emit
    # the elements. We chose to emit in the _WatermarkController so that the
    # element is emitted at the correct watermark value.
    for event in self.test_stream.events(self.current_index):
      if isinstance(event, (ElementEvent, WatermarkEvent)):
        # The WATERMARK_CONTROL_TAG is used to hold the _TestStream's
        # watermark to -inf, then +inf-1, then +inf. This watermark progression
        # is ultimately used to set up the proper holds to allow the
        # _WatermarkControllers to control their own output watermarks.
        if event.tag == _TestStream.WATERMARK_CONTROL_TAG:
          self.watermark = event.new_watermark
        else:
          main_output = list(self._outputs)[0]
          bundle = self._evaluation_context.create_bundle(main_output)
          bundle.output(GlobalWindows.windowed_value(event))
          self.bundles.append(bundle)
      elif isinstance(event, ProcessingTimeEvent):
        self._evaluation_context._watermark_manager._clock.advance_time(
            event.advance_by)
      else:
        raise ValueError('Invalid TestStream event: %s.' % event)
Ejemplo n.º 12
0
 def test_update_multiple(self):
   coder = coders.PickleCoder()
   total_size = 0
   opcounts = OperationCounters(CounterFactory(), 'some-name',
                                coder, 0)
   self.verify_counters(opcounts, 0, float('nan'))
   value = GlobalWindows.windowed_value('abcde')
   opcounts.update_from(value)
   total_size += coder.estimate_size(value)
   value = GlobalWindows.windowed_value('defghij')
   opcounts.update_from(value)
   total_size += coder.estimate_size(value)
   self.verify_counters(opcounts, 2, (float(total_size) / 2))
   value = GlobalWindows.windowed_value('klmnop')
   opcounts.update_from(value)
   total_size += coder.estimate_size(value)
   self.verify_counters(opcounts, 3, (float(total_size) / 3))
Ejemplo n.º 13
0
 def get_root_bundles(self):
   test_stream = self._applied_ptransform.transform
   bundle = self._evaluation_context.create_bundle(
       pvalue.PBegin(self._applied_ptransform.transform.pipeline))
   bundle.add(GlobalWindows.windowed_value(test_stream.begin(),
                                           timestamp=MIN_TIMESTAMP))
   bundle.commit(None)
   return [bundle]
Ejemplo n.º 14
0
 def test_update_str(self):
     coder = coders.PickleCoder()
     opcounts = OperationCounters(CounterFactory(), 'some-name', coder, 0)
     self.verify_counters(opcounts, 0, float('nan'))
     value = GlobalWindows.windowed_value('abcde')
     opcounts.update_from(value)
     estimated_size = coder.estimate_size(value)
     self.verify_counters(opcounts, 1, estimated_size)
Ejemplo n.º 15
0
 def test_update_str(self):
   coder = coders.PickleCoder()
   opcounts = OperationCounters(CounterFactory(), 'some-name',
                                coder, 0)
   self.verify_counters(opcounts, 0, float('nan'))
   value = GlobalWindows.windowed_value('abcde')
   opcounts.update_from(value)
   estimated_size = coder.estimate_size(value)
   self.verify_counters(opcounts, 1, estimated_size)
Ejemplo n.º 16
0
 def test_update_old_object(self):
     coder = coders.PickleCoder()
     opcounts = OperationCounters(CounterFactory(), 'some-name', coder, 0)
     self.verify_counters(opcounts, 0, float('nan'))
     obj = OldClassThatDoesNotImplementLen()
     value = GlobalWindows.windowed_value(obj)
     opcounts.update_from(value)
     estimated_size = coder.estimate_size(value)
     self.verify_counters(opcounts, 1, estimated_size)
Ejemplo n.º 17
0
 def test_update_old_object(self):
   coder = coders.PickleCoder()
   opcounts = OperationCounters(CounterFactory(), 'some-name',
                                coder, 0)
   self.verify_counters(opcounts, 0, float('nan'))
   obj = OldClassThatDoesNotImplementLen()
   value = GlobalWindows.windowed_value(obj)
   opcounts.update_from(value)
   estimated_size = coder.estimate_size(value)
   self.verify_counters(opcounts, 1, estimated_size)
Ejemplo n.º 18
0
 def finish_bundle(self):
   for destination, file_path_writer in \
     iteritems(self._destination_to_file_writer):
     (file_path, writer) = file_path_writer
     file_size = writer.tell()
     writer.close()
     yield pvalue.TaggedOutput(WriteRecordsToFile.WRITTEN_FILE_TAG,
                               GlobalWindows.windowed_value(
                                   (destination, (file_path, file_size))))
   self._destination_to_file_writer = {}
Ejemplo n.º 19
0
  def finish_bundle(self):
    bundles = []
    bundle = None
    for encoded_k, vs in iteritems(self.gbk_items):
      if not bundle:
        bundle = self._evaluation_context.create_bundle(self.output_pcollection)
        bundles.append(bundle)
      kwi = KeyedWorkItem(encoded_k, elements=vs)
      bundle.add(GlobalWindows.windowed_value(kwi))

    return TransformResult(self, bundles, [], None, None)
Ejemplo n.º 20
0
  def finish_bundle(self):
    bundles = []
    transform = self._applied_ptransform.transform

    assert transform.value is not None
    create_result = [GlobalWindows.windowed_value(v) for v in transform.value]
    for result in create_result:
      self.bundle.output(result)
    bundles.append(self.bundle)

    return TransformResult(
        self._applied_ptransform, bundles, None, None, None, None)
 def finish_bundle(self):
   unprocessed_bundles = []
   hold = None
   if self.current_index < len(self.test_stream.events) - 1:
     unprocessed_bundle = self._evaluation_context.create_bundle(
         pvalue.PBegin(self._applied_ptransform.transform.pipeline))
     unprocessed_bundle.add(GlobalWindows.windowed_value(
         self.current_index + 1, timestamp=self.watermark))
     unprocessed_bundles.append(unprocessed_bundle)
     hold = self.watermark
   return TransformResult(
       self._applied_ptransform, self.bundles, unprocessed_bundles, None, hold)
Ejemplo n.º 22
0
 def get_root_bundles(self):
   test_stream = self._applied_ptransform.transform
   bundles = []
   if len(test_stream.events) > 0:
     bundle = self._evaluation_context.create_bundle(
         pvalue.PBegin(self._applied_ptransform.transform.pipeline))
     # Explicitly set timestamp to MIN_TIMESTAMP to ensure that we hold the
     # watermark.
     bundle.add(GlobalWindows.windowed_value(0, timestamp=MIN_TIMESTAMP))
     bundle.commit(None)
     bundles.append(bundle)
   return bundles
  def finish_bundle(self):
    bundles = []
    bundle = None
    for encoded_k, vs in iteritems(self.gbk_items):
      if not bundle:
        bundle = self._evaluation_context.create_bundle(
            self.output_pcollection)
        bundles.append(bundle)
      kwi = KeyedWorkItem(encoded_k, elements=vs)
      bundle.add(GlobalWindows.windowed_value(kwi))

    return TransformResult(self, bundles, [], None, None)
Ejemplo n.º 24
0
  def finish_bundle(self):
    unprocessed_bundles = []
    next_index = self.test_stream.next(self.current_index)
    if not self.test_stream.end(next_index):
      unprocessed_bundle = self._evaluation_context.create_bundle(
          pvalue.PBegin(self._applied_ptransform.transform.pipeline))
      unprocessed_bundle.add(GlobalWindows.windowed_value(
          next_index, timestamp=self.watermark))
      unprocessed_bundles.append(unprocessed_bundle)

    # Returning the watermark in the dict here is used as a watermark hold.
    return TransformResult(
        self, self.bundles, unprocessed_bundles, None, {None: self.watermark})
Ejemplo n.º 25
0
 def finish_bundle(self):
   unprocessed_bundles = []
   hold = None
   if self.current_index < len(self.test_stream.events) - 1:
     unprocessed_bundle = self._evaluation_context.create_bundle(
         pvalue.PBegin(self._applied_ptransform.transform.pipeline))
     unprocessed_bundle.add(GlobalWindows.windowed_value(
         self.current_index + 1, timestamp=self.watermark))
     unprocessed_bundles.append(unprocessed_bundle)
     hold = self.watermark
   return TransformResult(
       self._applied_ptransform, self.bundles, unprocessed_bundles, None,
       {None: hold})
Ejemplo n.º 26
0
 def process_element(self, element):
   # In order to keep the order of the elements between the script and what
   # flows through the pipeline the same, emit the elements here.
   event = element.value
   if isinstance(event, WatermarkEvent):
     self._watermark = event.new_watermark
   elif isinstance(event, ElementEvent):
     main_output = list(self._outputs)[0]
     bundle = self._evaluation_context.create_bundle(main_output)
     for tv in event.timestamped_values:
       bundle.output(
           GlobalWindows.windowed_value(tv.value, timestamp=tv.timestamp))
     self.bundles.append(bundle)
Ejemplo n.º 27
0
    def finish_bundle(self):
        bundles = []
        transform = self._applied_ptransform.transform

        assert transform.value is not None
        create_result = [
            GlobalWindows.windowed_value(v) for v in transform.value
        ]
        for result in create_result:
            self.bundle.output(result)
        bundles.append(self.bundle)

        return TransformResult(self._applied_ptransform, bundles, None, None,
                               None, None)
Ejemplo n.º 28
0
 def __iter__(self):
   output_stream = create_OutputStream()
   if self._windowing.is_default():
     globally_window = GlobalWindows.windowed_value(None).with_value
     windowed_key_values = lambda key, values: [globally_window((key, values))]
   else:
     trigger_driver = trigger.create_trigger_driver(self._windowing, True)
     windowed_key_values = trigger_driver.process_entire_key
   coder_impl = self._post_grouped_coder.get_impl()
   key_coder_impl = self._key_coder.get_impl()
   for encoded_key, windowed_values in self._table.items():
     key = key_coder_impl.decode(encoded_key)
     for wkvs in windowed_key_values(key, windowed_values):
       coder_impl.encode_to_stream(wkvs, output_stream, True)
   return iter([output_stream.get()])
Ejemplo n.º 29
0
 def __iter__(self):
   output_stream = create_OutputStream()
   if self._windowing.is_default():
     globally_window = GlobalWindows.windowed_value(None).with_value
     windowed_key_values = lambda key, values: [globally_window((key, values))]
   else:
     trigger_driver = trigger.create_trigger_driver(self._windowing, True)
     windowed_key_values = trigger_driver.process_entire_key
   coder_impl = self._post_grouped_coder.get_impl()
   key_coder_impl = self._key_coder.get_impl()
   for encoded_key, windowed_values in self._table.items():
     key = key_coder_impl.decode(encoded_key)
     for wkvs in windowed_key_values(key, windowed_values):
       coder_impl.encode_to_stream(wkvs, output_stream, True)
   return iter([output_stream.get()])
Ejemplo n.º 30
0
    def process_element(self, element):
        # The watermark of the _TestStream transform itself.
        self.watermark = element.timestamp

        # Set up the correct watermark holds in the Watermark controllers and the
        # TestStream so that the watermarks will not automatically advance to +inf
        # when elements start streaming. This can happen multiple times in the first
        # bundle, but the operations are idempotent and adding state to keep track
        # of this would add unnecessary code complexity.
        events = []
        if self.watermark == MIN_TIMESTAMP:
            for event in self.test_stream._set_up(
                    self.test_stream.output_tags):
                events.append(event)

        # Retrieve the TestStream's event stream and read from it.
        try:
            events.append(next(self.event_stream))
        except StopIteration:
            # Advance the watermarks to +inf to cleanly stop the pipeline.
            self.is_done = True
            events += ([
                e for e in self.test_stream._tear_down(
                    self.test_stream.output_tags)
            ])

        for event in events:
            # We can either have the _TestStream or the _WatermarkController to emit
            # the elements. We chose to emit in the _WatermarkController so that the
            # element is emitted at the correct watermark value.
            if isinstance(event, (ElementEvent, WatermarkEvent)):
                # The WATERMARK_CONTROL_TAG is used to hold the _TestStream's
                # watermark to -inf, then +inf-1, then +inf. This watermark progression
                # is ultimately used to set up the proper holds to allow the
                # _WatermarkControllers to control their own output watermarks.
                if event.tag == _TestStream.WATERMARK_CONTROL_TAG:
                    self.watermark = event.new_watermark
                else:
                    main_output = list(self._outputs)[0]
                    bundle = self._evaluation_context.create_bundle(
                        main_output)
                    bundle.output(GlobalWindows.windowed_value(event))
                    self.bundles.append(bundle)
            elif isinstance(event, ProcessingTimeEvent):
                self._evaluation_context._watermark_manager._clock.advance_time(
                    event.advance_by)
            else:
                raise ValueError('Invalid TestStream event: %s.' % event)
Ejemplo n.º 31
0
    def finish_bundle(self):
        unprocessed_bundles = []

        # Continue to send its own state to itself via an unprocessed bundle. This
        # acts as a heartbeat, where each element will read the next event from the
        # event stream.
        if not self.is_done:
            unprocessed_bundle = self._evaluation_context.create_bundle(
                pvalue.PBegin(self._applied_ptransform.transform.pipeline))
            unprocessed_bundle.add(
                GlobalWindows.windowed_value(b'', timestamp=self.watermark))
            unprocessed_bundles.append(unprocessed_bundle)

        # Returning the watermark in the dict here is used as a watermark hold.
        return TransformResult(self, self.bundles, unprocessed_bundles, None,
                               {None: self.watermark})
Ejemplo n.º 32
0
    def _flush_batch(self, destination):

        # Flush the current batch of rows to BigQuery.
        rows = self._rows_buffer[destination]
        table_reference = bigquery_tools.parse_table_reference(destination)

        if table_reference.projectId is None:
            table_reference.projectId = vp.RuntimeValueProvider.get_value(
                'project', str, '')

        logging.debug('Flushing data to %s. Total %s rows.', destination,
                      len(rows))

        while True:
            # TODO: Figure out an insertId to make calls idempotent.
            passed, errors = self.bigquery_wrapper.insert_rows(
                project_id=table_reference.projectId,
                dataset_id=table_reference.datasetId,
                table_id=table_reference.tableId,
                rows=rows,
                skip_invalid_rows=True)

            logging.debug("Passed: %s. Errors are %s", passed, errors)
            failed_rows = [rows[entry.index] for entry in errors]
            should_retry = any(
                bigquery_tools.RetryStrategy.should_retry(
                    self._retry_strategy, entry.errors[0].reason)
                for entry in errors)
            rows = failed_rows

            if not should_retry:
                break
            else:
                retry_backoff = next(self._backoff_calculator)
                logging.info('Sleeping %s seconds before retrying insertion.',
                             retry_backoff)
                time.sleep(retry_backoff)

        self._total_buffered_rows -= len(self._rows_buffer[destination])
        del self._rows_buffer[destination]

        return [
            pvalue.TaggedOutput(
                BigQueryWriteFn.FAILED_ROWS,
                GlobalWindows.windowed_value((destination, row)))
            for row in failed_rows
        ]
Ejemplo n.º 33
0
  def get_root_bundles(self):
    test_stream = self._applied_ptransform.transform

    # If there was an endpoint defined then get the events from the
    # TestStreamService.
    if test_stream.endpoint:
      _TestStreamEvaluator.event_stream = _TestStream.events_from_rpc(
          test_stream.endpoint, test_stream.output_tags, test_stream.coder)
    else:
      _TestStreamEvaluator.event_stream = (
          _TestStream.events_from_script(test_stream._events))

    bundle = self._evaluation_context.create_bundle(
        pvalue.PBegin(self._applied_ptransform.transform.pipeline))
    bundle.add(GlobalWindows.windowed_value(b'', timestamp=MIN_TIMESTAMP))
    bundle.commit(None)
    return [bundle]
Ejemplo n.º 34
0
  def _flush_batch(self, destination):

    # Flush the current batch of rows to BigQuery.
    rows = self._rows_buffer[destination]
    table_reference = bigquery_tools.parse_table_reference(destination)

    if table_reference.projectId is None:
      table_reference.projectId = vp.RuntimeValueProvider.get_value(
          'project', str, '')

    logging.debug('Flushing data to %s. Total %s rows.',
                  destination, len(rows))

    while True:
      # TODO: Figure out an insertId to make calls idempotent.
      passed, errors = self.bigquery_wrapper.insert_rows(
          project_id=table_reference.projectId,
          dataset_id=table_reference.datasetId,
          table_id=table_reference.tableId,
          rows=rows,
          skip_invalid_rows=True)

      logging.debug("Passed: %s. Errors are %s", passed, errors)
      failed_rows = [rows[entry.index] for entry in errors]
      should_retry = any(
          bigquery_tools.RetryStrategy.should_retry(
              self._retry_strategy, entry.errors[0].reason)
          for entry in errors)
      rows = failed_rows

      if not should_retry:
        break
      else:
        retry_backoff = next(self._backoff_calculator)
        logging.info('Sleeping %s seconds before retrying insertion.',
                     retry_backoff)
        time.sleep(retry_backoff)

    self._total_buffered_rows -= len(self._rows_buffer[destination])
    del self._rows_buffer[destination]

    return [pvalue.TaggedOutput(BigQueryWriteFn.FAILED_ROWS,
                                GlobalWindows.windowed_value(
                                    (destination, row))) for row in failed_rows]
Ejemplo n.º 35
0
 def partition(self, n):
     # type: (int) -> List[List[bytes]]
     """ It is used to partition _GroupingBuffer to N parts. Once it is
 partitioned, it would not be re-partitioned with diff N. Re-partition
 is not supported now.
 """
     if not self._grouped_output:
         if self._windowing.is_default():
             globally_window = GlobalWindows.windowed_value(
                 None,
                 timestamp=GlobalWindow().max_timestamp(),
                 pane_info=windowed_value.PaneInfo(
                     is_first=True,
                     is_last=True,
                     timing=windowed_value.PaneInfoTiming.ON_TIME,
                     index=0,
                     nonspeculative_index=0)).with_value
             windowed_key_values = lambda key, values: [
                 globally_window((key, values))
             ]
         else:
             # TODO(pabloem, BEAM-7514): Trigger driver needs access to the clock
             #   note that this only comes through if windowing is default - but what
             #   about having multiple firings on the global window.
             #   May need to revise.
             trigger_driver = trigger.create_trigger_driver(
                 self._windowing, True)
             windowed_key_values = trigger_driver.process_entire_key
         coder_impl = self._post_grouped_coder.get_impl()
         key_coder_impl = self._key_coder.get_impl()
         self._grouped_output = [[] for _ in range(n)]
         output_stream_list = [create_OutputStream() for _ in range(n)]
         for idx, (encoded_key,
                   windowed_values) in enumerate(self._table.items()):
             key = key_coder_impl.decode(encoded_key)
             for wkvs in windowed_key_values(key, windowed_values):
                 coder_impl.encode_to_stream(wkvs,
                                             output_stream_list[idx % n],
                                             True)
         for ix, output_stream in enumerate(output_stream_list):
             self._grouped_output[ix] = [output_stream.get()]
         self._table.clear()
     return self._grouped_output
Ejemplo n.º 36
0
    def finish_bundle(self):
        if self._is_final_bundle():
            if self.global_state.get_state(
                    None, _GroupByKeyOnlyEvaluator.COMPLETION_TAG):
                # Ignore empty bundles after emitting output. (This may happen because
                # empty bundles do not affect input watermarks.)
                bundles = []
            else:
                gbk_result = []
                # TODO(ccy): perhaps we can clean this up to not use this
                # internal attribute of the DirectStepContext.
                for encoded_k in self.step_context.keyed_existing_state:
                    # Ignore global state.
                    if encoded_k is None:
                        continue
                    k = self.key_coder.decode(encoded_k)
                    state = self.step_context.get_keyed_state(encoded_k)
                    vs = state.get_state(None,
                                         _GroupByKeyOnlyEvaluator.ELEMENTS_TAG)
                    gbk_result.append(GlobalWindows.windowed_value((k, vs)))

                def len_element_fn(element):
                    _, v = element.value
                    return len(v)

                bundles = self._split_list_into_bundles(
                    self.output_pcollection, gbk_result,
                    _GroupByKeyOnlyEvaluator.MAX_ELEMENT_PER_BUNDLE,
                    len_element_fn)

            self.global_state.add_state(
                None, _GroupByKeyOnlyEvaluator.COMPLETION_TAG, True)
            hold = WatermarkManager.WATERMARK_POS_INF
        else:
            bundles = []
            hold = WatermarkManager.WATERMARK_NEG_INF
            self.global_state.set_timer(None, '', TimeDomain.WATERMARK,
                                        WatermarkManager.WATERMARK_POS_INF)

        return TransformResult(self._applied_ptransform, bundles, [], None,
                               {None: hold})
Ejemplo n.º 37
0
  def finish_bundle(self):
    if self._is_final_bundle():
      if self.global_state.get_state(
          None, _GroupByKeyOnlyEvaluator.COMPLETION_TAG):
        # Ignore empty bundles after emitting output. (This may happen because
        # empty bundles do not affect input watermarks.)
        bundles = []
      else:
        gbk_result = []
        # TODO(ccy): perhaps we can clean this up to not use this
        # internal attribute of the DirectStepContext.
        for encoded_k in self.step_context.keyed_existing_state:
          # Ignore global state.
          if encoded_k is None:
            continue
          k = self.key_coder.decode(encoded_k)
          state = self.step_context.get_keyed_state(encoded_k)
          vs = state.get_state(None, _GroupByKeyOnlyEvaluator.ELEMENTS_TAG)
          gbk_result.append(GlobalWindows.windowed_value((k, vs)))

        def len_element_fn(element):
          _, v = element.value
          return len(v)

        bundles = self._split_list_into_bundles(
            self.output_pcollection, gbk_result,
            _GroupByKeyOnlyEvaluator.MAX_ELEMENT_PER_BUNDLE, len_element_fn)

      self.global_state.add_state(
          None, _GroupByKeyOnlyEvaluator.COMPLETION_TAG, True)
      hold = WatermarkManager.WATERMARK_POS_INF
    else:
      bundles = []
      hold = WatermarkManager.WATERMARK_NEG_INF
      self.global_state.set_timer(
          None, '', TimeDomain.WATERMARK, WatermarkManager.WATERMARK_POS_INF)

    return TransformResult(
        self._applied_ptransform, bundles, [], None, {None: hold})
Ejemplo n.º 38
0
 def finish_bundle(self):
   data = self._read_from_pubsub()
   if data:
     output_pcollection = list(self._outputs)[0]
     bundle = self._evaluation_context.create_bundle(output_pcollection)
     # TODO(ccy): we currently do not use the PubSub message timestamp or
     # respect the PubSub source's id_label field.
     now = Timestamp.of(time.time())
     for message_data in data:
       bundle.output(GlobalWindows.windowed_value(message_data, timestamp=now))
     bundles = [bundle]
   else:
     bundles = []
   if self._applied_ptransform.inputs:
     input_pvalue = self._applied_ptransform.inputs[0]
   else:
     input_pvalue = pvalue.PBegin(self._applied_ptransform.transform.pipeline)
   unprocessed_bundle = self._evaluation_context.create_bundle(
       input_pvalue)
   return TransformResult(
       self._applied_ptransform, bundles,
       [unprocessed_bundle], None, {None: Timestamp.of(time.time())})
Ejemplo n.º 39
0
 def process_element(self, element):
   index = element.value
   self.watermark = element.timestamp
   assert isinstance(index, int)
   assert 0 <= index <= len(self.test_stream.events)
   self.current_index = index
   event = self.test_stream.events[self.current_index]
   if isinstance(event, ElementEvent):
     assert len(self._outputs) == 1
     output_pcollection = list(self._outputs)[0]
     bundle = self._evaluation_context.create_bundle(output_pcollection)
     for tv in event.timestamped_values:
       bundle.output(
           GlobalWindows.windowed_value(tv.value, timestamp=tv.timestamp))
     self.bundles.append(bundle)
   elif isinstance(event, WatermarkEvent):
     assert event.new_watermark >= self.watermark
     self.watermark = event.new_watermark
   elif isinstance(event, ProcessingTimeEvent):
     # TODO(ccy): advance processing time in the context's mock clock.
     pass
   else:
     raise ValueError('Invalid TestStream event: %s.' % event)
Ejemplo n.º 40
0
  def _map_task_registration(self, map_task, state_handler,
                             data_operation_spec):
    input_data = {}
    runner_sinks = {}
    transforms = []
    transform_index_to_id = {}

    # Maps coders to new coder objects and references.
    coders = {}

    def coder_id(coder):
      if coder not in coders:
        coders[coder] = beam_fn_api_pb2.Coder(
            function_spec=sdk_worker.pack_function_spec_data(
                json.dumps(coder.as_cloud_object()),
                sdk_worker.PYTHON_CODER_URN, id=self._next_uid()))

      return coders[coder].function_spec.id

    def output_tags(op):
      return getattr(op, 'output_tags', ['out'])

    def as_target(op_input):
      input_op_index, input_output_index = op_input
      input_op = map_task[input_op_index][1]
      return {
          'ignored_input_tag':
              beam_fn_api_pb2.Target.List(target=[
                  beam_fn_api_pb2.Target(
                      primitive_transform_reference=transform_index_to_id[
                          input_op_index],
                      name=output_tags(input_op)[input_output_index])
              ])
      }

    def outputs(op):
      return {
          tag: beam_fn_api_pb2.PCollection(coder_reference=coder_id(coder))
          for tag, coder in zip(output_tags(op), op.output_coders)
      }

    for op_ix, (stage_name, operation) in enumerate(map_task):
      transform_id = transform_index_to_id[op_ix] = self._next_uid()
      if isinstance(operation, operation_specs.WorkerInMemoryWrite):
        # Write this data back to the runner.
        fn = beam_fn_api_pb2.FunctionSpec(urn=sdk_worker.DATA_OUTPUT_URN,
                                          id=self._next_uid())
        if data_operation_spec:
          fn.data.Pack(data_operation_spec)
        inputs = as_target(operation.input)
        side_inputs = {}
        runner_sinks[(transform_id, 'out')] = operation

      elif isinstance(operation, operation_specs.WorkerRead):
        # A Read is either translated to a direct injection of windowed values
        # into the sdk worker, or an injection of the source object into the
        # sdk worker as data followed by an SDF that reads that source.
        if (isinstance(operation.source.source,
                       maptask_executor_runner.InMemorySource)
            and isinstance(operation.source.source.default_output_coder(),
                           WindowedValueCoder)):
          output_stream = create_OutputStream()
          element_coder = (
              operation.source.source.default_output_coder().get_impl())
          # Re-encode the elements in the nested context and
          # concatenate them together
          for element in operation.source.source.read(None):
            element_coder.encode_to_stream(element, output_stream, True)
          target_name = self._next_uid()
          input_data[(transform_id, target_name)] = output_stream.get()
          fn = beam_fn_api_pb2.FunctionSpec(urn=sdk_worker.DATA_INPUT_URN,
                                            id=self._next_uid())
          if data_operation_spec:
            fn.data.Pack(data_operation_spec)
          inputs = {target_name: beam_fn_api_pb2.Target.List()}
          side_inputs = {}
        else:
          # Read the source object from the runner.
          source_coder = beam.coders.DillCoder()
          input_transform_id = self._next_uid()
          output_stream = create_OutputStream()
          source_coder.get_impl().encode_to_stream(
              GlobalWindows.windowed_value(operation.source),
              output_stream,
              True)
          target_name = self._next_uid()
          input_data[(input_transform_id, target_name)] = output_stream.get()
          input_ptransform = beam_fn_api_pb2.PrimitiveTransform(
              id=input_transform_id,
              function_spec=beam_fn_api_pb2.FunctionSpec(
                  urn=sdk_worker.DATA_INPUT_URN,
                  id=self._next_uid()),
              # TODO(robertwb): Possible name collision.
              step_name=stage_name + '/inject_source',
              inputs={target_name: beam_fn_api_pb2.Target.List()},
              outputs={
                  'out':
                      beam_fn_api_pb2.PCollection(
                          coder_reference=coder_id(source_coder))
              })
          if data_operation_spec:
            input_ptransform.function_spec.data.Pack(data_operation_spec)
          transforms.append(input_ptransform)

          # Read the elements out of the source.
          fn = sdk_worker.pack_function_spec_data(
              OLDE_SOURCE_SPLITTABLE_DOFN_DATA,
              sdk_worker.PYTHON_DOFN_URN,
              id=self._next_uid())
          inputs = {
              'ignored_input_tag':
                  beam_fn_api_pb2.Target.List(target=[
                      beam_fn_api_pb2.Target(
                          primitive_transform_reference=input_transform_id,
                          name='out')
                  ])
          }
          side_inputs = {}

      elif isinstance(operation, operation_specs.WorkerDoFn):
        fn = sdk_worker.pack_function_spec_data(
            operation.serialized_fn,
            sdk_worker.PYTHON_DOFN_URN,
            id=self._next_uid())
        inputs = as_target(operation.input)
        # Store the contents of each side input for state access.
        for si in operation.side_inputs:
          assert isinstance(si.source, iobase.BoundedSource)
          element_coder = si.source.default_output_coder()
          view_id = self._next_uid()
          # TODO(robertwb): Actually flesh out the ViewFn API.
          side_inputs[si.tag] = beam_fn_api_pb2.SideInput(
              view_fn=sdk_worker.serialize_and_pack_py_fn(
                  element_coder, urn=sdk_worker.PYTHON_ITERABLE_VIEWFN_URN,
                  id=view_id))
          # Re-encode the elements in the nested context and
          # concatenate them together
          output_stream = create_OutputStream()
          for element in si.source.read(
              si.source.get_range_tracker(None, None)):
            element_coder.get_impl().encode_to_stream(
                element, output_stream, True)
          elements_data = output_stream.get()
          state_key = beam_fn_api_pb2.StateKey.MultimapSideInput(key=view_id)
          state_handler.Clear(state_key)
          state_handler.Append(state_key, elements_data)

      elif isinstance(operation, operation_specs.WorkerFlatten):
        fn = sdk_worker.pack_function_spec_data(
            operation.serialized_fn,
            sdk_worker.IDENTITY_DOFN_URN,
            id=self._next_uid())
        inputs = {
            'ignored_input_tag':
                beam_fn_api_pb2.Target.List(target=[
                    beam_fn_api_pb2.Target(
                        primitive_transform_reference=transform_index_to_id[
                            input_op_index],
                        name=output_tags(map_task[input_op_index][1])[
                            input_output_index])
                    for input_op_index, input_output_index in operation.inputs
                ])
        }
        side_inputs = {}

      else:
        raise TypeError(operation)

      ptransform = beam_fn_api_pb2.PrimitiveTransform(
          id=transform_id,
          function_spec=fn,
          step_name=stage_name,
          inputs=inputs,
          side_inputs=side_inputs,
          outputs=outputs(operation))
      transforms.append(ptransform)

    process_bundle_descriptor = beam_fn_api_pb2.ProcessBundleDescriptor(
        id=self._next_uid(), coders=coders.values(),
        primitive_transform=transforms)
    return beam_fn_api_pb2.InstructionRequest(
        instruction_id=self._next_uid(),
        register=beam_fn_api_pb2.RegisterRequest(
            process_bundle_descriptor=[process_bundle_descriptor
                                      ])), runner_sinks, input_data
Ejemplo n.º 41
0
 def _read_values_to_bundles(reader):
   read_result = [GlobalWindows.windowed_value(e) for e in reader]
   return self._split_list_into_bundles(
       output_pcollection, read_result,
       _BoundedReadEvaluator.MAX_ELEMENT_PER_BUNDLE, lambda _: 1)
Ejemplo n.º 42
0
 def process_timer(self, timer_firing):
   """Default process_timer() impl. generating KeyedWorkItem element."""
   self.process_element(
       GlobalWindows.windowed_value(
           KeyedWorkItem(timer_firing.encoded_key,
                         timer_firings=[timer_firing])))
 def set(self, ts):
   from apache_beam.transforms.window import GlobalWindows
   self._receiver.receive(
       GlobalWindows.windowed_value(
           (self._key,
            dict(timestamp=timestamp.Timestamp.of(ts)))))
Ejemplo n.º 44
0
from apache_beam.transforms.combiners import curry_combine_fn
from apache_beam.transforms.window import GlobalWindows
from apache_beam.utils.windowed_value import WindowedValue

# Allow some "pure mode" declarations.
try:
  import cython
except ImportError:
  class FakeCython(object):
    @staticmethod
    def cast(type, value):
      return value
  globals()['cython'] = FakeCython()


_globally_windowed_value = GlobalWindows.windowed_value(None)
_global_window_type = type(_globally_windowed_value.windows[0])


class ConsumerSet(Receiver):
  """A ConsumerSet represents a graph edge between two Operation nodes.

  The ConsumerSet object collects information from the output of the
  Operation at one end of its edge and the input of the Operation at
  the other edge.
  ConsumerSet are attached to the outputting Operation.
  """

  def __init__(
      self, counter_factory, step_name, output_index, consumers, coder):
    self.consumers = consumers
Ejemplo n.º 45
0
 def finish_bundle(self):
     assert len(self._outputs) == 1
     output_pcollection = list(self._outputs)[0]
     bundle = self._evaluation_context.create_bundle(output_pcollection)
     bundle.output(GlobalWindows.windowed_value(b''))
     return TransformResult(self, [bundle], [], None, None)