예제 #1
0
    def test_parse_windowedvalue_with_dicts(self):
        """Tests that dicts play well with WindowedValues.
    """
        from apache_beam.transforms.window import GlobalWindow

        els = [
            WindowedValue({
                'b': 2,
                'd': 4
            }, 1, [GlobalWindow()]),
            WindowedValue({
                'a': 1,
                'b': 2,
                'c': 3
            }, 1, [GlobalWindow()])
        ]

        actual_df = utils.elements_to_df(els, include_window_info=True)
        expected_df = pd.DataFrame(
            [[
                np.nan, 2, np.nan, 4,
                int(1e6), els[0].windows, els[0].pane_info
            ], [1, 2, 3, np.nan,
                int(1e6), els[1].windows, els[1].pane_info]],
            columns=['a', 'b', 'c', 'd', 'event_time', 'windows', 'pane_info'])
        pd.testing.assert_frame_equal(actual_df, expected_df)
예제 #2
0
 def create_split_across_windows(self, primary_windows, residual_windows):
     primary = SplitResultPrimary(primary_value=WindowedValue((
         ('a', (OffsetRange(0, 100), self.watermark_estimator_state)),
         100), 57, primary_windows)) if primary_windows else None
     residual = SplitResultResidual(
         residual_value=WindowedValue(
             (('a', (OffsetRange(0, 100), self.watermark_estimator_state)),
              100), 57, residual_windows),
         current_watermark=None,
         deferred_timestamp=None) if residual_windows else None
     return primary, residual
예제 #3
0
  def _output_as_events(self):
    """Outputs buffered elements as TestStream events.
    """
    if self.timing_events:
      yield WindowedValue(
          self.timing_events, timestamp=0, windows=[beam.window.GlobalWindow()])

    if self.elements:
      yield WindowedValue([ElementEvent(self.elements)],
                          timestamp=0,
                          windows=[beam.window.GlobalWindow()])
예제 #4
0
  def test_count_limiter_with_dataframes(self):
    limiter = CountLimiter(5)

    # Test that empty dataframes don't count.
    for _ in range(10):
      df = WindowedValue(pd.DataFrame(), 0, [])
      limiter.update(df)

    self.assertFalse(limiter.is_triggered())
    df = WindowedValue(pd.DataFrame({'col': list(range(10))}), 0, [])
    limiter.update(df)
    self.assertTrue(limiter.is_triggered())
예제 #5
0
 def create_split_in_window(self, offset_index, windows):
     return (
         SplitResultPrimary(primary_value=WindowedValue((('a', (
             OffsetRange(0, offset_index),
             self.watermark_estimator_state)), offset_index), 57, windows)),
         SplitResultResidual(
             residual_value=WindowedValue(
                 (('a', (OffsetRange(offset_index, 100),
                         self.watermark_estimator.get_estimator_state())),
                  100 - offset_index), 57, windows),
             current_watermark=self.watermark_estimator.current_watermark(),
             deferred_timestamp=None))
예제 #6
0
    def test_parse_windowedvalue(self):
        """Tests that WindowedValues are supported but not present.
    """
        from apache_beam.transforms.window import GlobalWindow

        els = [
            WindowedValue(('a', 2), 1, [GlobalWindow()]),
            WindowedValue(('b', 3), 1, [GlobalWindow()])
        ]

        actual_df = utils.elements_to_df(els, include_window_info=False)
        expected_df = pd.DataFrame([['a', 2], ['b', 3]], columns=[0, 1])
        pd.testing.assert_frame_equal(actual_df, expected_df)
예제 #7
0
    def _process_outputs(self, windowed_input_element, results):
        """Dispatch the result of computation to the appropriate receivers.

    A value wrapped in a SideOutputValue object will be unwrapped and
    then dispatched to the appropriate indexed output.
    """
        if results is None:
            return
        for result in results:
            tag = None
            if isinstance(result, SideOutputValue):
                tag = result.tag
                if not isinstance(tag, basestring):
                    raise TypeError('In %s, tag %s is not a string' %
                                    (self, tag))
                result = result.value
            if isinstance(result, WindowedValue):
                windowed_value = result
                if (windowed_input_element is not None
                        and len(windowed_input_element.windows) != 1):
                    windowed_value.windows *= len(
                        windowed_input_element.windows)
            elif windowed_input_element is None:
                # Start and finish have no element from which to grab context,
                # but may emit elements.
                if isinstance(result, TimestampedValue):
                    value = result.value
                    timestamp = result.timestamp
                    assign_context = NoContext(value, timestamp)
                else:
                    value = result
                    timestamp = -1
                    assign_context = NoContext(value)
                windowed_value = WindowedValue(
                    value, timestamp, self.window_fn.assign(assign_context))
            elif isinstance(result, TimestampedValue):
                assign_context = WindowFn.AssignContext(
                    result.timestamp, result.value)
                windowed_value = WindowedValue(
                    result.value, result.timestamp,
                    self.window_fn.assign(assign_context))
                if len(windowed_input_element.windows) != 1:
                    windowed_value.windows *= len(
                        windowed_input_element.windows)
            else:
                windowed_value = windowed_input_element.with_value(result)
            if tag is None:
                self.main_receivers.receive(windowed_value)
            else:
                self.tagged_receivers[tag].output(windowed_value)
예제 #8
0
      def finish_bundle(self):
        from apache_beam.transforms import window

        assert self.file_to_read
        for file_name in glob.glob(self.file_to_read):
          if self.compression_type is None:
            with open(file_name) as file:
              for record in file:
                value = self.coder.decode(record.rstrip('\n'))
                yield WindowedValue(value, -1, [window.GlobalWindow()])
          else:
            with gzip.open(file_name, 'r') as file:
              for record in file:
                value = self.coder.decode(record.rstrip('\n'))
                yield WindowedValue(value, -1, [window.GlobalWindow()])
예제 #9
0
    def invoke_process(self,
                       windowed_value,
                       restriction_tracker=None,
                       output_processor=None):
        output_processor = output_processor or self.output_processor
        self.context.set_element(windowed_value)
        # Call for the process function for each window if has windowed side inputs
        # or if the process accesses the window parameter. We can just call it once
        # otherwise as none of the arguments are changing

        additional_kwargs = {}
        if restriction_tracker:
            restriction_tracker_param = _find_param_with_default(
                self.signature.process_method,
                default_as_type=core.RestrictionProvider)[0]
            if not restriction_tracker_param:
                raise ValueError(
                    'A RestrictionTracker %r was provided but DoFn does not have a '
                    'RestrictionTrackerParam defined', restriction_tracker)
            additional_kwargs[restriction_tracker_param] = restriction_tracker
        if self.has_windowed_inputs and len(windowed_value.windows) != 1:
            for w in windowed_value.windows:
                self._invoke_per_window(
                    WindowedValue(windowed_value.value,
                                  windowed_value.timestamp, (w, )),
                    additional_kwargs, output_processor)
        else:
            self._invoke_per_window(windowed_value, additional_kwargs,
                                    output_processor)
    def test_basic_wordcount(self):
        """A wordcount to be used as a smoke test."""

        # Create the pipeline that will emit 0, 1, 2.
        p = beam.Pipeline(InteractiveRunner())
        elems = p | beam.Create([0, 1, 2])

        # Watch the pipeline and PCollections. This is normally done in a notebook
        # environment automatically, but we have to do it manually here.
        ib.watch(locals())
        ie.current_env().track_user_pipelines()

        # Create the recording objects. By calling `record` a new PipelineFragment
        # is started to compute the given PCollections and cache to disk.
        rm = RecordingManager(p)
        recording = rm.record([elems], max_n=3, max_duration_secs=500)
        stream = recording.stream(elems)
        recording.wait_until_finish()

        # Once the pipeline fragment completes, we can read from the stream and know
        # that all elements were written to cache.
        elems = list(stream.read())
        expected_elems = [
            WindowedValue(i, MIN_TIMESTAMP, [GlobalWindow()]) for i in range(3)
        ]
        self.assertListEqual(elems, expected_elems)
예제 #11
0
 def test_basic_test_stream(self):
     test_stream = (TestStream()
                    .advance_watermark_to(0)
                    .add_elements([
                        'a',
                        WindowedValue('b', 3, []),
                        TimestampedValue('c', 6)])
                    .advance_processing_time(10)
                    .advance_watermark_to(8)
                    .add_elements(['d'])
                    .advance_watermark_to_infinity())  # yapf: disable
     self.assertEqual(test_stream._events, [
         WatermarkEvent(0),
         ElementEvent([
             TimestampedValue('a', 0),
             TimestampedValue('b', 3),
             TimestampedValue('c', 6),
         ]),
         ProcessingTimeEvent(10),
         WatermarkEvent(8),
         ElementEvent([
             TimestampedValue('d', 8),
         ]),
         WatermarkEvent(timestamp.MAX_TIMESTAMP),
     ])
예제 #12
0
    def test_windowed_values_interpreted_correctly(self):
        windowed_value = WindowedValueHolder(
            WindowedValue('a', Timestamp(5),
                          [beam.window.IntervalWindow(5, 10)],
                          PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0)))
        test_stream = (TestStream()
                       .advance_processing_time(10)
                       .advance_watermark_to(10)
                       .add_elements([windowed_value])
                       .advance_watermark_to_infinity())  # yapf: disable

        class RecordFn(beam.DoFn):
            def process(self,
                        element=beam.DoFn.ElementParam,
                        timestamp=beam.DoFn.TimestampParam,
                        window=beam.DoFn.WindowParam):
                yield (element, timestamp, window)

        options = PipelineOptions()
        options.view_as(StandardOptions).streaming = True
        with TestPipeline(options=options) as p:
            my_record_fn = RecordFn()
            records = p | test_stream | beam.ParDo(my_record_fn)

            assert_that(
                records,
                equal_to([
                    ('a', timestamp.Timestamp(5),
                     beam.window.IntervalWindow(5, 10)),
                ]))
예제 #13
0
파일: common.py 프로젝트: junaidsaiyed/beam
  def process_outputs(self, windowed_input_element, results):
    """Dispatch the result of process computation to the appropriate receivers.

    A value wrapped in a TaggedOutput object will be unwrapped and
    then dispatched to the appropriate indexed output.
    """
    if results is None:
      return

    for result in results:
      tag = None
      if isinstance(result, TaggedOutput):
        tag = result.tag
        if not isinstance(tag, basestring):
          raise TypeError('In %s, tag %s is not a string' % (self, tag))
        result = result.value
      if isinstance(result, WindowedValue):
        windowed_value = result
        if (windowed_input_element is not None
            and len(windowed_input_element.windows) != 1):
          windowed_value.windows *= len(windowed_input_element.windows)
      elif isinstance(result, TimestampedValue):
        assign_context = WindowFn.AssignContext(result.timestamp, result.value)
        windowed_value = WindowedValue(
            result.value, result.timestamp,
            self.window_fn.assign(assign_context))
        if len(windowed_input_element.windows) != 1:
          windowed_value.windows *= len(windowed_input_element.windows)
      else:
        windowed_value = windowed_input_element.with_value(result)
      if tag is None:
        self.main_receivers.receive(windowed_value)
      else:
        self.tagged_receivers[tag].receive(windowed_value)
예제 #14
0
 def windowed_values(self):
     # yield first windowed_value as is, then iterate through
     # _appended_values to yield WindowedValue on the fly.
     yield self._initial_windowed_value
     for v in self._appended_values:
         yield WindowedValue(v, self._initial_windowed_value.timestamp,
                             self._initial_windowed_value.windows)
예제 #15
0
    def test_parse_windowedvalue_with_window_info(self):
        """Tests that WindowedValues are supported and have their own columns.
    """
        from apache_beam.transforms.window import GlobalWindow

        els = [
            WindowedValue(('a', 2), 1, [GlobalWindow()]),
            WindowedValue(('b', 3), 1, [GlobalWindow()])
        ]

        actual_df = utils.elements_to_df(els, include_window_info=True)
        expected_df = pd.DataFrame(
            [['a', 2, int(1e6), els[0].windows, els[0].pane_info],
             ['b', 3, int(1e6), els[1].windows, els[1].pane_info]],
            columns=[0, 1, 'event_time', 'windows', 'pane_info'])
        pd.testing.assert_frame_equal(actual_df, expected_df)
예제 #16
0
  def _output_as_records(self):
    """Outputs buffered elements as TestStreamFileRecords.
    """
    if self.header:
      yield WindowedValue(
          self.header, timestamp=0, windows=[beam.window.GlobalWindow()])

    if self.timing_events:
      timing_events = self._timing_events_to_records(self.timing_events)
      for r in timing_events:
        yield WindowedValue(
            r, timestamp=0, windows=[beam.window.GlobalWindow()])

    if self.elements:
      elements = self._elements_to_record(self.elements)
      yield WindowedValue(
          elements, timestamp=0, windows=[beam.window.GlobalWindow()])
예제 #17
0
 def windowed_value(
     cls,
     value,  # type: Any
     timestamp=MIN_TIMESTAMP,  # type: Timestamp
     pane_info=windowed_value.PANE_INFO_UNKNOWN  # type: windowed_value.PaneInfo
 ):
   # type: (...) -> WindowedValue
   return WindowedValue(value, timestamp, (GlobalWindow(), ), pane_info)
예제 #18
0
파일: common.py 프로젝트: mf2199/beam-MF
    def invoke_process(self,
                       windowed_value,
                       restriction_tracker=None,
                       output_processor=None,
                       additional_args=None,
                       additional_kwargs=None):
        if not additional_args:
            additional_args = []
        if not additional_kwargs:
            additional_kwargs = {}

        if not output_processor:
            output_processor = self.output_processor
        self.context.set_element(windowed_value)
        # Call for the process function for each window if has windowed side inputs
        # or if the process accesses the window parameter. We can just call it once
        # otherwise as none of the arguments are changing

        if self.is_splittable and not restriction_tracker:
            restriction = self.invoke_initial_restriction(windowed_value.value)
            restriction_tracker = self.invoke_create_tracker(restriction)

        if restriction_tracker:
            if len(windowed_value.windows) > 1 and self.has_windowed_inputs:
                # Should never get here due to window explosion in
                # the upstream pair-with-restriction.
                raise NotImplementedError(
                    'SDFs in multiply-windowed values with windowed arguments.'
                )
            restriction_tracker_param = _find_param_with_default(
                self.signature.process_method,
                default_as_type=core.RestrictionProvider)[0]
            if not restriction_tracker_param:
                raise ValueError(
                    'A RestrictionTracker %r was provided but DoFn does not have a '
                    'RestrictionTrackerParam defined' % restriction_tracker)
            additional_kwargs[restriction_tracker_param] = restriction_tracker
            try:
                self.current_windowed_value = windowed_value
                self.restriction_tracker = restriction_tracker
                return self._invoke_process_per_window(windowed_value,
                                                       additional_args,
                                                       additional_kwargs,
                                                       output_processor)
            finally:
                self.restriction_tracker = None
                self.current_windowed_value = windowed_value

        elif self.has_windowed_inputs and len(windowed_value.windows) != 1:
            for w in windowed_value.windows:
                self._invoke_process_per_window(
                    WindowedValue(windowed_value.value,
                                  windowed_value.timestamp, (w, )),
                    additional_args, additional_kwargs, output_processor)
        else:
            self._invoke_process_per_window(windowed_value, additional_args,
                                            additional_kwargs,
                                            output_processor)
예제 #19
0
파일: common.py 프로젝트: sachingpsl/beam
 def old_dofn_process(self, element):
     if self.has_windowed_side_inputs and len(element.windows) > 1:
         for w in element.windows:
             self.context.set_element(
                 WindowedValue(element.value, element.timestamp, (w, )))
             self._process_outputs(element, self.dofn_process(self.context))
     else:
         self.context.set_element(element)
         self._process_outputs(element, self.dofn_process(self.context))
예제 #20
0
 def output_key(self, wkey, accumulator):
   windows, key = wkey
   if self.combine_fn_compact is None:
     value = accumulator
   else:
     value = self.combine_fn_compact(accumulator)
   if windows is 0:
     self.output(_globally_windowed_value.with_value((key, value)))
   else:
     self.output(WindowedValue((key, value), windows[0].end, windows))
예제 #21
0
 def finish_bundle(self):
     """Runs predictions on remaining elements at end of bundle of elements."""
     logging.info("Run predictions on all intermediate elements.")
     for elements in self.batches.values():
         outputs = self.make_predictions(elements)
         for output in outputs:
             yield WindowedValue(
                 value=output,
                 timestamp=int(time.time()),
                 windows=(window.GlobalWindow(),))
     self.batches = {}
예제 #22
0
 def flush(self, target):
     limit = self.size - target
     for ix, (kw, vs) in enumerate(self.table.items()):
         if ix >= limit:
             break
         del self.table[kw]
         key, windows = kw
         output_value = [v.value[1] for v in vs]
         windowed_value = WindowedValue((key, output_value),
                                        vs[0].timestamp, windows)
         self.output(windowed_value)
예제 #23
0
파일: common.py 프로젝트: wileeam/beam
 def invoke_process(self, windowed_value):
   self.context.set_element(windowed_value)
   # Call for the process function for each window if has windowed side inputs
   # or if the process accesses the window parameter. We can just call it once
   # otherwise as none of the arguments are changing
   if self.has_windowed_inputs and len(windowed_value.windows) != 1:
     for w in windowed_value.windows:
       self._invoke_per_window(
           WindowedValue(windowed_value.value, windowed_value.timestamp, (w,)))
   else:
     self._invoke_per_window(windowed_value)
예제 #24
0
 def setUp(self):
     self.window1 = IntervalWindow(0, 10)
     self.window2 = IntervalWindow(10, 20)
     self.window3 = IntervalWindow(20, 30)
     self.windowed_value = WindowedValue(
         'a', 57, (self.window1, self.window2, self.window3))
     self.restriction = OffsetRange(0, 100)
     self.watermark_estimator_state = Timestamp(21)
     self.restriction_provider = TestOffsetRestrictionProvider()
     self.watermark_estimator = ManualWatermarkEstimator(Timestamp(42))
     self.maxDiff = None
예제 #25
0
 def _dofn_invoker(self, element):
     self.context.set_element(element)
     # Call for the process function for each window if has windowed side inputs
     # or if the process accesses the window parameter. We can just call it once
     # otherwise as none of the arguments are changing
     if self.has_windowed_inputs and len(element.windows) != 1:
         for w in element.windows:
             self._dofn_per_window_invoker(
                 WindowedValue(element.value, element.timestamp, (w, )))
     else:
         self._dofn_per_window_invoker(element)
예제 #26
0
  def process_outputs(
      self, windowed_input_element, results, watermark_estimator=None):
    # type: (WindowedValue, Iterable[Any]) -> None

    """Dispatch the result of process computation to the appropriate receivers.

    A value wrapped in a TaggedOutput object will be unwrapped and
    then dispatched to the appropriate indexed output.
    """
    if results is None:
      # TODO(BEAM-3937): Remove if block after output counter released.
      # Only enable per_element_output_counter when counter cythonized.
      if (self.per_element_output_counter is not None and
          self.per_element_output_counter.is_cythonized):
        self.per_element_output_counter.add_input(0)
      return

    output_element_count = 0
    for result in results:
      # results here may be a generator, which cannot call len on it.
      output_element_count += 1
      tag = None
      if isinstance(result, TaggedOutput):
        tag = result.tag
        if not isinstance(tag, (str, unicode)):
          raise TypeError('In %s, tag %s is not a string' % (self, tag))
        result = result.value
      if isinstance(result, WindowedValue):
        windowed_value = result
        if (windowed_input_element is not None and
            len(windowed_input_element.windows) != 1):
          windowed_value.windows *= len(windowed_input_element.windows)
      elif isinstance(result, TimestampedValue):
        assign_context = WindowFn.AssignContext(result.timestamp, result.value)
        windowed_value = WindowedValue(
            result.value,
            result.timestamp,
            self.window_fn.assign(assign_context))
        if len(windowed_input_element.windows) != 1:
          windowed_value.windows *= len(windowed_input_element.windows)
      else:
        windowed_value = windowed_input_element.with_value(result)
      if watermark_estimator is not None:
        watermark_estimator.observe_timestamp(windowed_value.timestamp)
      if tag is None:
        self.main_receivers.receive(windowed_value)
      else:
        self.tagged_receivers[tag].receive(windowed_value)
    # TODO(BEAM-3937): Remove if block after output counter released.
    # Only enable per_element_output_counter when counter cythonized
    if (self.per_element_output_counter is not None and
        self.per_element_output_counter.is_cythonized):
      self.per_element_output_counter.add_input(output_element_count)
예제 #27
0
파일: utils_test.py 프로젝트: mszb/beam
        def reader():
            element_payload = [
                TestStreamPayload.TimestampedElement(
                    encoded_element=coder.encode(
                        WindowedValueHolder(WindowedValue(e, 0, []))),
                    timestamp=Timestamp.of(0).micros) for e in range(10)
            ]

            event = TestStreamPayload.Event(
                element_event=TestStreamPayload.Event.AddElements(
                    elements=element_payload))
            yield event
예제 #28
0
    def output_key(self, wkey, accumulator, timestamp):
        if self.combine_fn_compact is None:
            value = accumulator
        else:
            value = self.combine_fn_compact(accumulator)

        if self.is_default_windowing:
            self.output(_globally_windowed_value.with_value((wkey, value)))
        else:
            windows, key = wkey
            if self.timestamp_combiner is None:
                timestamp = windows[0].max_timestamp()
            self.output(WindowedValue((key, value), timestamp, windows))
예제 #29
0
    def process(self,
                element: prediction_log_pb2.PredictionLog,
                window=beam.DoFn.WindowParam,
                timestamp=beam.DoFn.TimestampParam):
        if len(element.predict_log.request.inputs['examples'].string_val) > 1:
            raise Exception("Only support single input string.")

        if len(self.batch) > self.batching_size:
            for k in self.process_result(self.batch):
                yield k
            self.batch.clear()
        else:
            self.batch.append(WindowedValue(element, timestamp, [window]))
예제 #30
0
파일: common.py 프로젝트: tedyu/beam
    def finish_bundle_outputs(self, results):
        """Dispatch the result of finish_bundle to the appropriate receivers.

    A value wrapped in a OutputValue object will be unwrapped and
    then dispatched to the appropriate indexed output.
    """
        if results is None:
            return

        for result in results:
            tag = None
            if isinstance(result, OutputValue):
                tag = result.tag
                if not isinstance(tag, basestring):
                    raise TypeError('In %s, tag %s is not a string' %
                                    (self, tag))
                result = result.value

            if isinstance(result, WindowedValue):
                windowed_value = result
            elif isinstance(result, TimestampedValue):
                value = result.value
                timestamp = result.timestamp
                assign_context = NoContext(value, timestamp)
                windowed_value = WindowedValue(
                    value, timestamp, self.window_fn.assign(assign_context))
            else:
                value = result
                timestamp = -1
                assign_context = NoContext(value)
                windowed_value = WindowedValue(
                    value, timestamp, self.window_fn.assign(assign_context))

            if tag is None:
                self.main_receivers.receive(windowed_value)
            else:
                self.tagged_receivers[tag].output(windowed_value)