Esempio n. 1
0
    def run_trigger_simple(self, window_fn, trigger_fn, accumulation_mode,
                           timestamped_data, expected_panes, *groupings,
                           **kwargs):
        late_data = kwargs.pop('late_data', [])
        assert not kwargs

        def bundle_data(data, size):
            bundle = []
            for timestamp, elem in data:
                windows = window_fn.assign(
                    WindowFn.AssignContext(timestamp, elem))
                bundle.append(WindowedValue(elem, timestamp, windows))
                if len(bundle) == size:
                    yield bundle
                    bundle = []
            if bundle:
                yield bundle

        if not groupings:
            groupings = [1]
        for group_by in groupings:
            bundles = []
            bundle = []
            for timestamp, elem in timestamped_data:
                windows = window_fn.assign(
                    WindowFn.AssignContext(timestamp, elem))
                bundle.append(WindowedValue(elem, timestamp, windows))
                if len(bundle) == group_by:
                    bundles.append(bundle)
                    bundle = []
            bundles.append(bundle)
            self.run_trigger(window_fn, trigger_fn, accumulation_mode,
                             bundle_data(timestamped_data, group_by),
                             bundle_data(late_data, group_by), expected_panes)
Esempio n. 2
0
  def process_outputs(self, windowed_input_element, results):
    """Dispatch the result of process computation to the appropriate receivers.

    A value wrapped in a TaggedOutput object will be unwrapped and
    then dispatched to the appropriate indexed output.
    """
    if results is None:
      return

    for result in results:
      tag = None
      if isinstance(result, TaggedOutput):
        tag = result.tag
        if not isinstance(tag, basestring):
          raise TypeError('In %s, tag %s is not a string' % (self, tag))
        result = result.value
      if isinstance(result, WindowedValue):
        windowed_value = result
        if (windowed_input_element is not None
            and len(windowed_input_element.windows) != 1):
          windowed_value.windows *= len(windowed_input_element.windows)
      elif isinstance(result, TimestampedValue):
        assign_context = WindowFn.AssignContext(result.timestamp, result.value)
        windowed_value = WindowedValue(
            result.value, result.timestamp,
            self.window_fn.assign(assign_context))
        if len(windowed_input_element.windows) != 1:
          windowed_value.windows *= len(windowed_input_element.windows)
      else:
        windowed_value = windowed_input_element.with_value(result)
      if tag is None:
        self.main_receivers.receive(windowed_value)
      else:
        self.tagged_receivers[tag].receive(windowed_value)
Esempio n. 3
0
  def _execute(
      self, window_fn, trigger_fn, accumulation_mode, timestamp_combiner,
      transcript, unused_spec):

    driver = GeneralTriggerDriver(
        Windowing(window_fn, trigger_fn, accumulation_mode, timestamp_combiner),
        TestClock())
    state = InMemoryUnmergedState()
    output = []
    watermark = MIN_TIMESTAMP

    def fire_timers():
      to_fire = state.get_and_clear_timers(watermark)
      while to_fire:
        for timer_window, (name, time_domain, t_timestamp) in to_fire:
          for wvalue in driver.process_timer(
              timer_window, name, time_domain, t_timestamp, state):
            output.append(_windowed_value_info(wvalue))
        to_fire = state.get_and_clear_timers(watermark)

    for action, params in transcript:

      if action != 'expect':
        # Fail if we have output that was not expected in the transcript.
        self.assertEqual(
            [], output, msg='Unexpected output: %s before %s: %s' % (
                output, action, params))

      if action == 'input':
        bundle = [
            WindowedValue(t, t, window_fn.assign(WindowFn.AssignContext(t, t)))
            for t in params]
        output = [
            _windowed_value_info(wv)
            for wv in driver.process_elements(state, bundle, watermark)]
        fire_timers()

      elif action == 'watermark':
        watermark = params
        fire_timers()

      elif action == 'expect':
        for expected_output in params:
          for candidate in output:
            if all(candidate[k] == expected_output[k]
                   for k in candidate if k in expected_output):
              output.remove(candidate)
              break
          else:
            self.fail('Unmatched output %s in %s' % (expected_output, output))

      elif action == 'state':
        # TODO(robertwb): Implement once we support allowed lateness.
        pass

      else:
        self.fail('Unknown action: ' + action)

    # Fail if we have output that was not expected in the transcript.
    self.assertEqual([], output, msg='Unexpected output: %s' % output)
Esempio n. 4
0
 def test_windowfn_encoding(self):
     for window_fn in (GlobalWindows(), FixedWindows(37),
                       SlidingWindows(2, 389), Sessions(5077)):
         context = pipeline_context.PipelineContext()
         self.assertEqual(
             window_fn,
             WindowFn.from_runner_api(window_fn.to_runner_api(context),
                                      context))
Esempio n. 5
0
 def from_runner_api(proto, context):
   # pylint: disable=wrong-import-order, wrong-import-position
   from apache_beam.transforms.trigger import TriggerFn
   return Windowing(
       windowfn=WindowFn.from_runner_api(proto.window_fn, context),
       triggerfn=TriggerFn.from_runner_api(proto.trigger, context),
       accumulation_mode=proto.accumulation_mode,
       output_time_fn=proto.output_time)
Esempio n. 6
0
 def test_windowfn_encoding(self):
   for window_fn in (GlobalWindows(),
                     FixedWindows(37),
                     SlidingWindows(2, 389),
                     Sessions(5077)):
     context = pipeline_context.PipelineContext()
     self.assertEqual(
         window_fn,
         WindowFn.from_runner_api(window_fn.to_runner_api(context), context))
Esempio n. 7
0
 def bundle_data(data, size):
   bundle = []
   for timestamp, elem in data:
     windows = window_fn.assign(WindowFn.AssignContext(timestamp, elem))
     bundle.append(WindowedValue(elem, timestamp, windows))
     if len(bundle) == size:
       yield bundle
       bundle = []
   if bundle:
     yield bundle
Esempio n. 8
0
  def process_outputs(
      self, windowed_input_element, results, watermark_estimator=None):
    # type: (WindowedValue, Iterable[Any]) -> None

    """Dispatch the result of process computation to the appropriate receivers.

    A value wrapped in a TaggedOutput object will be unwrapped and
    then dispatched to the appropriate indexed output.
    """
    if results is None:
      # TODO(BEAM-3937): Remove if block after output counter released.
      # Only enable per_element_output_counter when counter cythonized.
      if (self.per_element_output_counter is not None and
          self.per_element_output_counter.is_cythonized):
        self.per_element_output_counter.add_input(0)
      return

    output_element_count = 0
    for result in results:
      # results here may be a generator, which cannot call len on it.
      output_element_count += 1
      tag = None
      if isinstance(result, TaggedOutput):
        tag = result.tag
        if not isinstance(tag, (str, unicode)):
          raise TypeError('In %s, tag %s is not a string' % (self, tag))
        result = result.value
      if isinstance(result, WindowedValue):
        windowed_value = result
        if (windowed_input_element is not None and
            len(windowed_input_element.windows) != 1):
          windowed_value.windows *= len(windowed_input_element.windows)
      elif isinstance(result, TimestampedValue):
        assign_context = WindowFn.AssignContext(result.timestamp, result.value)
        windowed_value = WindowedValue(
            result.value,
            result.timestamp,
            self.window_fn.assign(assign_context))
        if len(windowed_input_element.windows) != 1:
          windowed_value.windows *= len(windowed_input_element.windows)
      else:
        windowed_value = windowed_input_element.with_value(result)
      if watermark_estimator is not None:
        watermark_estimator.observe_timestamp(windowed_value.timestamp)
      if tag is None:
        self.main_receivers.receive(windowed_value)
      else:
        self.tagged_receivers[tag].receive(windowed_value)
    # TODO(BEAM-3937): Remove if block after output counter released.
    # Only enable per_element_output_counter when counter cythonized
    if (self.per_element_output_counter is not None and
        self.per_element_output_counter.is_cythonized):
      self.per_element_output_counter.add_input(output_element_count)
Esempio n. 9
0
    def _process_outputs(self, windowed_input_element, results):
        """Dispatch the result of computation to the appropriate receivers.

    A value wrapped in a SideOutputValue object will be unwrapped and
    then dispatched to the appropriate indexed output.
    """
        if results is None:
            return
        for result in results:
            tag = None
            if isinstance(result, SideOutputValue):
                tag = result.tag
                if not isinstance(tag, basestring):
                    raise TypeError('In %s, tag %s is not a string' %
                                    (self, tag))
                result = result.value
            if isinstance(result, WindowedValue):
                windowed_value = result
                if (windowed_input_element is not None
                        and len(windowed_input_element.windows) != 1):
                    windowed_value.windows *= len(
                        windowed_input_element.windows)
            elif windowed_input_element is None:
                # Start and finish have no element from which to grab context,
                # but may emit elements.
                if isinstance(result, TimestampedValue):
                    value = result.value
                    timestamp = result.timestamp
                    assign_context = NoContext(value, timestamp)
                else:
                    value = result
                    timestamp = -1
                    assign_context = NoContext(value)
                windowed_value = WindowedValue(
                    value, timestamp, self.window_fn.assign(assign_context))
            elif isinstance(result, TimestampedValue):
                assign_context = WindowFn.AssignContext(
                    result.timestamp, result.value)
                windowed_value = WindowedValue(
                    result.value, result.timestamp,
                    self.window_fn.assign(assign_context))
                if len(windowed_input_element.windows) != 1:
                    windowed_value.windows *= len(
                        windowed_input_element.windows)
            else:
                windowed_value = windowed_input_element.with_value(result)
            if tag is None:
                self.main_receivers.receive(windowed_value)
            else:
                self.tagged_receivers[tag].output(windowed_value)
Esempio n. 10
0
    def _run_log(self, spec):
        def parse_int_list(s):
            """Parses strings like '[1, 2, 3]'."""
            s = s.strip()
            assert s[0] == '[' and s[-1] == ']', s
            if not s[1:-1].strip():
                return []
            return [int(x) for x in s[1:-1].split(',')]

        def split_args(s):
            """Splits 'a, b, [c, d]' into ['a', 'b', '[c, d]']."""
            args = []
            start = 0
            depth = 0
            for ix in range(len(s)):
                c = s[ix]
                if c in '({[':
                    depth += 1
                elif c in ')}]':
                    depth -= 1
                elif c == ',' and depth == 0:
                    args.append(s[start:ix].strip())
                    start = ix + 1
            assert depth == 0, s
            args.append(s[start:].strip())
            return args

        def parse(s, names):
            """Parse (recursive) 'Foo(arg, kw=arg)' for Foo in the names dict."""
            s = s.strip()
            if s in names:
                return names[s]
            elif s[0] == '[':
                return parse_int_list(s)
            elif '(' in s:
                assert s[-1] == ')', s
                callee = parse(s[:s.index('(')], names)
                posargs = []
                kwargs = {}
                for arg in split_args(s[s.index('(') + 1:-1]):
                    if '=' in arg:
                        kw, value = arg.split('=', 1)
                        kwargs[kw] = parse(value, names)
                    else:
                        posargs.append(parse(arg, names))
                return callee(*posargs, **kwargs)
            else:
                try:
                    return int(s)
                except ValueError:
                    raise ValueError('Unknown function: %s' % s)

        def parse_fn(s, names):
            """Like parse(), but implicitly calls no-arg constructors."""
            fn = parse(s, names)
            if isinstance(fn, type):
                return fn()
            return fn

        # pylint: disable=wrong-import-order, wrong-import-position
        from apache_beam.transforms import window as window_module
        # pylint: enable=wrong-import-order, wrong-import-position
        window_fn_names = dict(window_module.__dict__)
        window_fn_names.update({
            'CustomTimestampingFixedWindowsWindowFn':
            CustomTimestampingFixedWindowsWindowFn
        })
        trigger_names = {'Default': DefaultTrigger}
        trigger_names.update(trigger.__dict__)

        window_fn = parse_fn(spec.get('window_fn', 'GlobalWindows'),
                             window_fn_names)
        trigger_fn = parse_fn(spec.get('trigger_fn', 'Default'), trigger_names)
        accumulation_mode = getattr(
            AccumulationMode,
            spec.get('accumulation_mode', 'ACCUMULATING').upper())
        timestamp_combiner = getattr(
            TimestampCombiner,
            spec.get('timestamp_combiner', 'OUTPUT_AT_EOW').upper())

        driver = GeneralTriggerDriver(
            Windowing(window_fn, trigger_fn, accumulation_mode,
                      timestamp_combiner), TestClock())
        state = InMemoryUnmergedState()
        output = []
        watermark = MIN_TIMESTAMP

        def fire_timers():
            to_fire = state.get_and_clear_timers(watermark)
            while to_fire:
                for timer_window, (name, time_domain, t_timestamp) in to_fire:
                    for wvalue in driver.process_timer(timer_window, name,
                                                       time_domain,
                                                       t_timestamp, state):
                        window, = wvalue.windows
                        output.append({
                            'window': [window.start, window.end - 1],
                            'values': sorted(wvalue.value),
                            'timestamp': wvalue.timestamp
                        })
                to_fire = state.get_and_clear_timers(watermark)

        for line in spec['transcript']:

            action, params = line.items()[0]

            if action != 'expect':
                # Fail if we have output that was not expected in the transcript.
                self.assertEquals([],
                                  output,
                                  msg='Unexpected output: %s before %s' %
                                  (output, line))

            if action == 'input':
                bundle = [
                    WindowedValue(
                        t, t, window_fn.assign(WindowFn.AssignContext(t, t)))
                    for t in params
                ]
                output = [{
                    'window':
                    [wvalue.windows[0].start, wvalue.windows[0].end - 1],
                    'values':
                    sorted(wvalue.value),
                    'timestamp':
                    wvalue.timestamp
                } for wvalue in driver.process_elements(
                    state, bundle, watermark)]
                fire_timers()

            elif action == 'watermark':
                watermark = params
                fire_timers()

            elif action == 'expect':
                for expected_output in params:
                    for candidate in output:
                        if all(candidate[k] == expected_output[k]
                               for k in candidate if k in expected_output):
                            output.remove(candidate)
                            break
                    else:
                        self.fail('Unmatched output %s in %s' %
                                  (expected_output, output))

            elif action == 'state':
                # TODO(robertwb): Implement once we support allowed lateness.
                pass

            else:
                self.fail('Unknown action: ' + action)

        # Fail if we have output that was not expected in the transcript.
        self.assertEquals([], output, msg='Unexpected output: %s' % output)
Esempio n. 11
0
def context(element, timestamp):
  return WindowFn.AssignContext(timestamp, element)