Esempio n. 1
0
    def run_trigger_simple(self, window_fn, trigger_fn, accumulation_mode,
                           timestamped_data, expected_panes, *groupings,
                           **kwargs):
        late_data = kwargs.pop('late_data', [])
        assert not kwargs

        def bundle_data(data, size):
            bundle = []
            for timestamp, elem in data:
                windows = window_fn.assign(
                    WindowFn.AssignContext(timestamp, elem))
                bundle.append(WindowedValue(elem, timestamp, windows))
                if len(bundle) == size:
                    yield bundle
                    bundle = []
            if bundle:
                yield bundle

        if not groupings:
            groupings = [1]
        for group_by in groupings:
            bundles = []
            bundle = []
            for timestamp, elem in timestamped_data:
                windows = window_fn.assign(
                    WindowFn.AssignContext(timestamp, elem))
                bundle.append(WindowedValue(elem, timestamp, windows))
                if len(bundle) == group_by:
                    bundles.append(bundle)
                    bundle = []
            bundles.append(bundle)
            self.run_trigger(window_fn, trigger_fn, accumulation_mode,
                             bundle_data(timestamped_data, group_by),
                             bundle_data(late_data, group_by), expected_panes)
Esempio n. 2
0
    def _process_outputs(self, element, results):
        """Dispatch the result of computation to the appropriate receivers.

    A value wrapped in a SideOutputValue object will be unwrapped and
    then dispatched to the appropriate indexed output.
    """
        if results is None:
            return
        for result in results:
            tag = None
            if isinstance(result, SideOutputValue):
                tag = result.tag
                if not isinstance(tag, basestring):
                    raise TypeError('In %s, tag %s is not a string' %
                                    (self, tag))
                result = result.value
            if isinstance(result, WindowedValue):
                windowed_value = result
            elif element is None:
                # Start and finish have no element from which to grab context,
                # but may emit elements.
                if isinstance(result, TimestampedValue):
                    value = result.value
                    timestamp = result.timestamp
                    assign_context = NoContext(value, timestamp)
                else:
                    value = result
                    timestamp = -1
                    assign_context = NoContext(value)
                windowed_value = WindowedValue(
                    value, timestamp, self.window_fn.assign(assign_context))
            elif isinstance(result, TimestampedValue):
                assign_context = WindowFn.AssignContext(
                    result.timestamp, result.value, element.windows)
                windowed_value = WindowedValue(
                    result.value, result.timestamp,
                    self.window_fn.assign(assign_context))
            else:
                windowed_value = element.with_value(result)
            if tag is None:
                self.main_receivers.output(windowed_value)
            else:
                self.tagged_receivers[tag].output(windowed_value)
Esempio n. 3
0
 def flush(self, target):
     limit = self.size - target
     for ix, (kw, vs) in enumerate(self.table.items()):
         if ix >= limit:
             break
         del self.table[kw]
         key, windows = kw
         output_value = [v.value[1] for v in vs]
         windowed_value = WindowedValue((key, output_value),
                                        vs[0].timestamp, windows)
         self.output(windowed_value)
Esempio n. 4
0
 def bundle_data(data, size):
     bundle = []
     for timestamp, elem in data:
         windows = window_fn.assign(
             WindowFn.AssignContext(timestamp, elem))
         bundle.append(WindowedValue(elem, timestamp, windows))
         if len(bundle) == size:
             yield bundle
             bundle = []
     if bundle:
         yield bundle
Esempio n. 5
0
    def _output(self, window, finished, state):
        """Output window and clean up if appropriate."""

        values = state.get_state(window, self.ELEMENTS)
        if finished:
            # TODO(robertwb): allowed lateness
            state.clear_state(window, self.ELEMENTS)
            state.add_state(window, self.TOMBSTONE, 1)
        elif self.accumulation_mode == AccumulationMode.DISCARDING:
            state.clear_state(window, self.ELEMENTS)

        timestamp = state.get_state(window, self.WATERMARK_HOLD)
        if timestamp is None:
            # If no watermark hold was set, output at end of window.
            timestamp = window.end
        else:
            state.clear_state(window, self.WATERMARK_HOLD)

        return WindowedValue(values, timestamp, (window, ))
Esempio n. 6
0
    def process_elements(self, state, windowed_values,
                         unused_output_watermark):
        if isinstance(windowed_values, list):
            unwindowed = [wv.value for wv in windowed_values]
        else:

            class UnwindowedValues(observable.ObservableMixin):
                def __iter__(self):
                    for wv in windowed_values:
                        unwindowed_value = wv.value
                        self.notify_observers(unwindowed_value)
                        yield unwindowed_value

                def __repr__(self):
                    return '<UnwindowedValues of %s>' % windowed_values

            unwindowed = UnwindowedValues()
        yield WindowedValue(unwindowed, MIN_TIMESTAMP,
                            self.GLOBAL_WINDOW_TUPLE)
Esempio n. 7
0
  def Write(self, windowed_kv):
    # WindmillWriter takes windowed values, reifies the windows and writes the
    # resulting windowed value to Windmill.  Note that in this streaming case,
    # the service does not add a ReifyWindows step, so we do that here.
    key, value = windowed_kv.value
    timestamp = windowed_kv.timestamp
    wm_timestamp = harness_to_windmill_timestamp(timestamp)
    windows = windowed_kv.windows
    windowed_value = WindowedValue(value, timestamp, windows)

    encoded_key = self.key_coder.encode(key)
    encoded_value = self.wv_coder.encode(windowed_value)
    # TODO(ccy): In the future, we will populate metadata with PaneInfo
    # details.
    metadata = ''

    # Add to output for key.
    if encoded_key not in self.keyed_output:
      self.keyed_output[encoded_key] = (
          self.windmill_pb2.KeyedMessageBundle(key=encoded_key))
    self.keyed_output[encoded_key].messages.add(
        timestamp=wm_timestamp,
        data=encoded_value,
        metadata=metadata)
Esempio n. 8
0
 def output_key(self, wkey, value):
     windows, key = wkey
     self.output(WindowedValue((key, value), windows[0].end, windows))
 def decode_from_stream(self, in_stream, nested):
     return WindowedValue(
         self._value_coder.decode_from_stream(in_stream, True),
         self._timestamp_coder.decode_from_stream(in_stream, True),
         self._windows_coder.decode_from_stream(in_stream, True))
Esempio n. 10
0
    def _run_log(self, spec):
        def parse_int_list(s):
            """Parses strings like '[1, 2, 3]'."""
            s = s.strip()
            assert s[0] == '[' and s[-1] == ']', s
            if not s[1:-1].strip():
                return []
            else:
                return [int(x) for x in s[1:-1].split(',')]

        def split_args(s):
            """Splits 'a, b, [c, d]' into ['a', 'b', '[c, d]']."""
            args = []
            start = 0
            depth = 0
            for ix in xrange(len(s)):
                c = s[ix]
                if c in '({[':
                    depth += 1
                elif c in ')}]':
                    depth -= 1
                elif c == ',' and depth == 0:
                    args.append(s[start:ix].strip())
                    start = ix + 1
            assert depth == 0, s
            args.append(s[start:].strip())
            return args

        def parse(s, names):
            """Parse (recursive) 'Foo(arg, kw=arg)' for Foo in the names dict."""
            s = s.strip()
            if s in names:
                return names[s]
            elif s[0] == '[':
                return parse_int_list(s)
            elif '(' in s:
                assert s[-1] == ')', s
                callee = parse(s[:s.index('(')], names)
                posargs = []
                kwargs = {}
                for arg in split_args(s[s.index('(') + 1:-1]):
                    if '=' in arg:
                        kw, value = arg.split('=', 1)
                        kwargs[kw] = parse(value, names)
                    else:
                        posargs.append(parse(arg, names))
                return callee(*posargs, **kwargs)
            else:
                try:
                    return int(s)
                except ValueError:
                    raise ValueError('Unknown function: %s' % s)

        def parse_fn(s, names):
            """Like parse(), but implicitly calls no-arg constructors."""
            fn = parse(s, names)
            if isinstance(fn, type):
                return fn()
            else:
                return fn

        # pylint: disable=g-import-not-at-top
        from google.cloud.dataflow.transforms import window as window_module
        from google.cloud.dataflow.transforms import trigger as trigger_module
        # pylint: enable=g-import-not-at-top
        window_fn_names = dict(window_module.__dict__)
        window_fn_names.update({
            'CustomTimestampingFixedWindowsWindowFn':
            CustomTimestampingFixedWindowsWindowFn
        })
        trigger_names = {'Default': DefaultTrigger}
        trigger_names.update(trigger_module.__dict__)

        window_fn = parse_fn(spec.get('window_fn', 'GlobalWindows'),
                             window_fn_names)
        trigger_fn = parse_fn(spec.get('trigger_fn', 'Default'), trigger_names)
        accumulation_mode = getattr(
            AccumulationMode,
            spec.get('accumulation_mode', 'ACCUMULATING').upper())
        output_time_fn = getattr(
            OutputTimeFn,
            spec.get('output_time_fn', 'OUTPUT_AT_EOW').upper())
        allowed_lateness = float(spec.get('allowed_lateness', '-inf'))

        driver = GeneralTriggerDriver(
            Windowing(window_fn, trigger_fn, accumulation_mode,
                      output_time_fn))
        state = InMemoryUnmergedState()
        output = []
        watermark = MIN_TIMESTAMP

        def fire_timers():
            to_fire = state.get_and_clear_timers(watermark)
            while to_fire:
                for timer_window, (name, time_domain, t_timestamp) in to_fire:
                    for wvalue in driver.process_timer(timer_window, name,
                                                       time_domain,
                                                       t_timestamp, state):
                        window, = wvalue.windows
                        output.append({
                            'window': [window.start, window.end - 1],
                            'values': sorted(wvalue.value),
                            'timestamp': wvalue.timestamp
                        })
                to_fire = state.get_and_clear_timers(watermark)

        for line in spec['transcript']:

            action, params = line.items()[0]

            if action != 'expect':
                # Fail if we have output that was not expected in the transcript.
                self.assertEquals([],
                                  output,
                                  msg='Unexpected output: %s before %s' %
                                  (output, line))

            if action == 'input':
                bundle = [
                    WindowedValue(
                        t, t, window_fn.assign(WindowFn.AssignContext(t, t)))
                    for t in params
                ]
                output = [{
                    'window':
                    [wvalue.windows[0].start, wvalue.windows[0].end - 1],
                    'values':
                    sorted(wvalue.value),
                    'timestamp':
                    wvalue.timestamp
                } for wvalue in driver.process_elements(
                    state, bundle, watermark)]
                fire_timers()

            elif action == 'watermark':
                watermark = params
                fire_timers()

            elif action == 'expect':
                for expected_output in params:
                    for candidate in output:
                        if all(candidate[k] == expected_output[k]
                               for k in candidate if k in expected_output):
                            output.remove(candidate)
                            break
                    else:
                        self.fail('Unmatched output %s in %s' %
                                  (expected_output, output))

            elif action == 'state':
                # TODO(robertwb): Implement once we support allowed lateness.
                pass

            else:
                self.fail('Unknown action: ' + action)

        # Fail if we have output that was not expected in the transcript.
        self.assertEquals([], output, msg='Unexpected output: %s' % output)
Esempio n. 11
0
 def process(self, context):
   context = WindowFn.AssignContext(context.timestamp,
                                    element=context.element,
                                    existing_windows=context.windows)
   new_windows = self.windowing.windowfn.assign(context)
   yield WindowedValue(context.element, context.timestamp, new_windows)
Esempio n. 12
0
 def timestamped_key_values(self, pipeline, key, *timestamps):
   return (pipeline | Create('start', timestamps)
           | Map(lambda x: WindowedValue((key, x), x, [])))