Beispiel #1
0
  def _invoke_per_window(
      self, windowed_value, additional_args,
      additional_kwargs, output_processor):
    if self.has_windowed_inputs:
      window, = windowed_value.windows
      side_inputs = [si[window] for si in self.side_inputs]
      side_inputs.extend(additional_args)
      args_for_process, kwargs_for_process = util.insert_values_in_args(
          self.args_for_process, self.kwargs_for_process,
          side_inputs)
    elif self.cache_globally_windowed_args:
      # Attempt to cache additional args if all inputs are globally
      # windowed inputs when processing the first element.
      self.cache_globally_windowed_args = False

      # Fill in sideInputs if they are globally windowed
      global_window = GlobalWindow()
      self.args_for_process, self.kwargs_for_process = (
          util.insert_values_in_args(
              self.args_for_process, self.kwargs_for_process,
              [si[global_window] for si in self.side_inputs]))
      args_for_process, kwargs_for_process = (
          self.args_for_process, self.kwargs_for_process)
    else:
      args_for_process, kwargs_for_process = (
          self.args_for_process, self.kwargs_for_process)
    # TODO(sourabhbajaj): Investigate why we can't use `is` instead of ==
    for i, p in self.placeholders:
      if p == core.DoFn.ElementParam:
        args_for_process[i] = windowed_value.value
      elif p == core.DoFn.WindowParam:
        args_for_process[i] = window
      elif p == core.DoFn.TimestampParam:
        args_for_process[i] = windowed_value.timestamp

    if additional_kwargs:
      if kwargs_for_process is None:
        kwargs_for_process = additional_kwargs
      else:
        for key in additional_kwargs:
          kwargs_for_process[key] = additional_kwargs[key]

    if kwargs_for_process:
      output_processor.process_outputs(
          windowed_value,
          self.process_method(*args_for_process, **kwargs_for_process))
    else:
      output_processor.process_outputs(
          windowed_value, self.process_method(*args_for_process))
Beispiel #2
0
  def _invoke_per_window(
      self, windowed_value, additional_args,
      additional_kwargs, output_processor):
    if self.has_windowed_inputs:
      window, = windowed_value.windows
      side_inputs = [si[window] for si in self.side_inputs]
      side_inputs.extend(additional_args)
      args_for_process, kwargs_for_process = util.insert_values_in_args(
          self.args_for_process, self.kwargs_for_process,
          side_inputs)
    elif self.cache_globally_windowed_args:
      # Attempt to cache additional args if all inputs are globally
      # windowed inputs when processing the first element.
      self.cache_globally_windowed_args = False

      # Fill in sideInputs if they are globally windowed
      global_window = GlobalWindow()
      self.args_for_process, self.kwargs_for_process = (
          util.insert_values_in_args(
              self.args_for_process, self.kwargs_for_process,
              [si[global_window] for si in self.side_inputs]))
      args_for_process, kwargs_for_process = (
          self.args_for_process, self.kwargs_for_process)
    else:
      args_for_process, kwargs_for_process = (
          self.args_for_process, self.kwargs_for_process)
    # TODO(sourabhbajaj): Investigate why we can't use `is` instead of ==
    for i, p in self.placeholders:
      if p == core.DoFn.ElementParam:
        args_for_process[i] = windowed_value.value
      elif p == core.DoFn.WindowParam:
        args_for_process[i] = window
      elif p == core.DoFn.TimestampParam:
        args_for_process[i] = windowed_value.timestamp

    if additional_kwargs:
      if kwargs_for_process is None:
        kwargs_for_process = additional_kwargs
      else:
        for key in additional_kwargs:
          kwargs_for_process[key] = additional_kwargs[key]

    if kwargs_for_process:
      output_processor.process_outputs(
          windowed_value,
          self.process_method(*args_for_process, **kwargs_for_process))
    else:
      output_processor.process_outputs(
          windowed_value, self.process_method(*args_for_process))
Beispiel #3
0
 def test_insert_values_in_args(self):
   values = ['a', 'b']
   args = [1, ArgumentPlaceholder()]
   kwargs = {'x': 1, 'y': ArgumentPlaceholder()}
   args, kwargs = insert_values_in_args(args, kwargs, values)
   self.assertEquals([1, 'a'], args)
   self.assertEquals({'x': 1, 'y': 'b'}, kwargs)
Beispiel #4
0
    def _invoke_per_window(self, windowed_value):
        if self.has_windowed_inputs:
            window, = windowed_value.windows
            args_for_process, kwargs_for_process = util.insert_values_in_args(
                self.args_for_process, self.kwargs_for_process,
                [si[window] for si in self.side_inputs])
        else:
            args_for_process, kwargs_for_process = (self.args_for_process,
                                                    self.kwargs_for_process)
        # TODO(sourabhbajaj): Investigate why we can't use `is` instead of ==
        for i, p in self.placeholders:
            if p == core.DoFn.ElementParam:
                args_for_process[i] = windowed_value.value
            elif p == core.DoFn.ContextParam:
                args_for_process[i] = self.context
            elif p == core.DoFn.WindowParam:
                args_for_process[i] = window
            elif p == core.DoFn.TimestampParam:
                args_for_process[i] = windowed_value.timestamp

        if kwargs_for_process:
            self.output_processor.process_outputs(
                windowed_value,
                self.process_method(*args_for_process, **kwargs_for_process))
        else:
            self.output_processor.process_outputs(
                windowed_value, self.process_method(*args_for_process))
Beispiel #5
0
 def test_insert_values_in_args_nothing_to_insert(self):
     values = []
     args = [1, 'a']
     kwargs = {'x': 1, 'y': 'b'}
     args, kwargs = insert_values_in_args(args, kwargs, values)
     self.assertEqual([1, 'a'], args)
     self.assertEqual({'x': 1, 'y': 'b'}, kwargs)
Beispiel #6
0
 def test_insert_values_in_args(self):
     values = ['a', 'b']
     args = [1, ArgumentPlaceholder()]
     kwargs = {'x': 1, 'y': ArgumentPlaceholder()}
     args, kwargs = insert_values_in_args(args, kwargs, values)
     self.assertEqual([1, 'a'], args)
     self.assertEqual({'x': 1, 'y': 'b'}, kwargs)
Beispiel #7
0
    def _invoke_per_window(self, windowed_value, additional_args,
                           additional_kwargs, output_processor):
        if self.has_windowed_inputs:
            window, = windowed_value.windows
            side_inputs = [si[window] for si in self.side_inputs]
            side_inputs.extend(additional_args)
            args_for_process, kwargs_for_process = util.insert_values_in_args(
                self.args_for_process, self.kwargs_for_process, side_inputs)
        else:
            args_for_process, kwargs_for_process = (self.args_for_process,
                                                    self.kwargs_for_process)
        # TODO(sourabhbajaj): Investigate why we can't use `is` instead of ==
        for i, p in self.placeholders:
            if p == core.DoFn.ElementParam:
                args_for_process[i] = windowed_value.value
            elif p == core.DoFn.WindowParam:
                args_for_process[i] = window
            elif p == core.DoFn.TimestampParam:
                args_for_process[i] = windowed_value.timestamp

        if additional_kwargs:
            if kwargs_for_process is None:
                kwargs_for_process = additional_kwargs
            else:
                for key in additional_kwargs:
                    kwargs_for_process[key] = additional_kwargs[key]

        if kwargs_for_process:
            output_processor.process_outputs(
                windowed_value,
                self.process_method(*args_for_process, **kwargs_for_process))
        else:
            output_processor.process_outputs(
                windowed_value, self.process_method(*args_for_process))
Beispiel #8
0
  def _invoke_per_window(self, windowed_value):
    if self.has_windowed_inputs:
      window, = windowed_value.windows
      args_for_process, kwargs_for_process = util.insert_values_in_args(
          self.args_for_process, self.kwargs_for_process,
          [si[window] for si in self.side_inputs])
    else:
      args_for_process, kwargs_for_process = (
          self.args_for_process, self.kwargs_for_process)
    # TODO(sourabhbajaj): Investigate why we can't use `is` instead of ==
    for i, p in self.placeholders:
      if p == core.DoFn.ElementParam:
        args_for_process[i] = windowed_value.value
      elif p == core.DoFn.WindowParam:
        args_for_process[i] = window
      elif p == core.DoFn.TimestampParam:
        args_for_process[i] = windowed_value.timestamp

    if kwargs_for_process:
      self.output_processor.process_outputs(
          windowed_value,
          self.process_method(*args_for_process, **kwargs_for_process))
    else:
      self.output_processor.process_outputs(
          windowed_value, self.process_method(*args_for_process))
Beispiel #9
0
 def test_insert_values_in_args_nothing_to_insert(self):
   values = []
   args = [1, 'a']
   kwargs = {'x': 1, 'y': 'b'}
   args, kwargs = insert_values_in_args(args, kwargs, values)
   self.assertEquals([1, 'a'], args)
   self.assertEquals({'x': 1, 'y': 'b'}, kwargs)
Beispiel #10
0
 def _dofn_invoker(self, element):
     self.context.set_element(element)
     # Call for the process function for each window if has windowed side inputs
     # or if the process accesses the window parameter. We can just call it once
     # otherwise as none of the arguments are changing
     if self.has_windowed_inputs:
         for w in element.windows:
             args, kwargs = util.insert_values_in_args(
                 self.args, self.kwargs, [si[w] for si in self.side_inputs])
             self._dofn_window_invoker(element, args, kwargs, w)
     else:
         self._dofn_window_invoker(element, self.args, self.kwargs, None)
Beispiel #11
0
  def expand(self, pcoll):
    args, kwargs = util.insert_values_in_args(
        self.args, self.kwargs, self.side_inputs)

    input_type = pcoll.element_type
    key_type = None
    if input_type is not None:
      key_type, _ = input_type.tuple_types

    runtime_type_check = (
        pcoll.pipeline.options.view_as(TypeOptions).runtime_type_check)
    return pcoll | ParDo(
        CombineValuesDoFn(key_type, self.fn, runtime_type_check),
        *args, **kwargs)
Beispiel #12
0
 def _dofn_per_window_invoker(self, element):
   if self.has_windowed_inputs:
     window, = element.windows
     args, kwargs = util.insert_values_in_args(
         self.args, self.kwargs, [si[window] for si in self.side_inputs])
   else:
     args, kwargs = self.args, self.kwargs
   # TODO(sourabhbajaj): Investigate why we can't use `is` instead of ==
   for i, p in self.placeholders:
     if p == core.DoFn.ElementParam:
       args[i] = element.value
     elif p == core.DoFn.ContextParam:
       args[i] = self.context
     elif p == core.DoFn.WindowParam:
       args[i] = window
     elif p == core.DoFn.TimestampParam:
       args[i] = element.timestamp
   if not kwargs:
     self._process_outputs(element, self.dofn_process(*args))
   else:
     self._process_outputs(element, self.dofn_process(*args, **kwargs))
Beispiel #13
0
 def _dofn_per_window_invoker(self, element):
   if self.has_windowed_inputs:
     window, = element.windows
     args, kwargs = util.insert_values_in_args(
         self.args, self.kwargs, [si[window] for si in self.side_inputs])
   else:
     args, kwargs = self.args, self.kwargs
   # TODO(sourabhbajaj): Investigate why we can't use `is` instead of ==
   for i, p in self.placeholders:
     if p == core.DoFn.ElementParam:
       args[i] = element.value
     elif p == core.DoFn.ContextParam:
       args[i] = self.context
     elif p == core.DoFn.WindowParam:
       args[i] = window
     elif p == core.DoFn.TimestampParam:
       args[i] = element.timestamp
   if not kwargs:
     self._process_outputs(element, self.dofn_process(*args))
   else:
     self._process_outputs(element, self.dofn_process(*args, **kwargs))
Beispiel #14
0
  def _invoke_per_window(
      self, windowed_value, additional_args,
      additional_kwargs, output_processor):
    if self.has_windowed_inputs:
      window, = windowed_value.windows
      side_inputs = [si[window] for si in self.side_inputs]
      side_inputs.extend(additional_args)
      args_for_process, kwargs_for_process = util.insert_values_in_args(
          self.args_for_process, self.kwargs_for_process,
          side_inputs)
    else:
      args_for_process, kwargs_for_process = (
          self.args_for_process, self.kwargs_for_process)
    # TODO(sourabhbajaj): Investigate why we can't use `is` instead of ==
    for i, p in self.placeholders:
      if p == core.DoFn.ElementParam:
        args_for_process[i] = windowed_value.value
      elif p == core.DoFn.WindowParam:
        args_for_process[i] = window
      elif p == core.DoFn.TimestampParam:
        args_for_process[i] = windowed_value.timestamp

    if additional_kwargs:
      if kwargs_for_process is None:
        kwargs_for_process = additional_kwargs
      else:
        for key in additional_kwargs:
          kwargs_for_process[key] = additional_kwargs[key]

    if kwargs_for_process:
      output_processor.process_outputs(
          windowed_value,
          self.process_method(*args_for_process, **kwargs_for_process))
    else:
      output_processor.process_outputs(
          windowed_value, self.process_method(*args_for_process))
Beispiel #15
0
 def process(context):
     w = context.windows[0]
     cur_args, cur_kwargs = util.insert_values_in_args(
         args, kwargs,
         [side_input[w] for side_input in side_inputs])
     return fn.process(context, *cur_args, **cur_kwargs)
Beispiel #16
0
  def _invoke_process_per_window(
      self, windowed_value, additional_args,
      additional_kwargs, output_processor):
    if self.has_windowed_inputs:
      window, = windowed_value.windows
      side_inputs = [si[window] for si in self.side_inputs]
      side_inputs.extend(additional_args)
      args_for_process, kwargs_for_process = util.insert_values_in_args(
          self.args_for_process, self.kwargs_for_process,
          side_inputs)
    elif self.cache_globally_windowed_args:
      # Attempt to cache additional args if all inputs are globally
      # windowed inputs when processing the first element.
      self.cache_globally_windowed_args = False

      # Fill in sideInputs if they are globally windowed
      global_window = GlobalWindow()
      self.args_for_process, self.kwargs_for_process = (
          util.insert_values_in_args(
              self.args_for_process, self.kwargs_for_process,
              [si[global_window] for si in self.side_inputs]))
      args_for_process, kwargs_for_process = (
          self.args_for_process, self.kwargs_for_process)
    else:
      args_for_process, kwargs_for_process = (
          self.args_for_process, self.kwargs_for_process)

    # Extract key in the case of a stateful DoFn. Note that in the case of a
    # stateful DoFn, we set during __init__ self.has_windowed_inputs to be
    # True. Therefore, windows will be exploded coming into this method, and
    # we can rely on the window variable being set above.
    if self.user_state_context:
      try:
        key, unused_value = windowed_value.value
      except (TypeError, ValueError):
        raise ValueError(
            ('Input value to a stateful DoFn must be a KV tuple; instead, '
             'got %s.') % (windowed_value.value,))

    # TODO(sourabhbajaj): Investigate why we can't use `is` instead of ==
    for i, p in self.placeholders:
      if p == core.DoFn.ElementParam:
        args_for_process[i] = windowed_value.value
      elif p == core.DoFn.WindowParam:
        args_for_process[i] = window
      elif p == core.DoFn.TimestampParam:
        args_for_process[i] = windowed_value.timestamp
      elif isinstance(p, core.DoFn.StateParam):
        args_for_process[i] = (
            self.user_state_context.get_state(p.state_spec, key, window))
      elif isinstance(p, core.DoFn.TimerParam):
        args_for_process[i] = (
            self.user_state_context.get_timer(p.timer_spec, key, window))
      elif p == core.DoFn.BundleFinalizerParam:
        args_for_process[i] = self.bundle_finalizer_param

    if additional_kwargs:
      if kwargs_for_process is None:
        kwargs_for_process = additional_kwargs
      else:
        for key in additional_kwargs:
          kwargs_for_process[key] = additional_kwargs[key]

    if kwargs_for_process:
      output_processor.process_outputs(
          windowed_value,
          self.process_method(*args_for_process, **kwargs_for_process))
    else:
      output_processor.process_outputs(
          windowed_value, self.process_method(*args_for_process))

    if self.is_splittable:
      deferred_status = self.restriction_tracker.deferred_status()
      if deferred_status:
        deferred_restriction, deferred_watermark = deferred_status
        element = windowed_value.value
        size = self.signature.get_restriction_provider().restriction_size(
            element, deferred_restriction)
        return (
            windowed_value.with_value(((element, deferred_restriction), size)),
            deferred_watermark)
Beispiel #17
0
  def __init__(self, output_processor, signature, context,
               side_inputs, input_args, input_kwargs):
    super(PerWindowInvoker, self).__init__(output_processor, signature)
    self.side_inputs = side_inputs
    self.context = context
    self.process_method = signature.process_method.method_value
    default_arg_values = signature.process_method.defaults
    self.has_windowed_inputs = (
        not all(si.is_globally_windowed() for si in side_inputs) or
        (core.DoFn.WindowParam in default_arg_values))

    # Try to prepare all the arguments that can just be filled in
    # without any additional work. in the process function.
    # Also cache all the placeholders needed in the process function.

    # Fill in sideInputs if they are globally windowed
    global_window = GlobalWindow()

    input_args = input_args if input_args else []
    input_kwargs = input_kwargs if input_kwargs else {}

    if not self.has_windowed_inputs:
      input_args, input_kwargs = util.insert_values_in_args(
          input_args, input_kwargs, [si[global_window] for si in side_inputs])

    arguments = signature.process_method.args
    defaults = signature.process_method.defaults

    # Create placeholder for element parameter of DoFn.process() method.
    self_in_args = int(signature.do_fn.is_process_bounded())

    class ArgPlaceholder(object):
      def __init__(self, placeholder):
        self.placeholder = placeholder

    if core.DoFn.ElementParam not in default_arg_values:
      args_to_pick = len(arguments) - len(default_arg_values) - 1 - self_in_args
      args_with_placeholders = (
          [ArgPlaceholder(core.DoFn.ElementParam)] + input_args[:args_to_pick])
    else:
      args_to_pick = len(arguments) - len(defaults) - self_in_args
      args_with_placeholders = input_args[:args_to_pick]

    # Fill the OtherPlaceholders for context, window or timestamp
    remaining_args_iter = iter(input_args[args_to_pick:])
    for a, d in zip(arguments[-len(defaults):], defaults):
      if d == core.DoFn.ElementParam:
        args_with_placeholders.append(ArgPlaceholder(d))
      elif d == core.DoFn.WindowParam:
        args_with_placeholders.append(ArgPlaceholder(d))
      elif d == core.DoFn.TimestampParam:
        args_with_placeholders.append(ArgPlaceholder(d))
      elif d == core.DoFn.SideInputParam:
        # If no more args are present then the value must be passed via kwarg
        try:
          args_with_placeholders.append(next(remaining_args_iter))
        except StopIteration:
          if a not in input_kwargs:
            raise ValueError("Value for sideinput %s not provided" % a)
      else:
        # If no more args are present then the value must be passed via kwarg
        try:
          args_with_placeholders.append(next(remaining_args_iter))
        except StopIteration:
          pass
    args_with_placeholders.extend(list(remaining_args_iter))

    # Stash the list of placeholder positions for performance
    self.placeholders = [(i, x.placeholder) for (i, x) in enumerate(
        args_with_placeholders)
                         if isinstance(x, ArgPlaceholder)]

    self.args_for_process = args_with_placeholders
    self.kwargs_for_process = input_kwargs
Beispiel #18
0
 def expand(self, pcoll):
   args, kwargs = util.insert_values_in_args(
       self.args, self.kwargs, self.side_inputs)
   return pcoll | GroupByKey() | 'Combine' >> CombineValues(
       self.fn, *args, **kwargs)
Beispiel #19
0
    def __init__(self, output_processor, signature, context, side_inputs,
                 input_args, input_kwargs):
        super(PerWindowInvoker, self).__init__(output_processor, signature)
        self.side_inputs = side_inputs
        self.context = context
        self.process_method = signature.process_method.method_value
        default_arg_values = signature.process_method.defaults
        self.has_windowed_inputs = (not all(si.is_globally_windowed()
                                            for si in side_inputs)
                                    or (core.DoFn.WindowParam
                                        in default_arg_values))

        # Try to prepare all the arguments that can just be filled in
        # without any additional work. in the process function.
        # Also cache all the placeholders needed in the process function.

        # Fill in sideInputs if they are globally windowed
        global_window = GlobalWindow()

        input_args = input_args if input_args else []
        input_kwargs = input_kwargs if input_kwargs else {}

        if not self.has_windowed_inputs:
            input_args, input_kwargs = util.insert_values_in_args(
                input_args, input_kwargs,
                [si[global_window] for si in side_inputs])

        arguments = signature.process_method.args
        defaults = signature.process_method.defaults

        # Create placeholder for element parameter of DoFn.process() method.
        self_in_args = int(signature.do_fn.is_process_bounded())

        class ArgPlaceholder(object):
            def __init__(self, placeholder):
                self.placeholder = placeholder

        if core.DoFn.ElementParam not in default_arg_values:
            args_to_pick = len(arguments) - len(
                default_arg_values) - 1 - self_in_args
            args_with_placeholders = (
                [ArgPlaceholder(core.DoFn.ElementParam)] +
                input_args[:args_to_pick])
        else:
            args_to_pick = len(arguments) - len(defaults) - self_in_args
            args_with_placeholders = input_args[:args_to_pick]

        # Fill the OtherPlaceholders for context, window or timestamp
        remaining_args_iter = iter(input_args[args_to_pick:])
        for a, d in zip(arguments[-len(defaults):], defaults):
            if d == core.DoFn.ElementParam:
                args_with_placeholders.append(ArgPlaceholder(d))
            elif d == core.DoFn.ContextParam:
                args_with_placeholders.append(ArgPlaceholder(d))
            elif d == core.DoFn.WindowParam:
                args_with_placeholders.append(ArgPlaceholder(d))
            elif d == core.DoFn.TimestampParam:
                args_with_placeholders.append(ArgPlaceholder(d))
            elif d == core.DoFn.SideInputParam:
                # If no more args are present then the value must be passed via kwarg
                try:
                    args_with_placeholders.append(remaining_args_iter.next())
                except StopIteration:
                    if a not in input_kwargs:
                        raise ValueError(
                            "Value for sideinput %s not provided" % a)
            else:
                # If no more args are present then the value must be passed via kwarg
                try:
                    args_with_placeholders.append(remaining_args_iter.next())
                except StopIteration:
                    pass
        args_with_placeholders.extend(list(remaining_args_iter))

        # Stash the list of placeholder positions for performance
        self.placeholders = [(i, x.placeholder)
                             for (i, x) in enumerate(args_with_placeholders)
                             if isinstance(x, ArgPlaceholder)]

        self.args_for_process = args_with_placeholders
        self.kwargs_for_process = input_kwargs
Beispiel #20
0
    def new_dofn_process(self, element):
        self.context.set_element(element)
        arguments, _, _, defaults = self.dofn.get_function_arguments('process')
        defaults = defaults if defaults else []

        self_in_args = int(self.dofn.is_process_bounded())

        # Call for the process function for each window if has windowed side inputs
        # or if the process accesses the window parameter. We can just call it once
        # otherwise as none of the arguments are changing
        if self.has_windowed_side_inputs or core.NewDoFn.WindowParam in defaults:
            windows = element.windows
        else:
            windows = [window.GlobalWindow()]

        for w in windows:
            args, kwargs = util.insert_values_in_args(
                self.args, self.kwargs, [s[w] for s in self.side_inputs])

            # If there are more arguments than the default then the first argument
            # should be the element and the rest should be picked from the side
            # inputs as window and timestamp should always be tagged
            if len(arguments) > len(defaults) + self_in_args:
                if core.NewDoFn.ElementParam not in defaults:
                    args_to_pick = len(arguments) - len(
                        defaults) - 1 - self_in_args
                    final_args = [element.value] + args[:args_to_pick]
                else:
                    args_to_pick = len(arguments) - len(
                        defaults) - self_in_args
                    final_args = args[:args_to_pick]
            else:
                args_to_pick = 0
                final_args = []
            args = iter(args[args_to_pick:])

            for a, d in zip(arguments[-len(defaults):], defaults):
                if d == core.NewDoFn.ElementParam:
                    final_args.append(element.value)
                elif d == core.NewDoFn.ContextParam:
                    final_args.append(self.context)
                elif d == core.NewDoFn.WindowParam:
                    final_args.append(w)
                elif d == core.NewDoFn.TimestampParam:
                    final_args.append(element.timestamp)
                elif d == core.NewDoFn.SideInputParam:
                    # If no more args are present then the value must be passed via kwarg
                    try:
                        final_args.append(args.next())
                    except StopIteration:
                        if a not in kwargs:
                            raise
                else:
                    # If no more args are present then the value must be passed via kwarg
                    try:
                        final_args.append(args.next())
                    except StopIteration:
                        if a not in kwargs:
                            kwargs[a] = d
            final_args.extend(list(args))
            self._process_outputs(element,
                                  self.dofn.process(*final_args, **kwargs))
Beispiel #21
0
    def _invoke_per_window(self, windowed_value, additional_args,
                           additional_kwargs, output_processor):
        if self.has_windowed_inputs:
            window, = windowed_value.windows
            side_inputs = [si[window] for si in self.side_inputs]
            side_inputs.extend(additional_args)
            args_for_process, kwargs_for_process = util.insert_values_in_args(
                self.args_for_process, self.kwargs_for_process, side_inputs)
        elif self.cache_globally_windowed_args:
            # Attempt to cache additional args if all inputs are globally
            # windowed inputs when processing the first element.
            self.cache_globally_windowed_args = False

            # Fill in sideInputs if they are globally windowed
            global_window = GlobalWindow()
            self.args_for_process, self.kwargs_for_process = (
                util.insert_values_in_args(
                    self.args_for_process, self.kwargs_for_process,
                    [si[global_window] for si in self.side_inputs]))
            args_for_process, kwargs_for_process = (self.args_for_process,
                                                    self.kwargs_for_process)
        else:
            args_for_process, kwargs_for_process = (self.args_for_process,
                                                    self.kwargs_for_process)

        # Extract key in the case of a stateful DoFn. Note that in the case of a
        # stateful DoFn, we set during __init__ self.has_windowed_inputs to be
        # True. Therefore, windows will be exploded coming into this method, and
        # we can rely on the window variable being set above.
        if self.user_state_context:
            try:
                key, unused_value = windowed_value.value
            except (TypeError, ValueError):
                raise ValueError((
                    'Input value to a stateful DoFn must be a KV tuple; instead, '
                    'got %s.') % (windowed_value.value, ))

        # TODO(sourabhbajaj): Investigate why we can't use `is` instead of ==
        for i, p in self.placeholders:
            if p == core.DoFn.ElementParam:
                args_for_process[i] = windowed_value.value
            elif p == core.DoFn.WindowParam:
                args_for_process[i] = window
            elif p == core.DoFn.TimestampParam:
                args_for_process[i] = windowed_value.timestamp
            elif isinstance(p, core.DoFn.StateParam):
                args_for_process[i] = (self.user_state_context.get_state(
                    p.state_spec, key, window))
            elif isinstance(p, core.DoFn.TimerParam):
                args_for_process[i] = (self.user_state_context.get_timer(
                    p.timer_spec, key, window))
            elif p == core.DoFn.BundleFinalizerParam:
                args_for_process[i] = self.bundle_finalizer_param

        if additional_kwargs:
            if kwargs_for_process is None:
                kwargs_for_process = additional_kwargs
            else:
                for key in additional_kwargs:
                    kwargs_for_process[key] = additional_kwargs[key]

        if kwargs_for_process:
            output_processor.process_outputs(
                windowed_value,
                self.process_method(*args_for_process, **kwargs_for_process))
        else:
            output_processor.process_outputs(
                windowed_value, self.process_method(*args_for_process))

        if self.is_splittable:
            deferred_status = self.restriction_tracker.deferred_status()
            if deferred_status:
                deferred_restriction, deferred_watermark = deferred_status
                element = windowed_value.value
                size = self.signature.get_restriction_provider(
                ).restriction_size(element, deferred_restriction)
                return (windowed_value.with_value(
                    ((element, deferred_restriction), size)),
                        deferred_watermark)
Beispiel #22
0
    def __init__(
            self,
            fn,
            args,
            kwargs,
            side_inputs,
            windowing,
            context=None,
            tagged_receivers=None,
            logger=None,
            step_name=None,
            # Preferred alternative to logger
            # TODO(robertwb): Remove once all runners are updated.
            logging_context=None,
            # Preferred alternative to context
            # TODO(robertwb): Remove once all runners are updated.
            state=None,
            scoped_metrics_container=None):
        """Initializes a DoFnRunner.

    Args:
      fn: user DoFn to invoke
      args: positional side input arguments (static and placeholder), if any
      kwargs: keyword side input arguments (static and placeholder), if any
      side_inputs: list of sideinput.SideInputMaps for deferred side inputs
      windowing: windowing properties of the output PCollection(s)
      context: a DoFnContext to use (deprecated)
      tagged_receivers: a dict of tag name to Receiver objects
      logger: a logging module (deprecated)
      step_name: the name of this step
      logging_context: a LoggingContext object
      state: handle for accessing DoFn state
      scoped_metrics_container: Context switcher for metrics container
    """
        self.step_name = step_name
        self.window_fn = windowing.windowfn
        self.tagged_receivers = tagged_receivers
        self.scoped_metrics_container = (scoped_metrics_container
                                         or ScopedMetricsContainer())

        global_window = window.GlobalWindow()

        # Need to support multiple iterations.
        side_inputs = list(side_inputs)

        if logging_context:
            self.logging_context = logging_context
        else:
            self.logging_context = get_logging_context(logger,
                                                       step_name=step_name)

        # Optimize for the common case.
        self.main_receivers = as_receiver(tagged_receivers[None])

        # TODO(sourabh): Deprecate the use of context
        if state:
            assert context is None
            self.context = DoFnContext(self.step_name, state=state)
        else:
            assert context is not None
            self.context = context

        # TODO(Sourabhbajaj): Remove the usage of OldDoFn
        if isinstance(fn, core.NewDoFn):
            self.is_new_dofn = True

            # Stash values for use in new_dofn_process.
            self.side_inputs = side_inputs
            self.has_windowed_side_inputs = not all(si.is_globally_windowed()
                                                    for si in self.side_inputs)

            self.args = args if args else []
            self.kwargs = kwargs if kwargs else {}
            self.dofn = fn

        else:
            self.is_new_dofn = False
            self.has_windowed_side_inputs = False  # Set to True in one case below.
            if not args and not kwargs:
                self.dofn = fn
                self.dofn_process = fn.process
            else:
                if side_inputs and all(side_input.is_globally_windowed()
                                       for side_input in side_inputs):
                    args, kwargs = util.insert_values_in_args(
                        args, kwargs, [
                            side_input[global_window]
                            for side_input in side_inputs
                        ])
                    side_inputs = []
                if side_inputs:
                    self.has_windowed_side_inputs = True

                    def process(context):
                        w = context.windows[0]
                        cur_args, cur_kwargs = util.insert_values_in_args(
                            args, kwargs,
                            [side_input[w] for side_input in side_inputs])
                        return fn.process(context, *cur_args, **cur_kwargs)

                    self.dofn_process = process
                elif kwargs:
                    self.dofn_process = lambda context: fn.process(
                        context, *args, **kwargs)
                else:
                    self.dofn_process = lambda context: fn.process(
                        context, *args)

                class CurriedFn(core.DoFn):

                    start_bundle = staticmethod(fn.start_bundle)
                    process = staticmethod(self.dofn_process)
                    finish_bundle = staticmethod(fn.finish_bundle)

                self.dofn = CurriedFn()
Beispiel #23
0
    def __init__(
            self,
            fn,
            args,
            kwargs,
            side_inputs,
            windowing,
            context=None,
            tagged_receivers=None,
            logger=None,
            step_name=None,
            # Preferred alternative to logger
            # TODO(robertwb): Remove once all runners are updated.
            logging_context=None,
            # Preferred alternative to context
            # TODO(robertwb): Remove once all runners are updated.
            state=None,
            scoped_metrics_container=None):
        """Initializes a DoFnRunner.

    Args:
      fn: user DoFn to invoke
      args: positional side input arguments (static and placeholder), if any
      kwargs: keyword side input arguments (static and placeholder), if any
      side_inputs: list of sideinput.SideInputMaps for deferred side inputs
      windowing: windowing properties of the output PCollection(s)
      context: a DoFnContext to use (deprecated)
      tagged_receivers: a dict of tag name to Receiver objects
      logger: a logging module (deprecated)
      step_name: the name of this step
      logging_context: a LoggingContext object
      state: handle for accessing DoFn state
      scoped_metrics_container: Context switcher for metrics container
    """
        self.step_name = step_name
        self.window_fn = windowing.windowfn
        self.tagged_receivers = tagged_receivers
        self.scoped_metrics_container = (scoped_metrics_container
                                         or ScopedMetricsContainer())

        global_window = GlobalWindow()

        # Need to support multiple iterations.
        side_inputs = list(side_inputs)

        if logging_context:
            self.logging_context = logging_context
        else:
            self.logging_context = get_logging_context(logger,
                                                       step_name=step_name)

        # Optimize for the common case.
        self.main_receivers = as_receiver(tagged_receivers[None])

        # TODO(sourabh): Deprecate the use of context
        if state:
            assert context is None
            self.context = DoFnContext(self.step_name, state=state)
        else:
            assert context is not None
            self.context = context

        class ArgPlaceholder(object):
            def __init__(self, placeholder):
                self.placeholder = placeholder

        # Stash values for use in dofn_process.
        self.side_inputs = side_inputs
        self.has_windowed_inputs = not all(si.is_globally_windowed()
                                           for si in self.side_inputs)

        self.args = args if args else []
        self.kwargs = kwargs if kwargs else {}
        self.dofn = fn
        self.dofn_process = fn.process

        arguments, _, _, defaults = self.dofn.get_function_arguments('process')
        defaults = defaults if defaults else []
        self_in_args = int(self.dofn.is_process_bounded())

        self.use_simple_invoker = (not side_inputs and not args and not kwargs
                                   and not defaults)
        if self.use_simple_invoker:
            # As we're using the simple invoker we don't need to compute placeholders
            return

        self.has_windowed_inputs = (self.has_windowed_inputs
                                    or core.DoFn.WindowParam in defaults)

        # Try to prepare all the arguments that can just be filled in
        # without any additional work. in the process function.
        # Also cache all the placeholders needed in the process function.

        # Fill in sideInputs if they are globally windowed
        if not self.has_windowed_inputs:
            self.args, self.kwargs = util.insert_values_in_args(
                args, kwargs, [si[global_window] for si in side_inputs])

        # Create placeholder for element parameter
        if core.DoFn.ElementParam not in defaults:
            args_to_pick = len(arguments) - len(defaults) - 1 - self_in_args
            final_args = [ArgPlaceholder(core.DoFn.ElementParam)] + \
                         self.args[:args_to_pick]
        else:
            args_to_pick = len(arguments) - len(defaults) - self_in_args
            final_args = self.args[:args_to_pick]

        # Fill the OtherPlaceholders for context, window or timestamp
        args = iter(self.args[args_to_pick:])
        for a, d in zip(arguments[-len(defaults):], defaults):
            if d == core.DoFn.ElementParam:
                final_args.append(ArgPlaceholder(d))
            elif d == core.DoFn.ContextParam:
                final_args.append(ArgPlaceholder(d))
            elif d == core.DoFn.WindowParam:
                final_args.append(ArgPlaceholder(d))
            elif d == core.DoFn.TimestampParam:
                final_args.append(ArgPlaceholder(d))
            elif d == core.DoFn.SideInputParam:
                # If no more args are present then the value must be passed via kwarg
                try:
                    final_args.append(args.next())
                except StopIteration:
                    if a not in self.kwargs:
                        raise ValueError(
                            "Value for sideinput %s not provided" % a)
            else:
                # If no more args are present then the value must be passed via kwarg
                try:
                    final_args.append(args.next())
                except StopIteration:
                    pass
        final_args.extend(list(args))
        self.args = final_args

        # Stash the list of placeholder positions for performance
        self.placeholders = [(i, x.placeholder)
                             for (i, x) in enumerate(self.args)
                             if isinstance(x, ArgPlaceholder)]
Beispiel #24
0
    def _invoke_process_per_window(
        self,
        windowed_value,  # type: WindowedValue
        additional_args,
        additional_kwargs,
    ):
        # type: (...) -> Optional[SplitResultResidual]
        if self.has_windowed_inputs:
            window, = windowed_value.windows
            side_inputs = [si[window] for si in self.side_inputs]
            side_inputs.extend(additional_args)
            args_for_process, kwargs_for_process = util.insert_values_in_args(
                self.args_for_process, self.kwargs_for_process, side_inputs)
        elif self.cache_globally_windowed_args:
            # Attempt to cache additional args if all inputs are globally
            # windowed inputs when processing the first element.
            self.cache_globally_windowed_args = False

            # Fill in sideInputs if they are globally windowed
            global_window = GlobalWindow()
            self.args_for_process, self.kwargs_for_process = (
                util.insert_values_in_args(
                    self.args_for_process, self.kwargs_for_process,
                    [si[global_window] for si in self.side_inputs]))
            args_for_process, kwargs_for_process = (self.args_for_process,
                                                    self.kwargs_for_process)
        else:
            args_for_process, kwargs_for_process = (self.args_for_process,
                                                    self.kwargs_for_process)

        # Extract key in the case of a stateful DoFn. Note that in the case of a
        # stateful DoFn, we set during __init__ self.has_windowed_inputs to be
        # True. Therefore, windows will be exploded coming into this method, and
        # we can rely on the window variable being set above.
        if self.user_state_context or self.is_key_param_required:
            try:
                key, unused_value = windowed_value.value
            except (TypeError, ValueError):
                raise ValueError((
                    'Input value to a stateful DoFn or KeyParam must be a KV tuple; '
                    'instead, got \'%s\'.') % (windowed_value.value, ))

        for i, p in self.placeholders:
            if core.DoFn.ElementParam == p:
                args_for_process[i] = windowed_value.value
            elif core.DoFn.KeyParam == p:
                args_for_process[i] = key
            elif core.DoFn.WindowParam == p:
                args_for_process[i] = window
            elif core.DoFn.TimestampParam == p:
                args_for_process[i] = windowed_value.timestamp
            elif core.DoFn.PaneInfoParam == p:
                args_for_process[i] = windowed_value.pane_info
            elif isinstance(p, core.DoFn.StateParam):
                assert self.user_state_context is not None
                args_for_process[i] = (self.user_state_context.get_state(
                    p.state_spec, key, window))
            elif isinstance(p, core.DoFn.TimerParam):
                assert self.user_state_context is not None
                args_for_process[i] = (self.user_state_context.get_timer(
                    p.timer_spec, key, window))
            elif core.DoFn.BundleFinalizerParam == p:
                args_for_process[i] = self.bundle_finalizer_param

        if additional_kwargs:
            if kwargs_for_process is None:
                kwargs_for_process = additional_kwargs
            else:
                for key in additional_kwargs:
                    kwargs_for_process[key] = additional_kwargs[key]

        if kwargs_for_process:
            self.output_processor.process_outputs(
                windowed_value,
                self.process_method(*args_for_process, **kwargs_for_process))
        else:
            self.output_processor.process_outputs(
                windowed_value, self.process_method(*args_for_process))

        if self.is_splittable:
            assert self.threadsafe_restriction_tracker is not None
            # TODO: Consider calling check_done right after SDF.Process() finishing.
            # In order to do this, we need to know that current invoking dofn is
            # ProcessSizedElementAndRestriction.
            self.threadsafe_restriction_tracker.check_done()
            deferred_status = self.threadsafe_restriction_tracker.deferred_status(
            )
            current_watermark = None
            if self.watermark_estimator:
                current_watermark = self.watermark_estimator.current_watermark(
                )
            if deferred_status:
                deferred_restriction, deferred_timestamp = deferred_status
                element = windowed_value.value
                size = self.signature.get_restriction_provider(
                ).restriction_size(element, deferred_restriction)
                residual_value = ((element, deferred_restriction), size)
                return SplitResultResidual(
                    residual_value=windowed_value.with_value(residual_value),
                    current_watermark=current_watermark,
                    deferred_timestamp=deferred_timestamp)
        return None
Beispiel #25
0
  def __init__(self,
               fn,
               args,
               kwargs,
               side_inputs,
               windowing,
               context=None,
               tagged_receivers=None,
               logger=None,
               step_name=None,
               # Preferred alternative to logger
               # TODO(robertwb): Remove once all runners are updated.
               logging_context=None,
               # Preferred alternative to context
               # TODO(robertwb): Remove once all runners are updated.
               state=None,
               scoped_metrics_container=None):
    """Initializes a DoFnRunner.

    Args:
      fn: user DoFn to invoke
      args: positional side input arguments (static and placeholder), if any
      kwargs: keyword side input arguments (static and placeholder), if any
      side_inputs: list of sideinput.SideInputMaps for deferred side inputs
      windowing: windowing properties of the output PCollection(s)
      context: a DoFnContext to use (deprecated)
      tagged_receivers: a dict of tag name to Receiver objects
      logger: a logging module (deprecated)
      step_name: the name of this step
      logging_context: a LoggingContext object
      state: handle for accessing DoFn state
      scoped_metrics_container: Context switcher for metrics container
    """
    self.step_name = step_name
    self.window_fn = windowing.windowfn
    self.tagged_receivers = tagged_receivers
    self.scoped_metrics_container = (scoped_metrics_container
                                     or ScopedMetricsContainer())

    global_window = GlobalWindow()

    # Need to support multiple iterations.
    side_inputs = list(side_inputs)

    if logging_context:
      self.logging_context = logging_context
    else:
      self.logging_context = get_logging_context(logger, step_name=step_name)

    # Optimize for the common case.
    self.main_receivers = as_receiver(tagged_receivers[None])

    # TODO(sourabh): Deprecate the use of context
    if state:
      assert context is None
      self.context = DoFnContext(self.step_name, state=state)
    else:
      assert context is not None
      self.context = context

    class ArgPlaceholder(object):
      def __init__(self, placeholder):
        self.placeholder = placeholder

    # Stash values for use in dofn_process.
    self.side_inputs = side_inputs
    self.has_windowed_inputs = not all(
        si.is_globally_windowed() for si in self.side_inputs)

    self.args = args if args else []
    self.kwargs = kwargs if kwargs else {}
    self.dofn = fn
    self.dofn_process = fn.process

    arguments, _, _, defaults = self.dofn.get_function_arguments('process')
    defaults = defaults if defaults else []
    self_in_args = int(self.dofn.is_process_bounded())

    self.use_simple_invoker = (
        not side_inputs and not args and not kwargs and not defaults)
    if self.use_simple_invoker:
      # As we're using the simple invoker we don't need to compute placeholders
      return

    self.has_windowed_inputs = (self.has_windowed_inputs or
                                core.DoFn.WindowParam in defaults)

    # Try to prepare all the arguments that can just be filled in
    # without any additional work. in the process function.
    # Also cache all the placeholders needed in the process function.

    # Fill in sideInputs if they are globally windowed
    if not self.has_windowed_inputs:
      self.args, self.kwargs = util.insert_values_in_args(
          args, kwargs, [si[global_window] for si in side_inputs])

    # Create placeholder for element parameter
    if core.DoFn.ElementParam not in defaults:
      args_to_pick = len(arguments) - len(defaults) - 1 - self_in_args
      final_args = [ArgPlaceholder(core.DoFn.ElementParam)] + \
                   self.args[:args_to_pick]
    else:
      args_to_pick = len(arguments) - len(defaults) - self_in_args
      final_args = self.args[:args_to_pick]

    # Fill the OtherPlaceholders for context, window or timestamp
    args = iter(self.args[args_to_pick:])
    for a, d in zip(arguments[-len(defaults):], defaults):
      if d == core.DoFn.ElementParam:
        final_args.append(ArgPlaceholder(d))
      elif d == core.DoFn.ContextParam:
        final_args.append(ArgPlaceholder(d))
      elif d == core.DoFn.WindowParam:
        final_args.append(ArgPlaceholder(d))
      elif d == core.DoFn.TimestampParam:
        final_args.append(ArgPlaceholder(d))
      elif d == core.DoFn.SideInputParam:
        # If no more args are present then the value must be passed via kwarg
        try:
          final_args.append(args.next())
        except StopIteration:
          if a not in self.kwargs:
            raise ValueError("Value for sideinput %s not provided" % a)
      else:
        # If no more args are present then the value must be passed via kwarg
        try:
          final_args.append(args.next())
        except StopIteration:
          pass
    final_args.extend(list(args))
    self.args = final_args

    # Stash the list of placeholder positions for performance
    self.placeholders = [(i, x.placeholder) for (i, x) in enumerate(self.args)
                         if isinstance(x, ArgPlaceholder)]