def call(self): self._call_count += 1 assert self._call_count <= (1 + len(self._applied_transform.side_inputs)) metrics_container = MetricsContainer( self._applied_transform.full_label) scoped_metrics_container = ScopedMetricsContainer(metrics_container) for side_input in self._applied_transform.side_inputs: if side_input not in self._side_input_values: has_result, value = (self._evaluation_context. get_value_or_schedule_after_output( side_input, self)) if not has_result: # Monitor task will reschedule this executor once the side input is # available. return self._side_input_values[side_input] = value side_input_values = [ self._side_input_values[side_input] for side_input in self._applied_transform.side_inputs ] try: evaluator = self._transform_evaluator_registry.for_application( self._applied_transform, self._input_bundle, side_input_values, scoped_metrics_container) if self._input_bundle: for value in self._input_bundle.get_elements_iterable(): evaluator.process_element(value) with scoped_metrics_container: result = evaluator.finish_bundle() result.logical_metric_updates = metrics_container.get_cumulative( ) if self._evaluation_context.has_cache: for uncommitted_bundle in result.uncommitted_output_bundles: self._evaluation_context.append_to_cache( self._applied_transform, uncommitted_bundle.tag, uncommitted_bundle.get_elements_iterable()) undeclared_tag_values = result.undeclared_tag_values if undeclared_tag_values: for tag, value in undeclared_tag_values.iteritems(): self._evaluation_context.append_to_cache( self._applied_transform, tag, value) self._completion_callback.handle_result(self._input_bundle, result) return result except Exception as e: # pylint: disable=broad-except logging.warning('Task failed: %s', traceback.format_exc(), exc_info=True) self._completion_callback.handle_exception(e) finally: self._evaluation_context.metrics().commit_physical( self._input_bundle, metrics_container.get_cumulative()) self._transform_evaluation_state.complete(self)
def __init__(self, operation_name, spec, counter_factory, state_sampler): """Initializes a worker operation instance. Args: operation_name: The system name assigned by the runner for this operation. spec: A operation_specs.Worker* instance. counter_factory: The CounterFactory to use for our counters. state_sampler: The StateSampler for the current operation. """ self.operation_name = operation_name self.spec = spec self.counter_factory = counter_factory self.consumers = collections.defaultdict(list) # These are overwritten in the legacy harness. self.step_name = operation_name self.metrics_container = MetricsContainer(self.step_name) self.scoped_metrics_container = ScopedMetricsContainer( self.metrics_container) self.state_sampler = state_sampler self.scoped_start_state = self.state_sampler.scoped_state( self.operation_name, 'start') self.scoped_process_state = self.state_sampler.scoped_state( self.operation_name, 'process') self.scoped_finish_state = self.state_sampler.scoped_state( self.operation_name, 'finish') # TODO(ccy): the '-abort' state can be added when the abort is supported in # Operations. self.receivers = []
def __init__(self, operation_name, spec, counter_factory, state_sampler): super(CombineOperation, self).__init__(operation_name, spec, counter_factory, state_sampler) # Combiners do not accept deferred side-inputs (the ignored fourth argument) # and therefore the code to handle the extra args/kwargs is simpler than for # the DoFn's of ParDo. fn, args, kwargs = pickler.loads(self.spec.serialized_fn)[:3] self.phased_combine_fn = (PhasedCombineFnExecutor( self.spec.phase, fn, args, kwargs)) self.scoped_metrics_container = ScopedMetricsContainer()
def test_scoped_container(self): c1 = MetricsContainer('mystep') c2 = MetricsContainer('myinternalstep') with ScopedMetricsContainer(c1): self.assertEqual(c1, MetricsEnvironment.current_container()) counter = Metrics.counter('ns', 'name') counter.inc(2) with ScopedMetricsContainer(c2): self.assertEqual(c2, MetricsEnvironment.current_container()) counter = Metrics.counter('ns', 'name') counter.inc(3) self.assertEqual(list(c2.get_cumulative().counters.items()), [(MetricKey('myinternalstep', MetricName('ns', 'name')), 3)]) self.assertEqual(c1, MetricsEnvironment.current_container()) counter = Metrics.counter('ns', 'name') counter.inc(4) self.assertEqual( list(c1.get_cumulative().counters.items()), [(MetricKey('mystep', MetricName('ns', 'name')), 6)])
def __init__(self, name_context, spec, counter_factory, state_sampler): """Initializes a worker operation instance. Args: name_context: A NameContext instance or string(deprecated), with the name information for this operation. spec: A operation_specs.Worker* instance. counter_factory: The CounterFactory to use for our counters. state_sampler: The StateSampler for the current operation. """ if isinstance(name_context, common.NameContext): # TODO(BEAM-4028): Clean this up once it's completely migrated. # We use the specific operation name that is used for metrics and state # sampling. self.name_context = name_context else: self.name_context = common.NameContext(name_context) # TODO(BEAM-4028): Remove following two lines. Rely on name context. self.operation_name = self.name_context.step_name self.step_name = self.name_context.logging_name() self.spec = spec self.counter_factory = counter_factory self.consumers = collections.defaultdict(list) # These are overwritten in the legacy harness. self.metrics_container = MetricsContainer( self.name_context.metrics_name()) # TODO(BEAM-4094): Remove ScopedMetricsContainer after Dataflow no longer # depends on it. self.scoped_metrics_container = ScopedMetricsContainer() self.state_sampler = state_sampler self.scoped_start_state = self.state_sampler.scoped_state( self.name_context.metrics_name(), 'start', metrics_container=self.metrics_container) self.scoped_process_state = self.state_sampler.scoped_state( self.name_context.metrics_name(), 'process', metrics_container=self.metrics_container) self.scoped_finish_state = self.state_sampler.scoped_state( self.name_context.metrics_name(), 'finish', metrics_container=self.metrics_container) # TODO(ccy): the '-abort' state can be added when the abort is supported in # Operations. self.receivers = []
def __init__(self, fn, args, kwargs, side_inputs, windowing, tagged_receivers=None, step_name=None, logging_context=None, state=None, scoped_metrics_container=None): """Initializes a DoFnRunner. Args: fn: user DoFn to invoke args: positional side input arguments (static and placeholder), if any kwargs: keyword side input arguments (static and placeholder), if any side_inputs: list of sideinput.SideInputMaps for deferred side inputs windowing: windowing properties of the output PCollection(s) tagged_receivers: a dict of tag name to Receiver objects step_name: the name of this step logging_context: a LoggingContext object state: handle for accessing DoFn state scoped_metrics_container: Context switcher for metrics container """ # Need to support multiple iterations. side_inputs = list(side_inputs) from apache_beam.metrics.execution import ScopedMetricsContainer self.scoped_metrics_container = ( scoped_metrics_container or ScopedMetricsContainer()) self.step_name = step_name self.logging_context = logging_context or LoggingContext() self.context = DoFnContext(step_name, state=state) do_fn_signature = DoFnSignature(fn) # Optimize for the common case. main_receivers = tagged_receivers[None] output_processor = _OutputProcessor( windowing.windowfn, main_receivers, tagged_receivers) self.do_fn_invoker = DoFnInvoker.create_invoker( do_fn_signature, output_processor, self.context, side_inputs, args, kwargs)
def call(self): self._call_count += 1 assert self._call_count <= (1 + len(self._applied_ptransform.side_inputs)) metrics_container = MetricsContainer( self._applied_ptransform.full_label) scoped_metrics_container = ScopedMetricsContainer(metrics_container) for side_input in self._applied_ptransform.side_inputs: if side_input not in self._side_input_values: has_result, value = (self._evaluation_context. get_value_or_schedule_after_output( side_input, self)) if not has_result: # Monitor task will reschedule this executor once the side input is # available. return self._side_input_values[side_input] = value side_input_values = [ self._side_input_values[side_input] for side_input in self._applied_ptransform.side_inputs ] while self._retry_count < self._max_retries_per_bundle: try: self.attempt_call(metrics_container, scoped_metrics_container, side_input_values) break except Exception as e: self._retry_count += 1 logging.error( 'Exception at bundle %r, due to an exception.\n %s', self._input_bundle, traceback.format_exc()) if self._retry_count == self._max_retries_per_bundle: logging.error('Giving up after %s attempts.', self._max_retries_per_bundle) if self._retry_count == 1: logging.info( 'Use the experimental flag --direct_runner_bundle_retry' ' to retry failed bundles (up to %d times).', TransformExecutor._MAX_RETRY_PER_BUNDLE) self._completion_callback.handle_exception(self, e) self._evaluation_context.metrics().commit_physical( self._input_bundle, metrics_container.get_cumulative()) self._transform_evaluation_state.complete(self)
def call(self): self._call_count += 1 assert self._call_count <= (1 + len(self._applied_ptransform.side_inputs)) metrics_container = MetricsContainer(self._applied_ptransform.full_label) scoped_metrics_container = ScopedMetricsContainer(metrics_container) for side_input in self._applied_ptransform.side_inputs: # Find the projection of main's window onto the side input's window. window_mapping_fn = side_input._view_options().get( 'window_mapping_fn', sideinputs._global_window_mapping_fn) main_onto_side_window = window_mapping_fn(self._latest_main_input_window) block_until = main_onto_side_window.end if side_input not in self._side_input_values: value = self._evaluation_context.get_value_or_block_until_ready( side_input, self, block_until) if not value: # Monitor task will reschedule this executor once the side input is # available. return self._side_input_values[side_input] = value side_input_values = [self._side_input_values[side_input] for side_input in self._applied_ptransform.side_inputs] while self._retry_count < self._max_retries_per_bundle: try: self.attempt_call(metrics_container, scoped_metrics_container, side_input_values) break except Exception as e: self._retry_count += 1 logging.error( 'Exception at bundle %r, due to an exception.\n %s', self._input_bundle, traceback.format_exc()) if self._retry_count == self._max_retries_per_bundle: logging.error('Giving up after %s attempts.', self._max_retries_per_bundle) self._completion_callback.handle_exception(self, e) self._evaluation_context.metrics().commit_physical( self._input_bundle, metrics_container.get_cumulative()) self._transform_evaluation_state.complete(self)
def create_operation(operation_name, spec, counter_factory, step_name, state_sampler, test_shuffle_source=None, test_shuffle_sink=None, is_streaming=False): """Create Operation object for given operation specification.""" if isinstance(spec, operation_specs.WorkerRead): if isinstance(spec.source, iobase.SourceBundle): op = ReadOperation(operation_name, spec, counter_factory, state_sampler) else: from dataflow_worker.native_operations import NativeReadOperation op = NativeReadOperation(operation_name, spec, counter_factory, state_sampler) elif isinstance(spec, operation_specs.WorkerWrite): from dataflow_worker.native_operations import NativeWriteOperation op = NativeWriteOperation(operation_name, spec, counter_factory, state_sampler) elif isinstance(spec, operation_specs.WorkerCombineFn): op = CombineOperation(operation_name, spec, counter_factory, state_sampler) elif isinstance(spec, operation_specs.WorkerPartialGroupByKey): op = create_pgbk_op(operation_name, spec, counter_factory, state_sampler) elif isinstance(spec, operation_specs.WorkerDoFn): op = DoOperation(operation_name, spec, counter_factory, state_sampler) elif isinstance(spec, operation_specs.WorkerGroupingShuffleRead): from dataflow_worker.shuffle_operations import GroupedShuffleReadOperation op = GroupedShuffleReadOperation(operation_name, spec, counter_factory, state_sampler, shuffle_source=test_shuffle_source) elif isinstance(spec, operation_specs.WorkerUngroupedShuffleRead): from dataflow_worker.shuffle_operations import UngroupedShuffleReadOperation op = UngroupedShuffleReadOperation(operation_name, spec, counter_factory, state_sampler, shuffle_source=test_shuffle_source) elif isinstance(spec, operation_specs.WorkerInMemoryWrite): op = InMemoryWriteOperation(operation_name, spec, counter_factory, state_sampler) elif isinstance(spec, operation_specs.WorkerShuffleWrite): from dataflow_worker.shuffle_operations import ShuffleWriteOperation op = ShuffleWriteOperation(operation_name, spec, counter_factory, state_sampler, shuffle_sink=test_shuffle_sink) elif isinstance(spec, operation_specs.WorkerFlatten): op = FlattenOperation(operation_name, spec, counter_factory, state_sampler) elif isinstance(spec, operation_specs.WorkerMergeWindows): from dataflow_worker.shuffle_operations import BatchGroupAlsoByWindowsOperation from dataflow_worker.shuffle_operations import StreamingGroupAlsoByWindowsOperation if is_streaming: op = StreamingGroupAlsoByWindowsOperation(operation_name, spec, counter_factory, state_sampler) else: op = BatchGroupAlsoByWindowsOperation(operation_name, spec, counter_factory, state_sampler) elif isinstance(spec, operation_specs.WorkerReifyTimestampAndWindows): from dataflow_worker.shuffle_operations import ReifyTimestampAndWindowsOperation op = ReifyTimestampAndWindowsOperation(operation_name, spec, counter_factory, state_sampler) else: raise TypeError( 'Expected an instance of operation_specs.Worker* class ' 'instead of %s' % (spec, )) op.step_name = step_name op.metrics_container = MetricsContainer(step_name) op.scoped_metrics_container = ScopedMetricsContainer(op.metrics_container) return op
def __init__( self, fn, args, kwargs, side_inputs, windowing, context=None, tagged_receivers=None, logger=None, step_name=None, # Preferred alternative to logger # TODO(robertwb): Remove once all runners are updated. logging_context=None, # Preferred alternative to context # TODO(robertwb): Remove once all runners are updated. state=None, scoped_metrics_container=None): """Initializes a DoFnRunner. Args: fn: user DoFn to invoke args: positional side input arguments (static and placeholder), if any kwargs: keyword side input arguments (static and placeholder), if any side_inputs: list of sideinput.SideInputMaps for deferred side inputs windowing: windowing properties of the output PCollection(s) context: a DoFnContext to use (deprecated) tagged_receivers: a dict of tag name to Receiver objects logger: a logging module (deprecated) step_name: the name of this step logging_context: a LoggingContext object state: handle for accessing DoFn state scoped_metrics_container: Context switcher for metrics container """ self.step_name = step_name self.window_fn = windowing.windowfn self.tagged_receivers = tagged_receivers self.scoped_metrics_container = (scoped_metrics_container or ScopedMetricsContainer()) global_window = window.GlobalWindow() # Need to support multiple iterations. side_inputs = list(side_inputs) if logging_context: self.logging_context = logging_context else: self.logging_context = get_logging_context(logger, step_name=step_name) # Optimize for the common case. self.main_receivers = as_receiver(tagged_receivers[None]) # TODO(sourabh): Deprecate the use of context if state: assert context is None self.context = DoFnContext(self.step_name, state=state) else: assert context is not None self.context = context # TODO(Sourabhbajaj): Remove the usage of OldDoFn if isinstance(fn, core.NewDoFn): self.is_new_dofn = True # Stash values for use in new_dofn_process. self.side_inputs = side_inputs self.has_windowed_side_inputs = not all(si.is_globally_windowed() for si in self.side_inputs) self.args = args if args else [] self.kwargs = kwargs if kwargs else {} self.dofn = fn else: self.is_new_dofn = False self.has_windowed_side_inputs = False # Set to True in one case below. if not args and not kwargs: self.dofn = fn self.dofn_process = fn.process else: if side_inputs and all(side_input.is_globally_windowed() for side_input in side_inputs): args, kwargs = util.insert_values_in_args( args, kwargs, [ side_input[global_window] for side_input in side_inputs ]) side_inputs = [] if side_inputs: self.has_windowed_side_inputs = True def process(context): w = context.windows[0] cur_args, cur_kwargs = util.insert_values_in_args( args, kwargs, [side_input[w] for side_input in side_inputs]) return fn.process(context, *cur_args, **cur_kwargs) self.dofn_process = process elif kwargs: self.dofn_process = lambda context: fn.process( context, *args, **kwargs) else: self.dofn_process = lambda context: fn.process( context, *args) class CurriedFn(core.DoFn): start_bundle = staticmethod(fn.start_bundle) process = staticmethod(self.dofn_process) finish_bundle = staticmethod(fn.finish_bundle) self.dofn = CurriedFn()
def __init__( self, fn, args, kwargs, side_inputs, windowing, context=None, tagged_receivers=None, logger=None, step_name=None, # Preferred alternative to logger # TODO(robertwb): Remove once all runners are updated. logging_context=None, # Preferred alternative to context # TODO(robertwb): Remove once all runners are updated. state=None, scoped_metrics_container=None): """Initializes a DoFnRunner. Args: fn: user DoFn to invoke args: positional side input arguments (static and placeholder), if any kwargs: keyword side input arguments (static and placeholder), if any side_inputs: list of sideinput.SideInputMaps for deferred side inputs windowing: windowing properties of the output PCollection(s) context: a DoFnContext to use (deprecated) tagged_receivers: a dict of tag name to Receiver objects logger: a logging module (deprecated) step_name: the name of this step logging_context: a LoggingContext object state: handle for accessing DoFn state scoped_metrics_container: Context switcher for metrics container """ self.scoped_metrics_container = (scoped_metrics_container or ScopedMetricsContainer()) self.step_name = step_name # Need to support multiple iterations. side_inputs = list(side_inputs) if logging_context: self.logging_context = logging_context else: self.logging_context = get_logging_context(logger, step_name=step_name) # TODO(sourabh): Deprecate the use of context if state: assert context is None context = DoFnContext(step_name, state=state) else: assert context is not None context = context self.context = context do_fn_signature = DoFnSignature(fn) # Optimize for the common case. main_receivers = as_receiver(tagged_receivers[None]) output_processor = OutputProcessor(windowing.windowfn, main_receivers, tagged_receivers) self.do_fn_invoker = DoFnInvoker.create_invoker( output_processor, do_fn_signature, context, side_inputs, args, kwargs)
def __init__( self, fn, args, kwargs, side_inputs, windowing, context=None, tagged_receivers=None, logger=None, step_name=None, # Preferred alternative to logger # TODO(robertwb): Remove once all runners are updated. logging_context=None, # Preferred alternative to context # TODO(robertwb): Remove once all runners are updated. state=None, scoped_metrics_container=None): """Initializes a DoFnRunner. Args: fn: user DoFn to invoke args: positional side input arguments (static and placeholder), if any kwargs: keyword side input arguments (static and placeholder), if any side_inputs: list of sideinput.SideInputMaps for deferred side inputs windowing: windowing properties of the output PCollection(s) context: a DoFnContext to use (deprecated) tagged_receivers: a dict of tag name to Receiver objects logger: a logging module (deprecated) step_name: the name of this step logging_context: a LoggingContext object state: handle for accessing DoFn state scoped_metrics_container: Context switcher for metrics container """ self.step_name = step_name self.window_fn = windowing.windowfn self.tagged_receivers = tagged_receivers self.scoped_metrics_container = (scoped_metrics_container or ScopedMetricsContainer()) global_window = GlobalWindow() # Need to support multiple iterations. side_inputs = list(side_inputs) if logging_context: self.logging_context = logging_context else: self.logging_context = get_logging_context(logger, step_name=step_name) # Optimize for the common case. self.main_receivers = as_receiver(tagged_receivers[None]) # TODO(sourabh): Deprecate the use of context if state: assert context is None self.context = DoFnContext(self.step_name, state=state) else: assert context is not None self.context = context class ArgPlaceholder(object): def __init__(self, placeholder): self.placeholder = placeholder # Stash values for use in dofn_process. self.side_inputs = side_inputs self.has_windowed_inputs = not all(si.is_globally_windowed() for si in self.side_inputs) self.args = args if args else [] self.kwargs = kwargs if kwargs else {} self.dofn = fn self.dofn_process = fn.process arguments, _, _, defaults = self.dofn.get_function_arguments('process') defaults = defaults if defaults else [] self_in_args = int(self.dofn.is_process_bounded()) self.use_simple_invoker = (not side_inputs and not args and not kwargs and not defaults) if self.use_simple_invoker: # As we're using the simple invoker we don't need to compute placeholders return self.has_windowed_inputs = (self.has_windowed_inputs or core.DoFn.WindowParam in defaults) # Try to prepare all the arguments that can just be filled in # without any additional work. in the process function. # Also cache all the placeholders needed in the process function. # Fill in sideInputs if they are globally windowed if not self.has_windowed_inputs: self.args, self.kwargs = util.insert_values_in_args( args, kwargs, [si[global_window] for si in side_inputs]) # Create placeholder for element parameter if core.DoFn.ElementParam not in defaults: args_to_pick = len(arguments) - len(defaults) - 1 - self_in_args final_args = [ArgPlaceholder(core.DoFn.ElementParam)] + \ self.args[:args_to_pick] else: args_to_pick = len(arguments) - len(defaults) - self_in_args final_args = self.args[:args_to_pick] # Fill the OtherPlaceholders for context, window or timestamp args = iter(self.args[args_to_pick:]) for a, d in zip(arguments[-len(defaults):], defaults): if d == core.DoFn.ElementParam: final_args.append(ArgPlaceholder(d)) elif d == core.DoFn.ContextParam: final_args.append(ArgPlaceholder(d)) elif d == core.DoFn.WindowParam: final_args.append(ArgPlaceholder(d)) elif d == core.DoFn.TimestampParam: final_args.append(ArgPlaceholder(d)) elif d == core.DoFn.SideInputParam: # If no more args are present then the value must be passed via kwarg try: final_args.append(args.next()) except StopIteration: if a not in self.kwargs: raise ValueError( "Value for sideinput %s not provided" % a) else: # If no more args are present then the value must be passed via kwarg try: final_args.append(args.next()) except StopIteration: pass final_args.extend(list(args)) self.args = final_args # Stash the list of placeholder positions for performance self.placeholders = [(i, x.placeholder) for (i, x) in enumerate(self.args) if isinstance(x, ArgPlaceholder)]