def test_hash_two_objects(self): self.assertEqual( hash(CounterName('counter_name', 'stage_name', 'step_name')), hash(CounterName('counter_name', 'stage_name', 'step_name'))) self.assertNotEqual( hash(CounterName('counter_name', 'stage_name', 'step_name')), hash(CounterName('counter_name', 'stage_name', 'step_nam')))
def test_equal_objects(self): self.assertEqual( CounterName('counter_name', 'stage_name', 'step_name'), CounterName('counter_name', 'stage_name', 'step_name')) self.assertNotEqual( CounterName('counter_name', 'stage_name', 'step_name'), CounterName('counter_name', 'stage_name', 'step_nam')) # Testing objects with an IOTarget. self.assertEqual( CounterName('counter_name', 'stage_name', 'step_name', io_target=counters.side_input_id(1, 's9')), CounterName('counter_name', 'stage_name', 'step_name', io_target=counters.side_input_id(1, 's9'))) self.assertNotEqual( CounterName('counter_name', 'stage_name', 'step_name', io_target=counters.side_input_id(1, 's')), CounterName('counter_name', 'stage_name', 'step_name', io_target=counters.side_input_id(1, 's9')))
def test_mean_counter(self): mean_counter = self.counter_factory.get_counter( CounterName('mean', 'stage_foo', 'step_bar'), counters.Counter.MEAN) for i in range(100): mean_counter.update(i) self.assertEqual(49, mean_counter.value())
def scoped_state(self, name_context, state_name, io_target=None, metrics_container=None): """Returns a ScopedState object associated to a Step and a State. Args: name_context: common.NameContext. It is the step name information. state_name: str. It is the state name (e.g. process / start / finish). io_target: metrics_container: MetricsContainer. The step's metrics container. Returns: A ScopedState that keeps the execution context and is able to switch it for the execution thread. """ if not isinstance(name_context, common.NameContext): name_context = common.NameContext(name_context) counter_name = CounterName(state_name + '-msecs', stage_name=self._prefix, step_name=name_context.metrics_name(), io_target=io_target) if counter_name in self._states_by_name: return self._states_by_name[counter_name] else: output_counter = self._counter_factory.get_counter( counter_name, Counter.SUM) self._states_by_name[counter_name] = super( StateSampler, self)._scoped_state(counter_name, name_context, output_counter, metrics_container) return self._states_by_name[counter_name]
def test_sum_counter(self): sum_counter = self.counter_factory.get_counter( CounterName('sum', 'stage_foo', 'step_bar'), counters.Counter.SUM) for i in range(100): sum_counter.update(i) self.assertEqual(99 * 50, sum_counter.value())
def test_distribution_counter(self): distribution_counter = self.counter_factory.get_counter( CounterName('distribution', 'stage_foo', 'step_bar'), counters.Counter.BEAM_DISTRIBUTION) for i in range(100): distribution_counter.update(i) self.assertEqual((49, 4950, 100, 0, 99), distribution_counter.value())
def _update_counters_for_requesting_step(self, step_name): side_input_id = counters.side_input_id(step_name, self.input_index) self.scoped_state = self._state_sampler.scoped_state( self.declaring_step, 'read-sideinput', io_target=side_input_id) self.bytes_read_counter = self._counter_factory.get_counter( CounterName('read-sideinput-byte-count', step_name=self.declaring_step, io_target=side_input_id), Counter.SUM)
def test_name_string_representation(self): counter_name = CounterName('counter_name', 'stage_name', 'step_name') # This string representation is utilized by the worker to report progress. # Change only if the worker code has also been changed. self.assertEqual('stage_name-step_name-counter_name', str(counter_name)) self.assertIn('<CounterName<stage_name-step_name-counter_name> at 0x', repr(counter_name))
def __init__(self, fn, args, kwargs, side_inputs, windowing, tagged_receivers=None, step_name=None, logging_context=None, state=None, scoped_metrics_container=None, operation_name=None): """Initializes a DoFnRunner. Args: fn: user DoFn to invoke args: positional side input arguments (static and placeholder), if any kwargs: keyword side input arguments (static and placeholder), if any side_inputs: list of sideinput.SideInputMaps for deferred side inputs windowing: windowing properties of the output PCollection(s) tagged_receivers: a dict of tag name to Receiver objects step_name: the name of this step logging_context: DEPRECATED [BEAM-4728] state: handle for accessing DoFn state scoped_metrics_container: DEPRECATED operation_name: The system name assigned by the runner for this operation. """ # Need to support multiple iterations. side_inputs = list(side_inputs) self.step_name = step_name self.context = DoFnContext(step_name, state=state) do_fn_signature = DoFnSignature(fn) # Optimize for the common case. main_receivers = tagged_receivers[None] # TODO(BEAM-3937): Remove if block after output counter released. if 'outputs_per_element_counter' in RuntimeValueProvider.experiments: # TODO(BEAM-3955): Make step_name and operation_name less confused. output_counter_name = (CounterName('per-element-output-count', step_name=operation_name)) per_element_output_counter = state._counter_factory.get_counter( output_counter_name, Counter.DATAFLOW_DISTRIBUTION).accumulator else: per_element_output_counter = None output_processor = _OutputProcessor(windowing.windowfn, main_receivers, tagged_receivers, per_element_output_counter) self.do_fn_invoker = DoFnInvoker.create_invoker( do_fn_signature, output_processor, self.context, side_inputs, args, kwargs)
def scoped_state(self, step_name, state_name, io_target=None): counter_name = CounterName(state_name + '-msecs', stage_name=self._prefix, step_name=step_name, io_target=io_target) if counter_name in self._states_by_name: return self._states_by_name[counter_name] else: output_counter = self._counter_factory.get_counter(counter_name, Counter.SUM) self._states_by_name[counter_name] = super( StateSampler, self)._scoped_state(counter_name, output_counter) return self._states_by_name[counter_name]
def test_reset(self): counter = self.counter_factory.get_counter( CounterName(self.combiner.default_label, 'stage_foo', 'reset'), self.combiner) for value in range(100): counter.update(value) expected = counter.value() counter.reset() for value in range(100): counter.update(value) self.assertEqual(expected, counter.value())
def test_basic_sampler(self): # Set up state sampler. counter_factory = CounterFactory() sampler = statesampler.StateSampler( 'basic', counter_factory, sampling_period_ms=1) # Duration of the fastest state. Total test duration is 6 times longer. state_duration_ms = 1000 margin_of_error = 0.25 # Run basic workload transitioning between 3 states. sampler.start() with sampler.scoped_state('step1', 'statea'): time.sleep(state_duration_ms / 1000) self.assertEqual( sampler.current_state().name, CounterName('statea-msecs', step_name='step1', stage_name='basic')) with sampler.scoped_state('step1', 'stateb'): time.sleep(state_duration_ms / 1000) self.assertEqual( sampler.current_state().name, CounterName('stateb-msecs', step_name='step1', stage_name='basic')) with sampler.scoped_state('step1', 'statec'): time.sleep(3 * state_duration_ms / 1000) self.assertEqual( sampler.current_state().name, CounterName( 'statec-msecs', step_name='step1', stage_name='basic')) time.sleep(state_duration_ms / 1000) sampler.stop() sampler.commit_counters() if not statesampler.FAST_SAMPLER: # The slow sampler does not implement sampling, so we won't test it. return # Test that sampled state timings are close to their expected values. # yapf: disable expected_counter_values = { CounterName('statea-msecs', step_name='step1', stage_name='basic'): state_duration_ms, CounterName('stateb-msecs', step_name='step1', stage_name='basic'): 2 * state_duration_ms, CounterName('statec-msecs', step_name='step1', stage_name='basic'): 3 * state_duration_ms, } # yapf: enable for counter in counter_factory.get_counters(): self.assertIn(counter.name, expected_counter_values) expected_value = expected_counter_values[counter.name] actual_value = counter.value() deviation = float(abs(actual_value - expected_value)) / expected_value _LOGGER.info('Sampling deviation from expectation: %f', deviation) self.assertGreater(actual_value, expected_value * (1.0 - margin_of_error)) self.assertLess(actual_value, expected_value * (1.0 + margin_of_error))
def test_basic_sampler(self): # Set up state sampler. counter_factory = CounterFactory() sampler = statesampler.StateSampler('basic', counter_factory, sampling_period_ms=1) # Run basic workload transitioning between 3 states. sampler.start() with sampler.scoped_state('step1', 'statea'): time.sleep(0.1) self.assertEqual( sampler.current_state().name, CounterName( 'statea-msecs', step_name='step1', stage_name='basic')) with sampler.scoped_state('step1', 'stateb'): time.sleep(0.2 / 2) self.assertEqual( sampler.current_state().name, CounterName( 'stateb-msecs', step_name='step1', stage_name='basic')) with sampler.scoped_state('step1', 'statec'): time.sleep(0.3) self.assertEqual( sampler.current_state().name, CounterName( 'statec-msecs', step_name='step1', stage_name='basic')) time.sleep(0.2 / 2) sampler.stop() sampler.commit_counters() if not statesampler.FAST_SAMPLER: # The slow sampler does not implement sampling, so we won't test it. return # Test that sampled state timings are close to their expected values. expected_counter_values = { CounterName('statea-msecs', step_name='step1', stage_name='basic'): 100, CounterName('stateb-msecs', step_name='step1', stage_name='basic'): 200, CounterName('statec-msecs', step_name='step1', stage_name='basic'): 300, } for counter in counter_factory.get_counters(): self.assertIn(counter.name, expected_counter_values) expected_value = expected_counter_values[counter.name] actual_value = counter.value() deviation = float(abs(actual_value - expected_value)) / expected_value logging.info('Sampling deviation from expectation: %f', deviation) self.assertGreater(actual_value, expected_value * 0.75) self.assertLess(actual_value, expected_value * 1.25)
def test_update_n(self): counter = self.counter_factory.get_counter( CounterName(self.combiner.default_label, 'stage_foo', 'update_n'), self.combiner) for i in range(100): value = i n = 100 - i for _ in range(n): counter.update(value) expected = counter.value() counter.reset() for i in range(100): value = i n = 100 - i counter.update_n(value, n) self.assertEqual(expected, counter.value())
def test_log_lull_in_bundle_processor(self): bundle_processor_cache = mock.MagicMock() worker = SdkWorker(bundle_processor_cache) sampler_info = statesampler.StateSamplerInfo( CounterName('progress-msecs', 'stage_name', 'step_name'), 1, 400000000000, threading.current_thread()) now = time.time() log_full_thread_dump_fn_name = \ 'apache_beam.runners.worker.sdk_worker.SdkWorker._log_full_thread_dump' with mock.patch('logging.Logger.warning') as warn_mock: with mock.patch( log_full_thread_dump_fn_name) as log_full_thread_dump: with mock.patch('time.time') as time_mock: time_mock.return_value = now worker._log_lull_sampler_info(sampler_info) processing_template = warn_mock.call_args[0][1] step_name_template = warn_mock.call_args[0][2] traceback = warn_mock.call_args = warn_mock.call_args[0][3] self.assertIn('progress-msecs', processing_template) self.assertIn('step_name', step_name_template) self.assertIn('test_log_lull_in_bundle_processor', traceback) log_full_thread_dump.assert_called_once_with() with mock.patch(log_full_thread_dump_fn_name) as log_full_thread_dump: with mock.patch('time.time') as time_mock: time_mock.return_value = now + 6 * 60 # 6 minutes worker._log_lull_sampler_info(sampler_info) self.assertFalse(log_full_thread_dump.called, 'log_full_thread_dump should not be called.') with mock.patch(log_full_thread_dump_fn_name) as log_full_thread_dump: with mock.patch('time.time') as time_mock: time_mock.return_value = now + 21 * 60 # 21 minutes worker._log_lull_sampler_info(sampler_info) log_full_thread_dump.assert_called_once_with()
def update_current_step(self): """Update the current running step. Due to the fusion optimization, user code may choose to emit the data structure that holds side inputs (Iterable, Dict, or others). This call updates the current step, to attribute the data consumption to the step that is responsible for actual consumption. CounterName uses the io_target field for information pertinent to the consumption of side inputs. """ current_state = self._state_sampler.current_state() operation_name = current_state.name.step_name self.scoped_state = self._state_sampler.scoped_state( self.declaring_step, 'read-sideinput', io_target=counters.side_input_id(operation_name, self.input_index)) self.bytes_read_counter = self._counter_factory.get_counter( CounterName( 'read-sideinput-byte-count', step_name=self.declaring_step, io_target=counters.side_input_id(operation_name, self.input_index)), Counter.SUM)
def __init__( self, fn, # type: core.DoFn args, kwargs, side_inputs, # type: Iterable[sideinputs.SideInputMap] windowing, tagged_receivers, # type: Mapping[Optional[str], Receiver] step_name=None, # type: Optional[str] logging_context=None, state=None, scoped_metrics_container=None, operation_name=None, user_state_context=None # type: Optional[userstate.UserStateContext] ): """Initializes a DoFnRunner. Args: fn: user DoFn to invoke args: positional side input arguments (static and placeholder), if any kwargs: keyword side input arguments (static and placeholder), if any side_inputs: list of sideinput.SideInputMaps for deferred side inputs windowing: windowing properties of the output PCollection(s) tagged_receivers: a dict of tag name to Receiver objects step_name: the name of this step logging_context: DEPRECATED [BEAM-4728] state: handle for accessing DoFn state scoped_metrics_container: DEPRECATED operation_name: The system name assigned by the runner for this operation. user_state_context: The UserStateContext instance for the current Stateful DoFn. """ # Need to support multiple iterations. side_inputs = list(side_inputs) self.step_name = step_name self.context = DoFnContext(step_name, state=state) self.bundle_finalizer_param = DoFn.BundleFinalizerParam() do_fn_signature = DoFnSignature(fn) # Optimize for the common case. main_receivers = tagged_receivers[None] # TODO(BEAM-3937): Remove if block after output counter released. if 'outputs_per_element_counter' in RuntimeValueProvider.experiments: # TODO(BEAM-3955): Make step_name and operation_name less confused. output_counter_name = (CounterName('per-element-output-count', step_name=operation_name)) per_element_output_counter = state._counter_factory.get_counter( output_counter_name, Counter.DATAFLOW_DISTRIBUTION).accumulator else: per_element_output_counter = None output_processor = _OutputProcessor(windowing.windowfn, main_receivers, tagged_receivers, per_element_output_counter) if do_fn_signature.is_stateful_dofn() and not user_state_context: raise Exception( 'Requested execution of a stateful DoFn, but no user state context ' 'is available. This likely means that the current runner does not ' 'support the execution of stateful DoFns.') self.do_fn_invoker = DoFnInvoker.create_invoker( do_fn_signature, output_processor, self.context, side_inputs, args, kwargs, user_state_context=user_state_context, bundle_finalizer_param=self.bundle_finalizer_param)
def _get_state_sampler_info_for_lull(self, lull_duration_s): return statesampler.StateSamplerInfo( CounterName('progress-msecs', 'stage_name', 'step_name'), 1, lull_duration_s * 1e9, threading.current_thread())