def process(self, element: Tuple[str, bytes], window=DoFn.WindowParam, values_state=DoFn.StateParam(VALUES_STATE), end_of_window_timer=DoFn.TimerParam(END_OF_WINDOW_TIMER)): logging.info('start process.') key, value = element end_of_window_timer.set(window.end) values_state.add(value) logging.info('end process.')
def process(self, element, restriction_tracker=DoFn.RestrictionParam(ReadFilesProvider()), *args, **kwargs): file_name = element with open(file_name, 'rb') as file: pos = restriction_tracker.current_restriction().start if restriction_tracker.current_restriction().start > 0: file.seek(restriction_tracker.current_restriction().start - 1) line = file.readline() pos = pos - 1 + len(line) output_count = 0 while restriction_tracker.try_claim(pos): line = file.readline() len_line = len(line) line = line.strip() if not line: break if line is None: break yield line output_count += 1 if self._resume_count and output_count == self._resume_count: restriction_tracker.defer_remainder() break pos += len_line
def process(self, element, side1, side2, side3, window=beam.DoFn.WindowParam, restriction_tracker=DoFn.RestrictionParam( ExpandStringsProvider()), *args, **kwargs): side = [] side.extend(side1) side.extend(side2) side.extend(side3) side = list(side) for i in range(restriction_tracker.current_restriction().start, restriction_tracker.current_restriction().stop): if restriction_tracker.try_claim(i): if not side: yield (element[0] + ':' + str(element[1]) + ':' + str(int(window.start)) if self._record_window else element) else: for val in side: ret = (element[0] + ':' + str(element[1]) + ':' + str(int(window.start)) if self._record_window else element) yield ret + ':' + val else: break
def test_translate_portable_job_step_name(self): mock_client, mock_job_result = self.setup_mock_client_result( self.ONLY_COUNTERS_LIST) pipeline_options = PipelineOptions([ '--experiments=use_runner_v2', '--experiments=use_portable_job_submission', '--temp_location=gs://any-location/temp', '--project=dummy_project', ]) pipeline = Pipeline(options=pipeline_options) pipeline | Create([1, 2, 3]) | 'MyTestParDo' >> ParDo(DoFn()) # pylint:disable=expression-not-assigned test_environment = DockerEnvironment( container_image='test_default_image') proto_pipeline, _ = pipeline.to_runner_api( return_context=True, default_environment=test_environment) job = apiclient.Job(pipeline_options, proto_pipeline) dm = dataflow_metrics.DataflowMetrics(mock_client, mock_job_result, job) self.assertEqual( 'MyTestParDo', dm._translate_step_name('ref_AppliedPTransform_MyTestParDo_14'))
def process( self, element, window=DoFn.WindowParam, element_state_0=DoFn.StateParam(ELEMENT_STATE_0), element_state_1=DoFn.StateParam(ELEMENT_STATE_1), element_state_2=DoFn.StateParam(ELEMENT_STATE_2), element_state_3=DoFn.StateParam(ELEMENT_STATE_3), count_state=DoFn.StateParam(COUNT_STATE), window_timer=DoFn.TimerParam(WINDOW_TIMER), buffering_timer=DoFn.TimerParam(BUFFERING_TIMER)): # Allowed lateness not supported in Python SDK # https://beam.apache.org/documentation/programming-guide/#watermarks-and-late-data window_timer.set(window.end) count_state.add(1) count = count_state.read() element_states = [element_state_0, element_state_1, element_state_2, element_state_3] element_states[count % 4].add(element) if count == 1 and max_buffering_duration_secs > 0: # This is the first element in batch. Start counting buffering time if a # limit was set. buffering_timer.set(clock() + max_buffering_duration_secs) if count >= batch_size: return self.flush_batch(element_states, count_state, buffering_timer)
def end_of_window(self, values_state=DoFn.StateParam(VALUES_STATE)): logging.info('start end_of_window.') read_count = 0 read_bytes = 0 values = values_state.read() for value in values: read_count += 1 read_bytes += len(value) logging.info('read_count: %s, read_bytes: %s', read_count, read_bytes) logging.info('end end_of_window.')
def on_window_timer( self, element_state_0=DoFn.StateParam(ELEMENT_STATE_0), element_state_1=DoFn.StateParam(ELEMENT_STATE_1), element_state_2=DoFn.StateParam(ELEMENT_STATE_2), element_state_3=DoFn.StateParam(ELEMENT_STATE_3), count_state=DoFn.StateParam(COUNT_STATE), buffering_timer=DoFn.TimerParam(BUFFERING_TIMER)): element_states = [element_state_0, element_state_1, element_state_2, element_state_3] return self.flush_batch(element_states, count_state, buffering_timer)
def __init__(self, project, source_language_code, target_language_code): DoFn.__init__(self) self._project = project self._source_language_code = source_language_code self._target_language_code = target_language_code