Beispiel #1
0
 def process(self,
             element: Tuple[str, bytes],
             window=DoFn.WindowParam,
             values_state=DoFn.StateParam(VALUES_STATE),
             end_of_window_timer=DoFn.TimerParam(END_OF_WINDOW_TIMER)):
     logging.info('start process.')
     key, value = element
     end_of_window_timer.set(window.end)
     values_state.add(value)
     logging.info('end process.')
    def process(self,
                element,
                restriction_tracker=DoFn.RestrictionParam(ReadFilesProvider()),
                *args,
                **kwargs):
        file_name = element

        with open(file_name, 'rb') as file:
            pos = restriction_tracker.current_restriction().start
            if restriction_tracker.current_restriction().start > 0:
                file.seek(restriction_tracker.current_restriction().start - 1)
                line = file.readline()
                pos = pos - 1 + len(line)

            output_count = 0
            while restriction_tracker.try_claim(pos):
                line = file.readline()
                len_line = len(line)
                line = line.strip()
                if not line:
                    break

                if line is None:
                    break
                yield line
                output_count += 1

                if self._resume_count and output_count == self._resume_count:
                    restriction_tracker.defer_remainder()
                    break

                pos += len_line
 def process(self,
             element,
             side1,
             side2,
             side3,
             window=beam.DoFn.WindowParam,
             restriction_tracker=DoFn.RestrictionParam(
                 ExpandStringsProvider()),
             *args,
             **kwargs):
     side = []
     side.extend(side1)
     side.extend(side2)
     side.extend(side3)
     side = list(side)
     for i in range(restriction_tracker.current_restriction().start,
                    restriction_tracker.current_restriction().stop):
         if restriction_tracker.try_claim(i):
             if not side:
                 yield (element[0] + ':' + str(element[1]) + ':' +
                        str(int(window.start))
                        if self._record_window else element)
             else:
                 for val in side:
                     ret = (element[0] + ':' + str(element[1]) + ':' +
                            str(int(window.start))
                            if self._record_window else element)
                     yield ret + ':' + val
         else:
             break
Beispiel #4
0
    def test_translate_portable_job_step_name(self):
        mock_client, mock_job_result = self.setup_mock_client_result(
            self.ONLY_COUNTERS_LIST)

        pipeline_options = PipelineOptions([
            '--experiments=use_runner_v2',
            '--experiments=use_portable_job_submission',
            '--temp_location=gs://any-location/temp',
            '--project=dummy_project',
        ])

        pipeline = Pipeline(options=pipeline_options)
        pipeline | Create([1, 2, 3]) | 'MyTestParDo' >> ParDo(DoFn())  # pylint:disable=expression-not-assigned

        test_environment = DockerEnvironment(
            container_image='test_default_image')
        proto_pipeline, _ = pipeline.to_runner_api(
            return_context=True, default_environment=test_environment)

        job = apiclient.Job(pipeline_options, proto_pipeline)
        dm = dataflow_metrics.DataflowMetrics(mock_client, mock_job_result,
                                              job)
        self.assertEqual(
            'MyTestParDo',
            dm._translate_step_name('ref_AppliedPTransform_MyTestParDo_14'))
Beispiel #5
0
    def process(
        self,
        element,
        window=DoFn.WindowParam,
        element_state_0=DoFn.StateParam(ELEMENT_STATE_0),
        element_state_1=DoFn.StateParam(ELEMENT_STATE_1),
        element_state_2=DoFn.StateParam(ELEMENT_STATE_2),
        element_state_3=DoFn.StateParam(ELEMENT_STATE_3),
        count_state=DoFn.StateParam(COUNT_STATE),
        window_timer=DoFn.TimerParam(WINDOW_TIMER),
        buffering_timer=DoFn.TimerParam(BUFFERING_TIMER)):
      # Allowed lateness not supported in Python SDK
      # https://beam.apache.org/documentation/programming-guide/#watermarks-and-late-data
      window_timer.set(window.end)

      count_state.add(1)
      count = count_state.read()

      element_states = [element_state_0, element_state_1, element_state_2, element_state_3]
      element_states[count % 4].add(element)

      if count == 1 and max_buffering_duration_secs > 0:
        # This is the first element in batch. Start counting buffering time if a
        # limit was set.
        buffering_timer.set(clock() + max_buffering_duration_secs)
      if count >= batch_size:
        return self.flush_batch(element_states, count_state, buffering_timer)
Beispiel #6
0
    def end_of_window(self, values_state=DoFn.StateParam(VALUES_STATE)):
        logging.info('start end_of_window.')

        read_count = 0
        read_bytes = 0
        values = values_state.read()
        for value in values:
            read_count += 1
            read_bytes += len(value)

        logging.info('read_count: %s, read_bytes: %s', read_count, read_bytes)
        logging.info('end end_of_window.')
Beispiel #7
0
    def on_window_timer(
        self,
        element_state_0=DoFn.StateParam(ELEMENT_STATE_0),
        element_state_1=DoFn.StateParam(ELEMENT_STATE_1),
        element_state_2=DoFn.StateParam(ELEMENT_STATE_2),
        element_state_3=DoFn.StateParam(ELEMENT_STATE_3),
        count_state=DoFn.StateParam(COUNT_STATE),
        buffering_timer=DoFn.TimerParam(BUFFERING_TIMER)):

      element_states = [element_state_0, element_state_1, element_state_2, element_state_3]
      return self.flush_batch(element_states, count_state, buffering_timer)
Beispiel #8
0
 def __init__(self, project, source_language_code, target_language_code):
     DoFn.__init__(self)
     self._project = project
     self._source_language_code = source_language_code
     self._target_language_code = target_language_code