def deferred_status(self): # type: () -> Optional[Tuple[Any, Duration]] """Returns deferred work which is produced by ``defer_remainder()``. When there is a self-checkpoint performed, the system needs to fulfill the DelayedBundleApplication with deferred_work for a ProcessBundleResponse. The system calls this API to get deferred_residual with watermark together to help the runner to schedule a future work. Returns: (deferred_residual, time_delay) if having any residual, else None. """ if self._deferred_residual: # If _deferred_timestamp is None, create Duration(0). if not self._deferred_timestamp: self._deferred_timestamp = Duration() # If an absolute timestamp is provided, calculate the delta between # the absoluted time and the time deferred_status() is called. elif isinstance(self._deferred_timestamp, Timestamp): self._deferred_timestamp = (self._deferred_timestamp - Timestamp.now()) # If a Duration is provided, the deferred time should be: # provided duration - the spent time since the defer_remainder() is # called. elif isinstance(self._deferred_timestamp, Duration): self._deferred_timestamp -= (Timestamp.now() - self._timestamp) return self._deferred_residual, self._deferred_timestamp return None
def defer_remainder(self, deferred_time=None): """Performs self-checkpoint on current processing restriction with an expected resuming time. Self-checkpoint could happen during processing elements. When executing an DoFn.process(), you may want to stop processing an element and resuming later if current element has been processed quit a long time or you also want to have some outputs from other elements. ``defer_remainder()`` can be called on per element if needed. Args: deferred_time: A relative ``Duration`` that indicates the ideal time gap between now and resuming, or an absolute ``Timestamp`` for resuming execution time. If the time_delay is None, the deferred work will be executed as soon as possible. """ # Record current time for calculating deferred_time later. with self._lock: self._timestamp = Timestamp.now() if deferred_time and not isinstance(deferred_time, (Duration, Timestamp)): raise ValueError( 'The timestamp of deter_remainder() should be a ' 'Duration or a Timestamp, or None.') self._deferred_timestamp = deferred_time checkpoint = self.try_split(0) if checkpoint: _, self._deferred_residual = checkpoint
def test_match_updated_files(self): files = [] tempdir = '%s%s' % (self._new_tempdir(), os.sep) def _create_extra_file(element): writer = FileSystems.create(FileSystems.join(tempdir, 'extra')) writer.close() return element.path # Create two files to be matched before pipeline files.append(self._create_temp_file(dir=tempdir)) writer = FileSystems.create(FileSystems.join(tempdir, 'extra')) writer.close() # Add file name that will be created mid-pipeline files.append(FileSystems.join(tempdir, 'extra')) files.append(FileSystems.join(tempdir, 'extra')) interval = 0.2 start = Timestamp.now() stop = start + interval + 0.1 with TestPipeline() as p: match_continiously = (p | fileio.MatchContinuously( file_pattern=FileSystems.join( tempdir, '*'), interval=interval, start_timestamp=start, stop_timestamp=stop, match_updated_files=True) | beam.Map(_create_extra_file)) assert_that(match_continiously, equal_to(files))
def _get_message_iter(self): """Returns an iterator of messages from the Spark server. Note that while message history is de-duped, this function's returned iterator may contain duplicate values.""" sleep_secs = 1.0 message_ix = 0 while True: response = self._get_spark_status() state = self._get_beam_state(response) timestamp = Timestamp.now() message = None if 'message' in response: importance = ( beam_job_api_pb2.JobMessage.MessageImportance.JOB_MESSAGE_ERROR if state == beam_job_api_pb2.JobState.FAILED else beam_job_api_pb2.JobMessage.MessageImportance.JOB_MESSAGE_BASIC) message = beam_job_api_pb2.JobMessage( message_id='message%d' % message_ix, time=str(int(timestamp)), importance=importance, message_text=response['message']) yield message message_ix += 1 # TODO(BEAM-8983) In the event of a failure, query # additional info from Spark master and/or workers. check_timestamp = self.set_state(state) if check_timestamp is not None: if message: self._message_history.append(message) self._message_history.append((state, check_timestamp)) yield state, timestamp sleep_secs = min(60, sleep_secs * 1.2) time.sleep(sleep_secs)
def __init__(self, job_id, job_name, pipeline, options): self._job_id = job_id self._job_name = job_name self._pipeline_proto = pipeline self._pipeline_options = options self._state_history = [(beam_job_api_pb2.JobState.STOPPED, Timestamp.now())]
def __init__(self, file_pattern, interval=360.0, has_deduplication=True, start_timestamp=Timestamp.now(), stop_timestamp=MAX_TIMESTAMP, match_updated_files=False, apply_windowing=False): """Initializes a MatchContinuously transform. Args: file_pattern: The file path to read from. interval: Interval at which to check for files in seconds. has_deduplication: Whether files already read are discarded or not. start_timestamp: Timestamp for start file checking. stop_timestamp: Timestamp after which no more files will be checked. match_updated_files: (When has_deduplication is set to True) whether match file with timestamp changes. apply_windowing: Whether each element should be assigned to individual window. If false, all elements will reside in global window. """ self.file_pattern = file_pattern self.interval = interval self.has_deduplication = has_deduplication self.start_ts = start_timestamp self.stop_ts = stop_timestamp self.match_upd = match_updated_files self.apply_windowing = apply_windowing
def test_without_deduplication(self): interval = 0.2 start = Timestamp.now() stop = start + interval + 0.1 files = [] tempdir = '%s%s' % (self._new_tempdir(), os.sep) # Create a file to be matched before pipeline starts file = self._create_temp_file(dir=tempdir) # Add file twice, since it will be matched for every interval files += [file, file] # Add file name that will be created mid-pipeline files.append(FileSystems.join(tempdir, 'extra')) def _create_extra_file(element): writer = FileSystems.create(FileSystems.join(tempdir, 'extra')) writer.close() return element.path with TestPipeline() as p: match_continiously = (p | fileio.MatchContinuously( file_pattern=FileSystems.join( tempdir, '*'), interval=interval, has_deduplication=False, start_timestamp=start, stop_timestamp=stop) | beam.Map(_create_extra_file)) assert_that(match_continiously, equal_to(files))
def __init__(self, job_id, # type: str job_name, # type: Optional[str] pipeline, # type: beam_runner_api_pb2.Pipeline options # type: struct_pb2.Struct ): self._job_id = job_id self._job_name = job_name self._pipeline_proto = pipeline self._pipeline_options = options self._state_history = [(beam_job_api_pb2.JobState.STOPPED, Timestamp.now())]
def set_state(self, new_state): """Set the latest state as an int enum and update the state history. :param new_state: int latest state enum :return: Timestamp or None the new timestamp if the state has not changed, else None """ if new_state != self._state_history[-1][0]: timestamp = Timestamp.now() self._state_history.append((new_state, timestamp)) return timestamp else: return None
def __init__(self, start_timestamp=Timestamp.now(), stop_timestamp=MAX_TIMESTAMP, fire_interval=360.0, apply_windowing=False): ''' :param start_timestamp: Timestamp for first element. :param stop_timestamp: Timestamp after which no elements will be output. :param fire_interval: Interval at which to output elements. :param apply_windowing: Whether each element should be assigned to individual window. If false, all elements will reside in global window. ''' self.start_ts = start_timestamp self.stop_ts = stop_timestamp self.interval = fire_interval self.apply_windowing = apply_windowing
def __init__(self, file_pattern, interval=360.0, has_deduplication=True, start_timestamp=Timestamp.now(), stop_timestamp=MAX_TIMESTAMP): """Initializes a MatchContinuously transform. Args: file_pattern: The file path to read from. interval: Interval at which to check for files in seconds. has_deduplication: Whether files already read are discarded or not. start_timestamp: Timestamp for start file checking. stop_timestamp: Timestamp after which no more files will be checked. """ self.file_pattern = file_pattern self.interval = interval self.has_deduplication = has_deduplication self.start_ts = start_timestamp self.stop_ts = stop_timestamp
def process(self, element, batch=DoFn.StateParam(BATCH), batchSize=DoFn.StateParam(BATCH_SIZE), flushTimer=DoFn.TimerParam(FLUSH_TIMER), endOfTime=DoFn.TimerParam(EOW_TIMER)): from apache_beam.utils.timestamp import Timestamp, Duration from apache_beam.transforms.window import GlobalWindow currentSize = batchSize.read() if not currentSize: currentSize = 1 flushTimer.set(Timestamp.now() + Duration(micros=self.maxWaitTime * 1000)) endOfTime.set(GlobalWindow().max_timestamp()) else: currentSize += 1 batchSize.write(currentSize) batch.add(element[1]) if currentSize >= self.batchSize: return self.flush(batch, batchSize)
def test_now(self): now = Timestamp.now() self.assertTrue(isinstance(now, Timestamp))
def current_watermark(self): self._timestamp = max(self._timestamp, Timestamp.now()) return self._timestamp
def __init__(self, timestamp=None): self._timestamp = timestamp or Timestamp.now()
def test_advance_watermark_with_incorrect_sys_clock(self): initial_timestamp = Timestamp.now() + Duration(100) watermark_estimator = WalltimeWatermarkEstimator(initial_timestamp) self.assertEqual(watermark_estimator.current_watermark(), initial_timestamp) self.assertEqual( watermark_estimator.get_estimator_state(), initial_timestamp)
def test_observe_timestamp(self): now_time = Timestamp.now() + Duration(10) watermark_estimator = WalltimeWatermarkEstimator(now_time) watermark_estimator.observe_timestamp(Timestamp(10)) watermark_estimator.observe_timestamp(Timestamp(10)) self.assertEqual(watermark_estimator.current_watermark(), now_time)
def test_initialization(self, mock_timestamp): now_time = Timestamp.now() - Duration(10) mock_timestamp.side_effect = lambda: now_time watermark_estimator = WalltimeWatermarkEstimator() self.assertIsInstance(watermark_estimator, WatermarkEstimator) self.assertEqual(watermark_estimator.get_estimator_state(), now_time)