def poll_status(self, context): retval = None if self.finished: logging.info("Job is already finished - skipping to result phase.") retval = self.return_value else: logging.info("Job is not finished - executing poll phase.") start_time = datetime.utcnow() i = 0 # Bluecore App Engine backend instances timeout after an hour while retval is None: remaining_secs = self.appengine_timeout - ( datetime.utcnow() - start_time).total_seconds() logging.info("%0.2f seconds remain until timeout" % remaining_secs) if remaining_secs <= 0: raise AirflowTaskTimeout() # try_xcom_pull allows us to distinguish between cases where the task # hasn't pushed an XCom and where the task pushed an XCom with value None. retval_tuple = try_xcom_pull(context=context, task_ids=self.task_id) # if XCom not yet pushed if not retval_tuple[0]: logging.info("XCom response not found. Sleeping.") # sleep for a while and try again time.sleep(min(60, 2**i)) i += 1 continue logging.info("XCom response received: %s" % str(retval)) retval = retval_tuple[1] if retval == '__EXCEPTION__': self.retrieve_exception_details(context) break logging.info("Executing result phase.") if retval == '__EXCEPTION__': logging.error("Found exception %s: %s" % (self.exc_type or '<UNKNOWN>', self.exc_message or '<UNKNOWN>')) if self.exc_callstack: logging.error(str(self.exc_callstack)) raise AirflowException(self.exc_message) logging.info("Remote task finished successfully.") return
def poll_status(self, context): start_time = datetime.utcnow() i = 0 # Bluecore App Engine backend instances timeout after an hour while True: remaining_secs = self.appengine_timeout - ( datetime.utcnow() - start_time).total_seconds() logging.info("%0.2f seconds remain until timeout" % remaining_secs) if remaining_secs <= 0: raise AirflowTaskTimeout() # try_xcom_pull allows us to distinguish between cases where the task # hasn't pushed an XCom and where the task pushed an XCom with value None. retval_tuple = try_xcom_pull(context=context, task_ids=self.task_id) # if XCom not yet pushed if not retval_tuple[0]: logging.info("XCom response not found. Sleeping.") # sleep for a while and try again time.sleep(min(60, 2**i)) i += 1 continue retval = retval_tuple[1] logging.info("XCom response received: %s" % str(retval)) if retval == '__EXCEPTION__': exc_message = self.safe_xcom_pull(context=context, task_ids=self.task_id, key='__EXCEPTION_MESSAGE') exc_type = self.safe_xcom_pull(context=context, task_ids=self.task_id, key='__EXCEPTION_TYPE') exc_callstack = self.safe_xcom_pull( context=context, task_ids=self.task_id, key='__EXCEPTION_CALLSTACK') logging.error( "Found exception %s: %s" % (exc_type or '<UNKNOWN>', exc_message or '<UNKNOWN>')) if exc_callstack: logging.error(str(exc_callstack)) raise AirflowException(exc_message) return
def _monitor_logging(self, ci_hook, resource_group, name): last_state = None last_message_logged = None last_line_logged = None for _ in range(43200): # roughly 12 hours try: state, exit_code, detail_status = ci_hook.get_state_exitcode_details( resource_group, name) if state != last_state: self.log.info("Container group state changed to %s", state) last_state = state messages = ci_hook.get_messages(resource_group, name) last_message_logged = self._log_last(messages, last_message_logged) if state in ["Running", "Terminated"]: try: logs = ci_hook.get_logs(resource_group, name) last_line_logged = self._log_last( logs, last_line_logged) except CloudError: self.log.exception("Exception while getting logs from " "container instance, retrying...") if state == "Terminated": self.log.info("Container exited with detail_status %s", detail_status) return exit_code except CloudError as err: if 'ResourceNotFound' in str(err): self.log.warning( "ResourceNotFound, container is probably removed " "by another process " "(make sure that the name is unique).") return 1 else: self.log.exception( "Exception while getting container groups") except Exception: self.log.exception("Exception while getting container groups") sleep(1) # no return -> hence still running raise AirflowTaskTimeout("Did not complete on time")
def poll_status_files(self): success_file_name = '%s/succeeded' % self.job_id fail_file_name = '%s/failed' % self.job_id start_time = datetime.utcnow() i = 0 # Bluecore App Engine backend instances timeout after an hour while (datetime.utcnow() - start_time).total_seconds() < 3600: time.sleep(min(60, 5 * 2**i)) i += 1 if check_gcs_file_exists(success_file_name, self.google_cloud_conn_id, self.bucket): return if check_gcs_file_exists(fail_file_name, self.google_cloud_conn_id, self.bucket): raise AirflowException('found failure file %s/%s' % (self.bucket, fail_file_name)) raise AirflowTaskTimeout()
def wait_for_task_execution(self, task_execution_arn: str, max_iterations: int = 2 * 180) -> bool: """ Wait for Task Execution status to be complete (SUCCESS/ERROR). The ``task_execution_arn`` must exist, or a boto3 ClientError will be raised. :param str task_execution_arn: TaskExecutionArn :param int max_iterations: Maximum number of iterations before timing out. :return: Result of task execution. :rtype: bool :raises AirflowTaskTimeout: If maximum iterations is exceeded. :raises AirflowBadRequest: If ``task_execution_arn`` is empty. """ if not task_execution_arn: raise AirflowBadRequest("task_execution_arn not specified") status = None iterations = max_iterations while status is None or status in self.TASK_EXECUTION_INTERMEDIATE_STATES: task_execution = self.get_conn().describe_task_execution( TaskExecutionArn=task_execution_arn) status = task_execution["Status"] self.log.info("status=%s", status) iterations -= 1 if status in self.TASK_EXECUTION_FAILURE_STATES: break if status in self.TASK_EXECUTION_SUCCESS_STATES: break if iterations <= 0: break time.sleep(self.wait_interval_seconds) if status in self.TASK_EXECUTION_SUCCESS_STATES: return True if status in self.TASK_EXECUTION_FAILURE_STATES: return False if iterations <= 0: raise AirflowTaskTimeout("Max iterations exceeded!") raise AirflowException("Unknown status: %s" % status) # Should never happen
def handle_timeout(self, signum, frame): _log.error("Process timed out") raise AirflowTaskTimeout(self.error_message)
def handle_timeout(self, signum, frame): """ Logs information and raises AirflowTaskTimeout. """ self.log.error("Process timed out, PID: %s", str(os.getpid())) raise AirflowTaskTimeout(self.error_message)
def handle_timeout(self, *args): # pylint: disable=unused-argument """Logs information and raises AirflowTaskTimeout.""" self.log.error("Process timed out, PID: %s", str(os.getpid())) raise AirflowTaskTimeout(self.error_message)
def handle_timeout(self, signum, frame): self.log.error("Process timed out, PID: %s", str(os.getpid())) raise AirflowTaskTimeout(self.error_message)