Beispiel #1
0
    def wait_for_job_success(
        self,
        job_name,
        namespace,
        wait_timeout=DEFAULT_WAIT_TIMEOUT,
        wait_time_between_attempts=DEFAULT_WAIT_BETWEEN_ATTEMPTS,
        num_pods_to_wait_for=DEFAULT_JOB_POD_COUNT,
    ):
        '''Poll a job for successful completion.

        Args:
            job_name (str): Name of the job to wait for.
            namespace (str): Namespace in which the job is located.
            wait_timeout (numeric, optional): Timeout after which to give up and raise exception.
                Defaults to DEFAULT_WAIT_TIMEOUT.
            wait_time_between_attempts (numeric, optional): Wait time between polling attempts. Defaults
                to DEFAULT_WAIT_BETWEEN_ATTEMPTS.

        Raises:
            DagsterK8sError: Raised when wait_timeout is exceeded or an error is encountered.
        '''
        check.str_param(job_name, 'job_name')
        check.str_param(namespace, 'namespace')
        check.numeric_param(wait_timeout, 'wait_timeout')
        check.numeric_param(wait_time_between_attempts, 'wait_time_between_attempts')
        check.int_param(num_pods_to_wait_for, 'num_pods_to_wait_for')

        job = None

        start = self.timer()

        # Ensure we found the job that we launched
        while not job:
            if self.timer() - start > wait_timeout:

                raise DagsterK8sError('Timed out while waiting for job to launch')

            jobs = self.batch_api.list_namespaced_job(namespace=namespace)
            job = next((j for j in jobs.items if j.metadata.name == job_name), None)

            if not job:
                self.logger('Job "{job_name}" not yet launched, waiting'.format(job_name=job_name))
                self.sleeper(wait_time_between_attempts)

        # Wait for job completed status
        while True:
            if self.timer() - start > wait_timeout:
                raise DagsterK8sError('Timed out while waiting for job to complete')

            # See: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.11/#jobstatus-v1-batch
            status = self.batch_api.read_namespaced_job_status(job_name, namespace=namespace).status

            if status.failed and status.failed > 0:
                raise DagsterK8sError('Encountered failed job pods with status: %s' % str(status))

            # done waiting for pod completion
            if status.succeeded == num_pods_to_wait_for:
                break

            self.sleeper(wait_time_between_attempts)
Beispiel #2
0
def retry_pg_connection_fn(fn, retry_limit=5, retry_wait=0.2):
    """Reusable retry logic for any psycopg2/sqlalchemy PG connection functions that may fail.
    Intended to be used anywhere we connect to PG, to gracefully handle transient connection issues.
    """
    check.callable_param(fn, "fn")
    check.int_param(retry_limit, "retry_limit")
    check.numeric_param(retry_wait, "retry_wait")

    while True:
        try:
            return fn()
        except (
                # See: https://www.psycopg.org/docs/errors.html
                # These are broad, we may want to list out specific exceptions to capture
                psycopg2.DatabaseError,
                psycopg2.OperationalError,
                sqlalchemy.exc.DatabaseError,
                sqlalchemy.exc.OperationalError,
        ) as exc:
            logging.warning("Retrying failed database connection")
            if retry_limit == 0:
                raise DagsterPostgresException(
                    "too many retries for DB connection") from exc

        time.sleep(retry_wait)
        retry_limit -= 1
Beispiel #3
0
def retry_mysql_creation_fn(fn, retry_limit=5, retry_wait=0.2):
    # Retry logic to recover from the case where two processes are creating
    # tables at the same time using sqlalchemy

    check.callable_param(fn, "fn")
    check.int_param(retry_limit, "retry_limit")
    check.numeric_param(retry_wait, "retry_wait")

    while True:
        try:
            return fn()
        except (
            mysql.ProgrammingError,
            mysql.IntegrityError,
            db.exc.ProgrammingError,
            db.exc.IntegrityError,
        ) as exc:
            if (
                isinstance(exc, db.exc.ProgrammingError)
                and exc.orig
                and exc.orig.errno == mysql.errorcode.ER_TABLE_EXISTS_ERROR
            ) or (
                isinstance(exc, mysql.ProgrammingError)
                and exc.errno == mysql.errorcode.ER_TABLE_EXISTS_ERROR
            ):
                raise
            logging.warning("Retrying failed database creation")
            if retry_limit == 0:
                raise DagsterMySQLException("too many retries for DB creation") from exc

        time.sleep(retry_wait)
        retry_limit -= 1
Beispiel #4
0
    def wait_for_job(
        self,
        job_name,
        namespace,
        wait_timeout=DEFAULT_WAIT_TIMEOUT,
        wait_time_between_attempts=DEFAULT_WAIT_BETWEEN_ATTEMPTS,
        start_time=None,
    ):
        """Wait for a job to launch and be running.

        Args:
            job_name (str): Name of the job to wait for.
            namespace (str): Namespace in which the job is located.
            wait_timeout (numeric, optional): Timeout after which to give up and raise exception.
                Defaults to DEFAULT_WAIT_TIMEOUT.
            wait_time_between_attempts (numeric, optional): Wait time between polling attempts. Defaults
                to DEFAULT_WAIT_BETWEEN_ATTEMPTS.

        Raises:
            DagsterK8sError: Raised when wait_timeout is exceeded or an error is encountered.
        """
        check.str_param(job_name, "job_name")
        check.str_param(namespace, "namespace")
        check.numeric_param(wait_timeout, "wait_timeout")
        check.numeric_param(wait_time_between_attempts,
                            "wait_time_between_attempts")

        job = None
        start = start_time or self.timer()

        while not job:
            if self.timer() - start > wait_timeout:
                raise DagsterK8sTimeoutError(
                    "Timed out while waiting for job {job_name}"
                    " to launch".format(job_name=job_name))

            # Get all jobs in the namespace and find the matching job
            def _get_jobs_for_namespace():
                jobs = self.batch_api.list_namespaced_job(
                    namespace=namespace,
                    field_selector="metadata.name={}".format(job_name))
                if jobs.items:
                    check.invariant(
                        len(jobs.items) == 1,
                        'There should only be one k8s job with name "{}", but got multiple jobs:" {}'
                        .format(job_name, jobs.items),
                    )
                    return jobs.items[0]
                else:
                    return None

            job = k8s_api_retry(_get_jobs_for_namespace,
                                max_retries=3,
                                timeout=wait_time_between_attempts)

            if not job:
                self.logger(
                    'Job "{job_name}" not yet launched, waiting'.format(
                        job_name=job_name))
                self.sleeper(wait_time_between_attempts)
Beispiel #5
0
def retry_mysql_connection_fn(fn, retry_limit=5, retry_wait=0.2):
    """Reusable retry logic for any MySQL connection functions that may fail.
    Intended to be used anywhere we connect to MySQL, to gracefully handle transient connection
    issues.
    """
    check.callable_param(fn, "fn")
    check.int_param(retry_limit, "retry_limit")
    check.numeric_param(retry_wait, "retry_wait")

    while True:
        try:
            return fn()

        except (
            mysql.DatabaseError,
            mysql.OperationalError,
            db.exc.DatabaseError,
            db.exc.OperationalError,
            mysql.errors.InterfaceError,
        ) as exc:
            logging.warning("Retrying failed database connection")
            if retry_limit == 0:
                raise DagsterMySQLException("too many retries for DB connection") from exc

        time.sleep(retry_wait)
        retry_limit -= 1
Beispiel #6
0
def retry_pg_creation_fn(fn, retry_limit=5, retry_wait=0.2):
    # Retry logic to recover from the case where two processes are creating
    # tables at the same time using sqlalchemy

    check.callable_param(fn, "fn")
    check.int_param(retry_limit, "retry_limit")
    check.numeric_param(retry_wait, "retry_wait")

    while True:
        try:
            return fn()
        except (
                psycopg2.ProgrammingError,
                psycopg2.IntegrityError,
                sqlalchemy.exc.ProgrammingError,
                sqlalchemy.exc.IntegrityError,
        ) as exc:
            # Only programming error we want to retry on is the DuplicateTable error
            if (isinstance(exc, sqlalchemy.exc.ProgrammingError) and exc.orig
                    and exc.orig.pgcode != psycopg2.errorcodes.DUPLICATE_TABLE
                ) or (isinstance(exc, psycopg2.ProgrammingError)
                      and exc.pgcode != psycopg2.errorcodes.DUPLICATE_TABLE):
                raise

            logging.warning("Retrying failed database creation")
            if retry_limit == 0:
                raise DagsterPostgresException(
                    "too many retries for DB creation") from exc

        time.sleep(retry_wait)
        retry_limit -= 1
Beispiel #7
0
def retry_pg_connection_fn(fn, retry_limit=5, retry_wait=0.2):
    """
    Reusable retry logic for any psycopg2/sqlalchemy PG connection functions that may fail.
    Intended to be used anywhere we connect to PG, to gracefully handle transient connection issues.
    """
    check.callable_param(fn, "fn")
    check.int_param(retry_limit, "retry_limit")
    check.numeric_param(retry_wait, "retry_wait")
    attempt_num = 0
    while True:
        attempt_num += 1
        try:
            return fn()
        except (
                # See: https://www.psycopg.org/docs/errors.html
                # These are broad, we may want to list out specific exceptions to capture
                psycopg2.DatabaseError,
                psycopg2.OperationalError,
                sqlalchemy.exc.DatabaseError,
                sqlalchemy.exc.OperationalError,
        ) as exc:
            logging.warning("Retrying failed database connection: %s", exc)
            if attempt_num > retry_limit:
                raise DagsterPostgresException(
                    "too many retries for DB connection") from exc

        time.sleep(
            calculate_delay(
                attempt_num=attempt_num,
                base_delay=retry_wait,
                jitter=Jitter.PLUS_MINUS,
                backoff=Backoff.EXPONENTIAL,
            ))
Beispiel #8
0
    def run(self, interval_seconds=2):
        """
        Run the coordinator daemon

        Arguments:
            interval_seconds (float): time in seconds to wait between dequeuing attempts
        """
        check.numeric_param(interval_seconds, "interval_seconds")

        while True:
            self.attempt_to_launch_runs()
            time.sleep(interval_seconds)
Beispiel #9
0
    def __init__(self, interval_seconds):
        self._logger = get_default_daemon_logger(type(self).__name__)
        self.interval_seconds = check.numeric_param(interval_seconds, "interval_seconds")

        self._last_iteration_time = None
        self._last_heartbeat_time = None
        self._current_iteration_exceptions = None
        self._last_iteration_exceptions = None
Beispiel #10
0
def k8s_api_retry(
    fn,
    max_retries,
    timeout,
    msg_fn=lambda: "Unexpected error encountered in Kubernetes API Client.",
):
    check.callable_param(fn, "fn")
    check.int_param(max_retries, "max_retries")
    check.numeric_param(timeout, "timeout")

    remaining_attempts = 1 + max_retries
    while remaining_attempts > 0:
        remaining_attempts -= 1

        try:
            return fn()
        except kubernetes.client.rest.ApiException as e:
            # Only catch whitelisted ApiExceptions
            status = e.status

            # Check if the status code is generally whitelisted
            whitelisted = status in WHITELISTED_TRANSIENT_K8S_STATUS_CODES

            # If there are remaining attempts, swallow the error
            if whitelisted and remaining_attempts > 0:
                time.sleep(timeout)
            elif whitelisted and remaining_attempts == 0:
                raise_from(
                    DagsterK8sAPIRetryLimitExceeded(
                        msg_fn(),
                        k8s_api_exception=e,
                        max_retries=max_retries,
                        original_exc_info=sys.exc_info(),
                    ),
                    e,
                )
            else:
                raise_from(
                    DagsterK8sUnrecoverableAPIError(
                        msg_fn(),
                        k8s_api_exception=e,
                        original_exc_info=sys.exc_info(),
                    ),
                    e,
                )
Beispiel #11
0
    def wait_for_job(
        self,
        job_name,
        namespace,
        wait_timeout=DEFAULT_WAIT_TIMEOUT,
        wait_time_between_attempts=DEFAULT_WAIT_BETWEEN_ATTEMPTS,
    ):
        """ Wait for a job to launch and be running.

        Args:
            job_name (str): Name of the job to wait for.
            namespace (str): Namespace in which the job is located.
            wait_timeout (numeric, optional): Timeout after which to give up and raise exception.
                Defaults to DEFAULT_WAIT_TIMEOUT.
            wait_time_between_attempts (numeric, optional): Wait time between polling attempts. Defaults
                to DEFAULT_WAIT_BETWEEN_ATTEMPTS.

        Raises:
            DagsterK8sError: Raised when wait_timeout is exceeded or an error is encountered.
        """
        check.str_param(job_name, "job_name")
        check.str_param(namespace, "namespace")
        check.numeric_param(wait_timeout, "wait_timeout")
        check.numeric_param(wait_time_between_attempts,
                            "wait_time_between_attempts")

        job = None

        start = self.timer()
        # Ensure we found the job that we launched
        while not job:
            if self.timer() - start > wait_timeout:
                raise DagsterK8sError(
                    "Timed out while waiting for job to launch")

            jobs = self.batch_api.list_namespaced_job(namespace=namespace)
            job = next((j for j in jobs.items if j.metadata.name == job_name),
                       None)

            if not job:
                self.logger(
                    'Job "{job_name}" not yet launched, waiting'.format(
                        job_name=job_name))
                self.sleeper(wait_time_between_attempts)
Beispiel #12
0
    def __init__(self, interval_seconds):
        self._logger = get_default_daemon_logger(type(self).__name__)
        self.interval_seconds = check.numeric_param(interval_seconds,
                                                    "interval_seconds")

        self._last_iteration_time = None
        self._last_heartbeat_time = None
        self._errors = []  # (SerializableErrorInfo, timestamp) tuples

        self._first_error_logged = False
Beispiel #13
0
 def numeric_column(
         name,
         expected_dtypes,
         min_value=-float('inf'),
         max_value=float('inf'),
         non_nullable=False,
         unique=False,
 ):
     return PandasColumn(
         name=check.str_param(name, 'name'),
         constraints=[
             ColumnTypeConstraint(expected_dtypes),
             InRangeColumnConstraint(
                 check.numeric_param(min_value, 'min_value'),
                 check.numeric_param(max_value, 'max_value'),
             ),
         ] + _construct_keyword_constraints(non_nullable=non_nullable,
                                            unique=unique),
     )
Beispiel #14
0
    def float_column(
        name,
        min_value=-float("inf"),
        max_value=float("inf"),
        non_nullable=False,
        unique=False,
        ignore_missing_vals=False,
        is_required=None,
    ):
        """
        Simple constructor for PandasColumns that expresses numeric constraints on float dtypes.

        Args:
            name (str): Name of the column. This must match up with the column name in the dataframe you
                expect to receive.
            min_value (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to -float('inf')
            max_value (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to float('inf')
            non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column
                ought to be non null values.
            unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values.
            ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will
                only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True.
            is_required (Optional[bool]): Flag indicating the optional/required presence of the column.
                If the column exists the validate function will validate the column. Default to True.
        """
        return PandasColumn(
            name=check.str_param(name, "name"),
            constraints=[
                ColumnDTypeFnConstraint(is_float_dtype),
                InRangeColumnConstraint(
                    check.numeric_param(min_value, "min_value"),
                    check.numeric_param(max_value, "max_value"),
                    ignore_missing_vals=ignore_missing_vals,
                ),
            ] + _construct_keyword_constraints(
                non_nullable=non_nullable,
                unique=unique,
                ignore_missing_vals=ignore_missing_vals),
            is_required=is_required,
        )
Beispiel #15
0
 def numeric_column(
         cls,
         name,
         expected_dtypes,
         min_value=-float('inf'),
         max_value=float('inf'),
         exists=False,
         unique=False,
 ):
     return cls(
         name=check.str_param(name, 'name'),
         constraints=cls.add_configurable_constraints(
             [
                 ColumnTypeConstraint(expected_dtypes),
                 InRangeColumnConstraint(
                     check.numeric_param(min_value, 'min_value'),
                     check.numeric_param(max_value, 'max_value'),
                 ),
             ],
             exists=exists,
             unique=unique,
         ),
     )
Beispiel #16
0
    def __init__(self, address, timeout, inst_data=None):
        self._inst_data = check.opt_inst_param(inst_data, 'inst_data', ConfigurableClassData)
        self._address = check.str_param(address, 'address')
        self._timeout = check.numeric_param(timeout, 'timeout')
        self._handle = None
        self._instance = None
        self._validated = False

        parsed_url = urlparse(address)
        check.invariant(
            parsed_url.scheme and parsed_url.netloc,
            'Address {address} is not a valid URL. Host URL should include scheme ie http://localhost'.format(
                address=self._address
            ),
        )
Beispiel #17
0
    def __init__(self,
                 host,
                 token,
                 poll_interval_sec=10,
                 max_wait_time_sec=_DEFAULT_RUN_MAX_WAIT_TIME_SEC):
        """Args:
            host (str): Databricks host, e.g. https://uksouth.azuredatabricks.net
            token (str): Databricks token
        """
        self.host = check.str_param(host, "host")
        self.token = check.str_param(token, "token")
        self.poll_interval_sec = check.numeric_param(poll_interval_sec,
                                                     "poll_interval_sec")
        self.max_wait_time_sec = check.int_param(max_wait_time_sec,
                                                 "max_wait_time_sec")

        self._client = DatabricksClient(host=self.host, token=self.token)
Beispiel #18
0
def wait_for_pod(
    pod_name,
    namespace,
    wait_for_state=WaitForPodState.Ready,
    wait_timeout=DEFAULT_WAIT_TIMEOUT,
    wait_time_between_attempts=DEFAULT_WAIT_BETWEEN_ATTEMPTS,
):
    '''Wait for a pod to launch and be running, or wait for termination (useful for job pods).

    Args:
        pod_name (str): Name of the pod to wait for.
        namespace (str): Namespace in which the pod is located.
        wait_for_state (WaitForPodState, optional): Whether to wait for pod readiness or
            termination. Defaults to waiting for readiness.
        wait_timeout (numeric, optional): Timeout after which to give up and raise exception.
            Defaults to DEFAULT_WAIT_TIMEOUT.
        wait_time_between_attempts (numeric, optional): Wait time between polling attempts. Defaults
            to DEFAULT_WAIT_BETWEEN_ATTEMPTS.

    Raises:
        DagsterK8sError: Raised when wait_timeout is exceeded or an error is encountered
    '''
    check.str_param(pod_name, 'pod_name')
    check.str_param(namespace, 'namespace')
    check.inst_param(wait_for_state, 'wait_for_state', WaitForPodState)
    check.numeric_param(wait_timeout, 'wait_timeout')
    check.numeric_param(wait_time_between_attempts,
                        'wait_time_between_attempts')

    logging.info('Waiting for pod %s' % pod_name)

    start = time.time()

    while True:
        pods = (kubernetes.client.CoreV1Api().list_namespaced_pod(
            namespace=namespace,
            field_selector='metadata.name=%s' % pod_name).items)
        pod = pods[0] if pods else None

        if time.time() - start > wait_timeout:
            raise DagsterK8sError(
                'Timed out while waiting for pod to become ready with pod info: %s'
                % str(pod))

        if pod is None:
            logging.info('Waiting for pod "%s" to launch...' % pod_name)
            time.sleep(wait_time_between_attempts)
            continue

        if not pod.status.container_statuses:
            logging.info(
                'Waiting for pod container status to be set by kubernetes...')
            time.sleep(wait_time_between_attempts)
            continue

        # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#containerstatus-v1-core
        container_status = pod.status.container_statuses[0]

        # State checks below, see:
        # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#containerstate-v1-core
        state = container_status.state

        if state.running is not None:
            if wait_for_state == WaitForPodState.Ready:
                # ready is boolean field of container status
                ready = container_status.ready
                if not ready:
                    logging.info('Waiting for pod "%s" to become ready...' %
                                 pod_name)
                    time.sleep(wait_time_between_attempts)
                    continue
                else:
                    logging.info('Pod "%s" is ready, done waiting' % pod_name)
                    break
            elif wait_for_state == WaitForPodState.Terminated:
                time.sleep(wait_time_between_attempts)
                continue
            else:
                raise DagsterK8sError('Unknown wait for state %s' %
                                      str(wait_for_state.value))
            break

        elif state.waiting is not None:
            # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#containerstatewaiting-v1-core
            if state.waiting.reason == 'PodInitializing':
                logging.info('Waiting for pod "%s" to initialize...' %
                             pod_name)
                time.sleep(wait_time_between_attempts)
                continue
            elif state.waiting.reason == 'ContainerCreating':
                logging.info('Waiting for container creation...')
                time.sleep(wait_time_between_attempts)
                continue
            elif state.waiting.reason in [
                    'ErrImagePull',
                    'ImagePullBackOff',
                    'CrashLoopBackOff',
                    'RunContainerError',
            ]:
                raise DagsterK8sError('Failed: %s' % state.waiting.message)
            else:
                raise DagsterK8sError('Unknown issue: %s' % state.waiting)

        # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#containerstateterminated-v1-core
        elif state.terminated is not None:
            if not state.terminated.exit_code == 0:
                raw_logs = retrieve_pod_logs(pod_name, namespace)
                raise DagsterK8sError(
                    'Pod did not exit successfully. Failed with message: %s and pod logs: %s'
                    % (state.terminated.message, str(raw_logs)))
            break

        else:
            raise DagsterK8sError('Should not get here, unknown pod state')
Beispiel #19
0
 def __init__(self, interval_seconds):
     self.interval_seconds = check.numeric_param(interval_seconds,
                                                 "interval_seconds")
     super().__init__()
Beispiel #20
0
    def __init__(
        self,
        instance,
        daemons,
        gen_workspace,
        heartbeat_interval_seconds=DEFAULT_HEARTBEAT_INTERVAL_SECONDS,
        heartbeat_tolerance_seconds=DEFAULT_DAEMON_HEARTBEAT_TOLERANCE_SECONDS,
        error_interval_seconds=DEFAULT_DAEMON_ERROR_INTERVAL_SECONDS,
        handler="default",
    ):

        self._daemon_uuid = str(uuid.uuid4())

        self._daemons = {}
        self._daemon_threads = {}

        self._instance = check.inst_param(instance, "instance", DagsterInstance)
        self._daemons = {
            daemon.daemon_type(): daemon
            for daemon in check.list_param(daemons, "daemons", of_type=DagsterDaemon)
        }

        self._gen_workspace = check.callable_param(gen_workspace, "gen_workspace")

        self._heartbeat_interval_seconds = check.numeric_param(
            heartbeat_interval_seconds, "heartbeat_interval_seconds"
        )

        self._heartbeat_tolerance_seconds = check.numeric_param(
            heartbeat_tolerance_seconds, "heartbeat_tolerance_seconds"
        )

        if not self._daemons:
            raise Exception("No daemons configured on the DagsterInstance")

        self._daemon_shutdown_event = threading.Event()

        configure_loggers(handler=handler)

        self._logger = logging.getLogger("dagster.daemon")
        self._logger.info(
            "instance is configured with the following daemons: {}".format(
                _sorted_quoted(type(daemon).__name__ for daemon in self.daemons)
            )
        )

        self._last_healthy_heartbeat_times = {}

        for daemon_type, daemon in self._daemons.items():
            self._daemon_threads[daemon_type] = threading.Thread(
                target=daemon.run_daemon_loop,
                args=(
                    self._instance.get_ref(),
                    self._daemon_uuid,
                    self._daemon_shutdown_event,
                    gen_workspace,
                    heartbeat_interval_seconds,
                    error_interval_seconds,
                ),
                name="dagster-daemon-{daemon_type}".format(daemon_type=daemon_type),
                daemon=True,  # Individual daemons should not outlive controller process
            )
            self._last_healthy_heartbeat_times[daemon_type] = time.time()
            self._daemon_threads[daemon_type].start()

        self._start_time = pendulum.now("UTC")
Beispiel #21
0
    def wait_for_pod(
        self,
        pod_name,
        namespace,
        wait_for_state=WaitForPodState.Ready,
        wait_timeout=DEFAULT_WAIT_TIMEOUT,
        wait_time_between_attempts=DEFAULT_WAIT_BETWEEN_ATTEMPTS,
    ):
        """Wait for a pod to launch and be running, or wait for termination (useful for job pods).

        Args:
            pod_name (str): Name of the pod to wait for.
            namespace (str): Namespace in which the pod is located.
            wait_for_state (WaitForPodState, optional): Whether to wait for pod readiness or
                termination. Defaults to waiting for readiness.
            wait_timeout (numeric, optional): Timeout after which to give up and raise exception.
                Defaults to DEFAULT_WAIT_TIMEOUT.
            wait_time_between_attempts (numeric, optional): Wait time between polling attempts. Defaults
                to DEFAULT_WAIT_BETWEEN_ATTEMPTS.

        Raises:
            DagsterK8sError: Raised when wait_timeout is exceeded or an error is encountered
        """
        check.str_param(pod_name, "pod_name")
        check.str_param(namespace, "namespace")
        check.inst_param(wait_for_state, "wait_for_state", WaitForPodState)
        check.numeric_param(wait_timeout, "wait_timeout")
        check.numeric_param(wait_time_between_attempts,
                            "wait_time_between_attempts")

        self.logger('Waiting for pod "%s"' % pod_name)

        start = self.timer()

        while True:

            pods = self.core_api.list_namespaced_pod(
                namespace=namespace,
                field_selector="metadata.name=%s" % pod_name).items
            pod = pods[0] if pods else None

            if self.timer() - start > wait_timeout:
                raise DagsterK8sError(
                    "Timed out while waiting for pod to become ready with pod info: %s"
                    % str(pod))

            if pod is None:
                self.logger('Waiting for pod "%s" to launch...' % pod_name)
                self.sleeper(wait_time_between_attempts)
                continue

            if not pod.status.container_statuses:
                self.logger(
                    "Waiting for pod container status to be set by kubernetes..."
                )
                self.sleeper(wait_time_between_attempts)
                continue

            # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#containerstatus-v1-core
            container_status = pod.status.container_statuses[0]

            # State checks below, see:
            # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#containerstate-v1-core
            state = container_status.state

            if state.running is not None:
                if wait_for_state == WaitForPodState.Ready:
                    # ready is boolean field of container status
                    ready = container_status.ready
                    if not ready:
                        self.logger('Waiting for pod "%s" to become ready...' %
                                    pod_name)
                        self.sleeper(wait_time_between_attempts)
                        continue
                    else:
                        self.logger('Pod "%s" is ready, done waiting' %
                                    pod_name)
                        break
                else:
                    check.invariant(
                        wait_for_state == WaitForPodState.Terminated,
                        "New invalid WaitForPodState")
                    self.sleeper(wait_time_between_attempts)
                    continue

            elif state.waiting is not None:
                # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#containerstatewaiting-v1-core
                if state.waiting.reason == KubernetesWaitingReasons.PodInitializing:
                    self.logger('Waiting for pod "%s" to initialize...' %
                                pod_name)
                    self.sleeper(wait_time_between_attempts)
                    continue
                if state.waiting.reason == KubernetesWaitingReasons.CreateContainerConfigError:
                    self.logger(
                        'Pod "%s" is waiting due to a CreateContainerConfigError with message "%s" - trying again to see if it recovers'
                        % (pod_name, state.waiting.message))
                    self.sleeper(wait_time_between_attempts)
                    continue
                elif state.waiting.reason == KubernetesWaitingReasons.ContainerCreating:
                    self.logger("Waiting for container creation...")
                    self.sleeper(wait_time_between_attempts)
                    continue
                elif state.waiting.reason in [
                        KubernetesWaitingReasons.ErrImagePull,
                        KubernetesWaitingReasons.ImagePullBackOff,
                        KubernetesWaitingReasons.CrashLoopBackOff,
                        KubernetesWaitingReasons.RunContainerError,
                ]:
                    raise DagsterK8sError(
                        'Failed: Reason="{reason}" Message="{message}"'.format(
                            reason=state.waiting.reason,
                            message=state.waiting.message))
                else:
                    raise DagsterK8sError("Unknown issue: %s" % state.waiting)

            # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#containerstateterminated-v1-core
            elif state.terminated is not None:
                if not state.terminated.exit_code == 0:
                    raw_logs = self.retrieve_pod_logs(pod_name, namespace)
                    message = state.terminated.message
                    raise DagsterK8sError(
                        f'Pod did not exit successfully. Failed with message: "{message}" '
                        f'and pod logs: "{raw_logs}"')
                else:
                    self.logger("Pod {pod_name} exitted successfully".format(
                        pod_name=pod_name))
                break

            else:
                raise DagsterK8sError("Should not get here, unknown pod state")
Beispiel #22
0
    def wait_for_job_success(
        self,
        job_name,
        namespace,
        instance=None,
        run_id=None,
        wait_timeout=DEFAULT_WAIT_TIMEOUT,
        wait_time_between_attempts=DEFAULT_WAIT_BETWEEN_ATTEMPTS,
        num_pods_to_wait_for=DEFAULT_JOB_POD_COUNT,
    ):
        """Poll a job for successful completion.

        Args:
            job_name (str): Name of the job to wait for.
            namespace (str): Namespace in which the job is located.
            wait_timeout (numeric, optional): Timeout after which to give up and raise exception.
                Defaults to DEFAULT_WAIT_TIMEOUT.
            wait_time_between_attempts (numeric, optional): Wait time between polling attempts. Defaults
                to DEFAULT_WAIT_BETWEEN_ATTEMPTS.

        Raises:
            DagsterK8sError: Raised when wait_timeout is exceeded or an error is encountered.
        """
        check.str_param(job_name, "job_name")
        check.str_param(namespace, "namespace")
        check.opt_inst_param(instance, "instance", DagsterInstance)
        check.opt_str_param(run_id, "run_id")
        check.numeric_param(wait_timeout, "wait_timeout")
        check.numeric_param(wait_time_between_attempts,
                            "wait_time_between_attempts")
        check.int_param(num_pods_to_wait_for, "num_pods_to_wait_for")

        start = self.timer()

        # Wait for job to be running
        self.wait_for_job(
            job_name,
            namespace,
            wait_timeout=wait_timeout,
            wait_time_between_attempts=wait_time_between_attempts,
            start_time=start,
        )

        # Wait for the job status to be completed. We check the status every
        # wait_time_between_attempts seconds
        while True:
            if self.timer() - start > wait_timeout:
                raise DagsterK8sTimeoutError(
                    "Timed out while waiting for job {job_name}"
                    " to complete".format(job_name=job_name))

            # Reads the status of the specified job. Returns a V1Job object that
            # we need to read the status off of.
            status = None

            def _get_job_status():
                job = self.batch_api.read_namespaced_job_status(
                    job_name, namespace=namespace)
                return job.status

            status = k8s_api_retry(_get_job_status,
                                   max_retries=3,
                                   timeout=wait_time_between_attempts)

            # status.succeeded represents the number of pods which reached phase Succeeded.
            if status.succeeded == num_pods_to_wait_for:
                break

            # status.failed represents the number of pods which reached phase Failed.
            if status.failed and status.failed > 0:
                raise DagsterK8sError(
                    "Encountered failed job pods for job {job_name} with status: {status}, "
                    "in namespace {namespace}".format(job_name=job_name,
                                                      status=status,
                                                      namespace=namespace))

            if instance and run_id:
                pipeline_run = instance.get_run_by_id(run_id)
                if not pipeline_run:
                    raise DagsterK8sPipelineStatusException()

                pipeline_run_status = pipeline_run.status
                if pipeline_run_status != PipelineRunStatus.STARTED:
                    raise DagsterK8sPipelineStatusException()

            self.sleeper(wait_time_between_attempts)
Beispiel #23
0
    def wait_for_job_success(
        self,
        job_name,
        namespace,
        instance=None,
        run_id=None,
        wait_timeout=DEFAULT_WAIT_TIMEOUT,
        wait_time_between_attempts=DEFAULT_WAIT_BETWEEN_ATTEMPTS,
        num_pods_to_wait_for=DEFAULT_JOB_POD_COUNT,
    ):
        '''Poll a job for successful completion.

        Args:
            job_name (str): Name of the job to wait for.
            namespace (str): Namespace in which the job is located.
            wait_timeout (numeric, optional): Timeout after which to give up and raise exception.
                Defaults to DEFAULT_WAIT_TIMEOUT.
            wait_time_between_attempts (numeric, optional): Wait time between polling attempts. Defaults
                to DEFAULT_WAIT_BETWEEN_ATTEMPTS.

        Raises:
            DagsterK8sError: Raised when wait_timeout is exceeded or an error is encountered.
        '''
        check.str_param(job_name, 'job_name')
        check.str_param(namespace, 'namespace')
        check.opt_inst_param(instance, 'instance', DagsterInstance)
        check.opt_str_param(run_id, 'run_id')
        check.numeric_param(wait_timeout, 'wait_timeout')
        check.numeric_param(wait_time_between_attempts, 'wait_time_between_attempts')
        check.int_param(num_pods_to_wait_for, 'num_pods_to_wait_for')

        job = None
        start = self.timer()

        # Ensure we found the job that we launched
        while not job:
            if self.timer() - start > wait_timeout:

                raise DagsterK8sError('Timed out while waiting for job to launch')

            jobs = self.batch_api.list_namespaced_job(namespace=namespace)
            job = next((j for j in jobs.items if j.metadata.name == job_name), None)

            if not job:
                self.logger('Job "{job_name}" not yet launched, waiting'.format(job_name=job_name))
                self.sleeper(wait_time_between_attempts)

        # Wait for job completed status
        while True:
            if self.timer() - start > wait_timeout:
                raise DagsterK8sError('Timed out while waiting for job to complete')

            # See: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.11/#jobstatus-v1-batch
            status = self.batch_api.read_namespaced_job_status(job_name, namespace=namespace).status

            if status.failed and status.failed > 0:
                pods = self.core_api.list_namespaced_pod(
                    label_selector='job-name=={}'.format(job_name), namespace=namespace
                )
                logs = {}
                for pod in pods.items:
                    pod_name = pod.metadata.name
                    try:
                        logs[pod_name] = self.core_api.read_namespaced_pod_log(
                            name=pod_name, namespace=namespace
                        )
                    except kubernetes.client.rest.ApiException as e:
                        logs[pod_name] = e

                raise DagsterK8sError(
                    'Encountered failed job pods with status: {}, and logs: {}'.format(status, logs)
                )

            # done waiting for pod completion
            if status.succeeded == num_pods_to_wait_for:
                break

            if instance and run_id:
                pipeline_run_status = instance.get_run_by_id(run_id).status
                if pipeline_run_status != PipelineRunStatus.STARTED:
                    raise DagsterK8sPipelineStatusException()

            self.sleeper(wait_time_between_attempts)
Beispiel #24
0
    def wait_for_job_success(
        self,
        job_name,
        namespace,
        instance=None,
        run_id=None,
        wait_timeout=DEFAULT_WAIT_TIMEOUT,
        wait_time_between_attempts=DEFAULT_WAIT_BETWEEN_ATTEMPTS,
        num_pods_to_wait_for=DEFAULT_JOB_POD_COUNT,
    ):
        """Poll a job for successful completion.

        Args:
            job_name (str): Name of the job to wait for.
            namespace (str): Namespace in which the job is located.
            wait_timeout (numeric, optional): Timeout after which to give up and raise exception.
                Defaults to DEFAULT_WAIT_TIMEOUT.
            wait_time_between_attempts (numeric, optional): Wait time between polling attempts. Defaults
                to DEFAULT_WAIT_BETWEEN_ATTEMPTS.

        Raises:
            DagsterK8sError: Raised when wait_timeout is exceeded or an error is encountered.
        """
        check.str_param(job_name, "job_name")
        check.str_param(namespace, "namespace")
        check.opt_inst_param(instance, "instance", DagsterInstance)
        check.opt_str_param(run_id, "run_id")
        check.numeric_param(wait_timeout, "wait_timeout")
        check.numeric_param(wait_time_between_attempts,
                            "wait_time_between_attempts")
        check.int_param(num_pods_to_wait_for, "num_pods_to_wait_for")

        job = None
        start = self.timer()

        # Wait for job to launch
        while not job:
            if self.timer() - start > wait_timeout:
                raise DagsterK8sTimeoutError(
                    "Timed out while waiting for job {job_name}"
                    " to launch".format(job_name=job_name))

            # Get all jobs in the namespace and find the matching job
            def _get_jobs_for_namespace():
                jobs = self.batch_api.list_namespaced_job(
                    namespace=namespace,
                    field_selector="metadata.name={}".format(job_name))
                if jobs.items:
                    check.invariant(
                        len(jobs.items) == 1,
                        'There should only be one k8s job with name "{}", but got multiple jobs:" {}'
                        .format(job_name, jobs.items),
                    )
                    return jobs.items[0]
                else:
                    return None

            job = k8s_api_retry(_get_jobs_for_namespace, max_retries=3)

            if not job:
                self.logger(
                    'Job "{job_name}" not yet launched, waiting'.format(
                        job_name=job_name))
                self.sleeper(wait_time_between_attempts)

        # Wait for the job status to be completed. We check the status every
        # wait_time_between_attempts seconds
        while True:
            if self.timer() - start > wait_timeout:
                raise DagsterK8sTimeoutError(
                    "Timed out while waiting for job {job_name}"
                    " to complete".format(job_name=job_name))

            # Reads the status of the specified job. Returns a V1Job object that
            # we need to read the status off of.
            status = None

            def _get_job_status():
                job = self.batch_api.read_namespaced_job_status(
                    job_name, namespace=namespace)
                return job.status

            status = k8s_api_retry(_get_job_status, max_retries=3)

            # status.succeeded represents the number of pods which reached phase Succeeded.
            if status.succeeded == num_pods_to_wait_for:
                break

            # status.failed represents the number of pods which reached phase Failed.
            if status.failed and status.failed > 0:
                raise DagsterK8sError(
                    "Encountered failed job pods for job {job_name} with status: {status}"
                    .format(job_name=job_name, status=status))

            if instance and run_id:
                pipeline_run = instance.get_run_by_id(run_id)
                if not pipeline_run:
                    raise DagsterK8sPipelineStatusException()

                pipeline_run_status = pipeline_run.status
                if pipeline_run_status != PipelineRunStatus.STARTED:
                    raise DagsterK8sPipelineStatusException()

            self.sleeper(wait_time_between_attempts)