Ejemplo n.º 1
0
    def wait(self, stdout_location, stderr_location, echo=None):
        def wait_for_launch(job):
            status = job.status
            echo('Task is starting (status %s)...' % status,
                 'stderr',
                 batch_id=job.id)
            t = time.time()
            while True:
                if status != job.status or (time.time() - t) > 30:
                    status = job.status
                    echo('Task is starting (status %s)...' % status,
                         'stderr',
                         batch_id=job.id)
                    t = time.time()
                if job.is_running or job.is_done or job.is_crashed:
                    break
                select.poll().poll(200)

        prefix = b'[%s] ' % util.to_bytes(self.job.id)

        def _print_available(tail, stream, should_persist=False):
            # print the latest batch of lines from S3Tail
            try:
                for line in tail:
                    if should_persist:
                        line = set_should_persist(line)
                    else:
                        line = refine(line, prefix=prefix)
                    echo(line.strip().decode('utf-8', errors='replace'),
                         stream)
            except Exception as ex:
                echo('[ temporary error in fetching logs: %s ]' % ex,
                     'stderr',
                     batch_id=self.job.id)

        stdout_tail = S3Tail(stdout_location)
        stderr_tail = S3Tail(stderr_location)

        # 1) Loop until the job has started
        wait_for_launch(self.job)

        # 2) Loop until the job has finished
        start_time = time.time()
        is_running = True
        next_log_update = start_time
        log_update_delay = 1

        while is_running:
            if time.time() > next_log_update:
                _print_available(stdout_tail, 'stdout')
                _print_available(stderr_tail, 'stderr')
                now = time.time()
                log_update_delay = update_delay(now - start_time)
                next_log_update = now + log_update_delay
                is_running = self.job.is_running

            # This sleep should never delay log updates. On the other hand,
            # we should exit this loop when the task has finished without
            # a long delay, regardless of the log tailing schedule
            d = min(log_update_delay, 5.0)
            select.poll().poll(d * 1000)

        # 3) Fetch remaining logs
        #
        # It is possible that we exit the loop above before all logs have been
        # shown.
        #
        # TODO if we notice AWS Batch failing to upload logs to S3, we can add a
        # HEAD request here to ensure that the file exists prior to calling
        # S3Tail and note the user about truncated logs if it doesn't
        _print_available(stdout_tail, 'stdout')
        _print_available(stderr_tail, 'stderr')
        # In case of hard crashes (OOM), the final save_logs won't happen.
        # We fetch the remaining logs from AWS CloudWatch and persist them to
        # Amazon S3.
        #
        # TODO: AWS CloudWatch fetch logs

        if self.job.is_crashed:
            if self.job.status_code == 1:
                msg = next(msg for msg in [
                    'Task failed with a general error. You may have a miscellaneous error(s), '
                    'such as "divide by zero" and other impermissible operations.'
                ] if msg is not None)
            if self.job.status_code == 2:
                msg = next(msg for msg in [
                    'Misuse of shell builtins. You may have a missing keyword or command, '
                    'or permission problem (and diff return code on a failed binary file '
                    'comparison).'
                ] if msg is not None)
            if self.job.status_code == 126:
                msg = next(msg for msg in [
                    'Command invoked cannot execute. May be a permission problem or command is '
                    'not an executable.'
                ] if msg is not None)
            if self.job.status_code == 127:
                msg = next(msg for msg in [
                    'Command not Found. May be a possible problem with $PATH or a type.'
                ] if msg is not None)
            if self.job.status_code == 128:
                msg = next(msg for msg in ['Invalid argument to exit.']
                           if msg is not None)
            if self.job.status_code > 129:
                msg = next(msg for msg in ['Signal related error']
                           if msg is not None)
            raise BatchException('%s '
                                 'This could be a transient error. '
                                 'Use @retry to retry.' % msg)
        else:
            if self.job.is_running:
                # Kill the job if it is still running by throwing an exception.
                raise BatchException("Task failed!")
            echo('Task finished with exit code %s.' % self.job.status_code,
                 'stderr',
                 batch_id=self.job.id)
Ejemplo n.º 2
0
    def wait(self, stdout_location, stderr_location, echo=None):
        def wait_for_launch(job):
            status = job.status
            echo(
                "Task is starting (status %s)..." % status,
                "stderr",
                batch_id=job.id,
            )
            t = time.time()
            while True:
                if status != job.status or (time.time() - t) > 30:
                    status = job.status
                    echo(
                        "Task is starting (status %s)..." % status,
                        "stderr",
                        batch_id=job.id,
                    )
                    t = time.time()
                if job.is_running or job.is_done or job.is_crashed:
                    break
                select.poll().poll(200)

        prefix = b"[%s] " % util.to_bytes(self.job.id)

        def _print_available(tail, stream, should_persist=False):
            # print the latest batch of lines from S3Tail
            try:
                for line in tail:
                    if should_persist:
                        line = set_should_persist(line)
                    else:
                        line = refine(line, prefix=prefix)
                    echo(line.strip().decode("utf-8", errors="replace"),
                         stream)
            except Exception as ex:
                echo(
                    "[ temporary error in fetching logs: %s ]" % ex,
                    "stderr",
                    batch_id=self.job.id,
                )

        stdout_tail = S3Tail(stdout_location)
        stderr_tail = S3Tail(stderr_location)

        # 1) Loop until the job has started
        wait_for_launch(self.job)

        # 2) Loop until the job has finished
        start_time = time.time()
        is_running = True
        next_log_update = start_time
        log_update_delay = 1

        while is_running:
            if time.time() > next_log_update:
                _print_available(stdout_tail, "stdout")
                _print_available(stderr_tail, "stderr")
                now = time.time()
                log_update_delay = update_delay(now - start_time)
                next_log_update = now + log_update_delay
                is_running = self.job.is_running

            # This sleep should never delay log updates. On the other hand,
            # we should exit this loop when the task has finished without
            # a long delay, regardless of the log tailing schedule
            d = min(log_update_delay, 5.0)
            select.poll().poll(d * 1000)

        # 3) Fetch remaining logs
        #
        # It is possible that we exit the loop above before all logs have been
        # shown.
        #
        # TODO if we notice AWS Batch failing to upload logs to S3, we can add a
        # HEAD request here to ensure that the file exists prior to calling
        # S3Tail and note the user about truncated logs if it doesn't
        _print_available(stdout_tail, "stdout")
        _print_available(stderr_tail, "stderr")
        # In case of hard crashes (OOM), the final save_logs won't happen.
        # We fetch the remaining logs from AWS CloudWatch and persist them to
        # Amazon S3.

        if self.job.is_crashed:
            msg = next(msg for msg in [
                self.job.reason,
                self.job.status_reason,
                "Task crashed.",
            ] if msg is not None)
            raise BatchException("%s "
                                 "This could be a transient error. "
                                 "Use @retry to retry." % msg)
        else:
            if self.job.is_running:
                # Kill the job if it is still running by throwing an exception.
                raise BatchException("Task failed!")
            echo(
                "Task finished with exit code %s." % self.job.status_code,
                "stderr",
                batch_id=self.job.id,
            )
Ejemplo n.º 3
0
    def wait(self, stdout_location, stderr_location, echo=None):
        def wait_for_launch(job):
            status = job.status
            echo(
                "Task is starting (Status %s)..." % status,
                "stderr",
                job_id=job.id,
            )
            t = time.time()
            while True:
                new_status = job.status
                if status != new_status or (time.time() - t) > 30:
                    status = new_status
                    echo(
                        "Task is starting (Status %s)..." % status,
                        "stderr",
                        job_id=job.id,
                    )
                    t = time.time()
                if job.is_running or job.is_done:
                    break
                time.sleep(1)

        def _print_available(tail, stream, should_persist=False):
            # print the latest batch of lines from S3Tail
            prefix = b"[%s] " % util.to_bytes(self._job.id)
            try:
                for line in tail:
                    if should_persist:
                        line = set_should_persist(line)
                    else:
                        line = refine(line, prefix=prefix)
                    echo(line.strip().decode("utf-8", errors="replace"), stream)
            except Exception as ex:
                echo(
                    "[ temporary error in fetching logs: %s ]" % ex,
                    "stderr",
                    job_id=self._job.id,
                )

        stdout_tail = S3Tail(stdout_location)
        stderr_tail = S3Tail(stderr_location)

        # 1) Loop until the job has started
        wait_for_launch(self._job)

        # 2) Loop until the job has finished
        start_time = time.time()
        is_running = True
        next_log_update = start_time
        log_update_delay = 1

        while is_running:
            if time.time() > next_log_update:
                _print_available(stdout_tail, "stdout")
                _print_available(stderr_tail, "stderr")
                now = time.time()
                log_update_delay = update_delay(now - start_time)
                next_log_update = now + log_update_delay
                is_running = self._job.is_running

            # This sleep should never delay log updates. On the other hand,
            # we should exit this loop when the task has finished without
            # a long delay, regardless of the log tailing schedule
            time.sleep(min(log_update_delay, 5.0))

        # 3) Fetch remaining logs
        #
        # It is possible that we exit the loop above before all logs have been
        # shown.
        #
        # TODO (savin): If we notice Kubernetes failing to upload logs to S3,
        #               we can add a HEAD request here to ensure that the file
        #               exists prior to calling S3Tail and note the user about
        #               truncated logs if it doesn't.
        # TODO (savin): For hard crashes, we can fetch logs from the pod.
        _print_available(stdout_tail, "stdout")
        _print_available(stderr_tail, "stderr")

        if self._job.has_failed:
            exit_code, reason = self._job.reason
            msg = next(
                msg
                for msg in [
                    reason,
                    "Task crashed",
                ]
                if msg is not None
            )
            if exit_code:
                if int(exit_code) == 139:
                    raise KubernetesException("Task failed with a segmentation fault.")
                else:
                    msg = "%s (exit code %s)" % (msg, exit_code)
            raise KubernetesException(
                "%s. This could be a transient error. " "Use @retry to retry." % msg
            )

        exit_code, _ = self._job.reason
        echo(
            "Task finished with exit code %s." % exit_code,
            "stderr",
            job_id=self._job.id,
        )