コード例 #1
0
ファイル: batch.py プロジェクト: sappier/metaflow
    def wait(self, stdout_location, stderr_location, echo=None):
        def wait_for_launch(job):
            status = job.status
            echo(
                "Task is starting (status %s)..." % status,
                "stderr",
                batch_id=job.id,
            )
            t = time.time()
            while True:
                if status != job.status or (time.time() - t) > 30:
                    status = job.status
                    echo(
                        "Task is starting (status %s)..." % status,
                        "stderr",
                        batch_id=job.id,
                    )
                    t = time.time()
                if job.is_running or job.is_done or job.is_crashed:
                    break
                select.poll().poll(200)

        prefix = b"[%s] " % util.to_bytes(self.job.id)
        stdout_tail = S3Tail(stdout_location)
        stderr_tail = S3Tail(stderr_location)

        # 1) Loop until the job has started
        wait_for_launch(self.job)

        # 2) Tail logs until the job has finished
        tail_logs(
            prefix=prefix,
            stdout_tail=stdout_tail,
            stderr_tail=stderr_tail,
            echo=echo,
            has_log_updates=lambda: self.job.is_running,
        )

        # In case of hard crashes (OOM), the final save_logs won't happen.
        # We can fetch the remaining logs from AWS CloudWatch and persist them
        # to Amazon S3.

        if self.job.is_crashed:
            msg = next(msg for msg in [
                self.job.reason,
                self.job.status_reason,
                "Task crashed.",
            ] if msg is not None)
            raise BatchException("%s "
                                 "This could be a transient error. "
                                 "Use @retry to retry." % msg)
        else:
            if self.job.is_running:
                # Kill the job if it is still running by throwing an exception.
                raise BatchException("Task failed!")
            echo(
                "Task finished with exit code %s." % self.job.status_code,
                "stderr",
                batch_id=self.job.id,
            )
コード例 #2
0
    def wait(self, stdout_location, stderr_location, echo=None):
        def update_delay(secs_since_start):
            # this sigmoid function reaches
            # - 0.1 after 11 minutes
            # - 0.5 after 15 minutes
            # - 1.0 after 23 minutes
            # in other words, the user will see very frequent updates
            # during the first 10 minutes
            sigmoid = 1.0 / (1.0 + math.exp(-0.01 * secs_since_start + 9.0))
            return 0.5 + sigmoid * 30.0

        def wait_for_launch(job):
            status = job.status
            echo(
                "Task is starting (%s)..." % status,
                "stderr",
                job_id=job.id,
            )
            t = time.time()
            start_time = time.time()
            while job.is_waiting:
                new_status = job.status
                if status != new_status or (time.time() - t) > 30:
                    status = new_status
                    echo(
                        "Task is starting (%s)..." % status,
                        "stderr",
                        job_id=job.id,
                    )
                    t = time.time()
                time.sleep(update_delay(time.time() - start_time))

        prefix = b"[%s] " % util.to_bytes(self._job.id)
        stdout_tail = S3Tail(stdout_location)
        stderr_tail = S3Tail(stderr_location)

        # 1) Loop until the job has started
        wait_for_launch(self._job)

        # 2) Tail logs until the job has finished
        tail_logs(
            prefix=prefix,
            stdout_tail=stdout_tail,
            stderr_tail=stderr_tail,
            echo=echo,
            has_log_updates=lambda: self._job.is_running,
        )

        # 3) Fetch remaining logs
        #
        # It is possible that we exit the loop above before all logs have been
        # shown.
        #
        # TODO : If we notice Kubernetes failing to upload logs to S3,
        #        we can add a HEAD request here to ensure that the file
        #        exists prior to calling S3Tail and note the user about
        #        truncated logs if it doesn't.
        # TODO : For hard crashes, we can fetch logs from the pod.

        if self._job.has_failed:
            exit_code, reason = self._job.reason
            msg = next(msg for msg in [
                reason,
                "Task crashed",
            ] if msg is not None)
            if exit_code:
                if int(exit_code) == 139:
                    raise KubernetesException(
                        "Task failed with a segmentation fault.")
                if int(exit_code) == 137:
                    raise KubernetesException(
                        "Task ran out of memory. "
                        "Increase the available memory by specifying "
                        "@resource(memory=...) for the step. ")
                else:
                    msg = "%s (exit code %s)" % (msg, exit_code)
            raise KubernetesException(
                "%s. This could be a transient error. Use @retry to retry." %
                msg)

        exit_code, _ = self._job.reason
        echo(
            "Task finished with exit code %s." % exit_code,
            "stderr",
            job_id=self._job.id,
        )
コード例 #3
0
ファイル: batch.py プロジェクト: cclauss/metaflow
    def wait(self, stdout_location, stderr_location, echo=None):
        def wait_for_launch(job):
            status = job.status
            echo(
                "Task is starting (status %s)..." % status,
                "stderr",
                batch_id=job.id,
            )
            t = time.time()
            while True:
                if status != job.status or (time.time() - t) > 30:
                    status = job.status
                    echo(
                        "Task is starting (status %s)..." % status,
                        "stderr",
                        batch_id=job.id,
                    )
                    t = time.time()
                if job.is_running or job.is_done or job.is_crashed:
                    break
                select.poll().poll(200)

        prefix = b"[%s] " % util.to_bytes(self.job.id)

        def _print_available(tail, stream, should_persist=False):
            # print the latest batch of lines from S3Tail
            try:
                for line in tail:
                    if should_persist:
                        line = set_should_persist(line)
                    else:
                        line = refine(line, prefix=prefix)
                    echo(line.strip().decode("utf-8", errors="replace"),
                         stream)
            except Exception as ex:
                echo(
                    "[ temporary error in fetching logs: %s ]" % ex,
                    "stderr",
                    batch_id=self.job.id,
                )

        stdout_tail = S3Tail(stdout_location)
        stderr_tail = S3Tail(stderr_location)

        # 1) Loop until the job has started
        wait_for_launch(self.job)

        # 2) Loop until the job has finished
        start_time = time.time()
        is_running = True
        next_log_update = start_time
        log_update_delay = 1

        while is_running:
            if time.time() > next_log_update:
                _print_available(stdout_tail, "stdout")
                _print_available(stderr_tail, "stderr")
                now = time.time()
                log_update_delay = update_delay(now - start_time)
                next_log_update = now + log_update_delay
                is_running = self.job.is_running

            # This sleep should never delay log updates. On the other hand,
            # we should exit this loop when the task has finished without
            # a long delay, regardless of the log tailing schedule
            d = min(log_update_delay, 5.0)
            select.poll().poll(d * 1000)

        # 3) Fetch remaining logs
        #
        # It is possible that we exit the loop above before all logs have been
        # shown.
        #
        # TODO if we notice AWS Batch failing to upload logs to S3, we can add a
        # HEAD request here to ensure that the file exists prior to calling
        # S3Tail and note the user about truncated logs if it doesn't
        _print_available(stdout_tail, "stdout")
        _print_available(stderr_tail, "stderr")
        # In case of hard crashes (OOM), the final save_logs won't happen.
        # We fetch the remaining logs from AWS CloudWatch and persist them to
        # Amazon S3.

        if self.job.is_crashed:
            msg = next(msg for msg in [
                self.job.reason,
                self.job.status_reason,
                "Task crashed.",
            ] if msg is not None)
            raise BatchException("%s "
                                 "This could be a transient error. "
                                 "Use @retry to retry." % msg)
        else:
            if self.job.is_running:
                # Kill the job if it is still running by throwing an exception.
                raise BatchException("Task failed!")
            echo(
                "Task finished with exit code %s." % self.job.status_code,
                "stderr",
                batch_id=self.job.id,
            )
コード例 #4
0
ファイル: batch.py プロジェクト: parampavar/metaflow
    def wait(self, stdout_location, stderr_location, echo=None):
        def wait_for_launch(job, child_jobs):
            status = job.status
            echo(
                "Task is starting (status %s)..." % status,
                "stderr",
                batch_id=job.id,
            )
            t = time.time()
            while True:
                if status != job.status or (time.time() - t) > 30:
                    if not child_jobs:
                        child_statuses = ""
                    else:
                        status_keys = set(
                            [child_job.status for child_job in child_jobs])
                        status_counts = [(
                            status,
                            len([
                                child_job.status == status
                                for child_job in child_jobs
                            ]),
                        ) for status in status_keys]
                        child_statuses = " (parallel node status: [{}])".format(
                            ", ".join([
                                "{}:{}".format(status, num)
                                for (status, num) in sorted(status_counts)
                            ]))
                    status = job.status
                    echo(
                        "Task is starting (status %s)... %s" %
                        (status, child_statuses),
                        "stderr",
                        batch_id=job.id,
                    )
                    t = time.time()
                if job.is_running or job.is_done or job.is_crashed:
                    break
                select.poll().poll(200)

        prefix = b"[%s] " % util.to_bytes(self.job.id)
        stdout_tail = S3Tail(stdout_location)
        stderr_tail = S3Tail(stderr_location)

        child_jobs = []
        if self.num_parallel > 1:
            for node in range(1, self.num_parallel):
                child_job = copy.copy(self.job)
                child_job._id = child_job._id + "#{}".format(node)
                child_jobs.append(child_job)

        # 1) Loop until the job has started
        wait_for_launch(self.job, child_jobs)

        # 2) Tail logs until the job has finished
        tail_logs(
            prefix=prefix,
            stdout_tail=stdout_tail,
            stderr_tail=stderr_tail,
            echo=echo,
            has_log_updates=lambda: self.job.is_running,
        )

        # In case of hard crashes (OOM), the final save_logs won't happen.
        # We can fetch the remaining logs from AWS CloudWatch and persist them
        # to Amazon S3.

        if self.job.is_crashed:
            msg = next(msg for msg in [
                self.job.reason,
                self.job.status_reason,
                "Task crashed.",
            ] if msg is not None)
            raise BatchException("%s "
                                 "This could be a transient error. "
                                 "Use @retry to retry." % msg)
        else:
            if self.job.is_running:
                # Kill the job if it is still running by throwing an exception.
                raise BatchException("Task failed!")
            echo(
                "Task finished with exit code %s." % self.job.status_code,
                "stderr",
                batch_id=self.job.id,
            )
コード例 #5
0
ファイル: kubernetes.py プロジェクト: sappier/metaflow
    def wait(self, stdout_location, stderr_location, echo=None):
        def wait_for_launch(job):
            status = job.status
            echo(
                "Task is starting (Status %s)..." % status,
                "stderr",
                job_id=job.id,
            )
            t = time.time()
            while True:
                new_status = job.status
                if status != new_status or (time.time() - t) > 30:
                    status = new_status
                    echo(
                        "Task is starting (Status %s)..." % status,
                        "stderr",
                        job_id=job.id,
                    )
                    t = time.time()
                if job.is_running or job.is_done:
                    break
                time.sleep(1)

        prefix = b"[%s] " % util.to_bytes(self._job.id)
        stdout_tail = S3Tail(stdout_location)
        stderr_tail = S3Tail(stderr_location)

        # 1) Loop until the job has started
        wait_for_launch(self._job)

        # 2) Tail logs until the job has finished
        tail_logs(
            prefix=prefix,
            stdout_tail=stdout_tail,
            stderr_tail=stderr_tail,
            echo=echo,
            has_log_updates=lambda: self._job.is_running,
        )

        # 3) Fetch remaining logs
        #
        # It is possible that we exit the loop above before all logs have been
        # shown.
        #
        # TODO (savin): If we notice Kubernetes failing to upload logs to S3,
        #               we can add a HEAD request here to ensure that the file
        #               exists prior to calling S3Tail and note the user about
        #               truncated logs if it doesn't.
        # TODO (savin): For hard crashes, we can fetch logs from the pod.

        if self._job.has_failed:
            exit_code, reason = self._job.reason
            msg = next(msg for msg in [
                reason,
                "Task crashed",
            ] if msg is not None)
            if exit_code:
                if int(exit_code) == 139:
                    raise KubernetesException(
                        "Task failed with a segmentation fault.")
                else:
                    msg = "%s (exit code %s)" % (msg, exit_code)
            raise KubernetesException("%s. This could be a transient error. "
                                      "Use @retry to retry." % msg)

        exit_code, _ = self._job.reason
        echo(
            "Task finished with exit code %s." % exit_code,
            "stderr",
            job_id=self._job.id,
        )
コード例 #6
0
    def wait(self, stdout_location, stderr_location, echo=None):
        def wait_for_launch(job):
            status = job.status
            echo(
                "Task is starting (Status %s)..." % status,
                "stderr",
                job_id=job.id,
            )
            t = time.time()
            while True:
                new_status = job.status
                if status != new_status or (time.time() - t) > 30:
                    status = new_status
                    echo(
                        "Task is starting (Status %s)..." % status,
                        "stderr",
                        job_id=job.id,
                    )
                    t = time.time()
                if job.is_running or job.is_done:
                    break
                time.sleep(1)

        def _print_available(tail, stream, should_persist=False):
            # print the latest batch of lines from S3Tail
            prefix = b"[%s] " % util.to_bytes(self._job.id)
            try:
                for line in tail:
                    if should_persist:
                        line = set_should_persist(line)
                    else:
                        line = refine(line, prefix=prefix)
                    echo(line.strip().decode("utf-8", errors="replace"), stream)
            except Exception as ex:
                echo(
                    "[ temporary error in fetching logs: %s ]" % ex,
                    "stderr",
                    job_id=self._job.id,
                )

        stdout_tail = S3Tail(stdout_location)
        stderr_tail = S3Tail(stderr_location)

        # 1) Loop until the job has started
        wait_for_launch(self._job)

        # 2) Loop until the job has finished
        start_time = time.time()
        is_running = True
        next_log_update = start_time
        log_update_delay = 1

        while is_running:
            if time.time() > next_log_update:
                _print_available(stdout_tail, "stdout")
                _print_available(stderr_tail, "stderr")
                now = time.time()
                log_update_delay = update_delay(now - start_time)
                next_log_update = now + log_update_delay
                is_running = self._job.is_running

            # This sleep should never delay log updates. On the other hand,
            # we should exit this loop when the task has finished without
            # a long delay, regardless of the log tailing schedule
            time.sleep(min(log_update_delay, 5.0))

        # 3) Fetch remaining logs
        #
        # It is possible that we exit the loop above before all logs have been
        # shown.
        #
        # TODO (savin): If we notice Kubernetes failing to upload logs to S3,
        #               we can add a HEAD request here to ensure that the file
        #               exists prior to calling S3Tail and note the user about
        #               truncated logs if it doesn't.
        # TODO (savin): For hard crashes, we can fetch logs from the pod.
        _print_available(stdout_tail, "stdout")
        _print_available(stderr_tail, "stderr")

        if self._job.has_failed:
            exit_code, reason = self._job.reason
            msg = next(
                msg
                for msg in [
                    reason,
                    "Task crashed",
                ]
                if msg is not None
            )
            if exit_code:
                if int(exit_code) == 139:
                    raise KubernetesException("Task failed with a segmentation fault.")
                else:
                    msg = "%s (exit code %s)" % (msg, exit_code)
            raise KubernetesException(
                "%s. This could be a transient error. " "Use @retry to retry." % msg
            )

        exit_code, _ = self._job.reason
        echo(
            "Task finished with exit code %s." % exit_code,
            "stderr",
            job_id=self._job.id,
        )