Ejemplo n.º 1
0
    def _transition_from_PREPARING(self, run_state):
        """
        1- Request the docker image from docker image manager
            - if image is failed, move to CLEANING_UP state
        2- Request the dependencies from dependency manager
            - if any are failed, move to CLEANING_UP state
        3- If all dependencies and docker image are ready:
            - Set up the local filesystem for the run
            - Create symlinks to dependencies
            - Allocate resources and prepare the docker container
            - Start the docker container
        4- If all is successful, move to RUNNING state
        """
        def mount_dependency(dependency, shared_file_system):
            if not shared_file_system:
                # Set up symlinks for the content at dependency path
                Path(dependency.child_path).parent.mkdir(parents=True,
                                                         exist_ok=True)
                os.symlink(dependency.docker_path, dependency.child_path)
            # The following will be converted into a Docker volume binding like:
            #   dependency_path:docker_dependency_path:ro
            docker_dependencies.append(
                (dependency.parent_path, dependency.docker_path))

        if run_state.is_killed or run_state.is_restaged:
            log_bundle_transition(
                bundle_uuid=run_state.bundle.uuid,
                previous_stage=run_state.stage,
                next_stage=RunStage.CLEANING_UP,
                reason=
                f'the bundle was {"killed" if run_state.is_killed else "restaged"}',
            )
            return run_state._replace(stage=RunStage.CLEANING_UP)

        # Check CPU and GPU availability
        try:
            cpuset, gpuset = self.assign_cpu_and_gpu_sets_fn(
                run_state.resources.cpus, run_state.resources.gpus)
        except Exception as e:
            message = "Unexpectedly unable to assign enough resources to bundle {}: {}".format(
                run_state.bundle.uuid, str(e))
            logger.error(message)
            logger.error(traceback.format_exc())
            return run_state._replace(run_status=message)

        dependencies_ready = True
        status_messages = []

        if not self.shared_file_system:
            # No need to download dependencies if we're in the shared FS,
            # since they're already in our FS
            for dep in run_state.bundle.dependencies:
                dep_key = DependencyKey(dep.parent_uuid, dep.parent_path)
                dependency_state = self.dependency_manager.get(
                    run_state.bundle.uuid, dep_key)
                if dependency_state.stage == DependencyStage.DOWNLOADING:
                    status_messages.append(
                        'Downloading dependency %s: %s done (archived size)' %
                        (dep.child_path, size_str(
                            dependency_state.size_bytes)))
                    dependencies_ready = False
                elif dependency_state.stage == DependencyStage.FAILED:
                    # Failed to download dependency; -> CLEANING_UP
                    log_bundle_transition(
                        bundle_uuid=run_state.bundle.uuid,
                        previous_stage=run_state.stage,
                        next_stage=RunStage.CLEANING_UP,
                        reason=
                        f'Dependency has failed for this bundle. Dependency child uuid: {dep.child_uuid}. Dependency child path: {dep.child_path}',
                    )
                    return run_state._replace(
                        stage=RunStage.CLEANING_UP,
                        failure_message='Failed to download dependency %s: %s'
                        % (dep.child_path, dependency_state.message),
                    )

        # get the docker image
        docker_image = run_state.resources.docker_image
        image_state = self.image_manager.get(docker_image)
        if image_state.stage == DependencyStage.DOWNLOADING:
            status_messages.append('Pulling docker image %s %s' %
                                   (docker_image, image_state.message))
            dependencies_ready = False
        elif image_state.stage == DependencyStage.FAILED:
            # Failed to pull image; -> CLEANING_UP
            message = 'Failed to download Docker image: %s' % image_state.message
            logger.error(message)
            return run_state._replace(stage=RunStage.CLEANING_UP,
                                      failure_message=message)

        # stop proceeding if dependency and image downloads aren't all done
        if not dependencies_ready:
            status_message = status_messages.pop()
            if status_messages:
                status_message += "(and downloading %d other dependencies and docker images)" % len(
                    status_messages)
            logger.info(
                f'bundle is not ready yet. uuid: {run_state.bundle.uuid}. status message: {status_message}'
            )
            return run_state._replace(run_status=status_message)

        # All dependencies ready! Set up directories, symlinks and container. Start container.
        # 1) Set up a directory to store the bundle.
        if self.shared_file_system:
            if not os.path.exists(run_state.bundle_path):
                if run_state.bundle_dir_wait_num_tries == 0:
                    message = (
                        "Bundle directory cannot be found on the shared filesystem. "
                        "Please ensure the shared fileystem between the server and "
                        "your worker is mounted properly or contact your administrators."
                    )
                    log_bundle_transition(
                        bundle_uuid=run_state.bundle.uuid,
                        previous_stage=run_state.stage,
                        next_stage=RunStage.CLEANING_UP,
                        reason=
                        "Bundle directory cannot be found on the shared filesystem.",
                    )
                    return run_state._replace(stage=RunStage.CLEANING_UP,
                                              failure_message=message)
                next_bundle_dir_wait_num_tries = run_state.bundle_dir_wait_num_tries - 1
                logger.info(
                    f'Waiting for bundle directory to be created by the server, uuid: {run_state.bundle.uuid}, bundle_dir_wait_num_tries: {next_bundle_dir_wait_num_tries}'
                )
                return run_state._replace(
                    run_status=
                    "Waiting for bundle directory to be created by the server",
                    bundle_dir_wait_num_tries=next_bundle_dir_wait_num_tries,
                )
        else:
            remove_path(run_state.bundle_path)
            os.makedirs(run_state.bundle_path)

        # 2) Set up symlinks
        docker_dependencies = []
        docker_dependencies_path = (
            RunStateMachine._ROOT + run_state.bundle.uuid +
            ('_dependencies' if not self.shared_file_system else ''))

        for dep in run_state.bundle.dependencies:
            full_child_path = os.path.normpath(
                os.path.join(run_state.bundle_path, dep.child_path))
            to_mount = []
            dependency_path = self._get_dependency_path(run_state, dep)

            if dep.child_path == RunStateMachine._CURRENT_DIRECTORY:
                # Mount all the content of the dependency_path to the top-level of the bundle
                for child in os.listdir(dependency_path):
                    child_path = os.path.normpath(
                        os.path.join(run_state.bundle_path, child))
                    to_mount.append(
                        DependencyToMount(
                            docker_path=os.path.join(docker_dependencies_path,
                                                     child),
                            child_path=child_path,
                            parent_path=os.path.join(dependency_path, child),
                        ))
                    run_state = run_state._replace(
                        paths_to_remove=(run_state.paths_to_remove or []) +
                        [child_path])
            else:
                to_mount.append(
                    DependencyToMount(
                        docker_path=os.path.join(docker_dependencies_path,
                                                 dep.child_path),
                        child_path=full_child_path,
                        parent_path=dependency_path,
                    ))

                first_element_of_path = Path(dep.child_path).parts[0]
                if first_element_of_path == RunStateMachine._ROOT:
                    run_state = run_state._replace(
                        paths_to_remove=(run_state.paths_to_remove or []) +
                        [full_child_path])
                else:
                    # child_path can be a nested path, so later remove everything from the first element of the path
                    path_to_remove = os.path.join(run_state.bundle_path,
                                                  first_element_of_path)
                    run_state = run_state._replace(
                        paths_to_remove=(run_state.paths_to_remove or []) +
                        [path_to_remove])
            for dependency in to_mount:
                try:
                    mount_dependency(dependency, self.shared_file_system)
                except OSError as e:
                    log_bundle_transition(
                        bundle_uuid=run_state.bundle.uuid,
                        previous_stage=run_state.stage,
                        next_stage=RunStage.CLEANING_UP,
                        reason=str(e.__class__),
                        level=logging.ERROR,
                    )
                    return run_state._replace(stage=RunStage.CLEANING_UP,
                                              failure_message=str(e))

        if run_state.resources.network:
            docker_network = self.docker_network_external.name
        else:
            docker_network = self.docker_network_internal.name

        # 3) Start container
        try:
            container = docker_utils.start_bundle_container(
                run_state.bundle_path,
                run_state.bundle.uuid,
                docker_dependencies,
                run_state.bundle.command,
                run_state.resources.docker_image,
                network=docker_network,
                cpuset=cpuset,
                gpuset=gpuset,
                memory_bytes=run_state.resources.memory,
                runtime=self.docker_runtime,
            )
            self.worker_docker_network.connect(container)
        except docker_utils.DockerUserErrorException as e:
            message = 'Cannot start Docker container: {}'.format(e)
            log_bundle_transition(
                bundle_uuid=run_state.bundle.uuid,
                previous_stage=run_state.stage,
                next_stage=RunStage.CLEANING_UP,
                reason='Cannot start Docker container.',
                level=logging.ERROR,
            )
            return run_state._replace(stage=RunStage.CLEANING_UP,
                                      failure_message=message)
        except Exception as e:
            message = 'Cannot start container: {}'.format(e)
            logger.error(message)
            logger.error(traceback.format_exc())
            raise

        return run_state._replace(
            stage=RunStage.RUNNING,
            run_status='Running job in container',
            container_id=container.id,
            container=container,
            docker_image=image_state.digest,
            has_contents=True,
            cpuset=cpuset,
            gpuset=gpuset,
        )
    def _transition_from_PREPARING(self, run_state):
        """
        1- Request the docker image from docker image manager
            - if image is failed, move to CLEANING_UP state
        2- Request the dependencies from dependency manager
            - if any are failed, move to CLEANING_UP state
        3- If all dependencies and docker image are ready:
            - Set up the local filesystem for the run
            - Create symlinks to dependencies
            - Allocate resources and prepare the docker container
            - Start the docker container
        4- If all is successful, move to RUNNING state
        """
        if run_state.is_killed:
            return run_state._replace(stage=RunStage.CLEANING_UP)

        dependencies_ready = True
        status_messages = []

        if not self.shared_file_system:
            # No need to download dependencies if we're in the shared FS since they're already in our FS
            for dep_key, dep in run_state.bundle.dependencies.items():
                dependency_state = self.dependency_manager.get(
                    run_state.bundle.uuid, dep_key)
                if dependency_state.stage == DependencyStage.DOWNLOADING:
                    status_messages.append(
                        'Downloading dependency %s: %s done (archived size)' %
                        (dep.child_path, size_str(
                            dependency_state.size_bytes)))
                    dependencies_ready = False
                elif dependency_state.stage == DependencyStage.FAILED:
                    # Failed to download dependency; -> CLEANING_UP
                    return run_state._replace(
                        stage=RunStage.CLEANING_UP,
                        failure_message='Failed to download dependency %s: %s'
                        % (dep.child_path, dependency_state.message),
                    )

        # get the docker image
        docker_image = run_state.resources.docker_image
        image_state = self.docker_image_manager.get(docker_image)
        if image_state.stage == DependencyStage.DOWNLOADING:
            status_messages.append('Pulling docker image: ' +
                                   (image_state.message or docker_image or ""))
            dependencies_ready = False
        elif image_state.stage == DependencyStage.FAILED:
            # Failed to pull image; -> CLEANING_UP
            message = 'Failed to download Docker image: %s' % image_state.message
            logger.error(message)
            return run_state._replace(stage=RunStage.CLEANING_UP,
                                      failure_message=message)

        # stop proceeding if dependency and image downloads aren't all done
        if not dependencies_ready:
            status_message = status_messages.pop()
            if status_messages:
                status_message += "(and downloading %d other dependencies and docker images)" % len(
                    status_messages)
            return run_state._replace(run_status=status_message)

        # All dependencies ready! Set up directories, symlinks and container. Start container.
        # 1) Set up a directory to store the bundle.
        if self.shared_file_system:
            if not os.path.exists(run_state.bundle_path):
                if run_state.bundle_dir_wait_num_tries == 0:
                    message = (
                        "Bundle directory cannot be found on the shared filesystem. "
                        "Please ensure the shared fileystem between the server and "
                        "your worker is mounted properly or contact your administrators."
                    )
                    logger.error(message)
                    return run_state._replace(stage=RunStage.CLEANING_UP,
                                              failure_message=message)
                return run_state._replace(
                    run_status=
                    "Waiting for bundle directory to be created by the server",
                    bundle_dir_wait_num_tries=run_state.
                    bundle_dir_wait_num_tries - 1,
                )
        else:
            remove_path(run_state.bundle_path)
            os.mkdir(run_state.bundle_path)

        # 2) Set up symlinks
        docker_dependencies = []
        docker_dependencies_path = (
            '/' + run_state.bundle.uuid +
            ('_dependencies' if not self.shared_file_system else ''))
        for dep_key, dep in run_state.bundle.dependencies.items():
            full_child_path = os.path.normpath(
                os.path.join(run_state.bundle_path, dep.child_path))
            if not full_child_path.startswith(run_state.bundle_path):
                # Dependencies should end up in their bundles (ie prevent using relative paths like ..
                # to get out of their parent bundles
                message = 'Invalid key for dependency: %s' % (dep.child_path)
                logger.error(message)
                return run_state._replace(stage=RunStage.CLEANING_UP,
                                          failure_message=message)
            docker_dependency_path = os.path.join(docker_dependencies_path,
                                                  dep.child_path)
            if self.shared_file_system:
                # On a shared FS, we know where the dep is stored and can get the contents directly
                dependency_path = os.path.realpath(
                    os.path.join(dep.location, dep.parent_path))
            else:
                # On a dependency_manager setup ask the manager where the dependency is
                dependency_path = os.path.join(
                    self.dependency_manager.dependencies_dir,
                    self.dependency_manager.get(run_state.bundle.uuid,
                                                dep_key).path,
                )
                os.symlink(docker_dependency_path, full_child_path)
            # These are turned into docker volume bindings like:
            #   dependency_path:docker_dependency_path:ro
            docker_dependencies.append(
                (dependency_path, docker_dependency_path))

        # 3) Set up container
        if run_state.resources.network:
            docker_network = self.docker_network_external.name
        else:
            docker_network = self.docker_network_internal.name

        try:
            cpuset, gpuset = self.assign_cpu_and_gpu_sets_fn(
                run_state.resources.cpus, run_state.resources.gpus)
        except Exception as e:
            message = "Cannot assign enough resources: %s" % str(e)
            logger.error(message)
            logger.error(traceback.format_exc())
            return run_state._replace(run_status=message)

        # 4) Start container
        try:
            container = docker_utils.start_bundle_container(
                run_state.bundle_path,
                run_state.bundle.uuid,
                docker_dependencies,
                run_state.bundle.command,
                run_state.resources.docker_image,
                network=docker_network,
                cpuset=cpuset,
                gpuset=gpuset,
                memory_bytes=run_state.resources.memory,
                runtime=self.docker_runtime,
            )
            self.worker_docker_network.connect(container)
        except Exception as e:
            message = 'Cannot start Docker container: {}'.format(e)
            logger.error(message)
            logger.error(traceback.format_exc())
            raise

        return run_state._replace(
            stage=RunStage.RUNNING,
            run_status='Running job in Docker container',
            container_id=container.id,
            container=container,
            docker_image=image_state.digest,
            has_contents=True,
            cpuset=cpuset,
            gpuset=gpuset,
        )