예제 #1
0
 def test_all_dependencies(self):
     dependency_key = DependencyKey(parent_uuid="0x1", parent_path="parent")
     self.dependency_manager.get("0x2", dependency_key)
     dependency_key = DependencyKey(parent_uuid="0x3",
                                    parent_path="parent2")
     self.dependency_manager.get("0x4", dependency_key)
     dependency_keys = self.dependency_manager.all_dependencies
     self.assertEqual(len(dependency_keys), 2)
예제 #2
0
 def test_get_has(self):
     dependent_uuid = "0x2"
     dependency_key = DependencyKey(parent_uuid="0x1", parent_path="parent")
     state = self.dependency_manager.get(dependent_uuid, dependency_key)
     self.assertTrue(self.dependency_manager.has(dependency_key))
     self.assertEqual(state.stage, "DOWNLOADING")
     self.assertEqual(state.path, "0x1_parent")
     self.assertEqual(state.dependents, {dependent_uuid})
예제 #3
0
def task(work_dir, state_path, random_file_path):
    """
    Runs the end-to-end workflow of the Dependency Manager.
    Note: ProcessPoolExecutor must serialize everything before sending it to the worker,
          so this function needs to be defined at the top-level.
    # """
    # Mock Bundle Service to return a random file object
    mock_bundle_service = MagicMock()
    mock_bundle_service.get_bundle_info = MagicMock(
        return_value={'type': "file"})
    file_obj = open(random_file_path, "rb")
    mock_bundle_service.get_bundle_contents = MagicMock(return_value=file_obj)

    # Create and start a dependency manager
    process_id = os.getpid()
    print(f"{process_id}: Starting a DependencyManager...")
    dependency_manager = DependencyManager(
        commit_file=state_path,
        bundle_service=mock_bundle_service,
        worker_dir=work_dir,
        max_cache_size_bytes=2048,
        download_dependencies_max_retries=1,
    )
    dependency_manager.start()
    print(f"{process_id}: Started with work directory: {work_dir}.")

    # Register a run's UUID as a dependent of a parent bundle with UUID 0x1
    dependency_key = DependencyKey(parent_uuid="0x1", parent_path="parent")
    run_uuid = f"0x{process_id}"
    state = dependency_manager.get(run_uuid, dependency_key)
    assert (run_uuid in state.dependents
            ), f"{process_id}: Expected {run_uuid} as one of the dependents."

    # Release the run bundle as a dependent
    dependency_manager.release(run_uuid, dependency_key)
    dependencies = dependency_manager._fetch_dependencies()
    if dependency_key in dependencies:
        state = dependencies[dependency_key]
        print(f"{process_id}: Checking {run_uuid} in {state.dependents}")
        assert (
            run_uuid not in state.dependents
        ), f"{process_id}: Dependent should not be in the list of dependents after unregistering."

    # Keep the dependency manager running for some time to test the loop
    time.sleep(30)

    # Stop the Dependency Manager
    print(f"{process_id}: Stopping DependencyManger...")
    dependency_manager.stop()
    print(f"{process_id}: Done.")
 def _get_dependency_path(self, run_state, dependency):
     if self.shared_file_system:
         # On a shared FS, we know where the dependency is stored and can get the contents directly
         return os.path.realpath(
             os.path.join(dependency.location, dependency.parent_path))
     else:
         # On a dependency_manager setup, ask the manager where the dependency is
         dep_key = DependencyKey(dependency.parent_uuid,
                                 dependency.parent_path)
         return os.path.join(
             self.dependency_manager.dependencies_dir,
             self.dependency_manager.get(run_state.bundle.uuid,
                                         dep_key).path,
         )
예제 #5
0
    def _transition_from_CLEANING_UP(self, run_state):
        """
        1- delete the container if still existent
        2- clean up the dependencies from bundle directory
        3- release the dependencies in dependency manager
        4- If bundle has contents to upload (i.e. was RUNNING at some point),
            move to UPLOADING_RESULTS state
           Otherwise move to FINALIZING state
        """
        if run_state.container_id is not None:
            while docker_utils.container_exists(run_state.container):
                try:
                    finished, _, _ = docker_utils.check_finished(
                        run_state.container)
                    if finished:
                        run_state.container.remove(force=True)
                        run_state = run_state._replace(container=None,
                                                       container_id=None)
                        break
                    else:
                        try:
                            run_state.container.kill()
                        except docker.errors.APIError:
                            logger.error(traceback.format_exc())
                            time.sleep(1)
                except docker.errors.APIError:
                    logger.error(traceback.format_exc())
                    time.sleep(1)

        for dep in run_state.bundle.dependencies:
            dep_key = DependencyKey(dep.parent_uuid, dep.parent_path)
            if not self.shared_file_system:  # No dependencies if shared fs worker
                self.dependency_manager.release(run_state.bundle.uuid, dep_key)

            child_path = os.path.join(run_state.bundle_path, dep.child_path)
            try:
                remove_path(child_path)
            except Exception:
                logger.error(traceback.format_exc())

        if run_state.is_restaged:
            return run_state._replace(stage=RunStage.RESTAGED)

        if not self.shared_file_system and run_state.has_contents:
            # No need to upload results since results are directly written to bundle store
            return run_state._replace(stage=RunStage.UPLOADING_RESULTS,
                                      run_status='Uploading results',
                                      container=None)
        else:
            return self.finalize_run(run_state)
예제 #6
0
    def test_release(self):
        dependency_key = DependencyKey(parent_uuid="0x1", parent_path="parent")
        self.dependency_manager.get("0x2", dependency_key)
        state = self.dependency_manager.get("0x3", dependency_key)
        # Passing in the same dependency key with a different dependent, will just add the dependent
        self.assertEqual(state.dependents, {"0x2", "0x3"})

        # Release 0x2 as a dependent
        self.dependency_manager.release("0x2", dependency_key)
        with self.dependency_manager._state_lock:
            dependencies = self.dependency_manager._fetch_dependencies()
        state = dependencies[dependency_key]
        self.assertEqual(state.dependents, {"0x3"})

        # Release 0x3 as a dependent - should be left with no dependents
        self.dependency_manager.release("0x3", dependency_key)
        with self.dependency_manager._state_lock:
            dependencies = self.dependency_manager._fetch_dependencies()
        state = dependencies[dependency_key]
        self.assertEqual(len(state.dependents), 0)
예제 #7
0
    def _transition_from_CLEANING_UP(self, run_state):
        """
        1- delete the container if still existent
        2- clean up the dependencies from bundle directory
        3- release the dependencies in dependency manager
        4- If bundle has contents to upload (i.e. was RUNNING at some point),
            move to UPLOADING_RESULTS state
           Otherwise move to FINALIZING state
        """
        def remove_path_no_fail(path):
            try:
                remove_path(path)
            except Exception:
                logger.error(traceback.format_exc())

        if run_state.container_id is not None:
            while docker_utils.container_exists(run_state.container):
                try:
                    finished, _, _ = docker_utils.check_finished(
                        run_state.container)
                    if finished:
                        run_state.container.remove(force=True)
                        run_state = run_state._replace(container=None,
                                                       container_id=None)
                        break
                    else:
                        try:
                            run_state.container.kill()
                        except docker.errors.APIError:
                            logger.error(traceback.format_exc())
                            time.sleep(1)
                except docker.errors.APIError:
                    logger.error(traceback.format_exc())
                    time.sleep(1)

        for dep in run_state.bundle.dependencies:
            if not self.shared_file_system:  # No dependencies if shared fs worker
                dep_key = DependencyKey(dep.parent_uuid, dep.parent_path)
                self.dependency_manager.release(run_state.bundle.uuid, dep_key)

        # Clean up dependencies paths
        for path in run_state.paths_to_remove or []:
            remove_path_no_fail(path)
        run_state = run_state._replace(paths_to_remove=[])

        if run_state.is_restaged:
            log_bundle_transition(
                bundle_uuid=run_state.bundle.uuid,
                previous_stage=run_state.stage,
                next_stage=RunStage.RESTAGED,
                reason=self.RESTAGED_REASON,
            )
            return run_state._replace(stage=RunStage.RESTAGED)

        if not self.shared_file_system and run_state.has_contents:
            log_bundle_transition(
                bundle_uuid=run_state.bundle.uuid,
                previous_stage=run_state.stage,
                next_stage=RunStage.UPLOADING_RESULTS,
            )
            return run_state._replace(stage=RunStage.UPLOADING_RESULTS,
                                      run_status='Uploading results',
                                      container=None)
        else:
            # No need to upload results since results are directly written to bundle store
            # Delete any files that match the exclude_patterns .
            for exclude_pattern in run_state.bundle.metadata[
                    "exclude_patterns"]:
                full_pattern = os.path.join(run_state.bundle_path,
                                            exclude_pattern)
                for file_path in glob.glob(full_pattern, recursive=True):
                    # Only remove files that are subpaths of run_state.bundle_path, in case
                    # that exclude_pattern is something like "../../../".
                    if path_is_parent(parent_path=run_state.bundle_path,
                                      child_path=file_path):
                        remove_path(file_path)
            return self.finalize_run(run_state)
예제 #8
0
    def _transition_from_PREPARING(self, run_state):
        """
        1- Request the docker image from docker image manager
            - if image is failed, move to CLEANING_UP state
        2- Request the dependencies from dependency manager
            - if any are failed, move to CLEANING_UP state
        3- If all dependencies and docker image are ready:
            - Set up the local filesystem for the run
            - Create symlinks to dependencies
            - Allocate resources and prepare the docker container
            - Start the docker container
        4- If all is successful, move to RUNNING state
        """
        def mount_dependency(dependency, shared_file_system):
            if not shared_file_system:
                # Set up symlinks for the content at dependency path
                Path(dependency.child_path).parent.mkdir(parents=True,
                                                         exist_ok=True)
                os.symlink(dependency.docker_path, dependency.child_path)
            # The following will be converted into a Docker volume binding like:
            #   dependency_path:docker_dependency_path:ro
            docker_dependencies.append(
                (dependency.parent_path, dependency.docker_path))

        if run_state.is_killed or run_state.is_restaged:
            log_bundle_transition(
                bundle_uuid=run_state.bundle.uuid,
                previous_stage=run_state.stage,
                next_stage=RunStage.CLEANING_UP,
                reason=
                f'the bundle was {"killed" if run_state.is_killed else "restaged"}',
            )
            return run_state._replace(stage=RunStage.CLEANING_UP)

        # Check CPU and GPU availability
        try:
            cpuset, gpuset = self.assign_cpu_and_gpu_sets_fn(
                run_state.resources.cpus, run_state.resources.gpus)
        except Exception as e:
            message = "Unexpectedly unable to assign enough resources to bundle {}: {}".format(
                run_state.bundle.uuid, str(e))
            logger.error(message)
            logger.error(traceback.format_exc())
            return run_state._replace(run_status=message)

        dependencies_ready = True
        status_messages = []

        if not self.shared_file_system:
            # No need to download dependencies if we're in the shared FS,
            # since they're already in our FS
            for dep in run_state.bundle.dependencies:
                dep_key = DependencyKey(dep.parent_uuid, dep.parent_path)
                dependency_state = self.dependency_manager.get(
                    run_state.bundle.uuid, dep_key)
                if dependency_state.stage == DependencyStage.DOWNLOADING:
                    status_messages.append(
                        'Downloading dependency %s: %s done (archived size)' %
                        (dep.child_path, size_str(
                            dependency_state.size_bytes)))
                    dependencies_ready = False
                elif dependency_state.stage == DependencyStage.FAILED:
                    # Failed to download dependency; -> CLEANING_UP
                    log_bundle_transition(
                        bundle_uuid=run_state.bundle.uuid,
                        previous_stage=run_state.stage,
                        next_stage=RunStage.CLEANING_UP,
                        reason=
                        f'Dependency has failed for this bundle. Dependency child uuid: {dep.child_uuid}. Dependency child path: {dep.child_path}',
                    )
                    return run_state._replace(
                        stage=RunStage.CLEANING_UP,
                        failure_message='Failed to download dependency %s: %s'
                        % (dep.child_path, dependency_state.message),
                    )

        # get the docker image
        docker_image = run_state.resources.docker_image
        image_state = self.image_manager.get(docker_image)
        if image_state.stage == DependencyStage.DOWNLOADING:
            status_messages.append('Pulling docker image %s %s' %
                                   (docker_image, image_state.message))
            dependencies_ready = False
        elif image_state.stage == DependencyStage.FAILED:
            # Failed to pull image; -> CLEANING_UP
            message = 'Failed to download Docker image: %s' % image_state.message
            logger.error(message)
            return run_state._replace(stage=RunStage.CLEANING_UP,
                                      failure_message=message)

        # stop proceeding if dependency and image downloads aren't all done
        if not dependencies_ready:
            status_message = status_messages.pop()
            if status_messages:
                status_message += "(and downloading %d other dependencies and docker images)" % len(
                    status_messages)
            logger.info(
                f'bundle is not ready yet. uuid: {run_state.bundle.uuid}. status message: {status_message}'
            )
            return run_state._replace(run_status=status_message)

        # All dependencies ready! Set up directories, symlinks and container. Start container.
        # 1) Set up a directory to store the bundle.
        if self.shared_file_system:
            if not os.path.exists(run_state.bundle_path):
                if run_state.bundle_dir_wait_num_tries == 0:
                    message = (
                        "Bundle directory cannot be found on the shared filesystem. "
                        "Please ensure the shared fileystem between the server and "
                        "your worker is mounted properly or contact your administrators."
                    )
                    log_bundle_transition(
                        bundle_uuid=run_state.bundle.uuid,
                        previous_stage=run_state.stage,
                        next_stage=RunStage.CLEANING_UP,
                        reason=
                        "Bundle directory cannot be found on the shared filesystem.",
                    )
                    return run_state._replace(stage=RunStage.CLEANING_UP,
                                              failure_message=message)
                next_bundle_dir_wait_num_tries = run_state.bundle_dir_wait_num_tries - 1
                logger.info(
                    f'Waiting for bundle directory to be created by the server, uuid: {run_state.bundle.uuid}, bundle_dir_wait_num_tries: {next_bundle_dir_wait_num_tries}'
                )
                return run_state._replace(
                    run_status=
                    "Waiting for bundle directory to be created by the server",
                    bundle_dir_wait_num_tries=next_bundle_dir_wait_num_tries,
                )
        else:
            remove_path(run_state.bundle_path)
            os.makedirs(run_state.bundle_path)

        # 2) Set up symlinks
        docker_dependencies = []
        docker_dependencies_path = (
            RunStateMachine._ROOT + run_state.bundle.uuid +
            ('_dependencies' if not self.shared_file_system else ''))

        for dep in run_state.bundle.dependencies:
            full_child_path = os.path.normpath(
                os.path.join(run_state.bundle_path, dep.child_path))
            to_mount = []
            dependency_path = self._get_dependency_path(run_state, dep)

            if dep.child_path == RunStateMachine._CURRENT_DIRECTORY:
                # Mount all the content of the dependency_path to the top-level of the bundle
                for child in os.listdir(dependency_path):
                    child_path = os.path.normpath(
                        os.path.join(run_state.bundle_path, child))
                    to_mount.append(
                        DependencyToMount(
                            docker_path=os.path.join(docker_dependencies_path,
                                                     child),
                            child_path=child_path,
                            parent_path=os.path.join(dependency_path, child),
                        ))
                    run_state = run_state._replace(
                        paths_to_remove=(run_state.paths_to_remove or []) +
                        [child_path])
            else:
                to_mount.append(
                    DependencyToMount(
                        docker_path=os.path.join(docker_dependencies_path,
                                                 dep.child_path),
                        child_path=full_child_path,
                        parent_path=dependency_path,
                    ))

                first_element_of_path = Path(dep.child_path).parts[0]
                if first_element_of_path == RunStateMachine._ROOT:
                    run_state = run_state._replace(
                        paths_to_remove=(run_state.paths_to_remove or []) +
                        [full_child_path])
                else:
                    # child_path can be a nested path, so later remove everything from the first element of the path
                    path_to_remove = os.path.join(run_state.bundle_path,
                                                  first_element_of_path)
                    run_state = run_state._replace(
                        paths_to_remove=(run_state.paths_to_remove or []) +
                        [path_to_remove])
            for dependency in to_mount:
                try:
                    mount_dependency(dependency, self.shared_file_system)
                except OSError as e:
                    log_bundle_transition(
                        bundle_uuid=run_state.bundle.uuid,
                        previous_stage=run_state.stage,
                        next_stage=RunStage.CLEANING_UP,
                        reason=str(e.__class__),
                        level=logging.ERROR,
                    )
                    return run_state._replace(stage=RunStage.CLEANING_UP,
                                              failure_message=str(e))

        if run_state.resources.network:
            docker_network = self.docker_network_external.name
        else:
            docker_network = self.docker_network_internal.name

        # 3) Start container
        try:
            container = docker_utils.start_bundle_container(
                run_state.bundle_path,
                run_state.bundle.uuid,
                docker_dependencies,
                run_state.bundle.command,
                run_state.resources.docker_image,
                network=docker_network,
                cpuset=cpuset,
                gpuset=gpuset,
                memory_bytes=run_state.resources.memory,
                runtime=self.docker_runtime,
            )
            self.worker_docker_network.connect(container)
        except docker_utils.DockerUserErrorException as e:
            message = 'Cannot start Docker container: {}'.format(e)
            log_bundle_transition(
                bundle_uuid=run_state.bundle.uuid,
                previous_stage=run_state.stage,
                next_stage=RunStage.CLEANING_UP,
                reason='Cannot start Docker container.',
                level=logging.ERROR,
            )
            return run_state._replace(stage=RunStage.CLEANING_UP,
                                      failure_message=message)
        except Exception as e:
            message = 'Cannot start container: {}'.format(e)
            logger.error(message)
            logger.error(traceback.format_exc())
            raise

        return run_state._replace(
            stage=RunStage.RUNNING,
            run_status='Running job in container',
            container_id=container.id,
            container=container,
            docker_image=image_state.digest,
            has_contents=True,
            cpuset=cpuset,
            gpuset=gpuset,
        )
예제 #9
0
    def _transition_from_PREPARING(self, run_state):
        """
        1- Request the docker image from docker image manager
            - if image is failed, move to CLEANING_UP state
        2- Request the dependencies from dependency manager
            - if any are failed, move to CLEANING_UP state
        3- If all dependencies and docker image are ready:
            - Set up the local filesystem for the run
            - Create symlinks to dependencies
            - Allocate resources and prepare the docker container
            - Start the docker container
        4- If all is successful, move to RUNNING state
        """
        if run_state.is_killed:
            return run_state._replace(stage=RunStage.CLEANING_UP)

        # Check CPU and GPU availability
        try:
            cpuset, gpuset = self.assign_cpu_and_gpu_sets_fn(
                run_state.resources.cpus, run_state.resources.gpus)
        except Exception as e:
            message = "Unexpectedly unable to assign enough resources: %s" % str(
                e)
            logger.error(message)
            logger.error(traceback.format_exc())
            return run_state._replace(run_status=message)

        dependencies_ready = True
        status_messages = []

        if not self.shared_file_system:
            # No need to download dependencies if we're in the shared FS since they're already in our FS
            for dep in run_state.bundle.dependencies:
                dep_key = DependencyKey(dep.parent_uuid, dep.parent_path)
                dependency_state = self.dependency_manager.get(
                    run_state.bundle.uuid, dep_key)
                if dependency_state.stage == DependencyStage.DOWNLOADING:
                    status_messages.append(
                        'Downloading dependency %s: %s done (archived size)' %
                        (dep.child_path, size_str(
                            dependency_state.size_bytes)))
                    dependencies_ready = False
                elif dependency_state.stage == DependencyStage.FAILED:
                    # Failed to download dependency; -> CLEANING_UP
                    return run_state._replace(
                        stage=RunStage.CLEANING_UP,
                        failure_message='Failed to download dependency %s: %s'
                        % (dep.child_path, dependency_state.message),
                    )

        # get the docker image
        docker_image = run_state.resources.docker_image
        image_state = self.docker_image_manager.get(docker_image)
        if image_state.stage == DependencyStage.DOWNLOADING:
            status_messages.append('Pulling docker image: ' +
                                   (image_state.message or docker_image or ""))
            dependencies_ready = False
        elif image_state.stage == DependencyStage.FAILED:
            # Failed to pull image; -> CLEANING_UP
            message = 'Failed to download Docker image: %s' % image_state.message
            logger.error(message)
            return run_state._replace(stage=RunStage.CLEANING_UP,
                                      failure_message=message)

        # stop proceeding if dependency and image downloads aren't all done
        if not dependencies_ready:
            status_message = status_messages.pop()
            if status_messages:
                status_message += "(and downloading %d other dependencies and docker images)" % len(
                    status_messages)
            return run_state._replace(run_status=status_message)

        # All dependencies ready! Set up directories, symlinks and container. Start container.
        # 1) Set up a directory to store the bundle.
        if self.shared_file_system:
            if not os.path.exists(run_state.bundle_path):
                if run_state.bundle_dir_wait_num_tries == 0:
                    message = (
                        "Bundle directory cannot be found on the shared filesystem. "
                        "Please ensure the shared fileystem between the server and "
                        "your worker is mounted properly or contact your administrators."
                    )
                    logger.error(message)
                    return run_state._replace(stage=RunStage.CLEANING_UP,
                                              failure_message=message)
                return run_state._replace(
                    run_status=
                    "Waiting for bundle directory to be created by the server",
                    bundle_dir_wait_num_tries=run_state.
                    bundle_dir_wait_num_tries - 1,
                )
        else:
            remove_path(run_state.bundle_path)
            os.mkdir(run_state.bundle_path)

        # 2) Set up symlinks
        docker_dependencies = []
        docker_dependencies_path = (
            '/' + run_state.bundle.uuid +
            ('_dependencies' if not self.shared_file_system else ''))
        for dep in run_state.bundle.dependencies:
            dep_key = DependencyKey(dep.parent_uuid, dep.parent_path)
            full_child_path = os.path.normpath(
                os.path.join(run_state.bundle_path, dep.child_path))
            if not full_child_path.startswith(run_state.bundle_path):
                # Dependencies should end up in their bundles (ie prevent using relative paths like ..
                # to get out of their parent bundles
                message = 'Invalid key for dependency: %s' % (dep.child_path)
                logger.error(message)
                return run_state._replace(stage=RunStage.CLEANING_UP,
                                          failure_message=message)
            docker_dependency_path = os.path.join(docker_dependencies_path,
                                                  dep.child_path)
            if self.shared_file_system:
                # On a shared FS, we know where the dep is stored and can get the contents directly
                dependency_path = os.path.realpath(
                    os.path.join(dep.location, dep.parent_path))
            else:
                # On a dependency_manager setup ask the manager where the dependency is
                dependency_path = os.path.join(
                    self.dependency_manager.dependencies_dir,
                    self.dependency_manager.get(run_state.bundle.uuid,
                                                dep_key).path,
                )
                os.symlink(docker_dependency_path, full_child_path)
            # These are turned into docker volume bindings like:
            #   dependency_path:docker_dependency_path:ro
            docker_dependencies.append(
                (dependency_path, docker_dependency_path))

        if run_state.resources.network:
            docker_network = self.docker_network_external.name
        else:
            docker_network = self.docker_network_internal.name

        # 3) Start container
        try:
            container = docker_utils.start_bundle_container(
                run_state.bundle_path,
                run_state.bundle.uuid,
                docker_dependencies,
                run_state.bundle.command,
                run_state.resources.docker_image,
                network=docker_network,
                cpuset=cpuset,
                gpuset=gpuset,
                memory_bytes=run_state.resources.memory,
                runtime=self.docker_runtime,
            )
            self.worker_docker_network.connect(container)
        except docker_utils.DockerUserErrorException as e:
            message = 'Cannot start Docker container: {}'.format(e)
            logger.warning(message)
            return run_state._replace(stage=RunStage.CLEANING_UP,
                                      failure_message=message)
        except Exception as e:
            message = 'Cannot start Docker container: {}'.format(e)
            logger.error(message)
            logger.error(traceback.format_exc())
            raise

        return run_state._replace(
            stage=RunStage.RUNNING,
            run_status='Running job in Docker container',
            container_id=container.id,
            container=container,
            docker_image=image_state.digest,
            has_contents=True,
            cpuset=cpuset,
            gpuset=gpuset,
        )