def un_tar_directory(fileobj, directory_path, compression='', force=False):
    """
    Extracts the given file-like object containing a tar archive into the given
    directory, which will be created and should not already exist. If it already exists,
    and `force` is `False`, an error is raised. If it already exists, and `force` is `True`,
    the directory is removed and recreated.

    compression specifies the compression scheme and can be one of '', 'gz' or
    'bz2'.

    Raises tarfile.TarError if the archive is not valid.
    """
    directory_path = os.path.realpath(directory_path)
    if force:
        from codalab.worker.file_util import remove_path

        remove_path(directory_path)
    os.mkdir(directory_path)
    with tarfile.open(fileobj=fileobj, mode='r|' + compression) as tar:
        for member in tar:
            # Make sure that there is no trickery going on (see note in
            # TarFile.extractall() documentation).
            member_path = os.path.realpath(os.path.join(directory_path, member.name))
            if not member_path.startswith(directory_path):
                raise tarfile.TarError('Archive member extracts outside the directory.')
            tar.extract(member, directory_path)
Exemple #2
0
 def _try_start_bundle(self, workers, worker, bundle):
     """
     Tries to start running the bundle on the given worker, returning False
     if that failed.
     """
     if self._model.set_starting_bundle(bundle, worker['user_id'],
                                        worker['worker_id']):
         workers.set_starting(bundle.uuid, worker)
         if (self._worker_model.shared_file_system
                 and worker['user_id'] == self._model.root_user_id):
             # On a shared file system we create the path here to avoid NFS
             # directory cache issues.
             path = self._bundle_store.get_bundle_location(bundle.uuid)
             remove_path(path)
             os.mkdir(path)
         if self._worker_model.send_json_message(
                 worker['socket_id'],
                 self._construct_run_message(worker, bundle), 0.2):
             logger.info('Starting run bundle %s', bundle.uuid)
             return True
         else:
             self._model.restage_bundle(bundle)
             workers.restage(bundle.uuid)
             return False
     else:
         return False
    def _try_start_bundle(self, workers, worker, bundle, bundle_resources):
        """
        Tries to start running the bundle on the given worker, returning False
        if that failed.
        """
        if not check_bundle_have_run_permission(
                self._model, self._model.get_user(worker['user_id']),
                bundle) or not self._model.transition_bundle_starting(
                    bundle, worker['user_id'], worker['worker_id']):
            return False

        workers.set_starting(bundle.uuid, worker['worker_id'])
        if worker['shared_file_system']:
            # On a shared file system we create the path here to avoid NFS
            # directory cache issues.
            # TODO(Ashwin): fix for --link
            path = self._bundle_store.get_bundle_location(bundle.uuid)
            remove_path(path)
            os.mkdir(path)
        if self._worker_model.send_json_message(
                worker['socket_id'],
                self._construct_run_message(worker['shared_file_system'],
                                            bundle, bundle_resources),
                0.2,
        ):
            logger.info('Starting run bundle {} on worker {}'.format(
                bundle.uuid, worker['worker_id']))
            return True
        else:
            self._model.transition_bundle_staged(bundle)
            workers.restage(bundle.uuid)
            return False
    def test_empty(self):
        dir = tempfile.mkdtemp()
        self.addCleanup(lambda: remove_path(dir))
        temp_dir = tempfile.mkdtemp()
        self.addCleanup(lambda: remove_path(temp_dir))

        output_dir = os.path.join(temp_dir, 'output')
        self.unarchive(self.archive(dir), output_dir, 'gz')
        self.assertEqual(os.listdir(output_dir), [])
Exemple #5
0
    def test_tar_empty(self):
        dir = tempfile.mkdtemp()
        self.addCleanup(lambda: remove_path(dir))
        temp_dir = tempfile.mkdtemp()
        self.addCleanup(lambda: remove_path(temp_dir))

        output_dir = os.path.join(temp_dir, 'output')
        un_tar_directory(tar_gzip_directory(dir), output_dir, 'gz')
        self.assertEqual(os.listdir(output_dir), [])
Exemple #6
0
    def _make_bundle(self, bundle):
        try:
            bundle_location = self._bundle_store.get_bundle_location(
                bundle.uuid)
            path = os.path.normpath(bundle_location)

            deps = []
            for dep in bundle.dependencies:
                parent_bundle_path = os.path.normpath(
                    self._bundle_store.get_bundle_location(dep.parent_uuid))
                dependency_path = os.path.normpath(
                    os.path.join(parent_bundle_path, dep.parent_path))
                if not dependency_path.startswith(parent_bundle_path) or (
                        not os.path.islink(dependency_path)
                        and not os.path.exists(dependency_path)):
                    raise Exception('Invalid dependency %s' %
                                    (path_util.safe_join(
                                        dep.parent_uuid, dep.parent_path)))

                child_path = os.path.normpath(
                    os.path.join(path, dep.child_path))
                if not child_path.startswith(path):
                    raise Exception('Invalid key for dependency: %s' %
                                    (dep.child_path))

                deps.append((dependency_path, child_path))

            remove_path(path)

            if len(deps) == 1 and deps[0][1] == path:
                path_util.copy(deps[0][0], path, follow_symlinks=False)
            else:
                os.mkdir(path)
                for dependency_path, child_path in deps:
                    path_util.copy(dependency_path,
                                   child_path,
                                   follow_symlinks=False)

            self._model.update_disk_metadata(bundle,
                                             bundle_location,
                                             enforce_disk_quota=True)
            logger.info('Finished making bundle %s', bundle.uuid)
            self._model.update_bundle(bundle, {'state': State.READY})
        except Exception as e:
            logger.info('Failing bundle %s: %s', bundle.uuid, str(e))
            self._model.update_bundle(bundle, {
                'state': State.FAILED,
                'metadata': {
                    'failure_message': str(e)
                }
            })
        finally:
            with self._make_uuids_lock:
                self._make_uuids.remove(bundle.uuid)
 def _transition_from_FINALIZING(self, run_state):
     """
     If a full worker cycle has passed since we got into FINALIZING we already reported to
     server so can move on to FINISHED. Can also remove bundle_path now
     """
     if run_state.finalized:
         if not self.shared_file_system:
             remove_path(run_state.bundle_path)  # don't remove bundle if shared FS
         return run_state._replace(stage=RunStage.FINISHED, run_status='Finished')
     else:
         return run_state
Exemple #8
0
    def _transition_from_CLEANING_UP(self, run_state):
        """
        1- delete the container if still existent
        2- clean up the dependencies from bundle directory
        3- release the dependencies in dependency manager
        4- If bundle has contents to upload (i.e. was RUNNING at some point),
            move to UPLOADING_RESULTS state
           Otherwise move to FINALIZING state
        """
        if run_state.container_id is not None:
            while docker_utils.container_exists(run_state.container):
                try:
                    finished, _, _ = docker_utils.check_finished(
                        run_state.container)
                    if finished:
                        run_state.container.remove(force=True)
                        run_state = run_state._replace(container=None,
                                                       container_id=None)
                        break
                    else:
                        try:
                            run_state.container.kill()
                        except docker.errors.APIError:
                            logger.error(traceback.format_exc())
                            time.sleep(1)
                except docker.errors.APIError:
                    logger.error(traceback.format_exc())
                    time.sleep(1)

        for dep in run_state.bundle.dependencies:
            dep_key = DependencyKey(dep.parent_uuid, dep.parent_path)
            if not self.shared_file_system:  # No dependencies if shared fs worker
                self.dependency_manager.release(run_state.bundle.uuid, dep_key)

            child_path = os.path.join(run_state.bundle_path, dep.child_path)
            try:
                remove_path(child_path)
            except Exception:
                logger.error(traceback.format_exc())

        if run_state.is_restaged:
            return run_state._replace(stage=RunStage.RESTAGED)

        if not self.shared_file_system and run_state.has_contents:
            # No need to upload results since results are directly written to bundle store
            return run_state._replace(stage=RunStage.UPLOADING_RESULTS,
                                      run_status='Uploading results',
                                      container=None)
        else:
            return self.finalize_run(run_state)
Exemple #9
0
 def _delete_dependency(self, dependency_key):
     """
     Remove the given dependency from the manager's state
     Also delete any known files on the filesystem if any exist
     """
     if self._acquire_if_exists(dependency_key):
         try:
             path_to_remove = self._dependencies[dependency_key].path
             self._paths.remove(path_to_remove)
             remove_path(path_to_remove)
         except Exception:
             pass
         finally:
             del self._dependencies[dependency_key]
             self._dependency_locks[dependency_key].release()
 def _transition_from_FINALIZING(self, run_state):
     """
     If a full worker cycle has passed since we got into the FINALIZING state we already reported to
     server, if bundle is going be sent back to the server, move on to the RESTAGED state. Otherwise,
     move on to the FINISHED state. Can also remove bundle_path now.
     """
     if run_state.is_restaged:
         return run_state._replace(stage=RunStage.RESTAGED)
     elif run_state.finalized:
         if not self.shared_file_system:
             remove_path(
                 run_state.bundle_path)  # don't remove bundle if shared FS
         return run_state._replace(stage=RunStage.FINISHED,
                                   run_status='Finished')
     else:
         return run_state
Exemple #11
0
    def _sync_state(self):
        """
        Synchronize dependency states between dependencies-state.json and the local file system as follows:
        1. self._dependencies, self._dependency_locks, and self._paths: populated from dependencies-state.json
            in function _load_state()
        2. directories on the local file system: the bundle contents
        This function forces the 1 and 2 to be in sync by taking the intersection (e.g., deleting bundles from the
        local file system that don't appear in the dependencies-state.json and vice-versa)
        """
        # Get the paths that exist in dependency state, loaded path and
        # the local file system (the dependency directories under self.dependencies_dir)
        local_directories = set(os.listdir(self.dependencies_dir))
        paths_in_loaded_state = [
            dep_state.path for dep_state in self._dependencies.values()
        ]
        self._paths = self._paths.intersection(
            paths_in_loaded_state).intersection(local_directories)

        # Remove the orphaned dependencies from self._dependencies and
        # self._dependency_locks if they don't exist in self._paths (intersection of paths in dependency state,
        # loaded paths and the paths on the local file system)
        dependencies_to_remove = [
            dep for dep, dep_state in self._dependencies.items()
            if dep_state.path not in self._paths
        ]
        for dep in dependencies_to_remove:
            logger.info(
                "Dependency {} in dependency state but its path {} doesn't exist on the local file system. "
                "Removing it from dependency state.".format(
                    dep,
                    os.path.join(self.dependencies_dir,
                                 self._dependencies[dep].path)))
            del self._dependencies[dep]
            del self._dependency_locks[dep]

        # Remove the orphaned directories from the local file system
        directories_to_remove = local_directories - self._paths
        for dir in directories_to_remove:
            full_path = os.path.join(self.dependencies_dir, dir)
            logger.info(
                "Remove orphaned directory {} from the local file system.".
                format(full_path))
            remove_path(full_path)

        # Save the current synced state back to the state file: dependency-state.json as
        # the current state might have been changed during the state syncing phase
        self._save_state()
 def _transition_from_FINALIZING(self, run_state):
     """
     If a full worker cycle has passed since we got into the FINALIZING state we already reported to
     server, if bundle is going be sent back to the server, move on to the RESTAGED state. Otherwise,
     move on to the FINISHED state. Can also remove bundle_path now.
     """
     if run_state.is_restaged:
         log_bundle_transition(
             bundle_uuid=run_state.bundle.uuid,
             previous_stage=run_state.stage,
             next_stage=RunStage.RESTAGED,
             reason='the bundle is restaged, as `pass-down-termination` is specified for worker',
         )
         return run_state._replace(stage=RunStage.RESTAGED)
     elif run_state.finalized:
         if not self.shared_file_system:
             remove_path(run_state.bundle_path)  # don't remove bundle if shared FS
         return run_state._replace(stage=RunStage.FINISHED, run_status='Finished')
     else:
         return run_state
Exemple #13
0
    def test_always_ignore(self):
        temp_dir = tempfile.mkdtemp()
        self.addCleanup(lambda: remove_path(temp_dir))
        output_dir = os.path.join(temp_dir, 'output')

        self.unarchive(self.archive(IGNORE_TEST_DIR), output_dir, 'gz')
        output_dir_entries = os.listdir(output_dir)
        self.assertNotIn('._ignored', output_dir_entries)
        self.assertIn('dir', output_dir_entries)
        self.assertNotIn('__MACOSX', output_dir_entries)
        self.assertFalse(os.path.exists(os.path.join(output_dir, 'dir', '__MACOSX')))
        self.assertFalse(os.path.exists(os.path.join(output_dir, 'dir', '._ignored2')))
    def _delete_dependency(self, dep_key, dependencies, paths):
        """
        Remove the given dependency from the manager's state
        Modifies `dependencies` and `paths` that are passed in.
        Also deletes any known files on the filesystem if any exist.

        NOT NFS-SAFE - Caller should acquire self._state_lock before calling this method.
        """
        assert self._state_lock.is_locked

        if dep_key in dependencies:
            try:
                path_to_remove = dependencies[dep_key].path
                paths.remove(path_to_remove)
                # Deletes dependency content from disk
                remove_path(path_to_remove)
            except Exception:
                pass
            finally:
                del dependencies[dep_key]
                logger.info(f"Deleted dependency {dep_key}.")
Exemple #15
0
    def test_exclude_ignore(self):
        temp_dir = tempfile.mkdtemp()
        self.addCleanup(lambda: remove_path(temp_dir))
        output_dir = os.path.join(temp_dir, 'output')

        self.unarchive(self.archive(IGNORE_TEST_DIR, ignore_file='.tarignore'), output_dir, 'gz')
        output_dir_entries = os.listdir(output_dir)
        self.assertIn('not_ignored.txt', output_dir_entries)
        self.assertIn('dir', output_dir_entries)
        self.assertNotIn('ignored.txt', output_dir_entries)
        self.assertNotIn('ignored_dir', output_dir_entries)
        self.assertTrue(os.path.exists(os.path.join(output_dir, 'dir', 'not_ignored2.txt')))
        self.assertFalse(os.path.exists(os.path.join(output_dir, 'dir', 'ignored2.txt')))
Exemple #16
0
    def test_has_files(self):
        temp_dir = tempfile.mkdtemp()
        self.addCleanup(lambda: remove_path(temp_dir))

        output_dir = os.path.join(temp_dir, 'output')
        self.unarchive(self.archive(FILES_DIR, False, ['f2'], ['f1', 'b.txt']), output_dir, 'gz')
        output_dir_entries = os.listdir(output_dir)
        self.assertIn('dir1', output_dir_entries)
        self.assertIn('a.txt', output_dir_entries)
        self.assertNotIn('b.txt', output_dir_entries)
        self.assertTrue(os.path.exists(os.path.join(output_dir, 'dir1', 'f1')))
        self.assertFalse(os.path.exists(os.path.join(output_dir, 'dir1', 'f2')))
        self.assertTrue(os.path.islink(os.path.join(output_dir, 'a-symlink.txt')))
Exemple #17
0
    def test_tar_always_ignore(self):
        dir = os.path.join(os.path.dirname(os.path.dirname(__file__)),
                           'files/ignore_test')
        temp_dir = tempfile.mkdtemp()
        self.addCleanup(lambda: remove_path(temp_dir))
        output_dir = os.path.join(temp_dir, 'output')

        un_tar_directory(tar_gzip_directory(dir), output_dir, 'gz')
        output_dir_entries = os.listdir(output_dir)
        self.assertNotIn('._ignored', output_dir_entries)
        self.assertIn('dir', output_dir_entries)
        self.assertNotIn('__MACOSX', output_dir_entries)
        self.assertFalse(
            os.path.exists(os.path.join(output_dir, 'dir', '__MACOSX')))
        self.assertFalse(
            os.path.exists(os.path.join(output_dir, 'dir', '._ignored2')))
Exemple #18
0
    def test_tar_has_files(self):
        dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'files')
        temp_dir = tempfile.mkdtemp()
        self.addCleanup(lambda: remove_path(temp_dir))

        output_dir = os.path.join(temp_dir, 'output')
        un_tar_directory(
            tar_gzip_directory(dir, False, ['f2'], ['f1', 'b.txt']),
            output_dir, 'gz')
        output_dir_entries = os.listdir(output_dir)
        self.assertIn('dir1', output_dir_entries)
        self.assertIn('a.txt', output_dir_entries)
        self.assertNotIn('b.txt', output_dir_entries)
        self.assertTrue(os.path.exists(os.path.join(output_dir, 'dir1', 'f1')))
        self.assertFalse(os.path.exists(os.path.join(output_dir, 'dir1',
                                                     'f2')))
        self.assertTrue(
            os.path.islink(os.path.join(output_dir, 'a-symlink.txt')))
Exemple #19
0
    def test_tar_exclude_ignore(self):
        dir = os.path.join(os.path.dirname(os.path.dirname(__file__)),
                           'files/ignore_test')
        temp_dir = tempfile.mkdtemp()
        self.addCleanup(lambda: remove_path(temp_dir))
        output_dir = os.path.join(temp_dir, 'output')

        un_tar_directory(tar_gzip_directory(dir, ignore_file='.tarignore'),
                         output_dir, 'gz')
        output_dir_entries = os.listdir(output_dir)
        self.assertIn('not_ignored.txt', output_dir_entries)
        self.assertIn('dir', output_dir_entries)
        self.assertNotIn('ignored.txt', output_dir_entries)
        self.assertNotIn('ignored_dir', output_dir_entries)
        self.assertTrue(
            os.path.exists(os.path.join(output_dir, 'dir',
                                        'not_ignored2.txt')))
        self.assertFalse(
            os.path.exists(os.path.join(output_dir, 'dir', 'ignored2.txt')))
    def _make_bundle(self, bundle):
        try:
            bundle_link_url = getattr(bundle.metadata, "link_url", None)
            bundle_location = bundle_link_url or self._bundle_store.get_bundle_location(
                bundle.uuid)
            path = os.path.normpath(bundle_location)

            deps = []
            parent_bundle_link_urls = self._model.get_bundle_metadata(
                [dep.parent_uuid for dep in bundle.dependencies], "link_url")
            for dep in bundle.dependencies:
                parent_bundle_link_url = parent_bundle_link_urls.get(
                    dep.parent_uuid)
                try:
                    parent_bundle_path = parent_bundle_link_url or os.path.normpath(
                        self._bundle_store.get_bundle_location(
                            dep.parent_uuid))
                except NotFoundError:
                    raise Exception('Invalid dependency %s' %
                                    (path_util.safe_join(
                                        dep.parent_uuid, dep.parent_path)))
                # TODO(Ashwin): make this logic non-fs specific.
                dependency_path = os.path.normpath(
                    os.path.join(parent_bundle_path, dep.parent_path))
                if not dependency_path.startswith(parent_bundle_path) or (
                        not os.path.islink(dependency_path)
                        and not os.path.exists(dependency_path)):
                    raise Exception('Invalid dependency %s' %
                                    (path_util.safe_join(
                                        dep.parent_uuid, dep.parent_path)))

                child_path = os.path.normpath(
                    os.path.join(path, dep.child_path))
                if not child_path.startswith(path):
                    raise Exception('Invalid key for dependency: %s' %
                                    (dep.child_path))

                deps.append((dependency_path, child_path))

            remove_path(path)

            if len(deps) == 1 and deps[0][1] == path:
                path_util.copy(deps[0][0], path, follow_symlinks=False)
            else:
                os.mkdir(path)
                for dependency_path, child_path in deps:
                    path_util.copy(dependency_path,
                                   child_path,
                                   follow_symlinks=False)

            # TODO(Ashwin): fix
            self._model.update_disk_metadata(bundle,
                                             bundle_location,
                                             enforce_disk_quota=True)
            logger.info('Finished making bundle %s', bundle.uuid)
            self._model.update_bundle(bundle, {'state': State.READY})
        except Exception as e:
            logger.info('Failing bundle %s: %s', bundle.uuid, str(e))
            self._model.update_bundle(
                bundle,
                {
                    'state': State.FAILED,
                    'metadata': {
                        'failure_message': str(e),
                        'error_traceback': traceback.format_exc(),
                    },
                },
            )
        finally:
            with self._make_uuids_lock:
                self._make_uuids.remove(bundle.uuid)
Exemple #21
0
 def remove_path_no_fail(path):
     try:
         remove_path(path)
     except Exception:
         logger.error(traceback.format_exc())
Exemple #22
0
    def _transition_from_PREPARING(self, run_state):
        """
        1- Request the docker image from docker image manager
            - if image is failed, move to CLEANING_UP state
        2- Request the dependencies from dependency manager
            - if any are failed, move to CLEANING_UP state
        3- If all dependencies and docker image are ready:
            - Set up the local filesystem for the run
            - Create symlinks to dependencies
            - Allocate resources and prepare the docker container
            - Start the docker container
        4- If all is successful, move to RUNNING state
        """
        def mount_dependency(dependency, shared_file_system):
            if not shared_file_system:
                # Set up symlinks for the content at dependency path
                Path(dependency.child_path).parent.mkdir(parents=True,
                                                         exist_ok=True)
                os.symlink(dependency.docker_path, dependency.child_path)
            # The following will be converted into a Docker volume binding like:
            #   dependency_path:docker_dependency_path:ro
            docker_dependencies.append(
                (dependency.parent_path, dependency.docker_path))

        if run_state.is_killed or run_state.is_restaged:
            log_bundle_transition(
                bundle_uuid=run_state.bundle.uuid,
                previous_stage=run_state.stage,
                next_stage=RunStage.CLEANING_UP,
                reason=
                f'the bundle was {"killed" if run_state.is_killed else "restaged"}',
            )
            return run_state._replace(stage=RunStage.CLEANING_UP)

        # Check CPU and GPU availability
        try:
            cpuset, gpuset = self.assign_cpu_and_gpu_sets_fn(
                run_state.resources.cpus, run_state.resources.gpus)
        except Exception as e:
            message = "Unexpectedly unable to assign enough resources to bundle {}: {}".format(
                run_state.bundle.uuid, str(e))
            logger.error(message)
            logger.error(traceback.format_exc())
            return run_state._replace(run_status=message)

        dependencies_ready = True
        status_messages = []

        if not self.shared_file_system:
            # No need to download dependencies if we're in the shared FS,
            # since they're already in our FS
            for dep in run_state.bundle.dependencies:
                dep_key = DependencyKey(dep.parent_uuid, dep.parent_path)
                dependency_state = self.dependency_manager.get(
                    run_state.bundle.uuid, dep_key)
                if dependency_state.stage == DependencyStage.DOWNLOADING:
                    status_messages.append(
                        'Downloading dependency %s: %s done (archived size)' %
                        (dep.child_path, size_str(
                            dependency_state.size_bytes)))
                    dependencies_ready = False
                elif dependency_state.stage == DependencyStage.FAILED:
                    # Failed to download dependency; -> CLEANING_UP
                    log_bundle_transition(
                        bundle_uuid=run_state.bundle.uuid,
                        previous_stage=run_state.stage,
                        next_stage=RunStage.CLEANING_UP,
                        reason=
                        f'Dependency has failed for this bundle. Dependency child uuid: {dep.child_uuid}. Dependency child path: {dep.child_path}',
                    )
                    return run_state._replace(
                        stage=RunStage.CLEANING_UP,
                        failure_message='Failed to download dependency %s: %s'
                        % (dep.child_path, dependency_state.message),
                    )

        # get the docker image
        docker_image = run_state.resources.docker_image
        image_state = self.image_manager.get(docker_image)
        if image_state.stage == DependencyStage.DOWNLOADING:
            status_messages.append('Pulling docker image %s %s' %
                                   (docker_image, image_state.message))
            dependencies_ready = False
        elif image_state.stage == DependencyStage.FAILED:
            # Failed to pull image; -> CLEANING_UP
            message = 'Failed to download Docker image: %s' % image_state.message
            logger.error(message)
            return run_state._replace(stage=RunStage.CLEANING_UP,
                                      failure_message=message)

        # stop proceeding if dependency and image downloads aren't all done
        if not dependencies_ready:
            status_message = status_messages.pop()
            if status_messages:
                status_message += "(and downloading %d other dependencies and docker images)" % len(
                    status_messages)
            logger.info(
                f'bundle is not ready yet. uuid: {run_state.bundle.uuid}. status message: {status_message}'
            )
            return run_state._replace(run_status=status_message)

        # All dependencies ready! Set up directories, symlinks and container. Start container.
        # 1) Set up a directory to store the bundle.
        if self.shared_file_system:
            if not os.path.exists(run_state.bundle_path):
                if run_state.bundle_dir_wait_num_tries == 0:
                    message = (
                        "Bundle directory cannot be found on the shared filesystem. "
                        "Please ensure the shared fileystem between the server and "
                        "your worker is mounted properly or contact your administrators."
                    )
                    log_bundle_transition(
                        bundle_uuid=run_state.bundle.uuid,
                        previous_stage=run_state.stage,
                        next_stage=RunStage.CLEANING_UP,
                        reason=
                        "Bundle directory cannot be found on the shared filesystem.",
                    )
                    return run_state._replace(stage=RunStage.CLEANING_UP,
                                              failure_message=message)
                next_bundle_dir_wait_num_tries = run_state.bundle_dir_wait_num_tries - 1
                logger.info(
                    f'Waiting for bundle directory to be created by the server, uuid: {run_state.bundle.uuid}, bundle_dir_wait_num_tries: {next_bundle_dir_wait_num_tries}'
                )
                return run_state._replace(
                    run_status=
                    "Waiting for bundle directory to be created by the server",
                    bundle_dir_wait_num_tries=next_bundle_dir_wait_num_tries,
                )
        else:
            remove_path(run_state.bundle_path)
            os.makedirs(run_state.bundle_path)

        # 2) Set up symlinks
        docker_dependencies = []
        docker_dependencies_path = (
            RunStateMachine._ROOT + run_state.bundle.uuid +
            ('_dependencies' if not self.shared_file_system else ''))

        for dep in run_state.bundle.dependencies:
            full_child_path = os.path.normpath(
                os.path.join(run_state.bundle_path, dep.child_path))
            to_mount = []
            dependency_path = self._get_dependency_path(run_state, dep)

            if dep.child_path == RunStateMachine._CURRENT_DIRECTORY:
                # Mount all the content of the dependency_path to the top-level of the bundle
                for child in os.listdir(dependency_path):
                    child_path = os.path.normpath(
                        os.path.join(run_state.bundle_path, child))
                    to_mount.append(
                        DependencyToMount(
                            docker_path=os.path.join(docker_dependencies_path,
                                                     child),
                            child_path=child_path,
                            parent_path=os.path.join(dependency_path, child),
                        ))
                    run_state = run_state._replace(
                        paths_to_remove=(run_state.paths_to_remove or []) +
                        [child_path])
            else:
                to_mount.append(
                    DependencyToMount(
                        docker_path=os.path.join(docker_dependencies_path,
                                                 dep.child_path),
                        child_path=full_child_path,
                        parent_path=dependency_path,
                    ))

                first_element_of_path = Path(dep.child_path).parts[0]
                if first_element_of_path == RunStateMachine._ROOT:
                    run_state = run_state._replace(
                        paths_to_remove=(run_state.paths_to_remove or []) +
                        [full_child_path])
                else:
                    # child_path can be a nested path, so later remove everything from the first element of the path
                    path_to_remove = os.path.join(run_state.bundle_path,
                                                  first_element_of_path)
                    run_state = run_state._replace(
                        paths_to_remove=(run_state.paths_to_remove or []) +
                        [path_to_remove])
            for dependency in to_mount:
                try:
                    mount_dependency(dependency, self.shared_file_system)
                except OSError as e:
                    log_bundle_transition(
                        bundle_uuid=run_state.bundle.uuid,
                        previous_stage=run_state.stage,
                        next_stage=RunStage.CLEANING_UP,
                        reason=str(e.__class__),
                        level=logging.ERROR,
                    )
                    return run_state._replace(stage=RunStage.CLEANING_UP,
                                              failure_message=str(e))

        if run_state.resources.network:
            docker_network = self.docker_network_external.name
        else:
            docker_network = self.docker_network_internal.name

        # 3) Start container
        try:
            container = docker_utils.start_bundle_container(
                run_state.bundle_path,
                run_state.bundle.uuid,
                docker_dependencies,
                run_state.bundle.command,
                run_state.resources.docker_image,
                network=docker_network,
                cpuset=cpuset,
                gpuset=gpuset,
                memory_bytes=run_state.resources.memory,
                runtime=self.docker_runtime,
            )
            self.worker_docker_network.connect(container)
        except docker_utils.DockerUserErrorException as e:
            message = 'Cannot start Docker container: {}'.format(e)
            log_bundle_transition(
                bundle_uuid=run_state.bundle.uuid,
                previous_stage=run_state.stage,
                next_stage=RunStage.CLEANING_UP,
                reason='Cannot start Docker container.',
                level=logging.ERROR,
            )
            return run_state._replace(stage=RunStage.CLEANING_UP,
                                      failure_message=message)
        except Exception as e:
            message = 'Cannot start container: {}'.format(e)
            logger.error(message)
            logger.error(traceback.format_exc())
            raise

        return run_state._replace(
            stage=RunStage.RUNNING,
            run_status='Running job in container',
            container_id=container.id,
            container=container,
            docker_image=image_state.digest,
            has_contents=True,
            cpuset=cpuset,
            gpuset=gpuset,
        )
Exemple #23
0
    def _transition_from_CLEANING_UP(self, run_state):
        """
        1- delete the container if still existent
        2- clean up the dependencies from bundle directory
        3- release the dependencies in dependency manager
        4- If bundle has contents to upload (i.e. was RUNNING at some point),
            move to UPLOADING_RESULTS state
           Otherwise move to FINALIZING state
        """
        def remove_path_no_fail(path):
            try:
                remove_path(path)
            except Exception:
                logger.error(traceback.format_exc())

        if run_state.container_id is not None:
            while docker_utils.container_exists(run_state.container):
                try:
                    finished, _, _ = docker_utils.check_finished(
                        run_state.container)
                    if finished:
                        run_state.container.remove(force=True)
                        run_state = run_state._replace(container=None,
                                                       container_id=None)
                        break
                    else:
                        try:
                            run_state.container.kill()
                        except docker.errors.APIError:
                            logger.error(traceback.format_exc())
                            time.sleep(1)
                except docker.errors.APIError:
                    logger.error(traceback.format_exc())
                    time.sleep(1)

        for dep in run_state.bundle.dependencies:
            if not self.shared_file_system:  # No dependencies if shared fs worker
                dep_key = DependencyKey(dep.parent_uuid, dep.parent_path)
                self.dependency_manager.release(run_state.bundle.uuid, dep_key)

        # Clean up dependencies paths
        for path in run_state.paths_to_remove or []:
            remove_path_no_fail(path)
        run_state = run_state._replace(paths_to_remove=[])

        if run_state.is_restaged:
            log_bundle_transition(
                bundle_uuid=run_state.bundle.uuid,
                previous_stage=run_state.stage,
                next_stage=RunStage.RESTAGED,
                reason=self.RESTAGED_REASON,
            )
            return run_state._replace(stage=RunStage.RESTAGED)

        if not self.shared_file_system and run_state.has_contents:
            log_bundle_transition(
                bundle_uuid=run_state.bundle.uuid,
                previous_stage=run_state.stage,
                next_stage=RunStage.UPLOADING_RESULTS,
            )
            return run_state._replace(stage=RunStage.UPLOADING_RESULTS,
                                      run_status='Uploading results',
                                      container=None)
        else:
            # No need to upload results since results are directly written to bundle store
            # Delete any files that match the exclude_patterns .
            for exclude_pattern in run_state.bundle.metadata[
                    "exclude_patterns"]:
                full_pattern = os.path.join(run_state.bundle_path,
                                            exclude_pattern)
                for file_path in glob.glob(full_pattern, recursive=True):
                    # Only remove files that are subpaths of run_state.bundle_path, in case
                    # that exclude_pattern is something like "../../../".
                    if path_is_parent(parent_path=run_state.bundle_path,
                                      child_path=file_path):
                        remove_path(file_path)
            return self.finalize_run(run_state)
    def _transition_from_PREPARING(self, run_state):
        """
        1- Request the docker image from docker image manager
            - if image is failed, move to CLEANING_UP state
        2- Request the dependencies from dependency manager
            - if any are failed, move to CLEANING_UP state
        3- If all dependencies and docker image are ready:
            - Set up the local filesystem for the run
            - Create symlinks to dependencies
            - Allocate resources and prepare the docker container
            - Start the docker container
        4- If all is successful, move to RUNNING state
        """
        if run_state.is_killed:
            return run_state._replace(stage=RunStage.CLEANING_UP)

        dependencies_ready = True
        status_messages = []

        if not self.shared_file_system:
            # No need to download dependencies if we're in the shared FS since they're already in our FS
            for dep_key, dep in run_state.bundle.dependencies.items():
                dependency_state = self.dependency_manager.get(
                    run_state.bundle.uuid, dep_key)
                if dependency_state.stage == DependencyStage.DOWNLOADING:
                    status_messages.append(
                        'Downloading dependency %s: %s done (archived size)' %
                        (dep.child_path, size_str(
                            dependency_state.size_bytes)))
                    dependencies_ready = False
                elif dependency_state.stage == DependencyStage.FAILED:
                    # Failed to download dependency; -> CLEANING_UP
                    return run_state._replace(
                        stage=RunStage.CLEANING_UP,
                        failure_message='Failed to download dependency %s: %s'
                        % (dep.child_path, dependency_state.message),
                    )

        # get the docker image
        docker_image = run_state.resources.docker_image
        image_state = self.docker_image_manager.get(docker_image)
        if image_state.stage == DependencyStage.DOWNLOADING:
            status_messages.append('Pulling docker image: ' +
                                   (image_state.message or docker_image or ""))
            dependencies_ready = False
        elif image_state.stage == DependencyStage.FAILED:
            # Failed to pull image; -> CLEANING_UP
            message = 'Failed to download Docker image: %s' % image_state.message
            logger.error(message)
            return run_state._replace(stage=RunStage.CLEANING_UP,
                                      failure_message=message)

        # stop proceeding if dependency and image downloads aren't all done
        if not dependencies_ready:
            status_message = status_messages.pop()
            if status_messages:
                status_message += "(and downloading %d other dependencies and docker images)" % len(
                    status_messages)
            return run_state._replace(run_status=status_message)

        # All dependencies ready! Set up directories, symlinks and container. Start container.
        # 1) Set up a directory to store the bundle.
        if self.shared_file_system:
            if not os.path.exists(run_state.bundle_path):
                if run_state.bundle_dir_wait_num_tries == 0:
                    message = (
                        "Bundle directory cannot be found on the shared filesystem. "
                        "Please ensure the shared fileystem between the server and "
                        "your worker is mounted properly or contact your administrators."
                    )
                    logger.error(message)
                    return run_state._replace(stage=RunStage.CLEANING_UP,
                                              failure_message=message)
                return run_state._replace(
                    run_status=
                    "Waiting for bundle directory to be created by the server",
                    bundle_dir_wait_num_tries=run_state.
                    bundle_dir_wait_num_tries - 1,
                )
        else:
            remove_path(run_state.bundle_path)
            os.mkdir(run_state.bundle_path)

        # 2) Set up symlinks
        docker_dependencies = []
        docker_dependencies_path = (
            '/' + run_state.bundle.uuid +
            ('_dependencies' if not self.shared_file_system else ''))
        for dep_key, dep in run_state.bundle.dependencies.items():
            full_child_path = os.path.normpath(
                os.path.join(run_state.bundle_path, dep.child_path))
            if not full_child_path.startswith(run_state.bundle_path):
                # Dependencies should end up in their bundles (ie prevent using relative paths like ..
                # to get out of their parent bundles
                message = 'Invalid key for dependency: %s' % (dep.child_path)
                logger.error(message)
                return run_state._replace(stage=RunStage.CLEANING_UP,
                                          failure_message=message)
            docker_dependency_path = os.path.join(docker_dependencies_path,
                                                  dep.child_path)
            if self.shared_file_system:
                # On a shared FS, we know where the dep is stored and can get the contents directly
                dependency_path = os.path.realpath(
                    os.path.join(dep.location, dep.parent_path))
            else:
                # On a dependency_manager setup ask the manager where the dependency is
                dependency_path = os.path.join(
                    self.dependency_manager.dependencies_dir,
                    self.dependency_manager.get(run_state.bundle.uuid,
                                                dep_key).path,
                )
                os.symlink(docker_dependency_path, full_child_path)
            # These are turned into docker volume bindings like:
            #   dependency_path:docker_dependency_path:ro
            docker_dependencies.append(
                (dependency_path, docker_dependency_path))

        # 3) Set up container
        if run_state.resources.network:
            docker_network = self.docker_network_external.name
        else:
            docker_network = self.docker_network_internal.name

        try:
            cpuset, gpuset = self.assign_cpu_and_gpu_sets_fn(
                run_state.resources.cpus, run_state.resources.gpus)
        except Exception as e:
            message = "Cannot assign enough resources: %s" % str(e)
            logger.error(message)
            logger.error(traceback.format_exc())
            return run_state._replace(run_status=message)

        # 4) Start container
        try:
            container = docker_utils.start_bundle_container(
                run_state.bundle_path,
                run_state.bundle.uuid,
                docker_dependencies,
                run_state.bundle.command,
                run_state.resources.docker_image,
                network=docker_network,
                cpuset=cpuset,
                gpuset=gpuset,
                memory_bytes=run_state.resources.memory,
                runtime=self.docker_runtime,
            )
            self.worker_docker_network.connect(container)
        except Exception as e:
            message = 'Cannot start Docker container: {}'.format(e)
            logger.error(message)
            logger.error(traceback.format_exc())
            raise

        return run_state._replace(
            stage=RunStage.RUNNING,
            run_status='Running job in Docker container',
            container_id=container.id,
            container=container,
            docker_image=image_state.digest,
            has_contents=True,
            cpuset=cpuset,
            gpuset=gpuset,
        )
    def _sync_state(self):
        """
        Synchronize dependency states between dependencies-state.json and the local file system as follows:
        1. dependencies and paths: populated from dependencies-state.json
        2. directories on the local file system: the bundle contents
        This function forces the 1 and 2 to be in sync by taking the intersection (e.g., deleting bundles from the
        local file system that don't appear in the dependencies-state.json and vice-versa)
        """
        with self._state_lock:
            # Load states from dependencies-state.json, which contains information about bundles (e.g., state,
            # dependencies, last used, etc.).
            if self._state_committer.state_file_exists:
                # If the state file exists, do not pass in a default. It's critical that we read the contents
                # of the state file, as this method prunes dependencies. If we can't read the contents of the
                # state file, fail immediately.
                dependencies, paths = self._fetch_state()
                logger.info(
                    'Found {} dependencies, {} paths from cache.'.format(
                        len(dependencies), len(paths)
                    )
                )
            else:
                dependencies: Dict[DependencyKey, DependencyState] = dict()
                paths: Set[str] = set()
                logger.info(
                    f'State file did not exist. Will create one at path {self._state_committer.path}.'
                )

            # Get the paths that exist in dependency state, loaded path and
            # the local file system (the dependency directories under self.dependencies_dir)
            local_directories = set(os.listdir(self.dependencies_dir))
            paths_in_loaded_state = [dep_state.path for dep_state in dependencies.values()]
            paths = paths.intersection(paths_in_loaded_state).intersection(local_directories)

            # Remove the orphaned dependencies if they don't exist in paths
            # (intersection of paths in dependency state, loaded paths and the paths on the local file system)
            dependencies_to_remove = [
                dep for dep, dep_state in dependencies.items() if dep_state.path not in paths
            ]
            for dep in dependencies_to_remove:
                logger.info(
                    "Dependency {} in dependency state but its path {} doesn't exist on the local file system. "
                    "Removing it from dependency state.".format(
                        dep, os.path.join(self.dependencies_dir, dependencies[dep].path)
                    )
                )
                del dependencies[dep]

            # Remove the orphaned directories from the local file system
            directories_to_remove = local_directories - paths
            for directory in directories_to_remove:
                full_path = os.path.join(self.dependencies_dir, directory)
                if os.path.exists(full_path):
                    logger.info(
                        "Remove orphaned directory {} from the local file system.".format(full_path)
                    )
                    remove_path(full_path)

            # Save the current synced state back to the state file: dependency-state.json as
            # the current state might have been changed during the state syncing phase
            self._commit_state(dependencies, paths)
 def tearDown(self):
     remove_path(self.temp_dir)