コード例 #1
0
class JsonStateCommitterTest(unittest.TestCase):
    def setUp(self):
        self.test_dir = tempfile.mkdtemp()
        self.state_file = 'test-state.json'
        self.state_path = os.path.join(self.test_dir, self.state_file)
        self.committer = JsonStateCommitter(self.state_path)

    def tearDown(self):
        try:
            os.remove(self.state_path)
        except OSError:
            pass
        os.rmdir(self.test_dir)

    def test_path_parsing(self):
        """ Simple test to ensure we don't mess up the state file path"""
        self.assertEqual(self.committer._state_file, self.state_path)

    def test_commit(self):
        """Make sure state is committed correctly"""
        test_state = {'state': 'value'}
        test_state_json_str = '{\"state\": \"value\"}'
        self.committer.commit(test_state)
        with open(self.state_path) as f:
            self.assertEqual(test_state_json_str, f.read())

    def test_load(self):
        """ Make sure load loads the state file if it exists """
        test_state = {'state': 'value'}
        test_state_json_str = '{\"state\": \"value\"}'
        with open(self.state_path, 'w') as f:
            f.write(test_state_json_str)
        loaded_state = self.committer.load()
        self.assertDictEqual(test_state, loaded_state)

    def test_default(self):
        """ Make sure load with a default works if state file doesn't exist """
        default_state = {'state': 'value'}
        loaded_state = self.committer.load(default=default_state)
        self.assertDictEqual(default_state, loaded_state)
コード例 #2
0
class LocalFileSystemDependencyManager(StateTransitioner, BaseDependencyManager):
    """
    This dependency manager downloads dependency bundles from Codalab server
    to the local filesystem. It caches all downloaded dependencies but cleans up the
    old ones if the disk use hits the given threshold

    For this class dependencies are uniquely identified by (parent_uuid, parent_path)
    """

    DEPENDENCIES_DIR_NAME = 'dependencies'
    DEPENDENCY_FAILURE_COOLDOWN = 10
    # TODO(bkgoksel): The server writes these to the worker_dependencies table, which stores the dependencies
    # json as a SqlAlchemy LargeBinary, which defaults to MySQL BLOB, which has a size limit of
    # 65K. For now we limit this value to about 58K to avoid any issues but we probably want to do
    # something better (either specify MEDIUMBLOB in the SqlAlchemy definition of the table or change
    # the data format of how we store this)
    MAX_SERIALIZED_LEN = 60000

    def __init__(self, commit_file, bundle_service, worker_dir, max_cache_size_bytes):
        super(LocalFileSystemDependencyManager, self).__init__()
        self.add_transition(DependencyStage.DOWNLOADING, self._transition_from_DOWNLOADING)
        self.add_terminal(DependencyStage.READY)
        self.add_terminal(DependencyStage.FAILED)

        self._state_committer = JsonStateCommitter(commit_file)
        self._bundle_service = bundle_service
        self._max_cache_size_bytes = max_cache_size_bytes
        self.dependencies_dir = os.path.join(
            worker_dir, LocalFileSystemDependencyManager.DEPENDENCIES_DIR_NAME
        )
        if not os.path.exists(self.dependencies_dir):
            logger.info('{} doesn\'t exist, creating.'.format(self.dependencies_dir))
            os.makedirs(self.dependencies_dir, 0o770)

        # Locks for concurrency
        self._dependency_locks = dict()  # (parent_uuid, parent_path) -> threading.RLock
        self._global_lock = threading.RLock()  # Used for add/remove actions
        self._paths_lock = threading.RLock()  # Used for path name computations

        # File paths that are currently being used to store dependencies. Used to prevent conflicts
        self._paths = set()
        # (parent_uuid, parent_path) -> DependencyState
        self._dependencies = dict()
        # (parent_uuid, parent_path) -> WorkerThread(thread, success, failure_message)
        self._downloading = ThreadDict(fields={'success': False, 'failure_message': None})
        self._load_state()

        self._stop = False
        self._main_thread = None

    def _save_state(self):
        with self._global_lock, self._paths_lock:
            self._state_committer.commit({'dependencies': self._dependencies, 'paths': self._paths})

    def _load_state(self):
        state = self._state_committer.load(default={'dependencies': {}, 'paths': set()})
        dependencies = {}
        dependency_locks = {}
        paths = set()
        for dep, dep_state in state['dependencies'].items():
            full_path = os.path.join(self.dependencies_dir, dep_state.path)
            if os.path.exists(full_path):
                dependencies[dep] = dep_state
                dependency_locks[dep] = threading.RLock()
            else:
                logger.info(
                    "Dependency {} in loaded state but its path {} doesn't exist in the filesystem".format(
                        dep, full_path
                    )
                )
            if dep_state.path not in state['paths']:
                state['paths'].add(dep_state.path)
                logger.info(
                    "Dependency {} in loaded state but its path {} is not in the loaded paths {}".format(
                        dep, dep_state.path, state['paths']
                    )
                )
        for path in state['paths']:
            full_path = os.path.join(self.dependencies_dir, path)
            if os.path.exists(full_path):
                paths.add(path)
            else:
                logger.info(
                    "Path {} in loaded state but doesn't exist in the filesystem".format(full_path)
                )

        with self._global_lock, self._paths_lock:
            self._dependencies = dependencies
            self._dependency_locks = dependency_locks
            self._paths = paths
        logger.info(
            '{} dependencies, {} paths in cache.'.format(len(self._dependencies), len(self._paths))
        )

    def start(self):
        def loop(self):
            while not self._stop:
                try:
                    self._process_dependencies()
                    self._save_state()
                    self._cleanup()
                    self._save_state()
                except Exception:
                    traceback.print_exc()

        self._main_thread = threading.Thread(target=loop, args=[self])
        self._main_thread.start()

    def stop(self):
        logger.info("Stopping local dependency manager")
        self._stop = True
        self._downloading.stop()
        self._main_thread.join()
        logger.info("Stopped local dependency manager. Exiting.")

    def _process_dependencies(self):
        for entry, state in self._dependencies.items():
            with self._dependency_locks[entry]:
                self._dependencies[entry] = self.transition(state)

    def _prune_failed_dependencies(self):
        """
        Prune failed dependencies older than 10 seconds so that further runs
        get to retry the download. Without pruning any future run depending on a
        failed dependency would automatically fail indefinitely.
        """
        with self._global_lock:
            self._acquire_all_locks()
            failed_deps = {
                dep: state
                for dep, state in self._dependencies.items()
                if state.stage == DependencyStage.FAILED
                and time.time() - state.last_used
                > LocalFileSystemDependencyManager.DEPENDENCY_FAILURE_COOLDOWN
            }
            for dependency, dependency_state in failed_deps.items():
                self._delete_dependency(dependency)
            self._release_all_locks()

    def _cleanup(self):
        """
        Prune failed dependencies older than 10 seconds
        Limit the disk usage of the dependencies (both the bundle files and the serialized state file size)
        Deletes oldest failed dependencies first and then oldest finished dependencies.
        Doesn't touch downloading dependencies.
        """
        self._prune_failed_dependencies()
        # With all the locks (should be fast if no cleanup needed, otherwise make sure nothing is corrupted
        while True:
            with self._global_lock:
                self._acquire_all_locks()
                bytes_used = sum(dep.size_bytes for dep in self._dependencies.values())
                serialized_length = len(codalabworker.pyjson.dumps(self._dependencies))
                if (
                    bytes_used > self._max_cache_size_bytes
                    or serialized_length > LocalFileSystemDependencyManager.MAX_SERIALIZED_LEN
                ):
                    logger.debug(
                        '%d dependencies in cache, disk usage: %s (max %s), serialized size: %s (max %s)',
                        len(self._dependencies),
                        size_str(bytes_used),
                        size_str(self._max_cache_size_bytes),
                        size_str(serialized_length),
                        LocalFileSystemDependencyManager.MAX_SERIALIZED_LEN,
                    )
                    ready_deps = {
                        dep: state
                        for dep, state in self._dependencies.items()
                        if state.stage == DependencyStage.READY and not state.dependents
                    }
                    failed_deps = {
                        dep: state
                        for dep, state in self._dependencies.items()
                        if state.stage == DependencyStage.FAILED
                    }
                    if failed_deps:
                        dep_to_remove = min(
                            failed_deps.iteritems(), key=lambda dep_state: dep_state[1].last_used
                        )[0]
                    elif ready_deps:
                        dep_to_remove = min(
                            ready_deps.iteritems(), key=lambda dep_state: dep_state[1].last_used
                        )[0]
                    else:
                        logger.info(
                            'Dependency quota full but there are only downloading dependencies, not cleaning up until downloads are over'
                        )
                        break
                    if dep_to_remove:
                        self._delete_dependency(dep_to_remove)
                    self._release_all_locks()
                else:
                    self._release_all_locks()
                    break

    def _delete_dependency(self, dependency):
        """
        Remove the given dependency from the manager's state
        Also delete any known files on the filesystem if any exist
        """
        if self._acquire_if_exists(dependency):
            try:
                path_to_remove = self._dependencies[dependency].path
                self._paths.remove(path_to_remove)
                remove_path(path_to_remove)
            except Exception:
                pass
            finally:
                del self._dependencies[dependency]
                self._dependency_locks[dependency].release()

    def has(self, dependency):
        """
        Takes a dependency = (parent_uuid, parent_path)
        Returns true if the manager has processed this dependency
        """
        with self._global_lock:
            return dependency in self._dependencies

    def get(self, uuid, dependency):
        """
        Request the dependency for the run with uuid, registering uuid as a dependent of this dependency
        """
        now = time.time()
        if not self._acquire_if_exists(dependency):  # add dependency state if it does not exist
            with self._global_lock:
                self._dependency_locks[dependency] = threading.RLock()
                self._dependency_locks[dependency].acquire()
                self._dependencies[dependency] = DependencyState(
                    stage=DependencyStage.DOWNLOADING,
                    dependency=dependency,
                    path=self._assign_path(dependency),
                    size_bytes=0,
                    dependents=set([uuid]),
                    last_used=now,
                    message="Starting download",
                    killed=False,
                )

        # update last_used as long as it isn't in FAILED
        if self._dependencies[dependency].stage != DependencyStage.FAILED:
            self._dependencies[dependency].dependents.add(uuid)
            self._dependencies[dependency] = self._dependencies[dependency]._replace(last_used=now)
        self._dependency_locks[dependency].release()
        return self._dependencies[dependency]

    def release(self, uuid, dependency):
        """
        Register that the run with uuid is no longer dependent on this dependency
        If no more runs are dependent on this dependency, kill it
        """
        if self._acquire_if_exists(dependency):
            dep_state = self._dependencies[dependency]
            if uuid in dep_state.dependents:
                dep_state.dependents.remove(uuid)
            if not dep_state.dependents:
                dep_state = dep_state._replace(killed=True)
                self._dependencies[dependency] = dep_state
            self._dependency_locks[dependency].release()

    def _acquire_if_exists(self, dependency):
        """
        Safely acquires a lock for the given dependency if it exists
        Returns True if depedendency exists, False otherwise
        Callers should remember to release the lock
        """
        with self._global_lock:
            if dependency in self._dependencies:
                self._dependency_locks[dependency].acquire()
                return True
            else:
                return False

    def _acquire_all_locks(self):
        """
        Acquires all dependency locks in the thread it's called from
        """
        with self._global_lock:
            for dependency, lock in self._dependency_locks.iteritems():
                lock.acquire()

    def _release_all_locks(self):
        """
        Releases all dependency locks in the thread it's called from
        """
        with self._global_lock:
            for dependency, lock in self._dependency_locks.iteritems():
                lock.release()

    def _assign_path(self, dependency):
        """
        Normalize the path for the dependency by replacing / with _, avoiding conflicts
        """
        parent_uuid, parent_path = dependency
        if parent_path:
            path = os.path.join(parent_uuid, parent_path)
        else:
            path = parent_uuid
        path = path.replace(os.path.sep, '_')

        # You could have a conflict between, for example a/b_c and
        # a_b/c. We have to avoid those.
        with self._paths_lock:
            while path in self._paths:
                path = path + '_'
            self._paths.add(path)
        return path

    def _store_dependency(self, dependency_path, fileobj, target_type):
        """
        Copy the dependency fileobj to its path in the local filesystem
        Overwrite existing files by the same name if found
        (may happen if filesystem modified outside the dependency manager,
         for example during an update if the state gets reset but filesystem
         doesn't get cleared)
        """
        try:
            if os.path.exists(dependency_path):
                logger.info('Path %s already exists, overwriting', dependency_path)
                if os.path.isdir(dependency_path):
                    shutil.rmtree(dependency_path)
                else:
                    os.remove(dependency_path)
            if target_type == 'directory':
                un_tar_directory(fileobj, dependency_path, 'gz')
            else:
                with open(dependency_path, 'wb') as f:
                    logger.debug('copying file to %s', dependency_path)
                    shutil.copyfileobj(fileobj, f)
        except Exception:
            raise

    @property
    def all_dependencies(self):
        with self._global_lock:
            return list(self._dependencies.keys())

    def _transition_from_DOWNLOADING(self, dependency_state):
        def download():
            def update_state_and_check_killed(bytes_downloaded):
                """
                Callback method for bundle service client updates dependency state and
                raises DownloadAbortedException if download is killed by dep. manager
                """
                with self._dependency_locks[dependency]:
                    state = self._dependencies[dependency]
                    if state.killed:
                        raise DownloadAbortedException("Aborted by user")
                    self._dependencies[dependency] = state._replace(
                        size_bytes=bytes_downloaded,
                        message="Downloading dependency: %s downloaded"
                        % size_str(bytes_downloaded),
                    )

            dependency_path = os.path.join(self.dependencies_dir, dependency_state.path)
            logger.debug('Downloading dependency %s/%s', parent_uuid, parent_path)
            try:
                # Start async download to the fileobj
                fileobj, target_type = self._bundle_service.get_bundle_contents(
                    parent_uuid, parent_path
                )
                with closing(fileobj):
                    # "Bug" the fileobj's read function so that we can keep
                    # track of the number of bytes downloaded so far.
                    old_read_method = fileobj.read
                    bytes_downloaded = [0]

                    def interruptable_read(*args, **kwargs):
                        data = old_read_method(*args, **kwargs)
                        bytes_downloaded[0] += len(data)
                        update_state_and_check_killed(bytes_downloaded[0])
                        return data

                    fileobj.read = interruptable_read

                    # Start copying the fileobj to filesystem dependency path
                    self._store_dependency(dependency_path, fileobj, target_type)

                logger.debug(
                    'Finished downloading %s dependency %s/%s to %s',
                    target_type,
                    parent_uuid,
                    parent_path,
                    dependency_path,
                )
                with self._dependency_locks[dependency]:
                    self._downloading[dependency]['success'] = True

            except Exception as e:
                with self._dependency_locks[dependency]:
                    self._downloading[dependency]['success'] = False
                    self._downloading[dependency][
                        'failure_message'
                    ] = "Dependency download failed: %s " % str(e)

        dependency = dependency_state.dependency
        parent_uuid, parent_path = dependency
        self._downloading.add_if_new(dependency, threading.Thread(target=download, args=[]))

        if self._downloading[dependency].is_alive():
            return dependency_state

        success = self._downloading[dependency]['success']
        failure_message = self._downloading[dependency]['failure_message']

        self._downloading.remove(dependency)
        if success:
            return dependency_state._replace(
                stage=DependencyStage.READY, message="Download complete"
            )
        else:
            with self._paths_lock:
                self._paths.remove(dependency_state.path)
            return dependency_state._replace(stage=DependencyStage.FAILED, message=failure_message)
コード例 #3
0
class LocalFileSystemDependencyManager(StateTransitioner, BaseDependencyManager):
    """
    This dependency manager downloads dependency bundles from Codalab server
    to the local filesystem. It caches all downloaded dependencies but cleans up the
    old ones if the disk use hits the given threshold

    For this class dependencies are uniquely identified by (parent_uuid, parent_path)
    """

    DEPENDENCIES_DIR_NAME = 'dependencies'
    DEPENDENCY_FAILURE_COOLDOWN = 10
    # TODO(bkgoksel): The server writes these to the worker_dependencies table, which stores the dependencies
    # json as a SqlAlchemy LargeBinary, which defaults to MySQL BLOB, which has a size limit of
    # 65K. For now we limit this value to about 58K to avoid any issues but we probably want to do
    # something better (either specify MEDIUMBLOB in the SqlAlchemy definition of the table or change
    # the data format of how we store this)
    MAX_SERIALIZED_LEN = 60000

    def __init__(self, commit_file, bundle_service, worker_dir, max_cache_size_bytes):
        super(LocalFileSystemDependencyManager, self).__init__()
        self.add_transition(DependencyStage.DOWNLOADING, self._transition_from_DOWNLOADING)
        self.add_terminal(DependencyStage.READY)
        self.add_terminal(DependencyStage.FAILED)

        self._state_committer = JsonStateCommitter(commit_file)
        self._bundle_service = bundle_service
        self._max_cache_size_bytes = max_cache_size_bytes
        self.dependencies_dir = os.path.join(
            worker_dir, LocalFileSystemDependencyManager.DEPENDENCIES_DIR_NAME
        )
        if not os.path.exists(self.dependencies_dir):
            logger.info('{} doesn\'t exist, creating.'.format(self.dependencies_dir))
            os.makedirs(self.dependencies_dir, 0o770)

        # Locks for concurrency
        self._dependency_locks = dict()  # (parent_uuid, parent_path) -> threading.RLock
        self._global_lock = threading.RLock()  # Used for add/remove actions
        self._paths_lock = threading.RLock()  # Used for path name computations

        # File paths that are currently being used to store dependencies. Used to prevent conflicts
        self._paths = set()
        # (parent_uuid, parent_path) -> DependencyState
        self._dependencies = dict()
        # (parent_uuid, parent_path) -> WorkerThread(thread, success, failure_message)
        self._downloading = ThreadDict(fields={'success': False, 'failure_message': None})
        self._load_state()

        self._stop = False
        self._main_thread = None

    def _save_state(self):
        with self._global_lock, self._paths_lock:
            self._state_committer.commit({'dependencies': self._dependencies, 'paths': self._paths})

    def _load_state(self):
        state = self._state_committer.load(default={'dependencies': {}, 'paths': set()})
        dependencies = {}
        dependency_locks = {}
        paths = set()
        for dep, dep_state in state['dependencies'].items():
            full_path = os.path.join(self.dependencies_dir, dep_state.path)
            if os.path.exists(full_path):
                dependencies[dep] = dep_state
                dependency_locks[dep] = threading.RLock()
            else:
                logger.info(
                    "Dependency {} in loaded state but its path {} doesn't exist in the filesystem".format(
                        dep, full_path
                    )
                )
            if dep_state.path not in state['paths']:
                state['paths'].add(dep_state.path)
                logger.info(
                    "Dependency {} in loaded state but its path {} is not in the loaded paths {}".format(
                        dep, dep_state.path, state['paths']
                    )
                )
        for path in state['paths']:
            full_path = os.path.join(self.dependencies_dir, path)
            if os.path.exists(full_path):
                paths.add(path)
            else:
                logger.info(
                    "Path {} in loaded state but doesn't exist in the filesystem".format(full_path)
                )

        with self._global_lock, self._paths_lock:
            self._dependencies = dependencies
            self._dependency_locks = dependency_locks
            self._paths = paths
        logger.info(
            '{} dependencies, {} paths in cache.'.format(len(self._dependencies), len(self._paths))
        )

    def start(self):
        logger.info('Starting local dependency manager')

        def loop(self):
            while not self._stop:
                try:
                    self._process_dependencies()
                    self._save_state()
                    self._cleanup()
                    self._save_state()
                except Exception:
                    traceback.print_exc()
                time.sleep(1)

        self._main_thread = threading.Thread(target=loop, args=[self])
        self._main_thread.start()

    def stop(self):
        logger.info('Stopping local dependency manager')
        self._stop = True
        self._downloading.stop()
        self._main_thread.join()
        logger.info('Stopped local dependency manager')

    def _process_dependencies(self):
        for entry, state in self._dependencies.items():
            with self._dependency_locks[entry]:
                self._dependencies[entry] = self.transition(state)

    def _prune_failed_dependencies(self):
        """
        Prune failed dependencies older than DEPENDENCY_FAILURE_COOLDOWN seconds so that further runs
        get to retry the download. Without pruning, any future run depending on a
        failed dependency would automatically fail indefinitely.
        """
        with self._global_lock:
            self._acquire_all_locks()
            failed_deps = {
                dep: state
                for dep, state in self._dependencies.items()
                if state.stage == DependencyStage.FAILED
                and time.time() - state.last_used
                > LocalFileSystemDependencyManager.DEPENDENCY_FAILURE_COOLDOWN
            }
            for dependency, dependency_state in failed_deps.items():
                self._delete_dependency(dependency)
            self._release_all_locks()

    def _cleanup(self):
        """
        Prune failed dependencies older than DEPENDENCY_FAILURE_COOLDOWN seconds.
        Limit the disk usage of the dependencies (both the bundle files and the serialized state file size)
        Deletes oldest failed dependencies first and then oldest finished dependencies.
        Doesn't touch downloading dependencies.
        """
        self._prune_failed_dependencies()
        # With all the locks (should be fast if no cleanup needed, otherwise make sure nothing is corrupted
        while True:
            with self._global_lock:
                self._acquire_all_locks()
                bytes_used = sum(dep.size_bytes for dep in self._dependencies.values())
                serialized_length = len(codalabworker.pyjson.dumps(self._dependencies))
                if (
                    bytes_used > self._max_cache_size_bytes
                    or serialized_length > LocalFileSystemDependencyManager.MAX_SERIALIZED_LEN
                ):
                    logger.debug(
                        '%d dependencies in cache, disk usage: %s (max %s), serialized size: %s (max %s)',
                        len(self._dependencies),
                        size_str(bytes_used),
                        size_str(self._max_cache_size_bytes),
                        size_str(serialized_length),
                        LocalFileSystemDependencyManager.MAX_SERIALIZED_LEN,
                    )
                    ready_deps = {
                        dep: state
                        for dep, state in self._dependencies.items()
                        if state.stage == DependencyStage.READY and not state.dependents
                    }
                    failed_deps = {
                        dep: state
                        for dep, state in self._dependencies.items()
                        if state.stage == DependencyStage.FAILED
                    }
                    if failed_deps:
                        dep_to_remove = min(
                            failed_deps.iteritems(), key=lambda dep_state: dep_state[1].last_used
                        )[0]
                    elif ready_deps:
                        dep_to_remove = min(
                            ready_deps.iteritems(), key=lambda dep_state: dep_state[1].last_used
                        )[0]
                    else:
                        logger.info(
                            'Dependency quota full but there are only downloading dependencies, not cleaning up until downloads are over'
                        )
                        break
                    if dep_to_remove:
                        self._delete_dependency(dep_to_remove)
                    self._release_all_locks()
                else:
                    self._release_all_locks()
                    break

    def _delete_dependency(self, dependency):
        """
        Remove the given dependency from the manager's state
        Also delete any known files on the filesystem if any exist
        """
        if self._acquire_if_exists(dependency):
            try:
                path_to_remove = self._dependencies[dependency].path
                self._paths.remove(path_to_remove)
                remove_path(path_to_remove)
            except Exception:
                pass
            finally:
                del self._dependencies[dependency]
                self._dependency_locks[dependency].release()

    def has(self, dependency):
        """
        Takes a dependency = (parent_uuid, parent_path)
        Returns true if the manager has processed this dependency
        """
        with self._global_lock:
            return dependency in self._dependencies

    def get(self, uuid, dependency):
        """
        Request the dependency for the run with uuid, registering uuid as a dependent of this dependency
        """
        now = time.time()
        if not self._acquire_if_exists(dependency):  # add dependency state if it does not exist
            with self._global_lock:
                self._dependency_locks[dependency] = threading.RLock()
                self._dependency_locks[dependency].acquire()
                self._dependencies[dependency] = DependencyState(
                    stage=DependencyStage.DOWNLOADING,
                    dependency=dependency,
                    path=self._assign_path(dependency),
                    size_bytes=0,
                    dependents=set([uuid]),
                    last_used=now,
                    message="Starting download",
                    killed=False,
                )

        # update last_used as long as it isn't in FAILED
        if self._dependencies[dependency].stage != DependencyStage.FAILED:
            self._dependencies[dependency].dependents.add(uuid)
            self._dependencies[dependency] = self._dependencies[dependency]._replace(last_used=now)
        self._dependency_locks[dependency].release()
        return self._dependencies[dependency]

    def release(self, uuid, dependency):
        """
        Register that the run with uuid is no longer dependent on this dependency
        If no more runs are dependent on this dependency, kill it
        """
        if self._acquire_if_exists(dependency):
            dep_state = self._dependencies[dependency]
            if uuid in dep_state.dependents:
                dep_state.dependents.remove(uuid)
            if not dep_state.dependents:
                dep_state = dep_state._replace(killed=True)
                self._dependencies[dependency] = dep_state
            self._dependency_locks[dependency].release()

    def _acquire_if_exists(self, dependency):
        """
        Safely acquires a lock for the given dependency if it exists
        Returns True if depedendency exists, False otherwise
        Callers should remember to release the lock
        """
        with self._global_lock:
            if dependency in self._dependencies:
                self._dependency_locks[dependency].acquire()
                return True
            else:
                return False

    def _acquire_all_locks(self):
        """
        Acquires all dependency locks in the thread it's called from
        """
        with self._global_lock:
            for dependency, lock in self._dependency_locks.iteritems():
                lock.acquire()

    def _release_all_locks(self):
        """
        Releases all dependency locks in the thread it's called from
        """
        with self._global_lock:
            for dependency, lock in self._dependency_locks.iteritems():
                lock.release()

    def _assign_path(self, dependency):
        """
        Normalize the path for the dependency by replacing / with _, avoiding conflicts
        """
        parent_uuid, parent_path = dependency
        if parent_path:
            path = os.path.join(parent_uuid, parent_path)
        else:
            path = parent_uuid
        path = path.replace(os.path.sep, '_')

        # You could have a conflict between, for example a/b_c and
        # a_b/c. We have to avoid those.
        with self._paths_lock:
            while path in self._paths:
                path = path + '_'
            self._paths.add(path)
        return path

    def _store_dependency(self, dependency_path, fileobj, target_type):
        """
        Copy the dependency fileobj to its path in the local filesystem
        Overwrite existing files by the same name if found
        (may happen if filesystem modified outside the dependency manager,
         for example during an update if the state gets reset but filesystem
         doesn't get cleared)
        """
        try:
            if os.path.exists(dependency_path):
                logger.info('Path %s already exists, overwriting', dependency_path)
                if os.path.isdir(dependency_path):
                    shutil.rmtree(dependency_path)
                else:
                    os.remove(dependency_path)
            if target_type == 'directory':
                un_tar_directory(fileobj, dependency_path, 'gz')
            else:
                with open(dependency_path, 'wb') as f:
                    logger.debug('copying file to %s', dependency_path)
                    shutil.copyfileobj(fileobj, f)
        except Exception:
            raise

    @property
    def all_dependencies(self):
        with self._global_lock:
            return list(self._dependencies.keys())

    def _transition_from_DOWNLOADING(self, dependency_state):
        def download():
            def update_state_and_check_killed(bytes_downloaded):
                """
                Callback method for bundle service client updates dependency state and
                raises DownloadAbortedException if download is killed by dep. manager
                """
                with self._dependency_locks[dependency]:
                    state = self._dependencies[dependency]
                    if state.killed:
                        raise DownloadAbortedException("Aborted by user")
                    self._dependencies[dependency] = state._replace(
                        size_bytes=bytes_downloaded,
                        message="Downloading dependency: %s downloaded"
                        % size_str(bytes_downloaded),
                    )

            dependency_path = os.path.join(self.dependencies_dir, dependency_state.path)
            logger.debug('Downloading dependency %s/%s', parent_uuid, parent_path)
            try:
                # Start async download to the fileobj
                fileobj, target_type = self._bundle_service.get_bundle_contents(
                    parent_uuid, parent_path
                )
                with closing(fileobj):
                    # "Bug" the fileobj's read function so that we can keep
                    # track of the number of bytes downloaded so far.
                    old_read_method = fileobj.read
                    bytes_downloaded = [0]

                    def interruptable_read(*args, **kwargs):
                        data = old_read_method(*args, **kwargs)
                        bytes_downloaded[0] += len(data)
                        update_state_and_check_killed(bytes_downloaded[0])
                        return data

                    fileobj.read = interruptable_read

                    # Start copying the fileobj to filesystem dependency path
                    self._store_dependency(dependency_path, fileobj, target_type)

                logger.debug(
                    'Finished downloading %s dependency %s/%s to %s',
                    target_type,
                    parent_uuid,
                    parent_path,
                    dependency_path,
                )
                with self._dependency_locks[dependency]:
                    self._downloading[dependency]['success'] = True

            except Exception as e:
                with self._dependency_locks[dependency]:
                    self._downloading[dependency]['success'] = False
                    self._downloading[dependency][
                        'failure_message'
                    ] = "Dependency download failed: %s " % str(e)

        dependency = dependency_state.dependency
        parent_uuid, parent_path = dependency
        self._downloading.add_if_new(dependency, threading.Thread(target=download, args=[]))

        if self._downloading[dependency].is_alive():
            return dependency_state

        success = self._downloading[dependency]['success']
        failure_message = self._downloading[dependency]['failure_message']

        self._downloading.remove(dependency)
        if success:
            return dependency_state._replace(
                stage=DependencyStage.READY, message="Download complete"
            )
        else:
            with self._paths_lock:
                self._paths.remove(dependency_state.path)
            return dependency_state._replace(stage=DependencyStage.FAILED, message=failure_message)
コード例 #4
0
class LocalRunManager(BaseRunManager):
    """
    LocalRunManager executes the runs locally, each one in its own Docker
    container. It manages its cache of local Docker images and its own local
    Docker network.
    """

    # Network buffer size to use while proxying with netcat
    NETCAT_BUFFER_SIZE = 4096
    # Number of seconds to wait for bundle kills to propagate before forcing kill
    KILL_TIMEOUT = 100
    # Directory name to store running bundles in worker filesystem
    BUNDLES_DIR_NAME = 'runs'

    def __init__(
            self,
            worker,  # type: Worker
            image_manager,  # type: DockerImageManager
            dependency_manager,  # type: LocalFileSystemDependencyManager
            commit_file,  # type: str
            cpuset,  # type: Set[str]
            gpuset,  # type: Set[str]
            work_dir,  # type: str
            docker_runtime=docker_utils.DEFAULT_RUNTIME,  # type: str
            docker_network_prefix='codalab_worker_network',  # type: str
    ):
        self._worker = worker
        self._state_committer = JsonStateCommitter(commit_file)
        self._reader = LocalReader()
        self._docker = docker.from_env()
        self._bundles_dir = os.path.join(work_dir,
                                         LocalRunManager.BUNDLES_DIR_NAME)
        if not os.path.exists(self._bundles_dir):
            logger.info('{} doesn\'t exist, creating.'.format(
                self._bundles_dir))
            os.makedirs(self._bundles_dir, 0o770)

        self._image_manager = image_manager
        self._dependency_manager = dependency_manager
        self._cpuset = cpuset
        self._gpuset = gpuset
        self._stop = False
        self._work_dir = work_dir

        self._runs = {}  # bundle_uuid -> LocalRunState
        self._lock = threading.RLock()
        self._init_docker_networks(docker_network_prefix)
        self._run_state_manager = LocalRunStateMachine(
            docker_image_manager=self._image_manager,
            dependency_manager=self._dependency_manager,
            worker_docker_network=self.worker_docker_network,
            docker_network_internal=self.docker_network_internal,
            docker_network_external=self.docker_network_external,
            docker_runtime=docker_runtime,
            upload_bundle_callback=self._worker.upload_bundle_contents,
            assign_cpu_and_gpu_sets_fn=self.assign_cpu_and_gpu_sets,
        )

    def _init_docker_networks(self, docker_network_prefix):
        """
        Set up docker networks for runs: one with external network access and one without
        """
        def create_or_get_network(name, internal):
            try:
                logger.debug('Creating docker network %s', name)
                return self._docker.networks.create(name,
                                                    internal=internal,
                                                    check_duplicate=True)
            except docker.errors.APIError:
                logger.debug('Network %s already exists, reusing', name)
                return self._docker.networks.list(names=[name])[0]

        self.worker_docker_network = create_or_get_network(
            docker_network_prefix, True)
        self.docker_network_external = create_or_get_network(
            docker_network_prefix + "_ext", False)
        self.docker_network_internal = create_or_get_network(
            docker_network_prefix + "_int", True)

    def save_state(self):
        # Remove complex container objects from state before serializing, these can be retrieved
        simple_runs = {
            uuid: state._replace(container=None)
            for uuid, state in self._runs.items()
        }
        self._state_committer.commit(simple_runs)

    def load_state(self):
        runs = self._state_committer.load()
        # Retrieve the complex container objects from the Docker API
        for uuid, run_state in runs.items():
            if run_state.container_id:
                try:
                    run_state = run_state._replace(
                        container=self._docker.containers.get(
                            run_state.container_id))
                except docker.errors.NotFound as ex:
                    logger.debug('Error getting the container for the run: %s',
                                 ex)
                    run_state = run_state._replace(container_id=None)
                finally:
                    self._runs[uuid] = run_state

    def start(self):
        """
        Load your state from disk, and start your sub-managers
        """
        self.load_state()
        self._image_manager.start()
        self._dependency_manager.start()

    def stop(self):
        """
        Starts any necessary cleanup and propagates to its other managers
        Blocks until cleanup is complete and it is safe to quit
        """
        logger.info("Stopping Local Run Manager")
        self._stop = True
        self._image_manager.stop()
        self._dependency_manager.stop()
        self._run_state_manager.stop()
        self.save_state()
        try:
            self.docker_network_internal.remove()
            self.docker_network_external.remove()
        except docker.errors.APIError as e:
            logger.error("Cannot clear docker networks: {}".format(str(e)))

        logger.info("Stopped Local Run Manager. Exiting")

    def kill_all(self):
        """
        Kills all runs
        """
        logger.debug("Killing all bundles")
        # Set all bundle statuses to killed
        with self._lock:
            for uuid in self._runs.keys():
                run_state = self._runs[uuid]
                run_state.info['kill_message'] = 'Worker stopped'
                run_state = run_state._replace(info=run_state.info,
                                               is_killed=True)
                self._runs[uuid] = run_state
        # Wait until all runs finished or KILL_TIMEOUT seconds pas
        for attempt in range(LocalRunManager.KILL_TIMEOUT):
            with self._lock:
                self._runs = {
                    k: v
                    for k, v in self._runs.items()
                    if v.stage != LocalRunStage.FINISHED
                }
                if len(self._runs) > 0:
                    logger.debug(
                        "Waiting for {} more bundles. {} seconds until force quit."
                        .format(len(self._runs),
                                LocalRunManager.KILL_TIMEOUT - attempt))
            time.sleep(1)

    def process_runs(self):
        """ Transition each run then filter out finished runs """
        with self._lock:
            # transition all runs
            for bundle_uuid in self._runs.keys():
                run_state = self._runs[bundle_uuid]
                self._runs[bundle_uuid] = self._run_state_manager.transition(
                    run_state)

            # filter out finished runs
            finished_container_ids = [
                run.container for run in self._runs.values()
                if (run.stage == LocalRunStage.FINISHED or run.stage ==
                    LocalRunStage.FINALIZING) and run.container_id is not None
            ]
            for container_id in finished_container_ids:
                try:
                    container = self._docker.containers.get(container_id)
                    container.remove(force=True)
                except (docker.errors.NotFound, docker.errors.NullResource):
                    pass
            self._runs = {
                k: v
                for k, v in self._runs.items()
                if v.stage != LocalRunStage.FINISHED
            }

    def create_run(self, bundle, resources):
        """
        Creates and starts processing a new run with the given bundle and
        resources
        """
        if self._stop:
            # Run Manager stopped, refuse more runs
            return
        bundle_uuid = bundle['uuid']
        bundle_path = os.path.join(self._bundles_dir, bundle_uuid)
        now = time.time()
        run_state = LocalRunState(
            stage=LocalRunStage.PREPARING,
            run_status='',
            bundle=bundle,
            bundle_path=os.path.realpath(bundle_path),
            resources=resources,
            start_time=now,
            container_id=None,
            container=None,
            docker_image=None,
            is_killed=False,
            has_contents=False,
            cpuset=None,
            gpuset=None,
            time_used=0,
            max_memory=0,
            disk_utilization=0,
            info={},
        )
        with self._lock:
            self._runs[bundle_uuid] = run_state

    def assign_cpu_and_gpu_sets(self, request_cpus, request_gpus):
        """
        Propose a cpuset and gpuset to a bundle based on given requested resources.
        Note: no side effects (this is important: we don't want to maintain more state than necessary)

        Arguments:
            request_cpus: integer
            request_gpus: integer

        Returns a 2-tuple:
            cpuset: assigned cpuset (str indices).
            gpuset: assigned gpuset (str indices).

        Throws an exception if unsuccessful.
        """
        cpuset, gpuset = set(self._cpuset), set(self._gpuset)

        with self._lock:
            for run_state in self._runs.values():
                if run_state.stage == LocalRunStage.RUNNING:
                    cpuset -= run_state.cpuset
                    gpuset -= run_state.gpuset

        if len(cpuset) < request_cpus or len(gpuset) < request_gpus:
            raise Exception("Not enough cpus or gpus to assign!")

        def propose_set(resource_set, request_count):
            return set(str(el) for el in list(resource_set)[:request_count])

        return propose_set(cpuset,
                           request_cpus), propose_set(gpuset, request_gpus)

    def get_run(self, uuid):
        """
        Returns the state of the run with the given UUID if it is managed
        by this RunManager, returns None otherwise
        """
        with self._lock:
            return self._runs.get(uuid, None)

    def mark_finalized(self, uuid):
        """
        Marks the run as finalized server-side so it can be discarded
        """
        if uuid in self._runs:
            with self._lock:
                self._runs[uuid].info['finalized'] = True

    def read(self, run_state, path, dep_paths, args, reply):
        """
        Use your Reader helper to invoke the given read command
        """
        self._reader.read(run_state, path, dep_paths, args, reply)

    def write(self, run_state, path, dep_paths, string):
        """
        Write string to path in bundle with uuid
        """
        if os.path.normpath(path) in dep_paths:
            return
        with open(os.path.join(run_state.bundle_path, path), 'w') as f:
            f.write(string)

    def netcat(self, run_state, port, message, reply):
        """
        Write message to port of bundle with uuid and read the response.
        Returns a stream with the response contents
        """
        container_ip = docker_utils.get_container_ip(
            self.worker_docker_network.name, run_state.container)
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        s.connect((container_ip, port))
        s.sendall(message)

        total_data = []
        while True:
            data = s.recv(LocalRunManager.NETCAT_BUFFER_SIZE)
            if not data:
                break
            total_data.append(data)
        s.close()
        reply(None, {}, b''.join(total_data).decode())

    def kill(self, run_state):
        """
        Kill bundle with uuid
        """
        with self._lock:
            run_state.info['kill_message'] = 'Kill requested'
            run_state = run_state._replace(info=run_state.info, is_killed=True)
            self._runs[run_state.bundle['uuid']] = run_state

    @property
    def all_runs(self):
        """
        Returns a list of all the runs managed by this RunManager
        """
        with self._lock:
            result = {
                bundle_uuid: {
                    'run_status':
                    run_state.run_status,
                    'start_time':
                    run_state.start_time,
                    'docker_image':
                    run_state.docker_image,
                    'info':
                    run_state.info,
                    'state':
                    LocalRunStage.WORKER_STATE_TO_SERVER_STATE[
                        run_state.stage],
                    'remote':
                    self._worker.id,
                }
                for bundle_uuid, run_state in self._runs.items()
            }
            return result

    @property
    def all_dependencies(self):
        """
        Returns a list of all dependencies available in this RunManager
        """
        return self._dependency_manager.all_dependencies

    @property
    def cpus(self):
        """
        Total number of CPUs this RunManager has
        """
        return len(self._cpuset)

    @property
    def gpus(self):
        """
        Total number of GPUs this RunManager has
        """
        return len(self._gpuset)

    @property
    def memory_bytes(self):
        """
        Total installed memory of this RunManager
        """
        try:
            return os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')
        except ValueError:
            # Fallback to sysctl when os.sysconf('SC_PHYS_PAGES') fails on OS X
            return int(check_output(['sysctl', '-n', 'hw.memsize']).strip())

    @property
    def free_disk_bytes(self):
        """
        Available disk space by bytes of this RunManager.
        """
        error_msg = "Failed to run command {}".format("df " + self._work_dir)
        try:
            p = Popen(["df", self._work_dir], stdout=PIPE)
            output, error = p.communicate()
            # Return None when there is an error.
            if error:
                logger.error(error.strip())
                return None

            if output:
                lines = output.decode().split("\n")
                index = lines[0].split().index("Available")
                # We convert the original result from df command in unit of 1KB blocks into bytes.
                return int(lines[1].split()[index]) * 1024

        except Exception as e:
            logger.error("{}: {}".format(error_msg, str(e)))
            return None
コード例 #5
0
class DockerImageManager:
    def __init__(self, commit_file, max_image_cache_size):
        """
        Initializes a DockerImageManager
        :param commit_file: String path to where the state file should be committed
        :param max_image_cache_size: Total size in bytes that the image cache can use
        """
        self._state_committer = JsonStateCommitter(commit_file)  # type: JsonStateCommitter
        self._docker = docker.from_env()  # type: DockerClient
        self._image_cache = {}  # type: Dict[str, ImageCacheEntry]
        self._downloading = ThreadDict(
            fields={'success': False, 'status': 'Download starting.'}, lock=True
        )
        self._max_image_cache_size = max_image_cache_size
        self._lock = threading.RLock()

        self._stop = False
        self._sleep_secs = 10
        self._cleanup_thread = None

        self._load_state()

    def _save_state(self):
        with self._lock:
            self._state_committer.commit(self._image_cache)

    def _load_state(self):
        with self._lock:
            self._image_cache = self._state_committer.load()

    def start(self):
        if self._max_image_cache_size:

            def cleanup_loop(self):
                while not self._stop:
                    try:
                        self._cleanup()
                        self._save_state()
                    except Exception:
                        traceback.print_exc()
                    time.sleep(self._sleep_secs)

            self._cleanup_thread = threading.Thread(target=cleanup_loop, args=[self])
            self._cleanup_thread.start()

    def stop(self):
        logger.info("Stopping docker image manager")
        self._stop = True
        logger.debug("Stopping docker image manager: stop the downloads threads")
        self._downloading.stop()
        if self._cleanup_thread:
            logger.debug("Stopping docker image manager: stop the cleanup thread")
            self._cleanup_thread.join()
        logger.info("Stopped docker image manager.")

    def _cleanup(self):
        """
        Prunes the image cache for runs
        1. Only care about images we (this DockerImageManager) downloaded and know about
        2. We use sum of VirtualSize's, which is an upper bound on the disk use of our images:
            in case no images share any intermediate layers, this will be the real disk use,
            however if images share layers, the virtual size will count that layer's size for each
            image that uses it, even though it's stored only once in the disk. The 'Size' field
            accounts for the marginal size each image adds on top of the shared layers, but summing
            those is not accurate either since the shared base layers need to be counted once to get
            the total size. (i.e. summing marginal sizes would give us a lower bound on the total disk
            use of images). Calling df gives us an accurate disk use of ALL the images on the machine
            but because of (1) we don't want to use that.
        """
        while not self._stop:
            time.sleep(self._sleep_secs)
            deletable_entries = set(self._image_cache.values())
            disk_use = sum(cache_entry.virtual_size for cache_entry in deletable_entries)
            while disk_use > self._max_image_cache_size:
                entry_to_remove = min(deletable_entries, key=lambda entry: entry.last_used)
                logger.info(
                    'Disk use (%s) > max cache size (%s), pruning image: %s',
                    disk_use,
                    self._max_image_cache_size,
                    entry_to_remove.digest,
                )
                try:
                    image_to_delete = self._docker.images.get(entry_to_remove.id)
                    tags_to_delete = image_to_delete.tags
                    for tag in tags_to_delete:
                        self._docker.images.remove(tag)
                    # if we successfully removed the image also remove its cache entry
                    del self._image_cache[entry_to_remove.digest]
                except docker.errors.NotFound:
                    # image doesn't exist anymore for some reason, stop tracking it
                    del self._image_cache[entry_to_remove.digest]
                except docker.errors.APIError as err:
                    # Maybe we can't delete this image because its container is still running
                    # (think a run that takes 4 days so this is the oldest image but still in use)
                    # In that case we just continue with our lives, hoping it will get deleted once
                    # it's no longer in use and the cache becomes full again
                    logger.error(
                        "Cannot remove image %s from cache: %s", entry_to_remove.digest, err
                    )
                deletable_entries.remove(entry_to_remove)
                disk_use = sum(entry.virtual_size for entry in deletable_entries)
        logger.debug("Stopping docker image manager cleanup")

    def get(self, image_spec):
        """
        Request the docker image for the run with uuid, registering uuid as a dependent of this docker image
        :param image_spec: Repo image_spec of docker image being requested
        :returns: A DockerAvailabilityState object with the state of the docker image
        """
        if ':' not in image_spec:
            # Both digests and repo:tag kind of specs include the : character. The only case without it is when
            # a repo is specified without a tag (like 'latest')
            # When this is the case, different images API methods act differently:
            # - pull pulls all tags of the image
            # - get tries to get `latest` by default
            # That means if someone requests a docker image without a tag, and the image does not have a latest
            # tag pushed to Dockerhub, pull will succeed since it will pull all other tags, but later get calls
            # will fail since the `latest` tag won't be found on the system.
            # We don't want to assume what tag the user wanted so we want the pull step to fail if no tag is specified
            # and there's no latest tag on dockerhub.
            # Hence, we append the latest tag to the image spec if there's no tag specified otherwise at the very beginning
            image_spec += ':latest'
        try:
            image = self._docker.images.get(image_spec)
            digest = image.attrs.get('RepoDigests', [image_spec])[0]
            with self._lock:
                self._image_cache[digest] = ImageCacheEntry(
                    id=image.id,
                    digest=digest,
                    last_used=time.time(),
                    virtual_size=image.attrs['VirtualSize'],
                    marginal_size=image.attrs['Size'],
                )
            # We can remove the download thread if it still exists
            if image_spec in self._downloading:
                self._downloading.remove(image_spec)
            return ImageAvailabilityState(
                digest=digest, stage=DependencyStage.READY, message='Image ready'
            )
        except docker.errors.ImageNotFound:
            return self._pull_or_report(image_spec)  # type: DockerAvailabilityState
        except Exception as ex:
            return ImageAvailabilityState(
                digest=None, stage=DependencyStage.FAILED, message=str(ex)
            )

    def _pull_or_report(self, image_spec):
        if image_spec in self._downloading:
            with self._downloading[image_spec]['lock']:
                if self._downloading[image_spec].is_alive():
                    return ImageAvailabilityState(
                        digest=None,
                        stage=DependencyStage.DOWNLOADING,
                        message=self._downloading[image_spec]['status'],
                    )
                else:
                    if self._downloading[image_spec]['success']:
                        digest = self._docker.images.get(image_spec).attrs.get(
                            'RepoDigests', [image_spec]
                        )[0]
                        status = ImageAvailabilityState(
                            digest=digest,
                            stage=DependencyStage.READY,
                            message=self._downloading[image_spec]['message'],
                        )
                    else:
                        status = ImageAvailabilityState(
                            digest=None,
                            stage=DependencyStage.FAILED,
                            message=self._downloading[image_spec]['message'],
                        )
                    self._downloading.remove(image_spec)
                    return status
        else:

            def download():
                logger.debug('Downloading Docker image %s', image_spec)
                try:
                    self._docker.images.pull(image_spec)
                    logger.debug('Download for Docker image %s complete', image_spec)
                    self._downloading[image_spec]['success'] = True
                    self._downloading[image_spec]['message'] = "Downloading image"
                except (docker.errors.APIError, docker.errors.ImageNotFound) as ex:
                    logger.debug('Download for Docker image %s failed: %s', image_spec, ex)
                    self._downloading[image_spec]['success'] = False
                    self._downloading[image_spec]['message'] = "Can't download image: {}".format(ex)

            self._downloading.add_if_new(image_spec, threading.Thread(target=download, args=[]))
            return ImageAvailabilityState(
                digest=None,
                stage=DependencyStage.DOWNLOADING,
                message=self._downloading[image_spec]['status'],
            )
コード例 #6
0
class DockerImageManager:
    def __init__(self, commit_file, max_image_cache_size):
        """
        Initializes a DockerImageManager
        :param commit_file: String path to where the state file should be committed
        :param max_image_cache_size: Total size in bytes that the image cache can use
        """
        self._state_committer = JsonStateCommitter(
            commit_file)  # type: JsonStateCommitter
        self._docker = docker.from_env()  # type: DockerClient
        self._image_cache = {}  # type: Dict[str, ImageCacheEntry]
        self._downloading = ThreadDict(fields={
            'success': False,
            'status': 'Download starting.'
        },
                                       lock=True)
        self._max_image_cache_size = max_image_cache_size
        self._lock = threading.RLock()

        self._stop = False
        self._sleep_secs = 10
        self._cleanup_thread = None

        self._load_state()

    def _save_state(self):
        with self._lock:
            self._state_committer.commit(self._image_cache)

    def _load_state(self):
        with self._lock:
            self._image_cache = self._state_committer.load()

    def start(self):
        if self._max_image_cache_size:

            def cleanup_loop(self):
                while not self._stop:
                    try:
                        self._cleanup()
                        self._save_state()
                    except Exception:
                        traceback.print_exc()
                    time.sleep(self._sleep_secs)

            self._cleanup_thread = threading.Thread(target=cleanup_loop,
                                                    args=[self])
            self._cleanup_thread.start()

    def stop(self):
        logger.info("Stopping docker image manager")
        self._stop = True
        logger.debug(
            "Stopping docker image manager: stop the downloads threads")
        self._downloading.stop()
        if self._cleanup_thread:
            logger.debug(
                "Stopping docker image manager: stop the cleanup thread")
            self._cleanup_thread.join()
        logger.info("Stopped docker image manager.")

    def _cleanup(self):
        """
        Prunes the image cache for runs
        1. Only care about images we (this DockerImageManager) downloaded and know about
        2. We use sum of VirtualSize's, which is an upper bound on the disk use of our images:
            in case no images share any intermediate layers, this will be the real disk use,
            however if images share layers, the virtual size will count that layer's size for each
            image that uses it, even though it's stored only once in the disk. The 'Size' field
            accounts for the marginal size each image adds on top of the shared layers, but summing
            those is not accurate either since the shared base layers need to be counted once to get
            the total size. (i.e. summing marginal sizes would give us a lower bound on the total disk
            use of images). Calling df gives us an accurate disk use of ALL the images on the machine
            but because of (1) we don't want to use that.
        """
        while not self._stop:
            time.sleep(self._sleep_secs)
            deletable_entries = set(self._image_cache.values())
            disk_use = sum(cache_entry.virtual_size
                           for cache_entry in deletable_entries)
            while disk_use > self._max_image_cache_size:
                entry_to_remove = min(deletable_entries,
                                      key=lambda entry: entry.last_used)
                logger.info(
                    'Disk use (%s) > max cache size (%s), pruning image: %s',
                    disk_use,
                    self._max_image_cache_size,
                    entry_to_remove.digest,
                )
                try:
                    image_to_delete = self._docker.images.get(
                        entry_to_remove.id)
                    tags_to_delete = image_to_delete.tags
                    for tag in tags_to_delete:
                        self._docker.images.remove(tag)
                    # if we successfully removed the image also remove its cache entry
                    del self._image_cache[entry_to_remove.digest]
                except docker.errors.NotFound:
                    # image doesn't exist anymore for some reason, stop tracking it
                    del self._image_cache[entry_to_remove.digest]
                except docker.errors.APIError as err:
                    # Maybe we can't delete this image because its container is still running
                    # (think a run that takes 4 days so this is the oldest image but still in use)
                    # In that case we just continue with our lives, hoping it will get deleted once
                    # it's no longer in use and the cache becomes full again
                    logger.error("Cannot remove image %s from cache: %s",
                                 entry_to_remove.digest, err)
                deletable_entries.remove(entry_to_remove)
                disk_use = sum(entry.virtual_size
                               for entry in deletable_entries)
        logger.debug("Stopping docker image manager cleanup")

    def get(self, image_spec):
        """
        Request the docker image for the run with uuid, registering uuid as a dependent of this docker image
        :param image_spec: Repo image_spec of docker image being requested
        :returns: A DockerAvailabilityState object with the state of the docker image
        """
        if ':' not in image_spec:
            # Both digests and repo:tag kind of specs include the : character. The only case without it is when
            # a repo is specified without a tag (like 'latest')
            # When this is the case, different images API methods act differently:
            # - pull pulls all tags of the image
            # - get tries to get `latest` by default
            # That means if someone requests a docker image without a tag, and the image does not have a latest
            # tag pushed to Dockerhub, pull will succeed since it will pull all other tags, but later get calls
            # will fail since the `latest` tag won't be found on the system.
            # We don't want to assume what tag the user wanted so we want the pull step to fail if no tag is specified
            # and there's no latest tag on dockerhub.
            # Hence, we append the latest tag to the image spec if there's no tag specified otherwise at the very beginning
            image_spec += ':latest'
        try:
            image = self._docker.images.get(image_spec)
            digest = image.attrs.get('RepoDigests', [image_spec])[0]
            with self._lock:
                self._image_cache[digest] = ImageCacheEntry(
                    id=image.id,
                    digest=digest,
                    last_used=time.time(),
                    virtual_size=image.attrs['VirtualSize'],
                    marginal_size=image.attrs['Size'],
                )
            # We can remove the download thread if it still exists
            if image_spec in self._downloading:
                self._downloading.remove(image_spec)
            return ImageAvailabilityState(digest=digest,
                                          stage=DependencyStage.READY,
                                          message='Image ready')
        except docker.errors.ImageNotFound:
            return self._pull_or_report(
                image_spec)  # type: DockerAvailabilityState
        except Exception as ex:
            return ImageAvailabilityState(digest=None,
                                          stage=DependencyStage.FAILED,
                                          message=str(ex))

    def _pull_or_report(self, image_spec):
        if image_spec in self._downloading:
            with self._downloading[image_spec]['lock']:
                if self._downloading[image_spec].is_alive():
                    return ImageAvailabilityState(
                        digest=None,
                        stage=DependencyStage.DOWNLOADING,
                        message=self._downloading[image_spec]['status'],
                    )
                else:
                    if self._downloading[image_spec]['success']:
                        digest = self._docker.images.get(image_spec).attrs.get(
                            'RepoDigests', [image_spec])[0]
                        status = ImageAvailabilityState(
                            digest=digest,
                            stage=DependencyStage.READY,
                            message=self._downloading[image_spec]['message'],
                        )
                    else:
                        status = ImageAvailabilityState(
                            digest=None,
                            stage=DependencyStage.FAILED,
                            message=self._downloading[image_spec]['message'],
                        )
                    self._downloading.remove(image_spec)
                    return status
        else:

            def download():
                logger.debug('Downloading Docker image %s', image_spec)
                try:
                    self._docker.images.pull(image_spec)
                    logger.debug('Download for Docker image %s complete',
                                 image_spec)
                    self._downloading[image_spec]['success'] = True
                    self._downloading[image_spec][
                        'message'] = "Downloading image"
                except (docker.errors.APIError,
                        docker.errors.ImageNotFound) as ex:
                    logger.debug('Download for Docker image %s failed: %s',
                                 image_spec, ex)
                    self._downloading[image_spec]['success'] = False
                    self._downloading[image_spec][
                        'message'] = "Can't download image: {}".format(ex)

            self._downloading.add_if_new(
                image_spec, threading.Thread(target=download, args=[]))
            return ImageAvailabilityState(
                digest=None,
                stage=DependencyStage.DOWNLOADING,
                message=self._downloading[image_spec]['status'],
            )
コード例 #7
0
class LocalRunManager(BaseRunManager):
    """
    LocalRunManager executes the runs locally, each one in its own Docker
    container. It manages its cache of local Docker images and its own local
    Docker network.
    """

    # Network buffer size to use while proxying with netcat
    NETCAT_BUFFER_SIZE = 4096
    # Number of seconds to wait for bundle kills to propagate before forcing kill
    KILL_TIMEOUT = 100
    # Directory name to store running bundles in worker filesystem
    BUNDLES_DIR_NAME = 'runs'

    def __init__(
        self,
        worker,  # type: Worker
        image_manager,  # type: DockerImageManager
        dependency_manager,  # type: LocalFileSystemDependencyManager
        commit_file,  # type: str
        cpuset,  # type: Set[str]
        gpuset,  # type: Set[str]
        work_dir,  # type: str
        docker_runtime=docker_utils.DEFAULT_RUNTIME,  # type: str
        docker_network_prefix='codalab_worker_network',  # type: str
    ):
        self._worker = worker
        self._state_committer = JsonStateCommitter(commit_file)
        self._reader = LocalReader()
        self._docker = docker.from_env()
        self._bundles_dir = os.path.join(work_dir, LocalRunManager.BUNDLES_DIR_NAME)
        if not os.path.exists(self._bundles_dir):
            logger.info('{} doesn\'t exist, creating.'.format(self._bundles_dir))
            os.makedirs(self._bundles_dir, 0o770)

        self._image_manager = image_manager
        self._dependency_manager = dependency_manager
        self._cpuset = cpuset
        self._gpuset = gpuset
        self._stop = False

        self._runs = {}  # bundle_uuid -> LocalRunState
        self._lock = threading.RLock()
        self._init_docker_networks(docker_network_prefix)
        self._run_state_manager = LocalRunStateMachine(
            docker_image_manager=self._image_manager,
            dependency_manager=self._dependency_manager,
            worker_docker_network=self.worker_docker_network,
            docker_network_internal=self.docker_network_internal,
            docker_network_external=self.docker_network_external,
            docker_runtime=docker_runtime,
            upload_bundle_callback=self._worker.upload_bundle_contents,
            assign_cpu_and_gpu_sets_fn=self.assign_cpu_and_gpu_sets,
        )

    def _init_docker_networks(self, docker_network_prefix):
        """
        Set up docker networks for runs: one with external network access and one without
        """

        def create_or_get_network(name, internal):
            try:
                logger.debug('Creating docker network %s', name)
                return self._docker.networks.create(name, internal=internal, check_duplicate=True)
            except docker.errors.APIError:
                logger.debug('Network %s already exists, reusing', name)
                return self._docker.networks.list(names=[name])[0]

        self.worker_docker_network = create_or_get_network(docker_network_prefix, True)
        self.docker_network_external = create_or_get_network(docker_network_prefix + "_ext", False)
        self.docker_network_internal = create_or_get_network(docker_network_prefix + "_int", True)

    def save_state(self):
        # Remove complex container objects from state before serializing, these can be retrieved
        simple_runs = {uuid: state._replace(container=None) for uuid, state in self._runs.items()}
        self._state_committer.commit(simple_runs)

    def load_state(self):
        runs = self._state_committer.load()
        # Retrieve the complex container objects from the Docker API
        for uuid, run_state in runs.iteritems():
            if run_state.container_id:
                try:
                    run_state = run_state._replace(
                        container=self._docker.containers.get(run_state.container_id)
                    )
                except docker.errors.NotFound as ex:
                    logger.debug('Error getting the container for the run: %s', ex)
                    run_state = run_state._replace(container_id=None)
                finally:
                    self._runs[uuid] = run_state

    def start(self):
        """
        Load your state from disk, and start your sub-managers
        """
        self.load_state()
        self._image_manager.start()
        self._dependency_manager.start()

    def stop(self):
        """
        Starts any necessary cleanup and propagates to its other managers
        Blocks until cleanup is complete and it is safe to quit
        """
        logger.info("Stopping Local Run Manager")
        self._stop = True
        self._image_manager.stop()
        self._dependency_manager.stop()
        self._run_state_manager.stop()
        self.save_state()
        try:
            self.docker_network_internal.remove()
            self.docker_network_external.remove()
        except docker.errors.APIError as e:
            logger.error("Cannot clear docker networks: {}".format(e.message))

        logger.info("Stopped Local Run Manager. Exiting")

    def kill_all(self):
        """
        Kills all runs
        """
        logger.debug("Killing all bundles")
        # Set all bundle statuses to killed
        with self._lock:
            for uuid in self._runs.keys():
                run_state = self._runs[uuid]
                run_state.info['kill_message'] = 'Worker stopped'
                run_state = run_state._replace(info=run_state.info, is_killed=True)
                self._runs[uuid] = run_state
        # Wait until all runs finished or KILL_TIMEOUT seconds pas
        for attempt in range(LocalRunManager.KILL_TIMEOUT):
            with self._lock:
                self._runs = {
                    k: v for k, v in self._runs.items() if v.stage != LocalRunStage.FINISHED
                }
                if len(self._runs) > 0:
                    logger.debug(
                        "Waiting for {} more bundles. {} seconds until force quit.".format(
                            len(self._runs), LocalRunManager.KILL_TIMEOUT - attempt
                        )
                    )
            time.sleep(1)

    def process_runs(self):
        """ Transition each run then filter out finished runs """
        with self._lock:
            # transition all runs
            for bundle_uuid in self._runs.keys():
                run_state = self._runs[bundle_uuid]
                self._runs[bundle_uuid] = self._run_state_manager.transition(run_state)

            # filter out finished runs
            finished_container_ids = [
                run.container
                for run in self._runs.values()
                if (run.stage == LocalRunStage.FINISHED or run.stage == LocalRunStage.FINALIZING)
                and run.container_id is not None
            ]
            for container_id in finished_container_ids:
                try:
                    container = self._docker.containers.get(container_id)
                    container.remove(force=True)
                except (docker.errors.NotFound, docker.errors.NullResource):
                    pass
            self._runs = {k: v for k, v in self._runs.items() if v.stage != LocalRunStage.FINISHED}

    def create_run(self, bundle, resources):
        """
        Creates and starts processing a new run with the given bundle and
        resources
        """
        if self._stop:
            # Run Manager stopped, refuse more runs
            return
        bundle_uuid = bundle['uuid']
        bundle_path = os.path.join(self._bundles_dir, bundle_uuid)
        now = time.time()
        run_state = LocalRunState(
            stage=LocalRunStage.PREPARING,
            run_status='',
            bundle=bundle,
            bundle_path=os.path.realpath(bundle_path),
            resources=resources,
            start_time=now,
            container_id=None,
            container=None,
            docker_image=None,
            is_killed=False,
            has_contents=False,
            cpuset=None,
            gpuset=None,
            time_used=0,
            max_memory=0,
            disk_utilization=0,
            info={},
        )
        with self._lock:
            self._runs[bundle_uuid] = run_state

    def assign_cpu_and_gpu_sets(self, request_cpus, request_gpus):
        """
        Propose a cpuset and gpuset to a bundle based on given requested resources.
        Note: no side effects (this is important: we don't want to maintain more state than necessary)

        Arguments:
            request_cpus: integer
            request_gpus: integer

        Returns a 2-tuple:
            cpuset: assigned cpuset (str indices).
            gpuset: assigned gpuset (str indices).

        Throws an exception if unsuccessful.
        """
        cpuset, gpuset = set(self._cpuset), set(self._gpuset)

        with self._lock:
            for run_state in self._runs.values():
                if run_state.stage == LocalRunStage.RUNNING:
                    cpuset -= run_state.cpuset
                    gpuset -= run_state.gpuset

        if len(cpuset) < request_cpus or len(gpuset) < request_gpus:
            raise Exception("Not enough cpus or gpus to assign!")

        def propose_set(resource_set, request_count):
            return set(str(el) for el in list(resource_set)[:request_count])

        return propose_set(cpuset, request_cpus), propose_set(gpuset, request_gpus)

    def get_run(self, uuid):
        """
        Returns the state of the run with the given UUID if it is managed
        by this RunManager, returns None otherwise
        """
        with self._lock:
            return self._runs.get(uuid, None)

    def mark_finalized(self, uuid):
        """
        Marks the run as finalized server-side so it can be discarded
        """
        if uuid in self._runs:
            with self._lock:
                self._runs[uuid].info['finalized'] = True

    def read(self, run_state, path, dep_paths, args, reply):
        """
        Use your Reader helper to invoke the given read command
        """
        self._reader.read(run_state, path, dep_paths, args, reply)

    def write(self, run_state, path, dep_paths, string):
        """
        Write string to path in bundle with uuid
        """
        if os.path.normpath(path) in dep_paths:
            return
        with open(os.path.join(run_state.bundle_path, path), 'w') as f:
            f.write(string)

    def netcat(self, run_state, port, message, reply):
        """
        Write message to port of bundle with uuid and read the response.
        Returns a stream with the response contents
        """
        container_ip = docker_utils.get_container_ip(
            self.worker_docker_network.name, run_state.container
        )
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        s.connect((container_ip, port))
        s.sendall(message)

        total_data = []
        while True:
            data = s.recv(LocalRunManager.NETCAT_BUFFER_SIZE)
            if not data:
                break
            total_data.append(data)
        s.close()
        reply(None, {}, ''.join(total_data))

    def kill(self, run_state):
        """
        Kill bundle with uuid
        """
        with self._lock:
            run_state.info['kill_message'] = 'Kill requested'
            run_state = run_state._replace(info=run_state.info, is_killed=True)
            self._runs[run_state.bundle['uuid']] = run_state

    @property
    def all_runs(self):
        """
        Returns a list of all the runs managed by this RunManager
        """
        with self._lock:
            result = {
                bundle_uuid: {
                    'run_status': run_state.run_status,
                    'start_time': run_state.start_time,
                    'docker_image': run_state.docker_image,
                    'info': run_state.info,
                    'state': LocalRunStage.WORKER_STATE_TO_SERVER_STATE[run_state.stage],
                }
                for bundle_uuid, run_state in self._runs.items()
            }
            return result

    @property
    def all_dependencies(self):
        """
        Returns a list of all dependencies available in this RunManager
        """
        return self._dependency_manager.all_dependencies

    @property
    def cpus(self):
        """
        Total number of CPUs this RunManager has
        """
        return len(self._cpuset)

    @property
    def gpus(self):
        """
        Total number of GPUs this RunManager has
        """
        return len(self._gpuset)

    @property
    def memory_bytes(self):
        """
        Total installed memory of this RunManager
        """
        try:
            return os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')
        except ValueError:
            # Fallback to sysctl when os.sysconf('SC_PHYS_PAGES') fails on OS X
            return int(check_output(['sysctl', '-n', 'hw.memsize']).strip())