def _cleanup(self): """ Prune failed dependencies older than 10 seconds Limit the disk usage of the dependencies (both the bundle files and the serialized state file size) Deletes oldest failed dependencies first and then oldest finished dependencies. Doesn't touch downloading dependencies. """ self._prune_failed_dependencies() # With all the locks (should be fast if no cleanup needed, otherwise make sure nothing is corrupted while True: with self._global_lock: self._acquire_all_locks() bytes_used = sum(dep.size_bytes for dep in self._dependencies.values()) serialized_length = len( codalabworker.pyjson.dumps(self._dependencies)) if (bytes_used > self._max_cache_size_bytes or serialized_length > LocalFileSystemDependencyManager.MAX_SERIALIZED_LEN): logger.debug( '%d dependencies in cache, disk usage: %s (max %s), serialized size: %s (max %s)', len(self._dependencies), size_str(bytes_used), size_str(self._max_cache_size_bytes), size_str(serialized_length), LocalFileSystemDependencyManager.MAX_SERIALIZED_LEN, ) ready_deps = { dep: state for dep, state in self._dependencies.items() if state.stage == DependencyStage.READY and not state.dependents } failed_deps = { dep: state for dep, state in self._dependencies.items() if state.stage == DependencyStage.FAILED } if failed_deps: dep_to_remove = min( failed_deps.iteritems(), key=lambda dep_state: dep_state[1].last_used)[0] elif ready_deps: dep_to_remove = min( ready_deps.iteritems(), key=lambda dep_state: dep_state[1].last_used)[0] else: logger.info( 'Dependency quota full but there are only downloading dependencies, not cleaning up until downloads are over' ) break if dep_to_remove: self._delete_dependency(dep_to_remove) self._release_all_locks() else: self._release_all_locks() break
def _cleanup(self): """ Prune failed dependencies older than 10 seconds Limit the disk usage of the dependencies (both the bundle files and the serialized state file size) Deletes oldest failed dependencies first and then oldest finished dependencies. Doesn't touch downloading dependencies. """ self._prune_failed_dependencies() # With all the locks (should be fast if no cleanup needed, otherwise make sure nothing is corrupted while True: with self._global_lock: self._acquire_all_locks() bytes_used = sum(dep.size_bytes for dep in self._dependencies.values()) serialized_length = len(codalabworker.pyjson.dumps(self._dependencies)) if ( bytes_used > self._max_cache_size_bytes or serialized_length > LocalFileSystemDependencyManager.MAX_SERIALIZED_LEN ): logger.debug( '%d dependencies in cache, disk usage: %s (max %s), serialized size: %s (max %s)', len(self._dependencies), size_str(bytes_used), size_str(self._max_cache_size_bytes), size_str(serialized_length), LocalFileSystemDependencyManager.MAX_SERIALIZED_LEN, ) ready_deps = { dep: state for dep, state in self._dependencies.items() if state.stage == DependencyStage.READY and not state.dependents } failed_deps = { dep: state for dep, state in self._dependencies.items() if state.stage == DependencyStage.FAILED } if failed_deps: dep_to_remove = min( failed_deps.iteritems(), key=lambda dep_state: dep_state[1].last_used )[0] elif ready_deps: dep_to_remove = min( ready_deps.iteritems(), key=lambda dep_state: dep_state[1].last_used )[0] else: logger.info( 'Dependency quota full but there are only downloading dependencies, not cleaning up until downloads are over' ) break if dep_to_remove: self._delete_dependency(dep_to_remove) self._release_all_locks() else: self._release_all_locks() break
def check_resource_utilization(run_state): kill_messages = [] run_stats = docker_utils.get_container_stats(run_state.container) time_used = time.time() - run_state.start_time run_state = run_state._replace(time_used=time_used) run_state = run_state._replace( max_memory=max(run_state.max_memory, run_stats.get('memory', 0)) ) run_state = run_state._replace( disk_utilization=self.disk_utilization[bundle_uuid]['disk_utilization'] ) if ( run_state.resources['request_time'] and run_state.time_used > run_state.resources['request_time'] ): kill_messages.append( 'Time limit %s exceeded.' % duration_str(run_state.resources['request_time']) ) if ( run_state.max_memory > run_state.resources['request_memory'] or run_state.info.get('exitcode', '0') == '137' ): kill_messages.append( 'Memory limit %s exceeded.' % size_str(run_state.resources['request_memory']) ) if ( run_state.resources['request_disk'] and run_state.disk_utilization > run_state.resources['request_disk'] ): kill_messages.append( 'Disk limit %sb exceeded.' % size_str(run_state.resources['request_disk']) ) if kill_messages: new_info = run_state.info new_info['kill_message'] = ' '.join(kill_messages) run_state = run_state._replace(info=new_info, is_killed=True) return run_state
def check_resource_utilization(run_state): kill_messages = [] run_stats = docker_utils.get_container_stats(run_state.container) time_used = time.time() - run_state.start_time run_state = run_state._replace(time_used=time_used) run_state = run_state._replace(max_memory=max( run_state.max_memory, run_stats.get('memory', 0))) run_state = run_state._replace( disk_utilization=self.disk_utilization[bundle_uuid] ['disk_utilization']) if (run_state.resources['request_time'] and run_state.time_used > run_state.resources['request_time']): kill_messages.append( 'Time limit %s exceeded.' % duration_str(run_state.resources['request_time'])) if (run_state.max_memory > run_state.resources['request_memory'] or run_state.info.get('exitcode', '0') == '137'): kill_messages.append( 'Memory limit %s exceeded.' % size_str(run_state.resources['request_memory'])) if (run_state.resources['request_disk'] and run_state.disk_utilization > run_state.resources['request_disk']): kill_messages.append( 'Disk limit %sb exceeded.' % size_str(run_state.resources['request_disk'])) if kill_messages: new_info = run_state.info new_info['kill_message'] = ' '.join(kill_messages) run_state = run_state._replace(info=new_info, is_killed=True) return run_state
def update_state_and_check_killed(bytes_downloaded): """ Callback method for bundle service client updates dependency state and raises DownloadAbortedException if download is killed by dep. manager """ with self._dependency_locks[dependency]: state = self._dependencies[dependency] if state.killed: raise DownloadAbortedException("Aborted by user") self._dependencies[dependency] = state._replace( size_bytes=bytes_downloaded, message="Downloading dependency: %s downloaded" % size_str(bytes_downloaded), )
def progress_callback(bytes_uploaded): run_status = 'Uploading results: %s done (archived size)' % size_str( bytes_uploaded) self.uploading[bundle_uuid]['run_status'] = run_status return True
def _transition_from_PREPARING(self, run_state): """ 1- Request the docker image from docker image manager - if image is failed, move to CLEANING_UP state 2- Request the dependencies from dependency manager - if any are failed, move to CLEANING_UP state 3- If all dependencies and docker image are ready: - Set up the local filesystem for the run - Create symlinks to dependencies - Allocate resources and prepare the docker container - Start the docker container 4- If all is successful, move to RUNNING state """ if run_state.is_killed: return run_state._replace(stage=LocalRunStage.CLEANING_UP, container_id=None) dependencies_ready = True status_messages = [] bundle_uuid = run_state.bundle['uuid'] # get dependencies for dep in run_state.bundle['dependencies']: dependency = (dep['parent_uuid'], dep['parent_path']) dependency_state = self.dependency_manager.get( bundle_uuid, dependency) if dependency_state.stage == DependencyStage.DOWNLOADING: status_messages.append( 'Downloading dependency %s: %s done (archived size)' % (dep['child_path'], size_str(dependency_state.size_bytes))) dependencies_ready = False elif dependency_state.stage == DependencyStage.FAILED: # Failed to download dependency; -> CLEANING_UP run_state.info[ 'failure_message'] = 'Failed to download dependency %s: %s' % ( dep['child_path'], '', ) return run_state._replace(stage=LocalRunStage.CLEANING_UP, info=run_state.info) # get the docker image docker_image = run_state.resources['docker_image'] image_state = self.docker_image_manager.get(docker_image) if image_state.stage == DependencyStage.DOWNLOADING: status_messages.append('Pulling docker image: ' + (image_state.message or docker_image or "")) dependencies_ready = False elif image_state.stage == DependencyStage.FAILED: # Failed to pull image; -> CLEANING_UP run_state.info['failure_message'] = image_state.message return run_state._replace(stage=LocalRunStage.CLEANING_UP, info=run_state.info) # stop proceeding if dependency and image downloads aren't all done if not dependencies_ready: status_message = status_messages.pop() if status_messages: status_message += "(and downloading %d other dependencies and docker images)" % len( status_messages) return run_state._replace(run_status=status_message) # All dependencies ready! Set up directories, symlinks and container. Start container. # 1) Set up a directory to store the bundle. remove_path(run_state.bundle_path) os.mkdir(run_state.bundle_path) # 2) Set up symlinks dependencies = [] docker_dependencies_path = '/' + bundle_uuid + '_dependencies' for dep in run_state.bundle['dependencies']: child_path = os.path.normpath( os.path.join(run_state.bundle_path, dep['child_path'])) if not child_path.startswith(run_state.bundle_path): raise Exception('Invalid key for dependency: %s' % (dep['child_path'])) dependency_path = self.dependency_manager.get( bundle_uuid, (dep['parent_uuid'], dep['parent_path'])).path dependency_path = os.path.join( self.dependency_manager.dependencies_dir, dependency_path) docker_dependency_path = os.path.join(docker_dependencies_path, dep['child_path']) os.symlink(docker_dependency_path, child_path) # These are turned into docker volume bindings like: # dependency_path:docker_dependency_path:ro dependencies.append((dependency_path, docker_dependency_path)) # 3) Set up container if run_state.resources['request_network']: docker_network = self.docker_network_external_name else: docker_network = self.docker_network_internal_name try: cpuset, gpuset = self.assign_cpu_and_gpu_sets_fn( run_state.resources['request_cpus'], run_state.resources['request_gpus']) except Exception: run_state.info[ 'failure_message'] = "Cannot assign enough resources" return run_state._replace(stage=LocalRunStage.CLEANING_UP, info=run_state.info) # 4) Start container try: container = docker_utils.start_bundle_container( run_state.bundle_path, bundle_uuid, dependencies, run_state.bundle['command'], run_state.resources['docker_image'], network=docker_network, cpuset=cpuset, gpuset=gpuset, memory_bytes=run_state.resources['request_memory'], runtime=self.docker_runtime, ) except docker_utils.DockerException as e: run_state.info[ 'failure_message'] = 'Cannot start Docker container: {}'.format( e) return run_state._replace(stage=LocalRunStage.CLEANING_UP, info=run_state.info) return run_state._replace( stage=LocalRunStage.RUNNING, start_time=time.time(), run_status='Running job in Docker container', container_id=container.id, container=container, docker_image=image_state.digest, has_contents=True, cpuset=cpuset, gpuset=gpuset, )
def progress_callback(bytes_uploaded): run_status = 'Uploading results: %s done (archived size)' % size_str( bytes_uploaded ) self.uploading[bundle_uuid]['run_status'] = run_status return True
def _transition_from_PREPARING(self, run_state): """ 1- Request the docker image from docker image manager - if image is failed, move to CLEANING_UP state 2- Request the dependencies from dependency manager - if any are failed, move to CLEANING_UP state 3- If all dependencies and docker image are ready: - Set up the local filesystem for the run - Create symlinks to dependencies - Allocate resources and prepare the docker container - Start the docker container 4- If all is successful, move to RUNNING state """ if run_state.is_killed: return run_state._replace(stage=LocalRunStage.CLEANING_UP, container_id=None) dependencies_ready = True status_messages = [] bundle_uuid = run_state.bundle['uuid'] # get dependencies for dep in run_state.bundle['dependencies']: dependency = (dep['parent_uuid'], dep['parent_path']) dependency_state = self.dependency_manager.get(bundle_uuid, dependency) if dependency_state.stage == DependencyStage.DOWNLOADING: status_messages.append( 'Downloading dependency %s: %s done (archived size)' % (dep['child_path'], size_str(dependency_state.size_bytes)) ) dependencies_ready = False elif dependency_state.stage == DependencyStage.FAILED: # Failed to download dependency; -> CLEANING_UP run_state.info['failure_message'] = 'Failed to download dependency %s: %s' % ( dep['child_path'], '', ) return run_state._replace(stage=LocalRunStage.CLEANING_UP, info=run_state.info) # get the docker image docker_image = run_state.resources['docker_image'] image_state = self.docker_image_manager.get(docker_image) if image_state.stage == DependencyStage.DOWNLOADING: status_messages.append( 'Pulling docker image: ' + (image_state.message or docker_image or "") ) dependencies_ready = False elif image_state.stage == DependencyStage.FAILED: # Failed to pull image; -> CLEANING_UP run_state.info['failure_message'] = image_state.message return run_state._replace(stage=LocalRunStage.CLEANING_UP, info=run_state.info) # stop proceeding if dependency and image downloads aren't all done if not dependencies_ready: status_message = status_messages.pop() if status_messages: status_message += "(and downloading %d other dependencies and docker images)" % len( status_messages ) return run_state._replace(run_status=status_message) # All dependencies ready! Set up directories, symlinks and container. Start container. # 1) Set up a directory to store the bundle. remove_path(run_state.bundle_path) os.mkdir(run_state.bundle_path) # 2) Set up symlinks dependencies = [] docker_dependencies_path = '/' + bundle_uuid + '_dependencies' for dep in run_state.bundle['dependencies']: child_path = os.path.normpath(os.path.join(run_state.bundle_path, dep['child_path'])) if not child_path.startswith(run_state.bundle_path): raise Exception('Invalid key for dependency: %s' % (dep['child_path'])) dependency_path = self.dependency_manager.get( bundle_uuid, (dep['parent_uuid'], dep['parent_path']) ).path dependency_path = os.path.join( self.dependency_manager.dependencies_dir, dependency_path ) docker_dependency_path = os.path.join(docker_dependencies_path, dep['child_path']) os.symlink(docker_dependency_path, child_path) # These are turned into docker volume bindings like: # dependency_path:docker_dependency_path:ro dependencies.append((dependency_path, docker_dependency_path)) # 3) Set up container if run_state.resources['request_network']: docker_network = self.docker_network_external.name else: docker_network = self.docker_network_internal.name try: cpuset, gpuset = self.assign_cpu_and_gpu_sets_fn( run_state.resources['request_cpus'], run_state.resources['request_gpus'] ) except Exception: run_state.info['failure_message'] = "Cannot assign enough resources" return run_state._replace(stage=LocalRunStage.CLEANING_UP, info=run_state.info) # 4) Start container try: container = docker_utils.start_bundle_container( run_state.bundle_path, bundle_uuid, dependencies, run_state.bundle['command'], run_state.resources['docker_image'], network=docker_network, cpuset=cpuset, gpuset=gpuset, memory_bytes=run_state.resources['request_memory'], runtime=self.docker_runtime, ) self.worker_docker_network.connect(container) except docker_utils.DockerException as e: run_state.info['failure_message'] = 'Cannot start Docker container: {}'.format(e) return run_state._replace(stage=LocalRunStage.CLEANING_UP, info=run_state.info) return run_state._replace( stage=LocalRunStage.RUNNING, start_time=time.time(), run_status='Running job in Docker container', container_id=container.id, container=container, docker_image=image_state.digest, has_contents=True, cpuset=cpuset, gpuset=gpuset, )