def un_tar_directory(fileobj, directory_path, compression='', force=False): """ Extracts the given file-like object containing a tar archive into the given directory, which will be created and should not already exist. If it already exists, and `force` is `False`, an error is raised. If it already exists, and `force` is `True`, the directory is removed and recreated. compression specifies the compression scheme and can be one of '', 'gz' or 'bz2'. Raises tarfile.TarError if the archive is not valid. """ directory_path = os.path.realpath(directory_path) if force: from codalab.worker.file_util import remove_path remove_path(directory_path) os.mkdir(directory_path) with tarfile.open(fileobj=fileobj, mode='r|' + compression) as tar: for member in tar: # Make sure that there is no trickery going on (see note in # TarFile.extractall() documentation). member_path = os.path.realpath(os.path.join(directory_path, member.name)) if not member_path.startswith(directory_path): raise tarfile.TarError('Archive member extracts outside the directory.') tar.extract(member, directory_path)
def _try_start_bundle(self, workers, worker, bundle): """ Tries to start running the bundle on the given worker, returning False if that failed. """ if self._model.set_starting_bundle(bundle, worker['user_id'], worker['worker_id']): workers.set_starting(bundle.uuid, worker) if (self._worker_model.shared_file_system and worker['user_id'] == self._model.root_user_id): # On a shared file system we create the path here to avoid NFS # directory cache issues. path = self._bundle_store.get_bundle_location(bundle.uuid) remove_path(path) os.mkdir(path) if self._worker_model.send_json_message( worker['socket_id'], self._construct_run_message(worker, bundle), 0.2): logger.info('Starting run bundle %s', bundle.uuid) return True else: self._model.restage_bundle(bundle) workers.restage(bundle.uuid) return False else: return False
def _try_start_bundle(self, workers, worker, bundle, bundle_resources): """ Tries to start running the bundle on the given worker, returning False if that failed. """ if not check_bundle_have_run_permission( self._model, self._model.get_user(worker['user_id']), bundle) or not self._model.transition_bundle_starting( bundle, worker['user_id'], worker['worker_id']): return False workers.set_starting(bundle.uuid, worker['worker_id']) if worker['shared_file_system']: # On a shared file system we create the path here to avoid NFS # directory cache issues. # TODO(Ashwin): fix for --link path = self._bundle_store.get_bundle_location(bundle.uuid) remove_path(path) os.mkdir(path) if self._worker_model.send_json_message( worker['socket_id'], self._construct_run_message(worker['shared_file_system'], bundle, bundle_resources), 0.2, ): logger.info('Starting run bundle {} on worker {}'.format( bundle.uuid, worker['worker_id'])) return True else: self._model.transition_bundle_staged(bundle) workers.restage(bundle.uuid) return False
def test_empty(self): dir = tempfile.mkdtemp() self.addCleanup(lambda: remove_path(dir)) temp_dir = tempfile.mkdtemp() self.addCleanup(lambda: remove_path(temp_dir)) output_dir = os.path.join(temp_dir, 'output') self.unarchive(self.archive(dir), output_dir, 'gz') self.assertEqual(os.listdir(output_dir), [])
def test_tar_empty(self): dir = tempfile.mkdtemp() self.addCleanup(lambda: remove_path(dir)) temp_dir = tempfile.mkdtemp() self.addCleanup(lambda: remove_path(temp_dir)) output_dir = os.path.join(temp_dir, 'output') un_tar_directory(tar_gzip_directory(dir), output_dir, 'gz') self.assertEqual(os.listdir(output_dir), [])
def _make_bundle(self, bundle): try: bundle_location = self._bundle_store.get_bundle_location( bundle.uuid) path = os.path.normpath(bundle_location) deps = [] for dep in bundle.dependencies: parent_bundle_path = os.path.normpath( self._bundle_store.get_bundle_location(dep.parent_uuid)) dependency_path = os.path.normpath( os.path.join(parent_bundle_path, dep.parent_path)) if not dependency_path.startswith(parent_bundle_path) or ( not os.path.islink(dependency_path) and not os.path.exists(dependency_path)): raise Exception('Invalid dependency %s' % (path_util.safe_join( dep.parent_uuid, dep.parent_path))) child_path = os.path.normpath( os.path.join(path, dep.child_path)) if not child_path.startswith(path): raise Exception('Invalid key for dependency: %s' % (dep.child_path)) deps.append((dependency_path, child_path)) remove_path(path) if len(deps) == 1 and deps[0][1] == path: path_util.copy(deps[0][0], path, follow_symlinks=False) else: os.mkdir(path) for dependency_path, child_path in deps: path_util.copy(dependency_path, child_path, follow_symlinks=False) self._model.update_disk_metadata(bundle, bundle_location, enforce_disk_quota=True) logger.info('Finished making bundle %s', bundle.uuid) self._model.update_bundle(bundle, {'state': State.READY}) except Exception as e: logger.info('Failing bundle %s: %s', bundle.uuid, str(e)) self._model.update_bundle(bundle, { 'state': State.FAILED, 'metadata': { 'failure_message': str(e) } }) finally: with self._make_uuids_lock: self._make_uuids.remove(bundle.uuid)
def _transition_from_FINALIZING(self, run_state): """ If a full worker cycle has passed since we got into FINALIZING we already reported to server so can move on to FINISHED. Can also remove bundle_path now """ if run_state.finalized: if not self.shared_file_system: remove_path(run_state.bundle_path) # don't remove bundle if shared FS return run_state._replace(stage=RunStage.FINISHED, run_status='Finished') else: return run_state
def _transition_from_CLEANING_UP(self, run_state): """ 1- delete the container if still existent 2- clean up the dependencies from bundle directory 3- release the dependencies in dependency manager 4- If bundle has contents to upload (i.e. was RUNNING at some point), move to UPLOADING_RESULTS state Otherwise move to FINALIZING state """ if run_state.container_id is not None: while docker_utils.container_exists(run_state.container): try: finished, _, _ = docker_utils.check_finished( run_state.container) if finished: run_state.container.remove(force=True) run_state = run_state._replace(container=None, container_id=None) break else: try: run_state.container.kill() except docker.errors.APIError: logger.error(traceback.format_exc()) time.sleep(1) except docker.errors.APIError: logger.error(traceback.format_exc()) time.sleep(1) for dep in run_state.bundle.dependencies: dep_key = DependencyKey(dep.parent_uuid, dep.parent_path) if not self.shared_file_system: # No dependencies if shared fs worker self.dependency_manager.release(run_state.bundle.uuid, dep_key) child_path = os.path.join(run_state.bundle_path, dep.child_path) try: remove_path(child_path) except Exception: logger.error(traceback.format_exc()) if run_state.is_restaged: return run_state._replace(stage=RunStage.RESTAGED) if not self.shared_file_system and run_state.has_contents: # No need to upload results since results are directly written to bundle store return run_state._replace(stage=RunStage.UPLOADING_RESULTS, run_status='Uploading results', container=None) else: return self.finalize_run(run_state)
def _delete_dependency(self, dependency_key): """ Remove the given dependency from the manager's state Also delete any known files on the filesystem if any exist """ if self._acquire_if_exists(dependency_key): try: path_to_remove = self._dependencies[dependency_key].path self._paths.remove(path_to_remove) remove_path(path_to_remove) except Exception: pass finally: del self._dependencies[dependency_key] self._dependency_locks[dependency_key].release()
def _transition_from_FINALIZING(self, run_state): """ If a full worker cycle has passed since we got into the FINALIZING state we already reported to server, if bundle is going be sent back to the server, move on to the RESTAGED state. Otherwise, move on to the FINISHED state. Can also remove bundle_path now. """ if run_state.is_restaged: return run_state._replace(stage=RunStage.RESTAGED) elif run_state.finalized: if not self.shared_file_system: remove_path( run_state.bundle_path) # don't remove bundle if shared FS return run_state._replace(stage=RunStage.FINISHED, run_status='Finished') else: return run_state
def _sync_state(self): """ Synchronize dependency states between dependencies-state.json and the local file system as follows: 1. self._dependencies, self._dependency_locks, and self._paths: populated from dependencies-state.json in function _load_state() 2. directories on the local file system: the bundle contents This function forces the 1 and 2 to be in sync by taking the intersection (e.g., deleting bundles from the local file system that don't appear in the dependencies-state.json and vice-versa) """ # Get the paths that exist in dependency state, loaded path and # the local file system (the dependency directories under self.dependencies_dir) local_directories = set(os.listdir(self.dependencies_dir)) paths_in_loaded_state = [ dep_state.path for dep_state in self._dependencies.values() ] self._paths = self._paths.intersection( paths_in_loaded_state).intersection(local_directories) # Remove the orphaned dependencies from self._dependencies and # self._dependency_locks if they don't exist in self._paths (intersection of paths in dependency state, # loaded paths and the paths on the local file system) dependencies_to_remove = [ dep for dep, dep_state in self._dependencies.items() if dep_state.path not in self._paths ] for dep in dependencies_to_remove: logger.info( "Dependency {} in dependency state but its path {} doesn't exist on the local file system. " "Removing it from dependency state.".format( dep, os.path.join(self.dependencies_dir, self._dependencies[dep].path))) del self._dependencies[dep] del self._dependency_locks[dep] # Remove the orphaned directories from the local file system directories_to_remove = local_directories - self._paths for dir in directories_to_remove: full_path = os.path.join(self.dependencies_dir, dir) logger.info( "Remove orphaned directory {} from the local file system.". format(full_path)) remove_path(full_path) # Save the current synced state back to the state file: dependency-state.json as # the current state might have been changed during the state syncing phase self._save_state()
def _transition_from_FINALIZING(self, run_state): """ If a full worker cycle has passed since we got into the FINALIZING state we already reported to server, if bundle is going be sent back to the server, move on to the RESTAGED state. Otherwise, move on to the FINISHED state. Can also remove bundle_path now. """ if run_state.is_restaged: log_bundle_transition( bundle_uuid=run_state.bundle.uuid, previous_stage=run_state.stage, next_stage=RunStage.RESTAGED, reason='the bundle is restaged, as `pass-down-termination` is specified for worker', ) return run_state._replace(stage=RunStage.RESTAGED) elif run_state.finalized: if not self.shared_file_system: remove_path(run_state.bundle_path) # don't remove bundle if shared FS return run_state._replace(stage=RunStage.FINISHED, run_status='Finished') else: return run_state
def test_always_ignore(self): temp_dir = tempfile.mkdtemp() self.addCleanup(lambda: remove_path(temp_dir)) output_dir = os.path.join(temp_dir, 'output') self.unarchive(self.archive(IGNORE_TEST_DIR), output_dir, 'gz') output_dir_entries = os.listdir(output_dir) self.assertNotIn('._ignored', output_dir_entries) self.assertIn('dir', output_dir_entries) self.assertNotIn('__MACOSX', output_dir_entries) self.assertFalse(os.path.exists(os.path.join(output_dir, 'dir', '__MACOSX'))) self.assertFalse(os.path.exists(os.path.join(output_dir, 'dir', '._ignored2')))
def _delete_dependency(self, dep_key, dependencies, paths): """ Remove the given dependency from the manager's state Modifies `dependencies` and `paths` that are passed in. Also deletes any known files on the filesystem if any exist. NOT NFS-SAFE - Caller should acquire self._state_lock before calling this method. """ assert self._state_lock.is_locked if dep_key in dependencies: try: path_to_remove = dependencies[dep_key].path paths.remove(path_to_remove) # Deletes dependency content from disk remove_path(path_to_remove) except Exception: pass finally: del dependencies[dep_key] logger.info(f"Deleted dependency {dep_key}.")
def test_exclude_ignore(self): temp_dir = tempfile.mkdtemp() self.addCleanup(lambda: remove_path(temp_dir)) output_dir = os.path.join(temp_dir, 'output') self.unarchive(self.archive(IGNORE_TEST_DIR, ignore_file='.tarignore'), output_dir, 'gz') output_dir_entries = os.listdir(output_dir) self.assertIn('not_ignored.txt', output_dir_entries) self.assertIn('dir', output_dir_entries) self.assertNotIn('ignored.txt', output_dir_entries) self.assertNotIn('ignored_dir', output_dir_entries) self.assertTrue(os.path.exists(os.path.join(output_dir, 'dir', 'not_ignored2.txt'))) self.assertFalse(os.path.exists(os.path.join(output_dir, 'dir', 'ignored2.txt')))
def test_has_files(self): temp_dir = tempfile.mkdtemp() self.addCleanup(lambda: remove_path(temp_dir)) output_dir = os.path.join(temp_dir, 'output') self.unarchive(self.archive(FILES_DIR, False, ['f2'], ['f1', 'b.txt']), output_dir, 'gz') output_dir_entries = os.listdir(output_dir) self.assertIn('dir1', output_dir_entries) self.assertIn('a.txt', output_dir_entries) self.assertNotIn('b.txt', output_dir_entries) self.assertTrue(os.path.exists(os.path.join(output_dir, 'dir1', 'f1'))) self.assertFalse(os.path.exists(os.path.join(output_dir, 'dir1', 'f2'))) self.assertTrue(os.path.islink(os.path.join(output_dir, 'a-symlink.txt')))
def test_tar_always_ignore(self): dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'files/ignore_test') temp_dir = tempfile.mkdtemp() self.addCleanup(lambda: remove_path(temp_dir)) output_dir = os.path.join(temp_dir, 'output') un_tar_directory(tar_gzip_directory(dir), output_dir, 'gz') output_dir_entries = os.listdir(output_dir) self.assertNotIn('._ignored', output_dir_entries) self.assertIn('dir', output_dir_entries) self.assertNotIn('__MACOSX', output_dir_entries) self.assertFalse( os.path.exists(os.path.join(output_dir, 'dir', '__MACOSX'))) self.assertFalse( os.path.exists(os.path.join(output_dir, 'dir', '._ignored2')))
def test_tar_has_files(self): dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'files') temp_dir = tempfile.mkdtemp() self.addCleanup(lambda: remove_path(temp_dir)) output_dir = os.path.join(temp_dir, 'output') un_tar_directory( tar_gzip_directory(dir, False, ['f2'], ['f1', 'b.txt']), output_dir, 'gz') output_dir_entries = os.listdir(output_dir) self.assertIn('dir1', output_dir_entries) self.assertIn('a.txt', output_dir_entries) self.assertNotIn('b.txt', output_dir_entries) self.assertTrue(os.path.exists(os.path.join(output_dir, 'dir1', 'f1'))) self.assertFalse(os.path.exists(os.path.join(output_dir, 'dir1', 'f2'))) self.assertTrue( os.path.islink(os.path.join(output_dir, 'a-symlink.txt')))
def test_tar_exclude_ignore(self): dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'files/ignore_test') temp_dir = tempfile.mkdtemp() self.addCleanup(lambda: remove_path(temp_dir)) output_dir = os.path.join(temp_dir, 'output') un_tar_directory(tar_gzip_directory(dir, ignore_file='.tarignore'), output_dir, 'gz') output_dir_entries = os.listdir(output_dir) self.assertIn('not_ignored.txt', output_dir_entries) self.assertIn('dir', output_dir_entries) self.assertNotIn('ignored.txt', output_dir_entries) self.assertNotIn('ignored_dir', output_dir_entries) self.assertTrue( os.path.exists(os.path.join(output_dir, 'dir', 'not_ignored2.txt'))) self.assertFalse( os.path.exists(os.path.join(output_dir, 'dir', 'ignored2.txt')))
def _make_bundle(self, bundle): try: bundle_link_url = getattr(bundle.metadata, "link_url", None) bundle_location = bundle_link_url or self._bundle_store.get_bundle_location( bundle.uuid) path = os.path.normpath(bundle_location) deps = [] parent_bundle_link_urls = self._model.get_bundle_metadata( [dep.parent_uuid for dep in bundle.dependencies], "link_url") for dep in bundle.dependencies: parent_bundle_link_url = parent_bundle_link_urls.get( dep.parent_uuid) try: parent_bundle_path = parent_bundle_link_url or os.path.normpath( self._bundle_store.get_bundle_location( dep.parent_uuid)) except NotFoundError: raise Exception('Invalid dependency %s' % (path_util.safe_join( dep.parent_uuid, dep.parent_path))) # TODO(Ashwin): make this logic non-fs specific. dependency_path = os.path.normpath( os.path.join(parent_bundle_path, dep.parent_path)) if not dependency_path.startswith(parent_bundle_path) or ( not os.path.islink(dependency_path) and not os.path.exists(dependency_path)): raise Exception('Invalid dependency %s' % (path_util.safe_join( dep.parent_uuid, dep.parent_path))) child_path = os.path.normpath( os.path.join(path, dep.child_path)) if not child_path.startswith(path): raise Exception('Invalid key for dependency: %s' % (dep.child_path)) deps.append((dependency_path, child_path)) remove_path(path) if len(deps) == 1 and deps[0][1] == path: path_util.copy(deps[0][0], path, follow_symlinks=False) else: os.mkdir(path) for dependency_path, child_path in deps: path_util.copy(dependency_path, child_path, follow_symlinks=False) # TODO(Ashwin): fix self._model.update_disk_metadata(bundle, bundle_location, enforce_disk_quota=True) logger.info('Finished making bundle %s', bundle.uuid) self._model.update_bundle(bundle, {'state': State.READY}) except Exception as e: logger.info('Failing bundle %s: %s', bundle.uuid, str(e)) self._model.update_bundle( bundle, { 'state': State.FAILED, 'metadata': { 'failure_message': str(e), 'error_traceback': traceback.format_exc(), }, }, ) finally: with self._make_uuids_lock: self._make_uuids.remove(bundle.uuid)
def remove_path_no_fail(path): try: remove_path(path) except Exception: logger.error(traceback.format_exc())
def _transition_from_PREPARING(self, run_state): """ 1- Request the docker image from docker image manager - if image is failed, move to CLEANING_UP state 2- Request the dependencies from dependency manager - if any are failed, move to CLEANING_UP state 3- If all dependencies and docker image are ready: - Set up the local filesystem for the run - Create symlinks to dependencies - Allocate resources and prepare the docker container - Start the docker container 4- If all is successful, move to RUNNING state """ def mount_dependency(dependency, shared_file_system): if not shared_file_system: # Set up symlinks for the content at dependency path Path(dependency.child_path).parent.mkdir(parents=True, exist_ok=True) os.symlink(dependency.docker_path, dependency.child_path) # The following will be converted into a Docker volume binding like: # dependency_path:docker_dependency_path:ro docker_dependencies.append( (dependency.parent_path, dependency.docker_path)) if run_state.is_killed or run_state.is_restaged: log_bundle_transition( bundle_uuid=run_state.bundle.uuid, previous_stage=run_state.stage, next_stage=RunStage.CLEANING_UP, reason= f'the bundle was {"killed" if run_state.is_killed else "restaged"}', ) return run_state._replace(stage=RunStage.CLEANING_UP) # Check CPU and GPU availability try: cpuset, gpuset = self.assign_cpu_and_gpu_sets_fn( run_state.resources.cpus, run_state.resources.gpus) except Exception as e: message = "Unexpectedly unable to assign enough resources to bundle {}: {}".format( run_state.bundle.uuid, str(e)) logger.error(message) logger.error(traceback.format_exc()) return run_state._replace(run_status=message) dependencies_ready = True status_messages = [] if not self.shared_file_system: # No need to download dependencies if we're in the shared FS, # since they're already in our FS for dep in run_state.bundle.dependencies: dep_key = DependencyKey(dep.parent_uuid, dep.parent_path) dependency_state = self.dependency_manager.get( run_state.bundle.uuid, dep_key) if dependency_state.stage == DependencyStage.DOWNLOADING: status_messages.append( 'Downloading dependency %s: %s done (archived size)' % (dep.child_path, size_str( dependency_state.size_bytes))) dependencies_ready = False elif dependency_state.stage == DependencyStage.FAILED: # Failed to download dependency; -> CLEANING_UP log_bundle_transition( bundle_uuid=run_state.bundle.uuid, previous_stage=run_state.stage, next_stage=RunStage.CLEANING_UP, reason= f'Dependency has failed for this bundle. Dependency child uuid: {dep.child_uuid}. Dependency child path: {dep.child_path}', ) return run_state._replace( stage=RunStage.CLEANING_UP, failure_message='Failed to download dependency %s: %s' % (dep.child_path, dependency_state.message), ) # get the docker image docker_image = run_state.resources.docker_image image_state = self.image_manager.get(docker_image) if image_state.stage == DependencyStage.DOWNLOADING: status_messages.append('Pulling docker image %s %s' % (docker_image, image_state.message)) dependencies_ready = False elif image_state.stage == DependencyStage.FAILED: # Failed to pull image; -> CLEANING_UP message = 'Failed to download Docker image: %s' % image_state.message logger.error(message) return run_state._replace(stage=RunStage.CLEANING_UP, failure_message=message) # stop proceeding if dependency and image downloads aren't all done if not dependencies_ready: status_message = status_messages.pop() if status_messages: status_message += "(and downloading %d other dependencies and docker images)" % len( status_messages) logger.info( f'bundle is not ready yet. uuid: {run_state.bundle.uuid}. status message: {status_message}' ) return run_state._replace(run_status=status_message) # All dependencies ready! Set up directories, symlinks and container. Start container. # 1) Set up a directory to store the bundle. if self.shared_file_system: if not os.path.exists(run_state.bundle_path): if run_state.bundle_dir_wait_num_tries == 0: message = ( "Bundle directory cannot be found on the shared filesystem. " "Please ensure the shared fileystem between the server and " "your worker is mounted properly or contact your administrators." ) log_bundle_transition( bundle_uuid=run_state.bundle.uuid, previous_stage=run_state.stage, next_stage=RunStage.CLEANING_UP, reason= "Bundle directory cannot be found on the shared filesystem.", ) return run_state._replace(stage=RunStage.CLEANING_UP, failure_message=message) next_bundle_dir_wait_num_tries = run_state.bundle_dir_wait_num_tries - 1 logger.info( f'Waiting for bundle directory to be created by the server, uuid: {run_state.bundle.uuid}, bundle_dir_wait_num_tries: {next_bundle_dir_wait_num_tries}' ) return run_state._replace( run_status= "Waiting for bundle directory to be created by the server", bundle_dir_wait_num_tries=next_bundle_dir_wait_num_tries, ) else: remove_path(run_state.bundle_path) os.makedirs(run_state.bundle_path) # 2) Set up symlinks docker_dependencies = [] docker_dependencies_path = ( RunStateMachine._ROOT + run_state.bundle.uuid + ('_dependencies' if not self.shared_file_system else '')) for dep in run_state.bundle.dependencies: full_child_path = os.path.normpath( os.path.join(run_state.bundle_path, dep.child_path)) to_mount = [] dependency_path = self._get_dependency_path(run_state, dep) if dep.child_path == RunStateMachine._CURRENT_DIRECTORY: # Mount all the content of the dependency_path to the top-level of the bundle for child in os.listdir(dependency_path): child_path = os.path.normpath( os.path.join(run_state.bundle_path, child)) to_mount.append( DependencyToMount( docker_path=os.path.join(docker_dependencies_path, child), child_path=child_path, parent_path=os.path.join(dependency_path, child), )) run_state = run_state._replace( paths_to_remove=(run_state.paths_to_remove or []) + [child_path]) else: to_mount.append( DependencyToMount( docker_path=os.path.join(docker_dependencies_path, dep.child_path), child_path=full_child_path, parent_path=dependency_path, )) first_element_of_path = Path(dep.child_path).parts[0] if first_element_of_path == RunStateMachine._ROOT: run_state = run_state._replace( paths_to_remove=(run_state.paths_to_remove or []) + [full_child_path]) else: # child_path can be a nested path, so later remove everything from the first element of the path path_to_remove = os.path.join(run_state.bundle_path, first_element_of_path) run_state = run_state._replace( paths_to_remove=(run_state.paths_to_remove or []) + [path_to_remove]) for dependency in to_mount: try: mount_dependency(dependency, self.shared_file_system) except OSError as e: log_bundle_transition( bundle_uuid=run_state.bundle.uuid, previous_stage=run_state.stage, next_stage=RunStage.CLEANING_UP, reason=str(e.__class__), level=logging.ERROR, ) return run_state._replace(stage=RunStage.CLEANING_UP, failure_message=str(e)) if run_state.resources.network: docker_network = self.docker_network_external.name else: docker_network = self.docker_network_internal.name # 3) Start container try: container = docker_utils.start_bundle_container( run_state.bundle_path, run_state.bundle.uuid, docker_dependencies, run_state.bundle.command, run_state.resources.docker_image, network=docker_network, cpuset=cpuset, gpuset=gpuset, memory_bytes=run_state.resources.memory, runtime=self.docker_runtime, ) self.worker_docker_network.connect(container) except docker_utils.DockerUserErrorException as e: message = 'Cannot start Docker container: {}'.format(e) log_bundle_transition( bundle_uuid=run_state.bundle.uuid, previous_stage=run_state.stage, next_stage=RunStage.CLEANING_UP, reason='Cannot start Docker container.', level=logging.ERROR, ) return run_state._replace(stage=RunStage.CLEANING_UP, failure_message=message) except Exception as e: message = 'Cannot start container: {}'.format(e) logger.error(message) logger.error(traceback.format_exc()) raise return run_state._replace( stage=RunStage.RUNNING, run_status='Running job in container', container_id=container.id, container=container, docker_image=image_state.digest, has_contents=True, cpuset=cpuset, gpuset=gpuset, )
def _transition_from_CLEANING_UP(self, run_state): """ 1- delete the container if still existent 2- clean up the dependencies from bundle directory 3- release the dependencies in dependency manager 4- If bundle has contents to upload (i.e. was RUNNING at some point), move to UPLOADING_RESULTS state Otherwise move to FINALIZING state """ def remove_path_no_fail(path): try: remove_path(path) except Exception: logger.error(traceback.format_exc()) if run_state.container_id is not None: while docker_utils.container_exists(run_state.container): try: finished, _, _ = docker_utils.check_finished( run_state.container) if finished: run_state.container.remove(force=True) run_state = run_state._replace(container=None, container_id=None) break else: try: run_state.container.kill() except docker.errors.APIError: logger.error(traceback.format_exc()) time.sleep(1) except docker.errors.APIError: logger.error(traceback.format_exc()) time.sleep(1) for dep in run_state.bundle.dependencies: if not self.shared_file_system: # No dependencies if shared fs worker dep_key = DependencyKey(dep.parent_uuid, dep.parent_path) self.dependency_manager.release(run_state.bundle.uuid, dep_key) # Clean up dependencies paths for path in run_state.paths_to_remove or []: remove_path_no_fail(path) run_state = run_state._replace(paths_to_remove=[]) if run_state.is_restaged: log_bundle_transition( bundle_uuid=run_state.bundle.uuid, previous_stage=run_state.stage, next_stage=RunStage.RESTAGED, reason=self.RESTAGED_REASON, ) return run_state._replace(stage=RunStage.RESTAGED) if not self.shared_file_system and run_state.has_contents: log_bundle_transition( bundle_uuid=run_state.bundle.uuid, previous_stage=run_state.stage, next_stage=RunStage.UPLOADING_RESULTS, ) return run_state._replace(stage=RunStage.UPLOADING_RESULTS, run_status='Uploading results', container=None) else: # No need to upload results since results are directly written to bundle store # Delete any files that match the exclude_patterns . for exclude_pattern in run_state.bundle.metadata[ "exclude_patterns"]: full_pattern = os.path.join(run_state.bundle_path, exclude_pattern) for file_path in glob.glob(full_pattern, recursive=True): # Only remove files that are subpaths of run_state.bundle_path, in case # that exclude_pattern is something like "../../../". if path_is_parent(parent_path=run_state.bundle_path, child_path=file_path): remove_path(file_path) return self.finalize_run(run_state)
def _transition_from_PREPARING(self, run_state): """ 1- Request the docker image from docker image manager - if image is failed, move to CLEANING_UP state 2- Request the dependencies from dependency manager - if any are failed, move to CLEANING_UP state 3- If all dependencies and docker image are ready: - Set up the local filesystem for the run - Create symlinks to dependencies - Allocate resources and prepare the docker container - Start the docker container 4- If all is successful, move to RUNNING state """ if run_state.is_killed: return run_state._replace(stage=RunStage.CLEANING_UP) dependencies_ready = True status_messages = [] if not self.shared_file_system: # No need to download dependencies if we're in the shared FS since they're already in our FS for dep_key, dep in run_state.bundle.dependencies.items(): dependency_state = self.dependency_manager.get( run_state.bundle.uuid, dep_key) if dependency_state.stage == DependencyStage.DOWNLOADING: status_messages.append( 'Downloading dependency %s: %s done (archived size)' % (dep.child_path, size_str( dependency_state.size_bytes))) dependencies_ready = False elif dependency_state.stage == DependencyStage.FAILED: # Failed to download dependency; -> CLEANING_UP return run_state._replace( stage=RunStage.CLEANING_UP, failure_message='Failed to download dependency %s: %s' % (dep.child_path, dependency_state.message), ) # get the docker image docker_image = run_state.resources.docker_image image_state = self.docker_image_manager.get(docker_image) if image_state.stage == DependencyStage.DOWNLOADING: status_messages.append('Pulling docker image: ' + (image_state.message or docker_image or "")) dependencies_ready = False elif image_state.stage == DependencyStage.FAILED: # Failed to pull image; -> CLEANING_UP message = 'Failed to download Docker image: %s' % image_state.message logger.error(message) return run_state._replace(stage=RunStage.CLEANING_UP, failure_message=message) # stop proceeding if dependency and image downloads aren't all done if not dependencies_ready: status_message = status_messages.pop() if status_messages: status_message += "(and downloading %d other dependencies and docker images)" % len( status_messages) return run_state._replace(run_status=status_message) # All dependencies ready! Set up directories, symlinks and container. Start container. # 1) Set up a directory to store the bundle. if self.shared_file_system: if not os.path.exists(run_state.bundle_path): if run_state.bundle_dir_wait_num_tries == 0: message = ( "Bundle directory cannot be found on the shared filesystem. " "Please ensure the shared fileystem between the server and " "your worker is mounted properly or contact your administrators." ) logger.error(message) return run_state._replace(stage=RunStage.CLEANING_UP, failure_message=message) return run_state._replace( run_status= "Waiting for bundle directory to be created by the server", bundle_dir_wait_num_tries=run_state. bundle_dir_wait_num_tries - 1, ) else: remove_path(run_state.bundle_path) os.mkdir(run_state.bundle_path) # 2) Set up symlinks docker_dependencies = [] docker_dependencies_path = ( '/' + run_state.bundle.uuid + ('_dependencies' if not self.shared_file_system else '')) for dep_key, dep in run_state.bundle.dependencies.items(): full_child_path = os.path.normpath( os.path.join(run_state.bundle_path, dep.child_path)) if not full_child_path.startswith(run_state.bundle_path): # Dependencies should end up in their bundles (ie prevent using relative paths like .. # to get out of their parent bundles message = 'Invalid key for dependency: %s' % (dep.child_path) logger.error(message) return run_state._replace(stage=RunStage.CLEANING_UP, failure_message=message) docker_dependency_path = os.path.join(docker_dependencies_path, dep.child_path) if self.shared_file_system: # On a shared FS, we know where the dep is stored and can get the contents directly dependency_path = os.path.realpath( os.path.join(dep.location, dep.parent_path)) else: # On a dependency_manager setup ask the manager where the dependency is dependency_path = os.path.join( self.dependency_manager.dependencies_dir, self.dependency_manager.get(run_state.bundle.uuid, dep_key).path, ) os.symlink(docker_dependency_path, full_child_path) # These are turned into docker volume bindings like: # dependency_path:docker_dependency_path:ro docker_dependencies.append( (dependency_path, docker_dependency_path)) # 3) Set up container if run_state.resources.network: docker_network = self.docker_network_external.name else: docker_network = self.docker_network_internal.name try: cpuset, gpuset = self.assign_cpu_and_gpu_sets_fn( run_state.resources.cpus, run_state.resources.gpus) except Exception as e: message = "Cannot assign enough resources: %s" % str(e) logger.error(message) logger.error(traceback.format_exc()) return run_state._replace(run_status=message) # 4) Start container try: container = docker_utils.start_bundle_container( run_state.bundle_path, run_state.bundle.uuid, docker_dependencies, run_state.bundle.command, run_state.resources.docker_image, network=docker_network, cpuset=cpuset, gpuset=gpuset, memory_bytes=run_state.resources.memory, runtime=self.docker_runtime, ) self.worker_docker_network.connect(container) except Exception as e: message = 'Cannot start Docker container: {}'.format(e) logger.error(message) logger.error(traceback.format_exc()) raise return run_state._replace( stage=RunStage.RUNNING, run_status='Running job in Docker container', container_id=container.id, container=container, docker_image=image_state.digest, has_contents=True, cpuset=cpuset, gpuset=gpuset, )
def _sync_state(self): """ Synchronize dependency states between dependencies-state.json and the local file system as follows: 1. dependencies and paths: populated from dependencies-state.json 2. directories on the local file system: the bundle contents This function forces the 1 and 2 to be in sync by taking the intersection (e.g., deleting bundles from the local file system that don't appear in the dependencies-state.json and vice-versa) """ with self._state_lock: # Load states from dependencies-state.json, which contains information about bundles (e.g., state, # dependencies, last used, etc.). if self._state_committer.state_file_exists: # If the state file exists, do not pass in a default. It's critical that we read the contents # of the state file, as this method prunes dependencies. If we can't read the contents of the # state file, fail immediately. dependencies, paths = self._fetch_state() logger.info( 'Found {} dependencies, {} paths from cache.'.format( len(dependencies), len(paths) ) ) else: dependencies: Dict[DependencyKey, DependencyState] = dict() paths: Set[str] = set() logger.info( f'State file did not exist. Will create one at path {self._state_committer.path}.' ) # Get the paths that exist in dependency state, loaded path and # the local file system (the dependency directories under self.dependencies_dir) local_directories = set(os.listdir(self.dependencies_dir)) paths_in_loaded_state = [dep_state.path for dep_state in dependencies.values()] paths = paths.intersection(paths_in_loaded_state).intersection(local_directories) # Remove the orphaned dependencies if they don't exist in paths # (intersection of paths in dependency state, loaded paths and the paths on the local file system) dependencies_to_remove = [ dep for dep, dep_state in dependencies.items() if dep_state.path not in paths ] for dep in dependencies_to_remove: logger.info( "Dependency {} in dependency state but its path {} doesn't exist on the local file system. " "Removing it from dependency state.".format( dep, os.path.join(self.dependencies_dir, dependencies[dep].path) ) ) del dependencies[dep] # Remove the orphaned directories from the local file system directories_to_remove = local_directories - paths for directory in directories_to_remove: full_path = os.path.join(self.dependencies_dir, directory) if os.path.exists(full_path): logger.info( "Remove orphaned directory {} from the local file system.".format(full_path) ) remove_path(full_path) # Save the current synced state back to the state file: dependency-state.json as # the current state might have been changed during the state syncing phase self._commit_state(dependencies, paths)
def tearDown(self): remove_path(self.temp_dir)