def _try_start_bundle(self, workers, worker, bundle): """ Tries to start running the bundle on the given worker, returning False if that failed. """ if self._model.set_starting_bundle(bundle, worker['user_id'], worker['worker_id']): workers.set_starting(bundle.uuid, worker) if ( self._worker_model.shared_file_system and worker['user_id'] == self._model.root_user_id ): # On a shared file system we create the path here to avoid NFS # directory cache issues. path = self._bundle_store.get_bundle_location(bundle.uuid) remove_path(path) os.mkdir(path) if self._worker_model.send_json_message( worker['socket_id'], self._construct_run_message(worker, bundle), 0.2 ): logger.info('Starting run bundle %s', bundle.uuid) return True else: self._model.restage_bundle(bundle) workers.restage(bundle.uuid) return False else: return False
def _try_start_bundle(self, workers, worker, bundle): """ Tries to start running the bundle on the given worker, returning False if that failed. """ if self._model.set_starting_bundle(bundle, worker['user_id'], worker['worker_id']): workers.set_starting(bundle.uuid, worker) if (self._worker_model.shared_file_system and worker['user_id'] == self._model.root_user_id): # On a shared file system we create the path here to avoid NFS # directory cache issues. path = self._bundle_store.get_bundle_location(bundle.uuid) remove_path(path) os.mkdir(path) if self._worker_model.send_json_message( worker['socket_id'], self._construct_run_message(worker, bundle), 0.2): logger.info('Starting run bundle %s', bundle.uuid) return True else: self._model.restage_bundle(bundle) workers.restage(bundle.uuid) return False else: return False
def test_tar_empty(self): dir = tempfile.mkdtemp() self.addCleanup(lambda: remove_path(dir)) temp_dir = tempfile.mkdtemp() self.addCleanup(lambda: remove_path(temp_dir)) output_dir = os.path.join(temp_dir, 'output') un_tar_directory(tar_gzip_directory(dir), output_dir, 'gz') self.assertEqual(os.listdir(output_dir), [])
def test_tar_empty(self): dir = tempfile.mkdtemp() self.addCleanup(lambda: remove_path(dir)) temp_dir = tempfile.mkdtemp() self.addCleanup(lambda: remove_path(temp_dir)) output_dir = os.path.join(temp_dir, 'output') un_tar_directory(tar_gzip_directory(dir), output_dir, 'gz') self.assertEquals(os.listdir(output_dir), [])
def _transition_from_FINALIZING(self, run_state): """ If a full worker cycle has passed since we got into FINALIZING we already reported to server so can move on to FINISHED. Can also remove bundle_path now """ if run_state.info['finalized']: remove_path(run_state.bundle_path) return run_state._replace(stage=LocalRunStage.FINISHED, run_status='Finished') else: return run_state
def _transition_from_FINALIZING(self, run_state): """ If a full worker cycle has passed since we got into FINALIZING we already reported to server so can move on to FINISHED. Can also remove bundle_path now """ if run_state.info['finalized']: remove_path(run_state.bundle_path) return run_state._replace(stage=LocalRunStage.FINISHED, run_status='Finished') else: return run_state
def _make_bundle(self, bundle): try: path = os.path.normpath( self._bundle_store.get_bundle_location(bundle.uuid)) deps = [] for dep in bundle.dependencies: parent_bundle_path = os.path.normpath( self._bundle_store.get_bundle_location(dep.parent_uuid)) dependency_path = os.path.normpath( os.path.join(parent_bundle_path, dep.parent_path)) if not dependency_path.startswith(parent_bundle_path) or ( not os.path.islink(dependency_path) and not os.path.exists(dependency_path)): raise Exception('Invalid dependency %s' % (path_util.safe_join( dep.parent_uuid, dep.parent_path))) child_path = os.path.normpath( os.path.join(path, dep.child_path)) if not child_path.startswith(path): raise Exception('Invalid key for dependency: %s' % (dep.child_path)) deps.append((dependency_path, child_path)) remove_path(path) if len(deps) == 1 and deps[0][1] == path: path_util.copy(deps[0][0], path, follow_symlinks=False) else: os.mkdir(path) for dependency_path, child_path in deps: path_util.copy(dependency_path, child_path, follow_symlinks=False) self._upload_manager.update_metadata_and_save( bundle, enforce_disk_quota=True) logger.info('Finished making bundle %s', bundle.uuid) self._model.update_bundle(bundle, {'state': State.READY}) except Exception as e: logger.info('Failing bundle %s: %s', bundle.uuid, str(e)) self._model.update_bundle(bundle, { 'state': State.FAILED, 'metadata': { 'failure_message': str(e) } }) finally: with self._make_uuids_lock: self._make_uuids.remove(bundle.uuid)
def _delete_dependency(self, dependency): """ Remove the given dependency from the manager's state Also delete any known files on the filesystem if any exist """ if self._acquire_if_exists(dependency): try: path_to_remove = self._dependencies[dependency].path self._paths.remove(path_to_remove) remove_path(path_to_remove) except Exception: pass finally: del self._dependencies[dependency] self._dependency_locks[dependency].release()
def _delete_dependency(self, dependency): """ Remove the given dependency from the manager's state Also delete any known files on the filesystem if any exist """ if self._acquire_if_exists(dependency): try: path_to_remove = self._dependencies[dependency].path self._paths.remove(path_to_remove) remove_path(path_to_remove) except Exception: pass finally: del self._dependencies[dependency] self._dependency_locks[dependency].release()
def _make_bundle(self, bundle): try: path = os.path.normpath(self._bundle_store.get_bundle_location(bundle.uuid)) deps = [] for dep in bundle.dependencies: parent_bundle_path = os.path.normpath( self._bundle_store.get_bundle_location(dep.parent_uuid) ) dependency_path = os.path.normpath( os.path.join(parent_bundle_path, dep.parent_path) ) if not dependency_path.startswith(parent_bundle_path) or ( not os.path.islink(dependency_path) and not os.path.exists(dependency_path) ): raise Exception( 'Invalid dependency %s' % (path_util.safe_join(dep.parent_uuid, dep.parent_path)) ) child_path = os.path.normpath(os.path.join(path, dep.child_path)) if not child_path.startswith(path): raise Exception('Invalid key for dependency: %s' % (dep.child_path)) deps.append((dependency_path, child_path)) remove_path(path) if len(deps) == 1 and deps[0][1] == path: path_util.copy(deps[0][0], path, follow_symlinks=False) else: os.mkdir(path) for dependency_path, child_path in deps: path_util.copy(dependency_path, child_path, follow_symlinks=False) self._upload_manager.update_metadata_and_save(bundle, enforce_disk_quota=True) logger.info('Finished making bundle %s', bundle.uuid) self._model.update_bundle(bundle, {'state': State.READY}) except Exception as e: logger.info('Failing bundle %s: %s', bundle.uuid, str(e)) self._model.update_bundle( bundle, {'state': State.FAILED, 'metadata': {'failure_message': str(e)}} ) finally: with self._make_uuids_lock: self._make_uuids.remove(bundle.uuid)
def _transition_from_CLEANING_UP(self, run_state): """ 1- delete the container if still existent 2- clean up the dependencies from bundle folder 3- release the dependencies in dependency manager 4- If bundle has contents to upload (i.e. was RUNNING at some point), move to UPLOADING_RESULTS state Otherwise move to FINALIZING state """ bundle_uuid = run_state.bundle['uuid'] if run_state.container_id is not None: while True: try: finished, _, _ = docker_utils.check_finished( run_state.container) if finished: run_state.container.remove(force=True) break except docker.errors.APIError: traceback.print_exc() time.sleep(1) for dep in run_state.bundle['dependencies']: self.dependency_manager.release( bundle_uuid, (dep['parent_uuid'], dep['parent_path'])) child_path = os.path.join(run_state.bundle_path, dep['child_path']) try: remove_path(child_path) except Exception: traceback.print_exc() if run_state.has_contents: return run_state._replace( stage=LocalRunStage.UPLOADING_RESULTS, run_status='Uploading results', container=None, ) else: return self.finalize_run(run_state)
def _transition_from_CLEANING_UP(self, run_state): """ 1- delete the container if still existent 2- clean up the dependencies from bundle folder 3- release the dependencies in dependency manager 4- If bundle has contents to upload (i.e. was RUNNING at some point), move to UPLOADING_RESULTS state Otherwise move to FINALIZING state """ bundle_uuid = run_state.bundle['uuid'] if run_state.container_id is not None: while True: try: finished, _, _ = docker_utils.check_finished(run_state.container) if finished: run_state.container.remove(force=True) break except docker.errors.APIError: traceback.print_exc() time.sleep(1) for dep in run_state.bundle['dependencies']: self.dependency_manager.release(bundle_uuid, (dep['parent_uuid'], dep['parent_path'])) child_path = os.path.join(run_state.bundle_path, dep['child_path']) try: remove_path(child_path) except Exception: traceback.print_exc() if run_state.has_contents: return run_state._replace( stage=LocalRunStage.UPLOADING_RESULTS, run_status='Uploading results', container=None, ) else: return self.finalize_run(run_state)
def test_tar_has_files(self): dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'files') temp_dir = tempfile.mkdtemp() self.addCleanup(lambda: remove_path(temp_dir)) output_dir = os.path.join(temp_dir, 'output') un_tar_directory(tar_gzip_directory(dir, False, ['f2'], ['f1', 'b.txt']), output_dir, 'gz') output_dir_entries = os.listdir(output_dir) self.assertIn('dir1', output_dir_entries) self.assertIn('a.txt', output_dir_entries) self.assertNotIn('b.txt', output_dir_entries) self.assertTrue(os.path.exists(os.path.join(output_dir, 'dir1', 'f1'))) self.assertFalse(os.path.exists(os.path.join(output_dir, 'dir1', 'f2'))) self.assertTrue(os.path.islink(os.path.join(output_dir, 'a-symlink.txt')))
def test_tar_has_files(self): dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'files') temp_dir = tempfile.mkdtemp() self.addCleanup(lambda: remove_path(temp_dir)) output_dir = os.path.join(temp_dir, 'output') un_tar_directory(tar_gzip_directory(dir, False, ['f2'], ['f1', 'b.txt']), output_dir, 'gz') output_dir_entries = os.listdir(output_dir) self.assertIn('dir1', output_dir_entries) self.assertIn('a.txt', output_dir_entries) self.assertNotIn('b.txt', output_dir_entries) self.assertTrue(os.path.exists(os.path.join(output_dir, 'dir1', 'f1'))) self.assertFalse(os.path.exists(os.path.join(output_dir, 'dir1', 'f2'))) self.assertTrue(os.path.islink(os.path.join(output_dir, 'a-symlink.txt')))
def _transition_from_PREPARING(self, run_state): """ 1- Request the docker image from docker image manager - if image is failed, move to CLEANING_UP state 2- Request the dependencies from dependency manager - if any are failed, move to CLEANING_UP state 3- If all dependencies and docker image are ready: - Set up the local filesystem for the run - Create symlinks to dependencies - Allocate resources and prepare the docker container - Start the docker container 4- If all is successful, move to RUNNING state """ if run_state.is_killed: return run_state._replace(stage=LocalRunStage.CLEANING_UP, container_id=None) dependencies_ready = True status_messages = [] bundle_uuid = run_state.bundle['uuid'] # get dependencies for dep in run_state.bundle['dependencies']: dependency = (dep['parent_uuid'], dep['parent_path']) dependency_state = self.dependency_manager.get(bundle_uuid, dependency) if dependency_state.stage == DependencyStage.DOWNLOADING: status_messages.append( 'Downloading dependency %s: %s done (archived size)' % (dep['child_path'], size_str(dependency_state.size_bytes)) ) dependencies_ready = False elif dependency_state.stage == DependencyStage.FAILED: # Failed to download dependency; -> CLEANING_UP run_state.info['failure_message'] = 'Failed to download dependency %s: %s' % ( dep['child_path'], '', ) return run_state._replace(stage=LocalRunStage.CLEANING_UP, info=run_state.info) # get the docker image docker_image = run_state.resources['docker_image'] image_state = self.docker_image_manager.get(docker_image) if image_state.stage == DependencyStage.DOWNLOADING: status_messages.append( 'Pulling docker image: ' + (image_state.message or docker_image or "") ) dependencies_ready = False elif image_state.stage == DependencyStage.FAILED: # Failed to pull image; -> CLEANING_UP run_state.info['failure_message'] = image_state.message return run_state._replace(stage=LocalRunStage.CLEANING_UP, info=run_state.info) # stop proceeding if dependency and image downloads aren't all done if not dependencies_ready: status_message = status_messages.pop() if status_messages: status_message += "(and downloading %d other dependencies and docker images)" % len( status_messages ) return run_state._replace(run_status=status_message) # All dependencies ready! Set up directories, symlinks and container. Start container. # 1) Set up a directory to store the bundle. remove_path(run_state.bundle_path) os.mkdir(run_state.bundle_path) # 2) Set up symlinks dependencies = [] docker_dependencies_path = '/' + bundle_uuid + '_dependencies' for dep in run_state.bundle['dependencies']: child_path = os.path.normpath(os.path.join(run_state.bundle_path, dep['child_path'])) if not child_path.startswith(run_state.bundle_path): raise Exception('Invalid key for dependency: %s' % (dep['child_path'])) dependency_path = self.dependency_manager.get( bundle_uuid, (dep['parent_uuid'], dep['parent_path']) ).path dependency_path = os.path.join( self.dependency_manager.dependencies_dir, dependency_path ) docker_dependency_path = os.path.join(docker_dependencies_path, dep['child_path']) os.symlink(docker_dependency_path, child_path) # These are turned into docker volume bindings like: # dependency_path:docker_dependency_path:ro dependencies.append((dependency_path, docker_dependency_path)) # 3) Set up container if run_state.resources['request_network']: docker_network = self.docker_network_external.name else: docker_network = self.docker_network_internal.name try: cpuset, gpuset = self.assign_cpu_and_gpu_sets_fn( run_state.resources['request_cpus'], run_state.resources['request_gpus'] ) except Exception: run_state.info['failure_message'] = "Cannot assign enough resources" return run_state._replace(stage=LocalRunStage.CLEANING_UP, info=run_state.info) # 4) Start container try: container = docker_utils.start_bundle_container( run_state.bundle_path, bundle_uuid, dependencies, run_state.bundle['command'], run_state.resources['docker_image'], network=docker_network, cpuset=cpuset, gpuset=gpuset, memory_bytes=run_state.resources['request_memory'], runtime=self.docker_runtime, ) self.worker_docker_network.connect(container) except docker_utils.DockerException as e: run_state.info['failure_message'] = 'Cannot start Docker container: {}'.format(e) return run_state._replace(stage=LocalRunStage.CLEANING_UP, info=run_state.info) return run_state._replace( stage=LocalRunStage.RUNNING, start_time=time.time(), run_status='Running job in Docker container', container_id=container.id, container=container, docker_image=image_state.digest, has_contents=True, cpuset=cpuset, gpuset=gpuset, )
def tearDown(self): remove_path(self.temp_dir)
def _clear_torque_logs(self, job_handle): remove_path(os.path.join(self._torque_log_dir, 'stdout.' + job_handle)) remove_path(os.path.join(self._torque_log_dir, 'stderr.' + job_handle))
def tearDown(self): remove_path(self.work_dir)
def _transition_from_PREPARING(self, run_state): """ 1- Request the docker image from docker image manager - if image is failed, move to CLEANING_UP state 2- Request the dependencies from dependency manager - if any are failed, move to CLEANING_UP state 3- If all dependencies and docker image are ready: - Set up the local filesystem for the run - Create symlinks to dependencies - Allocate resources and prepare the docker container - Start the docker container 4- If all is successful, move to RUNNING state """ if run_state.is_killed: return run_state._replace(stage=LocalRunStage.CLEANING_UP, container_id=None) dependencies_ready = True status_messages = [] bundle_uuid = run_state.bundle['uuid'] # get dependencies for dep in run_state.bundle['dependencies']: dependency = (dep['parent_uuid'], dep['parent_path']) dependency_state = self.dependency_manager.get( bundle_uuid, dependency) if dependency_state.stage == DependencyStage.DOWNLOADING: status_messages.append( 'Downloading dependency %s: %s done (archived size)' % (dep['child_path'], size_str(dependency_state.size_bytes))) dependencies_ready = False elif dependency_state.stage == DependencyStage.FAILED: # Failed to download dependency; -> CLEANING_UP run_state.info[ 'failure_message'] = 'Failed to download dependency %s: %s' % ( dep['child_path'], '', ) return run_state._replace(stage=LocalRunStage.CLEANING_UP, info=run_state.info) # get the docker image docker_image = run_state.resources['docker_image'] image_state = self.docker_image_manager.get(docker_image) if image_state.stage == DependencyStage.DOWNLOADING: status_messages.append('Pulling docker image: ' + (image_state.message or docker_image or "")) dependencies_ready = False elif image_state.stage == DependencyStage.FAILED: # Failed to pull image; -> CLEANING_UP run_state.info['failure_message'] = image_state.message return run_state._replace(stage=LocalRunStage.CLEANING_UP, info=run_state.info) # stop proceeding if dependency and image downloads aren't all done if not dependencies_ready: status_message = status_messages.pop() if status_messages: status_message += "(and downloading %d other dependencies and docker images)" % len( status_messages) return run_state._replace(run_status=status_message) # All dependencies ready! Set up directories, symlinks and container. Start container. # 1) Set up a directory to store the bundle. remove_path(run_state.bundle_path) os.mkdir(run_state.bundle_path) # 2) Set up symlinks dependencies = [] docker_dependencies_path = '/' + bundle_uuid + '_dependencies' for dep in run_state.bundle['dependencies']: child_path = os.path.normpath( os.path.join(run_state.bundle_path, dep['child_path'])) if not child_path.startswith(run_state.bundle_path): raise Exception('Invalid key for dependency: %s' % (dep['child_path'])) dependency_path = self.dependency_manager.get( bundle_uuid, (dep['parent_uuid'], dep['parent_path'])).path dependency_path = os.path.join( self.dependency_manager.dependencies_dir, dependency_path) docker_dependency_path = os.path.join(docker_dependencies_path, dep['child_path']) os.symlink(docker_dependency_path, child_path) # These are turned into docker volume bindings like: # dependency_path:docker_dependency_path:ro dependencies.append((dependency_path, docker_dependency_path)) # 3) Set up container if run_state.resources['request_network']: docker_network = self.docker_network_external_name else: docker_network = self.docker_network_internal_name try: cpuset, gpuset = self.assign_cpu_and_gpu_sets_fn( run_state.resources['request_cpus'], run_state.resources['request_gpus']) except Exception: run_state.info[ 'failure_message'] = "Cannot assign enough resources" return run_state._replace(stage=LocalRunStage.CLEANING_UP, info=run_state.info) # 4) Start container try: container = docker_utils.start_bundle_container( run_state.bundle_path, bundle_uuid, dependencies, run_state.bundle['command'], run_state.resources['docker_image'], network=docker_network, cpuset=cpuset, gpuset=gpuset, memory_bytes=run_state.resources['request_memory'], runtime=self.docker_runtime, ) except docker_utils.DockerException as e: run_state.info[ 'failure_message'] = 'Cannot start Docker container: {}'.format( e) return run_state._replace(stage=LocalRunStage.CLEANING_UP, info=run_state.info) return run_state._replace( stage=LocalRunStage.RUNNING, start_time=time.time(), run_status='Running job in Docker container', container_id=container.id, container=container, docker_image=image_state.digest, has_contents=True, cpuset=cpuset, gpuset=gpuset, )