예제 #1
0
 def _try_start_bundle(self, workers, worker, bundle):
     """
     Tries to start running the bundle on the given worker, returning False
     if that failed.
     """
     if self._model.set_starting_bundle(bundle, worker['user_id'], worker['worker_id']):
         workers.set_starting(bundle.uuid, worker)
         if (
             self._worker_model.shared_file_system
             and worker['user_id'] == self._model.root_user_id
         ):
             # On a shared file system we create the path here to avoid NFS
             # directory cache issues.
             path = self._bundle_store.get_bundle_location(bundle.uuid)
             remove_path(path)
             os.mkdir(path)
         if self._worker_model.send_json_message(
             worker['socket_id'], self._construct_run_message(worker, bundle), 0.2
         ):
             logger.info('Starting run bundle %s', bundle.uuid)
             return True
         else:
             self._model.restage_bundle(bundle)
             workers.restage(bundle.uuid)
             return False
     else:
         return False
예제 #2
0
 def _try_start_bundle(self, workers, worker, bundle):
     """
     Tries to start running the bundle on the given worker, returning False
     if that failed.
     """
     if self._model.set_starting_bundle(bundle, worker['user_id'],
                                        worker['worker_id']):
         workers.set_starting(bundle.uuid, worker)
         if (self._worker_model.shared_file_system
                 and worker['user_id'] == self._model.root_user_id):
             # On a shared file system we create the path here to avoid NFS
             # directory cache issues.
             path = self._bundle_store.get_bundle_location(bundle.uuid)
             remove_path(path)
             os.mkdir(path)
         if self._worker_model.send_json_message(
                 worker['socket_id'],
                 self._construct_run_message(worker, bundle), 0.2):
             logger.info('Starting run bundle %s', bundle.uuid)
             return True
         else:
             self._model.restage_bundle(bundle)
             workers.restage(bundle.uuid)
             return False
     else:
         return False
예제 #3
0
    def test_tar_empty(self):
        dir = tempfile.mkdtemp()
        self.addCleanup(lambda: remove_path(dir))
        temp_dir = tempfile.mkdtemp()
        self.addCleanup(lambda: remove_path(temp_dir))

        output_dir = os.path.join(temp_dir, 'output')
        un_tar_directory(tar_gzip_directory(dir), output_dir, 'gz')
        self.assertEqual(os.listdir(output_dir), [])
예제 #4
0
    def test_tar_empty(self):
        dir = tempfile.mkdtemp()
        self.addCleanup(lambda: remove_path(dir))
        temp_dir = tempfile.mkdtemp()
        self.addCleanup(lambda: remove_path(temp_dir))

        output_dir = os.path.join(temp_dir, 'output')
        un_tar_directory(tar_gzip_directory(dir), output_dir, 'gz')
        self.assertEquals(os.listdir(output_dir), [])
예제 #5
0
 def _transition_from_FINALIZING(self, run_state):
     """
     If a full worker cycle has passed since we got into FINALIZING we already reported to
     server so can move on to FINISHED. Can also remove bundle_path now
     """
     if run_state.info['finalized']:
         remove_path(run_state.bundle_path)
         return run_state._replace(stage=LocalRunStage.FINISHED, run_status='Finished')
     else:
         return run_state
예제 #6
0
 def _transition_from_FINALIZING(self, run_state):
     """
     If a full worker cycle has passed since we got into FINALIZING we already reported to
     server so can move on to FINISHED. Can also remove bundle_path now
     """
     if run_state.info['finalized']:
         remove_path(run_state.bundle_path)
         return run_state._replace(stage=LocalRunStage.FINISHED,
                                   run_status='Finished')
     else:
         return run_state
예제 #7
0
    def _make_bundle(self, bundle):
        try:
            path = os.path.normpath(
                self._bundle_store.get_bundle_location(bundle.uuid))

            deps = []
            for dep in bundle.dependencies:
                parent_bundle_path = os.path.normpath(
                    self._bundle_store.get_bundle_location(dep.parent_uuid))
                dependency_path = os.path.normpath(
                    os.path.join(parent_bundle_path, dep.parent_path))
                if not dependency_path.startswith(parent_bundle_path) or (
                        not os.path.islink(dependency_path)
                        and not os.path.exists(dependency_path)):
                    raise Exception('Invalid dependency %s' %
                                    (path_util.safe_join(
                                        dep.parent_uuid, dep.parent_path)))

                child_path = os.path.normpath(
                    os.path.join(path, dep.child_path))
                if not child_path.startswith(path):
                    raise Exception('Invalid key for dependency: %s' %
                                    (dep.child_path))

                deps.append((dependency_path, child_path))

            remove_path(path)

            if len(deps) == 1 and deps[0][1] == path:
                path_util.copy(deps[0][0], path, follow_symlinks=False)
            else:
                os.mkdir(path)
                for dependency_path, child_path in deps:
                    path_util.copy(dependency_path,
                                   child_path,
                                   follow_symlinks=False)

            self._upload_manager.update_metadata_and_save(
                bundle, enforce_disk_quota=True)
            logger.info('Finished making bundle %s', bundle.uuid)
            self._model.update_bundle(bundle, {'state': State.READY})
        except Exception as e:
            logger.info('Failing bundle %s: %s', bundle.uuid, str(e))
            self._model.update_bundle(bundle, {
                'state': State.FAILED,
                'metadata': {
                    'failure_message': str(e)
                }
            })
        finally:
            with self._make_uuids_lock:
                self._make_uuids.remove(bundle.uuid)
예제 #8
0
 def _delete_dependency(self, dependency):
     """
     Remove the given dependency from the manager's state
     Also delete any known files on the filesystem if any exist
     """
     if self._acquire_if_exists(dependency):
         try:
             path_to_remove = self._dependencies[dependency].path
             self._paths.remove(path_to_remove)
             remove_path(path_to_remove)
         except Exception:
             pass
         finally:
             del self._dependencies[dependency]
             self._dependency_locks[dependency].release()
 def _delete_dependency(self, dependency):
     """
     Remove the given dependency from the manager's state
     Also delete any known files on the filesystem if any exist
     """
     if self._acquire_if_exists(dependency):
         try:
             path_to_remove = self._dependencies[dependency].path
             self._paths.remove(path_to_remove)
             remove_path(path_to_remove)
         except Exception:
             pass
         finally:
             del self._dependencies[dependency]
             self._dependency_locks[dependency].release()
예제 #10
0
    def _make_bundle(self, bundle):
        try:
            path = os.path.normpath(self._bundle_store.get_bundle_location(bundle.uuid))

            deps = []
            for dep in bundle.dependencies:
                parent_bundle_path = os.path.normpath(
                    self._bundle_store.get_bundle_location(dep.parent_uuid)
                )
                dependency_path = os.path.normpath(
                    os.path.join(parent_bundle_path, dep.parent_path)
                )
                if not dependency_path.startswith(parent_bundle_path) or (
                    not os.path.islink(dependency_path) and not os.path.exists(dependency_path)
                ):
                    raise Exception(
                        'Invalid dependency %s'
                        % (path_util.safe_join(dep.parent_uuid, dep.parent_path))
                    )

                child_path = os.path.normpath(os.path.join(path, dep.child_path))
                if not child_path.startswith(path):
                    raise Exception('Invalid key for dependency: %s' % (dep.child_path))

                deps.append((dependency_path, child_path))

            remove_path(path)

            if len(deps) == 1 and deps[0][1] == path:
                path_util.copy(deps[0][0], path, follow_symlinks=False)
            else:
                os.mkdir(path)
                for dependency_path, child_path in deps:
                    path_util.copy(dependency_path, child_path, follow_symlinks=False)

            self._upload_manager.update_metadata_and_save(bundle, enforce_disk_quota=True)
            logger.info('Finished making bundle %s', bundle.uuid)
            self._model.update_bundle(bundle, {'state': State.READY})
        except Exception as e:
            logger.info('Failing bundle %s: %s', bundle.uuid, str(e))
            self._model.update_bundle(
                bundle, {'state': State.FAILED, 'metadata': {'failure_message': str(e)}}
            )
        finally:
            with self._make_uuids_lock:
                self._make_uuids.remove(bundle.uuid)
예제 #11
0
    def _transition_from_CLEANING_UP(self, run_state):
        """
        1- delete the container if still existent
        2- clean up the dependencies from bundle folder
        3- release the dependencies in dependency manager
        4- If bundle has contents to upload (i.e. was RUNNING at some point),
            move to UPLOADING_RESULTS state
           Otherwise move to FINALIZING state
        """
        bundle_uuid = run_state.bundle['uuid']
        if run_state.container_id is not None:
            while True:
                try:
                    finished, _, _ = docker_utils.check_finished(
                        run_state.container)
                    if finished:
                        run_state.container.remove(force=True)
                        break
                except docker.errors.APIError:
                    traceback.print_exc()
                    time.sleep(1)

        for dep in run_state.bundle['dependencies']:
            self.dependency_manager.release(
                bundle_uuid, (dep['parent_uuid'], dep['parent_path']))

            child_path = os.path.join(run_state.bundle_path, dep['child_path'])
            try:
                remove_path(child_path)
            except Exception:
                traceback.print_exc()

        if run_state.has_contents:
            return run_state._replace(
                stage=LocalRunStage.UPLOADING_RESULTS,
                run_status='Uploading results',
                container=None,
            )
        else:
            return self.finalize_run(run_state)
예제 #12
0
    def _transition_from_CLEANING_UP(self, run_state):
        """
        1- delete the container if still existent
        2- clean up the dependencies from bundle folder
        3- release the dependencies in dependency manager
        4- If bundle has contents to upload (i.e. was RUNNING at some point),
            move to UPLOADING_RESULTS state
           Otherwise move to FINALIZING state
        """
        bundle_uuid = run_state.bundle['uuid']
        if run_state.container_id is not None:
            while True:
                try:
                    finished, _, _ = docker_utils.check_finished(run_state.container)
                    if finished:
                        run_state.container.remove(force=True)
                        break
                except docker.errors.APIError:
                    traceback.print_exc()
                    time.sleep(1)

        for dep in run_state.bundle['dependencies']:
            self.dependency_manager.release(bundle_uuid, (dep['parent_uuid'], dep['parent_path']))

            child_path = os.path.join(run_state.bundle_path, dep['child_path'])
            try:
                remove_path(child_path)
            except Exception:
                traceback.print_exc()

        if run_state.has_contents:
            return run_state._replace(
                stage=LocalRunStage.UPLOADING_RESULTS,
                run_status='Uploading results',
                container=None,
            )
        else:
            return self.finalize_run(run_state)
예제 #13
0
    def test_tar_has_files(self):
        dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'files')
        temp_dir = tempfile.mkdtemp()
        self.addCleanup(lambda: remove_path(temp_dir))

        output_dir = os.path.join(temp_dir, 'output')
        un_tar_directory(tar_gzip_directory(dir, False, ['f2'], ['f1', 'b.txt']), output_dir, 'gz')
        output_dir_entries = os.listdir(output_dir)
        self.assertIn('dir1', output_dir_entries)
        self.assertIn('a.txt', output_dir_entries)
        self.assertNotIn('b.txt', output_dir_entries)
        self.assertTrue(os.path.exists(os.path.join(output_dir, 'dir1', 'f1')))
        self.assertFalse(os.path.exists(os.path.join(output_dir, 'dir1', 'f2')))
        self.assertTrue(os.path.islink(os.path.join(output_dir, 'a-symlink.txt')))
예제 #14
0
    def test_tar_has_files(self):
        dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'files')
        temp_dir = tempfile.mkdtemp()
        self.addCleanup(lambda: remove_path(temp_dir))

        output_dir = os.path.join(temp_dir, 'output')
        un_tar_directory(tar_gzip_directory(dir, False, ['f2'], ['f1', 'b.txt']), output_dir, 'gz')
        output_dir_entries = os.listdir(output_dir)
        self.assertIn('dir1', output_dir_entries)
        self.assertIn('a.txt', output_dir_entries)
        self.assertNotIn('b.txt', output_dir_entries)
        self.assertTrue(os.path.exists(os.path.join(output_dir, 'dir1', 'f1')))
        self.assertFalse(os.path.exists(os.path.join(output_dir, 'dir1', 'f2')))
        self.assertTrue(os.path.islink(os.path.join(output_dir, 'a-symlink.txt')))
예제 #15
0
    def _transition_from_PREPARING(self, run_state):
        """
        1- Request the docker image from docker image manager
            - if image is failed, move to CLEANING_UP state
        2- Request the dependencies from dependency manager
            - if any are failed, move to CLEANING_UP state
        3- If all dependencies and docker image are ready:
            - Set up the local filesystem for the run
            - Create symlinks to dependencies
            - Allocate resources and prepare the docker container
            - Start the docker container
        4- If all is successful, move to RUNNING state
        """
        if run_state.is_killed:
            return run_state._replace(stage=LocalRunStage.CLEANING_UP, container_id=None)

        dependencies_ready = True
        status_messages = []
        bundle_uuid = run_state.bundle['uuid']

        # get dependencies
        for dep in run_state.bundle['dependencies']:
            dependency = (dep['parent_uuid'], dep['parent_path'])
            dependency_state = self.dependency_manager.get(bundle_uuid, dependency)
            if dependency_state.stage == DependencyStage.DOWNLOADING:
                status_messages.append(
                    'Downloading dependency %s: %s done (archived size)'
                    % (dep['child_path'], size_str(dependency_state.size_bytes))
                )
                dependencies_ready = False
            elif dependency_state.stage == DependencyStage.FAILED:
                # Failed to download dependency; -> CLEANING_UP
                run_state.info['failure_message'] = 'Failed to download dependency %s: %s' % (
                    dep['child_path'],
                    '',
                )
                return run_state._replace(stage=LocalRunStage.CLEANING_UP, info=run_state.info)

        # get the docker image
        docker_image = run_state.resources['docker_image']
        image_state = self.docker_image_manager.get(docker_image)
        if image_state.stage == DependencyStage.DOWNLOADING:
            status_messages.append(
                'Pulling docker image: ' + (image_state.message or docker_image or "")
            )
            dependencies_ready = False
        elif image_state.stage == DependencyStage.FAILED:
            # Failed to pull image; -> CLEANING_UP
            run_state.info['failure_message'] = image_state.message
            return run_state._replace(stage=LocalRunStage.CLEANING_UP, info=run_state.info)

        # stop proceeding if dependency and image downloads aren't all done
        if not dependencies_ready:
            status_message = status_messages.pop()
            if status_messages:
                status_message += "(and downloading %d other dependencies and docker images)" % len(
                    status_messages
                )
            return run_state._replace(run_status=status_message)

        # All dependencies ready! Set up directories, symlinks and container. Start container.
        # 1) Set up a directory to store the bundle.
        remove_path(run_state.bundle_path)
        os.mkdir(run_state.bundle_path)

        # 2) Set up symlinks
        dependencies = []
        docker_dependencies_path = '/' + bundle_uuid + '_dependencies'
        for dep in run_state.bundle['dependencies']:
            child_path = os.path.normpath(os.path.join(run_state.bundle_path, dep['child_path']))
            if not child_path.startswith(run_state.bundle_path):
                raise Exception('Invalid key for dependency: %s' % (dep['child_path']))

            dependency_path = self.dependency_manager.get(
                bundle_uuid, (dep['parent_uuid'], dep['parent_path'])
            ).path
            dependency_path = os.path.join(
                self.dependency_manager.dependencies_dir, dependency_path
            )

            docker_dependency_path = os.path.join(docker_dependencies_path, dep['child_path'])

            os.symlink(docker_dependency_path, child_path)
            # These are turned into docker volume bindings like:
            #   dependency_path:docker_dependency_path:ro
            dependencies.append((dependency_path, docker_dependency_path))

        # 3) Set up container
        if run_state.resources['request_network']:
            docker_network = self.docker_network_external.name
        else:
            docker_network = self.docker_network_internal.name

        try:
            cpuset, gpuset = self.assign_cpu_and_gpu_sets_fn(
                run_state.resources['request_cpus'], run_state.resources['request_gpus']
            )
        except Exception:
            run_state.info['failure_message'] = "Cannot assign enough resources"
            return run_state._replace(stage=LocalRunStage.CLEANING_UP, info=run_state.info)

        # 4) Start container
        try:
            container = docker_utils.start_bundle_container(
                run_state.bundle_path,
                bundle_uuid,
                dependencies,
                run_state.bundle['command'],
                run_state.resources['docker_image'],
                network=docker_network,
                cpuset=cpuset,
                gpuset=gpuset,
                memory_bytes=run_state.resources['request_memory'],
                runtime=self.docker_runtime,
            )
            self.worker_docker_network.connect(container)
        except docker_utils.DockerException as e:
            run_state.info['failure_message'] = 'Cannot start Docker container: {}'.format(e)
            return run_state._replace(stage=LocalRunStage.CLEANING_UP, info=run_state.info)

        return run_state._replace(
            stage=LocalRunStage.RUNNING,
            start_time=time.time(),
            run_status='Running job in Docker container',
            container_id=container.id,
            container=container,
            docker_image=image_state.digest,
            has_contents=True,
            cpuset=cpuset,
            gpuset=gpuset,
        )
예제 #16
0
 def tearDown(self):
     remove_path(self.temp_dir)
예제 #17
0
 def _clear_torque_logs(self, job_handle):
     remove_path(os.path.join(self._torque_log_dir, 'stdout.' + job_handle))
     remove_path(os.path.join(self._torque_log_dir, 'stderr.' + job_handle))
예제 #18
0
 def tearDown(self):
     remove_path(self.work_dir)
예제 #19
0
    def _transition_from_PREPARING(self, run_state):
        """
        1- Request the docker image from docker image manager
            - if image is failed, move to CLEANING_UP state
        2- Request the dependencies from dependency manager
            - if any are failed, move to CLEANING_UP state
        3- If all dependencies and docker image are ready:
            - Set up the local filesystem for the run
            - Create symlinks to dependencies
            - Allocate resources and prepare the docker container
            - Start the docker container
        4- If all is successful, move to RUNNING state
        """
        if run_state.is_killed:
            return run_state._replace(stage=LocalRunStage.CLEANING_UP,
                                      container_id=None)

        dependencies_ready = True
        status_messages = []
        bundle_uuid = run_state.bundle['uuid']

        # get dependencies
        for dep in run_state.bundle['dependencies']:
            dependency = (dep['parent_uuid'], dep['parent_path'])
            dependency_state = self.dependency_manager.get(
                bundle_uuid, dependency)
            if dependency_state.stage == DependencyStage.DOWNLOADING:
                status_messages.append(
                    'Downloading dependency %s: %s done (archived size)' %
                    (dep['child_path'], size_str(dependency_state.size_bytes)))
                dependencies_ready = False
            elif dependency_state.stage == DependencyStage.FAILED:
                # Failed to download dependency; -> CLEANING_UP
                run_state.info[
                    'failure_message'] = 'Failed to download dependency %s: %s' % (
                        dep['child_path'],
                        '',
                    )
                return run_state._replace(stage=LocalRunStage.CLEANING_UP,
                                          info=run_state.info)

        # get the docker image
        docker_image = run_state.resources['docker_image']
        image_state = self.docker_image_manager.get(docker_image)
        if image_state.stage == DependencyStage.DOWNLOADING:
            status_messages.append('Pulling docker image: ' +
                                   (image_state.message or docker_image or ""))
            dependencies_ready = False
        elif image_state.stage == DependencyStage.FAILED:
            # Failed to pull image; -> CLEANING_UP
            run_state.info['failure_message'] = image_state.message
            return run_state._replace(stage=LocalRunStage.CLEANING_UP,
                                      info=run_state.info)

        # stop proceeding if dependency and image downloads aren't all done
        if not dependencies_ready:
            status_message = status_messages.pop()
            if status_messages:
                status_message += "(and downloading %d other dependencies and docker images)" % len(
                    status_messages)
            return run_state._replace(run_status=status_message)

        # All dependencies ready! Set up directories, symlinks and container. Start container.
        # 1) Set up a directory to store the bundle.
        remove_path(run_state.bundle_path)
        os.mkdir(run_state.bundle_path)

        # 2) Set up symlinks
        dependencies = []
        docker_dependencies_path = '/' + bundle_uuid + '_dependencies'
        for dep in run_state.bundle['dependencies']:
            child_path = os.path.normpath(
                os.path.join(run_state.bundle_path, dep['child_path']))
            if not child_path.startswith(run_state.bundle_path):
                raise Exception('Invalid key for dependency: %s' %
                                (dep['child_path']))

            dependency_path = self.dependency_manager.get(
                bundle_uuid, (dep['parent_uuid'], dep['parent_path'])).path
            dependency_path = os.path.join(
                self.dependency_manager.dependencies_dir, dependency_path)

            docker_dependency_path = os.path.join(docker_dependencies_path,
                                                  dep['child_path'])

            os.symlink(docker_dependency_path, child_path)
            # These are turned into docker volume bindings like:
            #   dependency_path:docker_dependency_path:ro
            dependencies.append((dependency_path, docker_dependency_path))

        # 3) Set up container
        if run_state.resources['request_network']:
            docker_network = self.docker_network_external_name
        else:
            docker_network = self.docker_network_internal_name

        try:
            cpuset, gpuset = self.assign_cpu_and_gpu_sets_fn(
                run_state.resources['request_cpus'],
                run_state.resources['request_gpus'])
        except Exception:
            run_state.info[
                'failure_message'] = "Cannot assign enough resources"
            return run_state._replace(stage=LocalRunStage.CLEANING_UP,
                                      info=run_state.info)

        # 4) Start container
        try:
            container = docker_utils.start_bundle_container(
                run_state.bundle_path,
                bundle_uuid,
                dependencies,
                run_state.bundle['command'],
                run_state.resources['docker_image'],
                network=docker_network,
                cpuset=cpuset,
                gpuset=gpuset,
                memory_bytes=run_state.resources['request_memory'],
                runtime=self.docker_runtime,
            )
        except docker_utils.DockerException as e:
            run_state.info[
                'failure_message'] = 'Cannot start Docker container: {}'.format(
                    e)
            return run_state._replace(stage=LocalRunStage.CLEANING_UP,
                                      info=run_state.info)

        return run_state._replace(
            stage=LocalRunStage.RUNNING,
            start_time=time.time(),
            run_status='Running job in Docker container',
            container_id=container.id,
            container=container,
            docker_image=image_state.digest,
            has_contents=True,
            cpuset=cpuset,
            gpuset=gpuset,
        )