def _retrieve_logs(self): from foundations_contrib.global_state import config_manager from foundations_core_cli.job_submission.config import load from foundations_internal.change_directory import ChangeDirectory import os arguments = self._cli.arguments() env_name = arguments.scheduler_config job_id = arguments.job_id current_directory = os.getcwd() with ChangeDirectory(current_directory): load(arguments.scheduler_config or "scheduler") job_deployment_class = config_manager["deployment_implementation"][ "deployment_type" ] job_deployment = job_deployment_class(job_id, None, None) job_status = job_deployment.get_job_status() if job_status is None: self._cli._fail_with_message( "Error: Job `{}` does not exist for environment `{}`".format( job_id, env_name ) ) elif job_status == "queued": self._cli._fail_with_message( "Error: Job `{}` is queued and has not produced any logs".format(job_id) ) else: logs = job_deployment.get_job_logs() print(logs)
def test_loads_config_into_config_manager_when_config_present(self): from foundations_local_docker_scheduler_plugin.config.scheduler import translate self._set_up_config() load(self.config_name) self.mock_config_listing.update_config_manager_with_config.assert_called_with( self.config_name, translate)
def _stop(self): from foundations_contrib.global_state import config_manager from foundations_core_cli.job_submission.config import load from foundations_internal.change_directory import ChangeDirectory import os arguments = self._cli.arguments() env_name = arguments.scheduler_config job_id = arguments.job_id current_directory = os.getcwd() with ChangeDirectory(current_directory): load(arguments.scheduler_config or "scheduler") job_deployment_class = config_manager["deployment_implementation"][ "deployment_type" ] job_deployment = job_deployment_class(job_id, None, None) try: job_status = job_deployment.get_job_status() if job_status is None: self._cli._fail_with_message( "Error: Job `{}` does not exist for environment `{}`".format( job_id, env_name ) ) elif job_status == "queued": self._cli._fail_with_message( "Error: Job `{}` is queued and cannot be stopped".format(job_id) ) elif job_status == "completed": self._cli._fail_with_message( "Error: Job `{}` is completed and cannot be stopped".format(job_id) ) else: if job_deployment.stop_running_job(): print("Stopped running job {}".format(job_id)) else: print("Error stopping job {}".format(job_id)) except AttributeError: print("The specified scheduler does not support this functionality")
def _delete_job(self): from foundations_contrib.global_state import config_manager from foundations_core_cli.job_submission.config import load from foundations_internal.change_directory import ChangeDirectory import os arguments = self._cli.arguments() env_name = arguments.scheduler_config job_id = arguments.job_id current_directory = os.getcwd() with ChangeDirectory(current_directory): load(arguments.scheduler_config or "scheduler") job_deployment_class = config_manager["deployment_implementation"][ "deployment_type" ] job_deployment = job_deployment_class(job_id, None, None) job_status = job_deployment.get_job_status() if job_status is None: self._cli._fail_with_message( "Error: Job `{}` does not exist for environment `{}`".format( job_id, env_name ) ) elif job_status in ("queued", "running", "pending"): self._cli._fail_with_message( "Error: Job `{}` has status `{}` and cannot be deleted".format( job_id, job_status ) ) else: if job_deployment.cancel_jobs([job_id])[job_id]: print(f"Job {job_id} successfully deleted") else: print( f"Could not completely delete job {job_id}. Please make sure that the job bundle exists under ~/.foundations/job_data/" )
def _clear_queue(self): from foundations_contrib.global_state import config_manager from foundations_core_cli.job_submission.config import load from foundations_internal.change_directory import ChangeDirectory import os arguments = self._cli.arguments() current_directory = os.getcwd() with ChangeDirectory(current_directory): load(arguments.scheduler_config or "scheduler") job_deployment_class = config_manager["deployment_implementation"][ "deployment_type" ] try: num_jobs_dequeued = job_deployment_class.clear_queue() print("Removed {} job(s) from queue".format(num_jobs_dequeued)) except AttributeError: print("The specified scheduler does not support this functionality")
def _retrieve_artifacts(self): from foundations_contrib.global_state import config_manager from foundations_core_cli.job_submission.config import load from foundations_internal.change_directory import ChangeDirectory import os arguments = self._cli.arguments() env_name = arguments.scheduler_config job_id = arguments.job_id current_directory = os.getcwd() if arguments.save_dir is None: arguments.save_dir = os.path.join(current_directory, str(job_id)) with ChangeDirectory(current_directory): load(arguments.scheduler_config or "scheduler") job_deployment_class = config_manager["deployment_implementation"][ "deployment_type" ] job_deployment = job_deployment_class(job_id, None, None) job_status = job_deployment.get_job_status() if job_status is None: self._cli._fail_with_message( "Error: Job `{}` does not exist for environment `{}`".format( job_id, env_name ) ) else: if job_deployment.get_job_archive(): print(f"Successfully retrieved Job {job_id} from archive store") else: print(f"Error: Could not download Job {job_id}")
def submit(arguments): from foundations_core_cli.job_submission.config import load from foundations_core_cli.job_submission.deployment import deploy from foundations_core_cli.job_submission.logs import stream_job_logs from foundations_internal.change_directory import ChangeDirectory from foundations_contrib.global_state import config_manager, log_manager from foundations_contrib.set_job_resources import set_job_resources from jsonschema import validate import os import os.path current_directory = os.getcwd() with ChangeDirectory(arguments.job_directory or current_directory): load(arguments.scheduler_config or 'scheduler') job_config = {} if os.path.exists('job.config.yaml'): with open('job.config.yaml') as file: job_config = yaml.load(file.read(), Loader=yaml.FullLoader) # validate(instance=job_config, schema=_job_schema) job_resource_args = {} if 'log_level' in job_config: config_manager['log_level'] = job_config['log_level'] if 'worker' in job_config: config_manager['worker_container_overrides'].update( job_config['worker']) if 'num_gpus' in job_config: job_resource_args['num_gpus'] = job_config['num_gpus'] if 'ram' in job_config: job_resource_args['ram'] = job_config['ram'] logger = log_manager.get_logger(__name__) if arguments.command: config_manager['worker_container_overrides'][ 'args'] = arguments.command if not os.path.exists(arguments.command[0]): logger.warning( f"Hey, seems like your command '{arguments.command[0]}' is not an existing file in your current directory. If you are using Atlas's advanced custom docker image functionality and know what you are doing, you can ignore this message." ) else: logger.warning('No command was specified.') if arguments.num_gpus is not None: job_resource_args['num_gpus'] = arguments.num_gpus if arguments.ram is not None: job_resource_args['ram'] = arguments.ram set_job_resources(**job_resource_args) from foundations.global_state import current_foundations_context try: cur_job_id = current_foundations_context().pipeline_context( ).file_name except ValueError: cur_job_id = None deployment = deploy( arguments.project_name or job_config.get('project_name'), arguments.entrypoint or job_config.get('entrypoint'), arguments.params or job_config.get('params')) if arguments.stream_job_logs: try: stream_job_logs(deployment) except KeyboardInterrupt: pass if cur_job_id is not None: current_foundations_context().pipeline_context( ).file_name = cur_job_id return deployment
def test_does_not_print_error_when_config_present(self): self._set_up_config() load(self.config_name) self.print_mock.assert_not_called()
def test_prints_warning_message_when_config_missing(self): load(self.config_name) self.print_mock.assert_called_with( f"Could not find submission configuration with name: `{self.config_name}`" )
def test_exits_when_config_missing(self): load(self.config_name) self.exit_mock.assert_called_with(1)