def test_run_local_minimum_conf(): batches_data_dir = tempfile.mkdtemp(prefix="hyperctl-test-batches") config_dict = { "jobs": [{ "params": { "learning_rate": 0.1 }, "execution": { "command": "pwd" } }], "daemon": { 'exit_on_finish': True, 'port': 8063 } } print("Config:") print(config_dict) daemon.run_batch(config_dict, batches_data_dir) executor_manager = get_context().executor_manager batch = get_context().batch batch_name = batch.name jobs_name = [j.name for j in batch.jobs] assert isinstance(executor_manager, LocalExecutorManager) assert_batch_finished(batch, batch_name, jobs_name, ShellJob.STATUS_SUCCEED) assert_local_job_succeed(batch.jobs)
def test_run_local(): batches_data_dir = tempfile.mkdtemp(prefix="hyperctl-test-batches") job_name0 = "eVqNV5Uo0" job_name1 = "eVqNV5Uo1" batch_name = "eVqNV5Ut" local_example_script = Path( "hypernets/tests/hyperctl/local-example-script.py").absolute() print(local_example_script) config_dict = { "jobs": [{ "name": job_name0, "params": { "learning_rate": 0.1 }, "resource": { "cpu": 2 }, "execution": { "command": f"{sys.executable} {local_example_script}", "working_dir": "/tmp" } }, { "name": job_name1, "params": { "learning_rate": 0.1 }, "resource": { "cpu": 2 }, "execution": { "command": f"{sys.executable} {local_example_script}", "working_dir": "/tmp", } }], "backend": { "type": "local", "conf": {} }, "name": batch_name, "daemon": { "port": 8061, "exit_on_finish": True }, "version": 2.5 } print("Config:") print(config_dict) daemon.run_batch(config_dict, batches_data_dir) executor_manager = get_context().executor_manager assert isinstance(executor_manager, LocalExecutorManager) assert_batch_finished(get_context().batch, batch_name, [job_name0, job_name1], ShellJob.STATUS_SUCCEED)
def test_run_remote(): job1_name = "eVqNV5Uo1" job2_name = "eVqNV5Uo2" batch_name = "eVqNV5Ut" jobs_data_dir = tempfile.mkdtemp(prefix="hyperctl-test-jobs") config_dict = { "jobs": [{ "name": job1_name, "params": { "learning_rate": 0.1 }, "resource": {}, "execution": { "command": "sleep 3", "working_dir": "/tmp", "data_dir": jobs_data_dir } }, { "name": job2_name, "params": { "learning_rate": 0.1 }, "resource": {}, "execution": { "command": "sleep 3", "working_dir": "/tmp", "data_dir": jobs_data_dir } }], "backend": { "type": "remote", "conf": { "machines": ssh_utils_test.get_ssh_test_config(use_password=True, use_rsa_file=False) } }, "name": batch_name, "daemon": { "port": 8060, "exit_on_finish": True }, "version": 2.5 } batches_data_dir = tempfile.mkdtemp(prefix="hyperctl-test-batches") daemon.run_batch(config_dict, batches_data_dir) executor_manager = get_context().executor_manager batch = get_context().batch assert isinstance(executor_manager, RemoteSSHExecutorManager) assert len(executor_manager.machines) == 2 assert_batch_finished(batch, batch_name, [job1_name, job2_name], ShellJob.STATUS_SUCCEED)
def post(self, job_name, operation, **kwargs): # job_name # request_body = self.get_request_as_dict() if operation not in [self.OPT_KILL]: raise ValueError(f"unknown operation {operation} ") # checkout job job: ShellJob = dao.get_job_by_name(job_name) if job is None: raise ValueError(f'job {job_name} does not exists ') if operation == self.OPT_KILL: # do kill logger.debug( f"trying kill job {job_name}, it's status is {job.status} ") # check job status if job.status != job.STATUS_RUNNING: raise RuntimeError( f"job {job_name} in not in {job.STATUS_RUNNING} status but is {job.status} " ) # find executor and close em: RemoteSSHExecutorManager = get_context().executor_manager executor = em.get_executor(job) logger.debug(f"find executor {executor} of job {job_name}") if executor is not None: em.kill_executor(executor) logger.debug(f"write failed status file for {job_name}") dao.change_job_status(job, job.STATUS_FAILED) self.response({"msg": f"{job.name} killed"}) else: raise ValueError(f"no executor found for job {job.name}")
def test_kill_local_job(): batches_data_dir = tempfile.mkdtemp(prefix="hyperctl-test-batches") job_name = "eVqNV5Uo0" batch_name = "eVqNV5Ut" daemon_port = 8062 config_dict = { "jobs": [{ "name": job_name, "params": { "learning_rate": 0.1 }, "resource": {}, "execution": { "command": "sleep 8", "working_dir": "/tmp" } }], "backend": { "type": "local", "conf": {} }, "name": batch_name, "daemon": { "port": daemon_port, "exit_on_finish": True }, "version": 2.5 } def send_kill_request(): time.sleep(6) runtime.kill_job(f'http://localhost:{daemon_port}', job_name) _thread.start_new_thread(send_kill_request, ()) print("Config:") print(config_dict) daemon.run_batch(config_dict, batches_data_dir) batch = get_context().batch assert_batch_finished(get_context().batch, batch_name, [job_name], ShellJob.STATUS_FAILED) assert_local_job_finished(batch.jobs)
def _make_run_shell_content(self): # default http://localhost:8060 vars = { consts.KEY_ENV_JOB_DATA_DIR: self.job.job_data_dir, consts.KEY_ENV_JOB_NAME: self.job.name, consts.KEY_ENV_JOB_EXECUTION_WORKING_DIR: self.job.execution.working_dir, # default value consts.KEY_TEMPLATE_COMMAND: self.job.execution.command, consts.KEY_ENV_DAEMON_PORTAL: get_context().batch.daemon_conf.portal, } run_shell = str(consts.RUN_SH_TEMPLATE) for k, v in vars.items(): run_shell = run_shell.replace(f"#{k}#", v) return run_shell
def schedule(self): c = get_context() executor_manager = c.executor_manager jobs = c.batch.jobs # check all jobs finished job_finished = c.batch.is_finished() if job_finished: batch_summary = json.dumps(c.batch.summary()) logger.info("all jobs finished, stop scheduler:\n" + batch_summary) self._timer.stop() # stop the timer if self.exit_on_finish: logger.info("stop ioloop") ioloop.IOLoop.instance().stop() return self._check_executors(executor_manager) self._dispatch_jobs(executor_manager, jobs)
def run(self): # check remote host setting daemon_host = get_context().batch.daemon_conf.host if consts.HOST_LOCALHOST == daemon_host: logger.warning( "recommended that set IP address that can be accessed in remote machines, " "but now it's \"localhost\", and the task executed on the remote machines " "may fail because it can't get information from the daemon server," " you can set it in `daemon.host` ") # create remote data dir execution_data_dir = Path(self.job.execution.data_dir).as_posix() with ssh_utils.sftp_client(**self.connections) as sftp_client: logger.debug(f"create remote job data dir {execution_data_dir} ") ssh_utils.makedirs(sftp_client, execution_data_dir) # create run shell file fd_run_file, run_file = tempfile.mkstemp( prefix=f'hyperctl_run_{self.job.name}_', suffix='.sh') os.close(fd_run_file) self._write_run_shell_script(run_file) # copy file to remote with ssh_utils.sftp_client(**self.connections) as sftp_client: logger.debug(f'upload {run_file} to {self.job.run_file_path}') sftp_client: SFTPClient = sftp_client ssh_utils.copy_from_local_to_remote(sftp_client, run_file, self.job.run_file_path) # execute command in async self._command_ssh_client = ssh_utils.create_ssh_client( **self.connections) command = f'sh {self.job.run_file_path}' logger.debug(f'execute command {command}') self._remote_process = self._command_ssh_client.exec_command( command, get_pty=True)
def get_job_by_name(job_name): for job in get_context().batch.jobs: if job.name == job_name: return job return None
def get_jobs(): return get_context().batch.jobs