Esempio n. 1
0
def test_run_local_minimum_conf():
    batches_data_dir = tempfile.mkdtemp(prefix="hyperctl-test-batches")

    config_dict = {
        "jobs": [{
            "params": {
                "learning_rate": 0.1
            },
            "execution": {
                "command": "pwd"
            }
        }],
        "daemon": {
            'exit_on_finish': True,
            'port': 8063
        }
    }
    print("Config:")
    print(config_dict)

    daemon.run_batch(config_dict, batches_data_dir)

    executor_manager = get_context().executor_manager
    batch = get_context().batch
    batch_name = batch.name
    jobs_name = [j.name for j in batch.jobs]

    assert isinstance(executor_manager, LocalExecutorManager)

    assert_batch_finished(batch, batch_name, jobs_name,
                          ShellJob.STATUS_SUCCEED)

    assert_local_job_succeed(batch.jobs)
Esempio n. 2
0
def test_run_local():
    batches_data_dir = tempfile.mkdtemp(prefix="hyperctl-test-batches")
    job_name0 = "eVqNV5Uo0"
    job_name1 = "eVqNV5Uo1"
    batch_name = "eVqNV5Ut"
    local_example_script = Path(
        "hypernets/tests/hyperctl/local-example-script.py").absolute()
    print(local_example_script)
    config_dict = {
        "jobs": [{
            "name": job_name0,
            "params": {
                "learning_rate": 0.1
            },
            "resource": {
                "cpu": 2
            },
            "execution": {
                "command": f"{sys.executable} {local_example_script}",
                "working_dir": "/tmp"
            }
        }, {
            "name": job_name1,
            "params": {
                "learning_rate": 0.1
            },
            "resource": {
                "cpu": 2
            },
            "execution": {
                "command": f"{sys.executable} {local_example_script}",
                "working_dir": "/tmp",
            }
        }],
        "backend": {
            "type": "local",
            "conf": {}
        },
        "name":
        batch_name,
        "daemon": {
            "port": 8061,
            "exit_on_finish": True
        },
        "version":
        2.5
    }

    print("Config:")
    print(config_dict)

    daemon.run_batch(config_dict, batches_data_dir)

    executor_manager = get_context().executor_manager
    assert isinstance(executor_manager, LocalExecutorManager)

    assert_batch_finished(get_context().batch, batch_name,
                          [job_name0, job_name1], ShellJob.STATUS_SUCCEED)
Esempio n. 3
0
def test_run_remote():
    job1_name = "eVqNV5Uo1"
    job2_name = "eVqNV5Uo2"
    batch_name = "eVqNV5Ut"
    jobs_data_dir = tempfile.mkdtemp(prefix="hyperctl-test-jobs")
    config_dict = {
        "jobs": [{
            "name": job1_name,
            "params": {
                "learning_rate": 0.1
            },
            "resource": {},
            "execution": {
                "command": "sleep 3",
                "working_dir": "/tmp",
                "data_dir": jobs_data_dir
            }
        }, {
            "name": job2_name,
            "params": {
                "learning_rate": 0.1
            },
            "resource": {},
            "execution": {
                "command": "sleep 3",
                "working_dir": "/tmp",
                "data_dir": jobs_data_dir
            }
        }],
        "backend": {
            "type": "remote",
            "conf": {
                "machines":
                ssh_utils_test.get_ssh_test_config(use_password=True,
                                                   use_rsa_file=False)
            }
        },
        "name":
        batch_name,
        "daemon": {
            "port": 8060,
            "exit_on_finish": True
        },
        "version":
        2.5
    }

    batches_data_dir = tempfile.mkdtemp(prefix="hyperctl-test-batches")

    daemon.run_batch(config_dict, batches_data_dir)
    executor_manager = get_context().executor_manager
    batch = get_context().batch
    assert isinstance(executor_manager, RemoteSSHExecutorManager)
    assert len(executor_manager.machines) == 2

    assert_batch_finished(batch, batch_name, [job1_name, job2_name],
                          ShellJob.STATUS_SUCCEED)
Esempio n. 4
0
    def post(self, job_name, operation, **kwargs):
        # job_name
        # request_body = self.get_request_as_dict()

        if operation not in [self.OPT_KILL]:
            raise ValueError(f"unknown operation {operation} ")

        # checkout job
        job: ShellJob = dao.get_job_by_name(job_name)
        if job is None:
            raise ValueError(f'job {job_name} does not exists ')

        if operation == self.OPT_KILL:  # do kill
            logger.debug(
                f"trying kill job {job_name}, it's status is {job.status} ")
            # check job status
            if job.status != job.STATUS_RUNNING:
                raise RuntimeError(
                    f"job {job_name} in not in {job.STATUS_RUNNING} status but is {job.status} "
                )

            # find executor and close
            em: RemoteSSHExecutorManager = get_context().executor_manager
            executor = em.get_executor(job)
            logger.debug(f"find executor {executor} of job {job_name}")
            if executor is not None:
                em.kill_executor(executor)
                logger.debug(f"write failed status file for {job_name}")
                dao.change_job_status(job, job.STATUS_FAILED)
                self.response({"msg": f"{job.name} killed"})
            else:
                raise ValueError(f"no executor found for job {job.name}")
Esempio n. 5
0
def test_kill_local_job():
    batches_data_dir = tempfile.mkdtemp(prefix="hyperctl-test-batches")
    job_name = "eVqNV5Uo0"
    batch_name = "eVqNV5Ut"
    daemon_port = 8062

    config_dict = {
        "jobs": [{
            "name": job_name,
            "params": {
                "learning_rate": 0.1
            },
            "resource": {},
            "execution": {
                "command": "sleep 8",
                "working_dir": "/tmp"
            }
        }],
        "backend": {
            "type": "local",
            "conf": {}
        },
        "name":
        batch_name,
        "daemon": {
            "port": daemon_port,
            "exit_on_finish": True
        },
        "version":
        2.5
    }

    def send_kill_request():
        time.sleep(6)
        runtime.kill_job(f'http://localhost:{daemon_port}', job_name)

    _thread.start_new_thread(send_kill_request, ())

    print("Config:")
    print(config_dict)

    daemon.run_batch(config_dict, batches_data_dir)
    batch = get_context().batch
    assert_batch_finished(get_context().batch, batch_name, [job_name],
                          ShellJob.STATUS_FAILED)
    assert_local_job_finished(batch.jobs)
Esempio n. 6
0
    def _make_run_shell_content(self):
        # default http://localhost:8060
        vars = {
            consts.KEY_ENV_JOB_DATA_DIR: self.job.job_data_dir,
            consts.KEY_ENV_JOB_NAME: self.job.name,
            consts.KEY_ENV_JOB_EXECUTION_WORKING_DIR:
            self.job.execution.working_dir,  # default value
            consts.KEY_TEMPLATE_COMMAND: self.job.execution.command,
            consts.KEY_ENV_DAEMON_PORTAL:
            get_context().batch.daemon_conf.portal,
        }

        run_shell = str(consts.RUN_SH_TEMPLATE)
        for k, v in vars.items():
            run_shell = run_shell.replace(f"#{k}#", v)

        return run_shell
Esempio n. 7
0
    def schedule(self):
        c = get_context()
        executor_manager = c.executor_manager
        jobs = c.batch.jobs

        # check all jobs finished
        job_finished = c.batch.is_finished()
        if job_finished:
            batch_summary = json.dumps(c.batch.summary())
            logger.info("all jobs finished, stop scheduler:\n" + batch_summary)
            self._timer.stop()  # stop the timer
            if self.exit_on_finish:
                logger.info("stop ioloop")
                ioloop.IOLoop.instance().stop()
            return

        self._check_executors(executor_manager)
        self._dispatch_jobs(executor_manager, jobs)
Esempio n. 8
0
    def run(self):
        # check remote host setting
        daemon_host = get_context().batch.daemon_conf.host
        if consts.HOST_LOCALHOST == daemon_host:
            logger.warning(
                "recommended that set IP address that can be accessed in remote machines, "
                "but now it's \"localhost\", and the task executed on the remote machines "
                "may fail because it can't get information from the daemon server,"
                " you can set it in `daemon.host` ")

        # create remote data dir
        execution_data_dir = Path(self.job.execution.data_dir).as_posix()
        with ssh_utils.sftp_client(**self.connections) as sftp_client:
            logger.debug(f"create remote job data dir {execution_data_dir} ")
            ssh_utils.makedirs(sftp_client, execution_data_dir)

        # create run shell file
        fd_run_file, run_file = tempfile.mkstemp(
            prefix=f'hyperctl_run_{self.job.name}_', suffix='.sh')
        os.close(fd_run_file)

        self._write_run_shell_script(run_file)

        # copy file to remote
        with ssh_utils.sftp_client(**self.connections) as sftp_client:
            logger.debug(f'upload {run_file} to {self.job.run_file_path}')
            sftp_client: SFTPClient = sftp_client
            ssh_utils.copy_from_local_to_remote(sftp_client, run_file,
                                                self.job.run_file_path)

        # execute command in async
        self._command_ssh_client = ssh_utils.create_ssh_client(
            **self.connections)
        command = f'sh {self.job.run_file_path}'
        logger.debug(f'execute command {command}')
        self._remote_process = self._command_ssh_client.exec_command(
            command, get_pty=True)
Esempio n. 9
0
def get_job_by_name(job_name):
    for job in get_context().batch.jobs:

        if job.name == job_name:
            return job
    return None
Esempio n. 10
0
def get_jobs():
    return get_context().batch.jobs