def test_start_worker(self): client.initialize_server_files("dist_job_mgr.json_model.ModelAdapter", config_dir=self.dir) c = client.get_local_connection(self.dir) n = c.create_node(getpass.getuser(), 20000, "localhost", public_ip="127.0.0.1") j = c.start_job("test", JobType.ONE_TIME_JOB, 1, "test of start worker", requested_nodes=["localhost"]) (s, r) = c.run_task_on_all_nodes_of_job(j, "StartWorker", "StartWorker") self.assertEqual(s, TaskStatus.TASK_SUCCESSFUL) tr = c.run_command_task(j, n, "ls", ["/bin/ls"]) self.assertEqual(tr.status, TaskStatus.TASK_SUCCESSFUL) (s, r) = c.run_task_on_all_nodes_of_job(j, "Command", "ls", ["/bin/ls"], cwd=None) self.assertEqual(s, TaskStatus.TASK_SUCCESSFUL) (s, r) = c.run_task_on_all_nodes_of_job(j, "StopWorker", "StopWorker") self.assertEqual(s, TaskStatus.TASK_SUCCESSFUL) dest = os.path.join(self.dir, os.path.basename(__file__)) tr = c.run_copy_task(j, n, "copy files", __file__, dest) self.assertEqual(tr.status, TaskStatus.TASK_SUCCESSFUL) self.assertTrue(os.path.exists(dest)) c.stop_job(j, JobStatus.JOB_SUCCESSFUL, comment="done") c.delete_job(j) c.delete_node("localhost")
def test_cleanup_crashed_jobs(self): rc = djmctl.main(["add-node", "--server-config-dir=%s" % self.dir, "--private-ip=127.0.0.1", "--bootstrap", "localhost"]) self.assertEqual(rc, 0) c = client.get_local_connection(self.dir) j = c.start_job("test", JobType.ONE_TIME_JOB, 1, "this is a test job", requested_nodes=["localhost"]) t = c.run_task(j, "Test", "test task", "localhost") # this really breaks the abstraction, but its the only good way # to get a task that's started by not finished. m = c.model m.begin_transaction() n = m.find_node(name="localhost") job = m.query_jobs(job_id=j)[0] m.create_task("bad task", job, "Test", n, "this task will not be stopped") m.commit_transaction() # simulate the crahsing of the process lock = c.job_locks[j] lock.release() del c.job_locks[j] rc = djmctl.main(["cleanup-crashed-jobs", "--server-config-dir=%s" % self.dir, "--debug", "--delete"]) self.assertEqual(rc, 0) rc = djmctl.main(["list-jobs", "--server-config-dir=%s" % self.dir, "--debug", "--job-id=%s" % j]) self.assertEqual(rc, 1)
def test_bootstrap(self): djm_path = os.path.abspath(os.path.expanduser(WORKER_DIR)) if os.path.exists(djm_path): # there is an existing worker worker_exe = os.path.join(djm_path, "python/bin/djm-worker") if os.path.exists(worker_exe): subprocess.check_call([worker_exe, "stop"], shell=True) shutil.rmtree(djm_path) client.initialize_server_files("dist_job_mgr.json_model.ModelAdapter", config_dir=self.dir, djm_package=self.djm_package) c = client.get_local_connection(self.dir) n = c.create_node(getpass.getuser(), 20000, "localhost", public_ip="127.0.0.1") j = c.start_job("test", JobType.ONE_TIME_JOB, 1, "test of start worker", requested_nodes=["localhost"]) tr = c.run_task(j, "BootstrapWorker", "bootstrap test", "localhost") self.assertEqual(tr.status, TaskStatus.TASK_SUCCESSFUL) tr = c.run_task(j, "BootstrapWorker", "bootstrap test with worker already present", "localhost") self.assertEqual(tr.status, TaskStatus.TASK_SUCCESSFUL) worker_script = os.path.join(djm_path, "python/bin/djm-worker") self.assertTrue(os.path.exists(worker_script), "Worker script %s does not exist" % worker_script) tr = c.run_task(j, "StartWorker", "check that bootstrapped worker is startable", "localhost") self.assertEqual(tr.status, TaskStatus.TASK_SUCCESSFUL) tr = c.run_task(j, "StopWorker", "cleanup", "localhost") self.assertEqual(tr.status, TaskStatus.TASK_SUCCESSFUL) c.stop_job(j, JobStatus.JOB_SUCCESSFUL, comment="done")
def test_delete_job_with_force(self): rc = djmctl.main(["add-node", "--server-config-dir=%s" % self.dir, "--private-ip=127.0.0.1", "--bootstrap", "localhost"]) self.assertEqual(rc, 0) c = client.get_local_connection(self.dir) j = c.start_job("test", JobType.ONE_TIME_JOB, 1, "this is a test job", requested_nodes=["localhost"]) t = c.run_task(j, "Test", "test task", "localhost") # this really breaks the abstraction, but its the only good way # to get a task that's started by not finished. m = c.model m.begin_transaction() n = m.find_node(name="localhost") job = m.query_jobs(job_id=j)[0] m.create_task("bad task", job, "Test", n, "this task will not be stopped") m.commit_transaction() # need to release our lock on the job lock = c.job_locks[j] lock.release() rc = djmctl.main(["delete-job", "--server-config-dir=%s" % self.dir, "--force", j]) self.assertEqual(rc, 0) rc = djmctl.main(["list-jobs", "--server-config-dir=%s" % self.dir]) self.assertEqual(rc, 0)
def run(self, options, args): info = utils.get_machine_info() c = client.get_local_connection(options.server_config_dir) n = c.create_node( info["username"], options.port, name="master", hostname=info["hostname"], public_ip=info["public_ip"], private_ip=info["private_ip"], pool_name=options.pool, ) if options.bootstrap: print "Bootstrapping worker on node..." j = c.start_job( "Bootstrap-node", JobType.ONE_TIME_JOB, 1, "Bootstrapping %s" % "master", requested_nodes=["master"], node_pool_name=options.pool, ) tr = c.run_task(j, "BootstrapWorker", "bootstrap %s" % "master", "master") c.stop_job(j, task_result_to_job_status(tr)) c.delete_job(j) if tr.status != TaskStatus.TASK_SUCCESSFUL: c.delete_node(n) print "Bootstrap failed, node not created" return 1 print "Successfully created and bootstrapped node 'master'" else: print "Successfully created node 'master'" return 0
def run(self, options, args): c = client.get_local_connection(options.server_config_dir) if len(args) == 1: name = args[0] else: name = None if options.private_ip: test_address = options.private_ip elif options.public_ip: test_address = options.public_ip else: test_address = options.hostname if ( test_address in ["127.0.0.1", "localhost", task.HOSTNAME, task.LOCAL_IP] ) and task.USERNAME == options.os_user: home_dir = os.path.abspath(os.path.expanduser("~/")) else: home_dir = utils.get_remote_home_directory(options.os_user, test_address) if home_dir == None: print "Unable to reach node via address %s" % test_address return 1 private_ip = options.private_ip if (not private_ip) and (not options.no_check_for_private_ip): private_ip = utils.look_for_private_ip(options.os_user, test_address) print "Found private network address %s" % private_ip n = c.create_node( options.os_user, options.port, name=name, hostname=options.hostname, public_ip=options.public_ip, private_ip=private_ip, pool_name=options.pool, ) print "Successfully created node %s" % n if options.bootstrap: print "Bootstrapping worker on node..." j = c.start_job( "Bootstrap-node", JobType.ONE_TIME_JOB, 1, "Bootstrapping %s" % n, requested_nodes=[n], node_pool_name=options.pool, ) tr = c.run_task(j, "BootstrapWorker", "bootstrap %s" % n, n) c.stop_job(j, task_result_to_job_status(tr)) c.delete_job(j) if tr.status != TaskStatus.TASK_SUCCESSFUL: c.delete_node(n) print "Bootstrap failed for node %s, node not created" % n return 1 else: print "Successfully created and bootstrapped node %s" % n else: print "Successfully created node %s" % n return 0
def run(self, options, args): c = client.get_local_connection(options.server_config_dir) pool_list = c.list_pools() for pool in pool_list: print "%s current_size: %3d available_size: %3d" % ( _pad_right(pool["name"], 12), pool["current_size"], pool["current_available_size"], ) return 0
def run(self, options, args): name = args[0] c = client.get_local_connection(options.server_config_dir) node = c.find_node_by_name(name) if node == None: print "Node %s does not exist" % name return 1 if options.destroy: print "Destroy option currently not supported" return 1 c.delete_node(name) return 0
def run(self, options, args): c = client.get_local_connection(options.server_config_dir) ids = c.cleanup_dead_coordinators() if len(ids) > 0: if options.delete: for job_id in ids: c.delete_job(job_id) print "Cleaned up and deleted the jobs: %s" % " ".join(ids) else: print "Cleaned up the jobs: %s" % " ".join(ids) else: print "No jobs to clean up" return 0
def run(self, options, args): job_id = args[0] c = client.get_local_connection(options.server_config_dir) task_list = c.get_tasks_for_job(job_id) print " Id Name Type Node Name Result" for task in task_list: print "%s %s %s %s %s" % ( _pad_left(task["task_id"], 4), _pad_right(task["name"], 12), _pad_right(task["task_type"], 10), _pad_right(task["node_name"], 10), task["result_status"] if task["result_status"] else "In progress", ) return 0
def test_local_client(self): client.initialize_server_files("dist_job_mgr.mem_model.ModelAdapter", config_dir=self.dir) c = client.get_local_connection(self.dir) c.create_static_pool("p1") c.create_node("joe", 20000, "n1", hostname="localhost", pool_name="p1") c.create_node("joe", 20001, "n2", private_ip="127.0.0.1", pool_name="p1") j = c.start_job("test", JobType.ONE_TIME_JOB, 2, "this is a test job", node_pool_name="p1") t = c.run_task(j, "Test", "test task", "n1", "arg1", kwarg1=1) c.stop_job(j, JobStatus.JOB_SUCCESSFUL, comment="testing") c.delete_job(j) c.delete_node("n1") c.delete_static_pool("p1")
def run(self, options, args): c = client.get_local_connection(options.server_config_dir) job_list = c.query_jobs(job_id=options.job_id, job_name=options.job_name, pool_name=options.pool_name) if options.job_id and len(job_list) == 0: print "No job with id %s found" % options.job_id return 1 max_width = 1 for job in job_list: max_width = max(max_width, len(job["job_id"])) print "%s Type Pool Status" % _pad_right("Id", max_width) for job in job_list: print "%s %s %s %s" % ( _pad_right(job["job_id"], max_width), _pad_right(job["job_type"], 12), _pad_right(job["node_pool"], 10), job["status"], ) return 0
def _run_task(self, options, name, task_type, description, *task_args, **task_kwargs): c = client.get_local_connection(options.server_config_dir) logger = logging.getLogger() if len(logger.handlers) == 0: logger.setLevel(logging.INFO) handler = logging.StreamHandler(sys.stdout) handler.setLevel(logging.INFO) logger.addHandler(handler) if options.pool: pools = c.list_pools() node_cnt = None for p in pools: if p["name"] == name: node_cnt = p["current_size"] break if node_cnt == None: self.parser.error("Pool name %s does not exist" % name) j = c.start_job(self.NAME, JobType.ONE_TIME_JOB, node_cnt, description, node_pool_name=name) (s, r) = c.run_task_on_all_nodes_of_job(j, task_type, self.NAME, *task_args, **task_kwargs) for result in r: if result.reason: print "Task %d, status was %s, reason=%s" % (result.task_id, result.status, result.reason) else: print "Task %d, status was %s" % (result.task_id, result.status) if s == TaskStatus.TASK_SUCCESSFUL: status = JobStatus.JOB_SUCCESSFUL else: status = JobStatus.JOB_FAILED else: # we are running for a specific node node = c.find_node_by_name(name) if not node: parser.error("Node %s does not exist" % name) j = c.start_job( self.NAME, JobType.ONE_TIME_JOB, 1, description, node_pool_name=node["pool"], requested_nodes=[name] ) r = c.run_task(j, task_type, self.NAME, name, *task_args, **task_kwargs) if r.status == TaskStatus.TASK_SUCCESSFUL: status = JobStatus.JOB_SUCCESSFUL else: status = JobStatus.JOB_FAILED c.stop_job(j, status) return 0 if status == JobStatus.JOB_SUCCESSFUL else 1
def run(self, options, args): def _def_none(v): return v if v else "None" c = client.get_local_connection(options.server_config_dir) node_name = args[0] n = c.find_node_by_name(node_name) if not n: print "Node %s does not exist." % node_name return 1 print "name: %s" % node_name print "os username: %s" % n["os_username"] print "worker port: %d" % n["worker_port"] print "contact address: %s" % n["contact_address"] print "hostname: %s" % _def_none(n["hostname"]) print "public ip: %s" % _def_none(n["public_ip"]) print "private ip: %s" % _def_none(n["private_ip"]) print "pool: %s" % _def_none(n["pool"]) print "Current job: %s" % _def_none(n["job_id"]) print "Current task: %s" % _def_none(n["task_id"]) return 0
def run(self, options, args): job_id = args[0] c = client.get_local_connection(options.server_config_dir) job_list = c.query_jobs(job_id=job_id) if len(job_list) != 1: print "Job %s does not exist" % job_id return 1 job = job_list[0] if job["status"] == None: if options.force: tlist = c.get_tasks_for_job(job_id) for t in tlist: if t["result_status"] == None: print "Marking task %d as stopped" % t["task_id"] c.mark_task_as_stopped(job_id, t["task_id"], TaskStatus.TASK_UNKNOWN) c.stop_job(job_id, JobStatus.JOB_FAILED) else: print "Job has not completed and --force was not specified" return 1 c.delete_job(job_id) return 0
def run(self, options, args): c = client.get_local_connection(options.server_config_dir) if len(args) == 1: pool_name = args[0] else: pool_name = None node_list = c.query_nodes(pool_name=pool_name) if len(node_list) == 0: print "No nodes found." return 0 print "Name Address Port OS_User Pool Job Task" for node in node_list: print "%s %s %s %s %s %s %s" % ( _pad_right(node["name"], 12), _pad_right(node["contact_address"], 12), _pad_left(node["worker_port"], 5), _pad_right(node["os_username"], 10), _pad_right(node["pool"], 10), _pad_right(node["job_id"], 10), _pad_left(node["task_id"], 4), ) return 0
def run(self, options, args): c = client.get_local_connection(options.server_config_dir) job_list = c.query_jobs(pool_name=options.pool_name) for job in job_list: c.delete_job(job["job_id"]) return 0
def run(self, options, args): c = client.get_local_connection(options.server_config_dir) c.create_static_pool(args[0]) print "Created static pool %s" % args[0] return 0