コード例 #1
0
ファイル: test_client.py プロジェクト: quaddra/dist_job_mgr
 def test_start_worker(self):
     client.initialize_server_files("dist_job_mgr.json_model.ModelAdapter",
                                    config_dir=self.dir)
     c = client.get_local_connection(self.dir)
     n = c.create_node(getpass.getuser(), 20000, "localhost",
                       public_ip="127.0.0.1")
     j = c.start_job("test", JobType.ONE_TIME_JOB, 1,
                     "test of start worker",
                     requested_nodes=["localhost"])
     (s, r) = c.run_task_on_all_nodes_of_job(j, "StartWorker", "StartWorker")
     self.assertEqual(s, TaskStatus.TASK_SUCCESSFUL)
     tr = c.run_command_task(j, n, "ls", ["/bin/ls"])
     self.assertEqual(tr.status, TaskStatus.TASK_SUCCESSFUL)
     (s, r) = c.run_task_on_all_nodes_of_job(j, "Command", "ls",
                                             ["/bin/ls"], cwd=None)
     self.assertEqual(s, TaskStatus.TASK_SUCCESSFUL)
     (s, r) = c.run_task_on_all_nodes_of_job(j, "StopWorker", "StopWorker")
     self.assertEqual(s, TaskStatus.TASK_SUCCESSFUL)
     dest = os.path.join(self.dir, os.path.basename(__file__))
     tr = c.run_copy_task(j, n, "copy files",
                          __file__, dest)
     self.assertEqual(tr.status, TaskStatus.TASK_SUCCESSFUL)
     self.assertTrue(os.path.exists(dest))
     c.stop_job(j, JobStatus.JOB_SUCCESSFUL, comment="done")
     c.delete_job(j)
     c.delete_node("localhost")
コード例 #2
0
ファイル: test_djmctl.py プロジェクト: quaddra/dist_job_mgr
 def test_cleanup_crashed_jobs(self):
     rc = djmctl.main(["add-node",
                       "--server-config-dir=%s" % self.dir,
                       "--private-ip=127.0.0.1",
                       "--bootstrap",
                       "localhost"])
     self.assertEqual(rc, 0)
     c = client.get_local_connection(self.dir)
     j = c.start_job("test", JobType.ONE_TIME_JOB, 1, "this is a test job",
                     requested_nodes=["localhost"])
     t = c.run_task(j, "Test", "test task", "localhost")
     # this really breaks the abstraction, but its the only good way
     # to get a task that's started by not finished.
     m = c.model
     m.begin_transaction()
     n = m.find_node(name="localhost")
     job = m.query_jobs(job_id=j)[0]
     m.create_task("bad task", job, "Test", n, "this task will not be stopped")
     m.commit_transaction()
     # simulate the crahsing of the process
     lock = c.job_locks[j]
     lock.release()
     del c.job_locks[j]
     rc = djmctl.main(["cleanup-crashed-jobs",
                       "--server-config-dir=%s" % self.dir,
                       "--debug",
                       "--delete"])
     self.assertEqual(rc, 0)
     rc = djmctl.main(["list-jobs",
                       "--server-config-dir=%s" % self.dir,
                       "--debug",
                       "--job-id=%s" % j])
     self.assertEqual(rc, 1)
コード例 #3
0
ファイル: test_client.py プロジェクト: quaddra/dist_job_mgr
 def test_bootstrap(self):
     djm_path = os.path.abspath(os.path.expanduser(WORKER_DIR))
     if os.path.exists(djm_path): # there is an existing worker
         worker_exe = os.path.join(djm_path, "python/bin/djm-worker")
         if os.path.exists(worker_exe):
             subprocess.check_call([worker_exe, "stop"], shell=True)
         shutil.rmtree(djm_path)
     client.initialize_server_files("dist_job_mgr.json_model.ModelAdapter",
                                    config_dir=self.dir,
                                    djm_package=self.djm_package)
     c = client.get_local_connection(self.dir)
     n = c.create_node(getpass.getuser(), 20000, "localhost",
                       public_ip="127.0.0.1")
     j = c.start_job("test", JobType.ONE_TIME_JOB, 1,
                     "test of start worker",
                     requested_nodes=["localhost"])
     tr = c.run_task(j, "BootstrapWorker", "bootstrap test", "localhost")
     self.assertEqual(tr.status, TaskStatus.TASK_SUCCESSFUL)
     tr = c.run_task(j, "BootstrapWorker", "bootstrap test with worker already present", "localhost")
     self.assertEqual(tr.status, TaskStatus.TASK_SUCCESSFUL)
     worker_script = os.path.join(djm_path, "python/bin/djm-worker")
     self.assertTrue(os.path.exists(worker_script),
                     "Worker script %s does not exist" % worker_script)
     tr = c.run_task(j, "StartWorker", "check that bootstrapped worker is startable",
                     "localhost")
     self.assertEqual(tr.status, TaskStatus.TASK_SUCCESSFUL)
     tr = c.run_task(j, "StopWorker", "cleanup",
                     "localhost")
     self.assertEqual(tr.status, TaskStatus.TASK_SUCCESSFUL)
     c.stop_job(j, JobStatus.JOB_SUCCESSFUL, comment="done")
コード例 #4
0
ファイル: test_djmctl.py プロジェクト: quaddra/dist_job_mgr
 def test_delete_job_with_force(self):
     rc = djmctl.main(["add-node",
                       "--server-config-dir=%s" % self.dir,
                       "--private-ip=127.0.0.1",
                       "--bootstrap",
                       "localhost"])
     self.assertEqual(rc, 0)
     c = client.get_local_connection(self.dir)
     j = c.start_job("test", JobType.ONE_TIME_JOB, 1, "this is a test job",
                     requested_nodes=["localhost"])
     t = c.run_task(j, "Test", "test task", "localhost")
     # this really breaks the abstraction, but its the only good way
     # to get a task that's started by not finished.
     m = c.model
     m.begin_transaction()
     n = m.find_node(name="localhost")
     job = m.query_jobs(job_id=j)[0]
     m.create_task("bad task", job, "Test", n, "this task will not be stopped")
     m.commit_transaction()
     # need to release our lock on the job
     lock = c.job_locks[j]
     lock.release()
     rc = djmctl.main(["delete-job",
                       "--server-config-dir=%s" % self.dir,
                       "--force",
                       j])
     self.assertEqual(rc, 0)
     rc = djmctl.main(["list-jobs",
                       "--server-config-dir=%s" % self.dir])
     self.assertEqual(rc, 0)
コード例 #5
0
ファイル: djmctl.py プロジェクト: quaddra/dist_job_mgr
 def run(self, options, args):
     info = utils.get_machine_info()
     c = client.get_local_connection(options.server_config_dir)
     n = c.create_node(
         info["username"],
         options.port,
         name="master",
         hostname=info["hostname"],
         public_ip=info["public_ip"],
         private_ip=info["private_ip"],
         pool_name=options.pool,
     )
     if options.bootstrap:
         print "Bootstrapping worker on node..."
         j = c.start_job(
             "Bootstrap-node",
             JobType.ONE_TIME_JOB,
             1,
             "Bootstrapping %s" % "master",
             requested_nodes=["master"],
             node_pool_name=options.pool,
         )
         tr = c.run_task(j, "BootstrapWorker", "bootstrap %s" % "master", "master")
         c.stop_job(j, task_result_to_job_status(tr))
         c.delete_job(j)
         if tr.status != TaskStatus.TASK_SUCCESSFUL:
             c.delete_node(n)
             print "Bootstrap failed, node not created"
             return 1
         print "Successfully created and bootstrapped node 'master'"
     else:
         print "Successfully created node 'master'"
     return 0
コード例 #6
0
ファイル: djmctl.py プロジェクト: quaddra/dist_job_mgr
 def run(self, options, args):
     c = client.get_local_connection(options.server_config_dir)
     if len(args) == 1:
         name = args[0]
     else:
         name = None
     if options.private_ip:
         test_address = options.private_ip
     elif options.public_ip:
         test_address = options.public_ip
     else:
         test_address = options.hostname
     if (
         test_address in ["127.0.0.1", "localhost", task.HOSTNAME, task.LOCAL_IP]
     ) and task.USERNAME == options.os_user:
         home_dir = os.path.abspath(os.path.expanduser("~/"))
     else:
         home_dir = utils.get_remote_home_directory(options.os_user, test_address)
     if home_dir == None:
         print "Unable to reach node via address %s" % test_address
         return 1
     private_ip = options.private_ip
     if (not private_ip) and (not options.no_check_for_private_ip):
         private_ip = utils.look_for_private_ip(options.os_user, test_address)
         print "Found private network address %s" % private_ip
     n = c.create_node(
         options.os_user,
         options.port,
         name=name,
         hostname=options.hostname,
         public_ip=options.public_ip,
         private_ip=private_ip,
         pool_name=options.pool,
     )
     print "Successfully created node %s" % n
     if options.bootstrap:
         print "Bootstrapping worker on node..."
         j = c.start_job(
             "Bootstrap-node",
             JobType.ONE_TIME_JOB,
             1,
             "Bootstrapping %s" % n,
             requested_nodes=[n],
             node_pool_name=options.pool,
         )
         tr = c.run_task(j, "BootstrapWorker", "bootstrap %s" % n, n)
         c.stop_job(j, task_result_to_job_status(tr))
         c.delete_job(j)
         if tr.status != TaskStatus.TASK_SUCCESSFUL:
             c.delete_node(n)
             print "Bootstrap failed for node %s, node not created" % n
             return 1
         else:
             print "Successfully created and bootstrapped node %s" % n
     else:
         print "Successfully created node %s" % n
     return 0
コード例 #7
0
ファイル: djmctl.py プロジェクト: quaddra/dist_job_mgr
 def run(self, options, args):
     c = client.get_local_connection(options.server_config_dir)
     pool_list = c.list_pools()
     for pool in pool_list:
         print "%s current_size: %3d      available_size: %3d" % (
             _pad_right(pool["name"], 12),
             pool["current_size"],
             pool["current_available_size"],
         )
     return 0
コード例 #8
0
ファイル: djmctl.py プロジェクト: quaddra/dist_job_mgr
 def run(self, options, args):
     name = args[0]
     c = client.get_local_connection(options.server_config_dir)
     node = c.find_node_by_name(name)
     if node == None:
         print "Node %s does not exist" % name
         return 1
     if options.destroy:
         print "Destroy option currently not supported"
         return 1
     c.delete_node(name)
     return 0
コード例 #9
0
ファイル: djmctl.py プロジェクト: quaddra/dist_job_mgr
 def run(self, options, args):
     c = client.get_local_connection(options.server_config_dir)
     ids = c.cleanup_dead_coordinators()
     if len(ids) > 0:
         if options.delete:
             for job_id in ids:
                 c.delete_job(job_id)
             print "Cleaned up and deleted the jobs: %s" % " ".join(ids)
         else:
             print "Cleaned up the jobs: %s" % " ".join(ids)
     else:
         print "No jobs to clean up"
     return 0
コード例 #10
0
ファイル: djmctl.py プロジェクト: quaddra/dist_job_mgr
 def run(self, options, args):
     job_id = args[0]
     c = client.get_local_connection(options.server_config_dir)
     task_list = c.get_tasks_for_job(job_id)
     print "  Id  Name          Type        Node Name   Result"
     for task in task_list:
         print "%s  %s  %s  %s  %s" % (
             _pad_left(task["task_id"], 4),
             _pad_right(task["name"], 12),
             _pad_right(task["task_type"], 10),
             _pad_right(task["node_name"], 10),
             task["result_status"] if task["result_status"] else "In progress",
         )
     return 0
コード例 #11
0
ファイル: test_client.py プロジェクト: quaddra/dist_job_mgr
 def test_local_client(self):
     client.initialize_server_files("dist_job_mgr.mem_model.ModelAdapter",
                                    config_dir=self.dir)
     c = client.get_local_connection(self.dir)
     c.create_static_pool("p1")
     c.create_node("joe", 20000, "n1",
                   hostname="localhost", pool_name="p1")
     c.create_node("joe", 20001, "n2",
                   private_ip="127.0.0.1", pool_name="p1")
     j = c.start_job("test", JobType.ONE_TIME_JOB, 2, "this is a test job",
                     node_pool_name="p1")
     t = c.run_task(j, "Test", "test task", "n1", "arg1", kwarg1=1)
     c.stop_job(j, JobStatus.JOB_SUCCESSFUL, comment="testing")
     c.delete_job(j)
     c.delete_node("n1")
     c.delete_static_pool("p1")
コード例 #12
0
ファイル: djmctl.py プロジェクト: quaddra/dist_job_mgr
 def run(self, options, args):
     c = client.get_local_connection(options.server_config_dir)
     job_list = c.query_jobs(job_id=options.job_id, job_name=options.job_name, pool_name=options.pool_name)
     if options.job_id and len(job_list) == 0:
         print "No job with id %s found" % options.job_id
         return 1
     max_width = 1
     for job in job_list:
         max_width = max(max_width, len(job["job_id"]))
     print "%s  Type          Pool        Status" % _pad_right("Id", max_width)
     for job in job_list:
         print "%s  %s  %s  %s" % (
             _pad_right(job["job_id"], max_width),
             _pad_right(job["job_type"], 12),
             _pad_right(job["node_pool"], 10),
             job["status"],
         )
     return 0
コード例 #13
0
ファイル: djmctl.py プロジェクト: quaddra/dist_job_mgr
 def _run_task(self, options, name, task_type, description, *task_args, **task_kwargs):
     c = client.get_local_connection(options.server_config_dir)
     logger = logging.getLogger()
     if len(logger.handlers) == 0:
         logger.setLevel(logging.INFO)
         handler = logging.StreamHandler(sys.stdout)
         handler.setLevel(logging.INFO)
         logger.addHandler(handler)
     if options.pool:
         pools = c.list_pools()
         node_cnt = None
         for p in pools:
             if p["name"] == name:
                 node_cnt = p["current_size"]
                 break
         if node_cnt == None:
             self.parser.error("Pool name %s does not exist" % name)
         j = c.start_job(self.NAME, JobType.ONE_TIME_JOB, node_cnt, description, node_pool_name=name)
         (s, r) = c.run_task_on_all_nodes_of_job(j, task_type, self.NAME, *task_args, **task_kwargs)
         for result in r:
             if result.reason:
                 print "Task %d, status was %s, reason=%s" % (result.task_id, result.status, result.reason)
             else:
                 print "Task %d, status was %s" % (result.task_id, result.status)
         if s == TaskStatus.TASK_SUCCESSFUL:
             status = JobStatus.JOB_SUCCESSFUL
         else:
             status = JobStatus.JOB_FAILED
     else:  # we are running for a specific node
         node = c.find_node_by_name(name)
         if not node:
             parser.error("Node %s does not exist" % name)
         j = c.start_job(
             self.NAME, JobType.ONE_TIME_JOB, 1, description, node_pool_name=node["pool"], requested_nodes=[name]
         )
         r = c.run_task(j, task_type, self.NAME, name, *task_args, **task_kwargs)
         if r.status == TaskStatus.TASK_SUCCESSFUL:
             status = JobStatus.JOB_SUCCESSFUL
         else:
             status = JobStatus.JOB_FAILED
     c.stop_job(j, status)
     return 0 if status == JobStatus.JOB_SUCCESSFUL else 1
コード例 #14
0
ファイル: djmctl.py プロジェクト: quaddra/dist_job_mgr
    def run(self, options, args):
        def _def_none(v):
            return v if v else "None"

        c = client.get_local_connection(options.server_config_dir)
        node_name = args[0]
        n = c.find_node_by_name(node_name)
        if not n:
            print "Node %s does not exist." % node_name
            return 1
        print "name:            %s" % node_name
        print "os username:     %s" % n["os_username"]
        print "worker port:     %d" % n["worker_port"]
        print "contact address: %s" % n["contact_address"]
        print "hostname:        %s" % _def_none(n["hostname"])
        print "public ip:       %s" % _def_none(n["public_ip"])
        print "private ip:      %s" % _def_none(n["private_ip"])
        print "pool:            %s" % _def_none(n["pool"])
        print "Current job:     %s" % _def_none(n["job_id"])
        print "Current task:    %s" % _def_none(n["task_id"])
        return 0
コード例 #15
0
ファイル: djmctl.py プロジェクト: quaddra/dist_job_mgr
 def run(self, options, args):
     job_id = args[0]
     c = client.get_local_connection(options.server_config_dir)
     job_list = c.query_jobs(job_id=job_id)
     if len(job_list) != 1:
         print "Job %s does not exist" % job_id
         return 1
     job = job_list[0]
     if job["status"] == None:
         if options.force:
             tlist = c.get_tasks_for_job(job_id)
             for t in tlist:
                 if t["result_status"] == None:
                     print "Marking task %d as stopped" % t["task_id"]
                     c.mark_task_as_stopped(job_id, t["task_id"], TaskStatus.TASK_UNKNOWN)
             c.stop_job(job_id, JobStatus.JOB_FAILED)
         else:
             print "Job has not completed and --force was not specified"
             return 1
     c.delete_job(job_id)
     return 0
コード例 #16
0
ファイル: djmctl.py プロジェクト: quaddra/dist_job_mgr
 def run(self, options, args):
     c = client.get_local_connection(options.server_config_dir)
     if len(args) == 1:
         pool_name = args[0]
     else:
         pool_name = None
     node_list = c.query_nodes(pool_name=pool_name)
     if len(node_list) == 0:
         print "No nodes found."
         return 0
     print "Name          Address        Port  OS_User     Pool        Job         Task"
     for node in node_list:
         print "%s  %s  %s  %s  %s  %s  %s" % (
             _pad_right(node["name"], 12),
             _pad_right(node["contact_address"], 12),
             _pad_left(node["worker_port"], 5),
             _pad_right(node["os_username"], 10),
             _pad_right(node["pool"], 10),
             _pad_right(node["job_id"], 10),
             _pad_left(node["task_id"], 4),
         )
     return 0
コード例 #17
0
ファイル: djmctl.py プロジェクト: quaddra/dist_job_mgr
 def run(self, options, args):
     c = client.get_local_connection(options.server_config_dir)
     job_list = c.query_jobs(pool_name=options.pool_name)
     for job in job_list:
         c.delete_job(job["job_id"])
     return 0
コード例 #18
0
ファイル: djmctl.py プロジェクト: quaddra/dist_job_mgr
 def run(self, options, args):
     c = client.get_local_connection(options.server_config_dir)
     c.create_static_pool(args[0])
     print "Created static pool %s" % args[0]
     return 0