Beispiel #1
0
    def step3_job_monitoring(self):
        to_screen("""\
testing REST APIs related to querying a job, including
- rest_api_job_list
- rest_api_job_info
        """)
        client = ClusterList().load().get_client(
            get_defaults()["cluster-alias"])
        self.cmd_exec(['opai', 'job', 'list'])
        job_list = client.rest_api_job_list(
            client.user)  # ! only jobs from current user to reduce time
        job_list = [job['name'] for job in job_list]
        assert self.job_name in job_list, job_list
        to_screen(f"testing job monitoring with {self.job_name}")
        status = client.rest_api_job_info(self.job_name)
        to_screen(
            f"retrieving job status and get its state {JobStatusParser.state(status)}"
        )
        client.rest_api_job_info(self.job_name, 'config')
        to_screen("retrieving job config")
        logs = JobStatusParser.all_tasks_logs(status)
        assert logs, f"failed to read logs from status \n{status}"
        for k, v in logs.items():
            for t, content in v.items():
                to_screen(
                    f"reading logs {k} for {t} and get {len(content)} Bytes")
Beispiel #2
0
 def check(self):
     cluster_info = self.rest_api_cluster_info()
     cluster_info["virtual_clusters"] = self.virtual_clusters()
     self.config.update(cluster_info)
     # ! will check authentication types according to AAD enabled or not
     to_screen("succeeded to connect cluster {}".format(self.alias))
     return self
Beispiel #3
0
 def wait(self,
          t_sleep: float = 10,
          timeout: float = 3600,
          silent: bool = False):
     """for jupyter job, wait until ready to connect
     for normal job, wait until completed"""
     exit_states = __job_states__["completed"]
     repeater = Retry(timeout=timeout, t_sleep=t_sleep, silent=silent)
     interactive_nb = self.has_tag(__internal_tags__["interactive_nb"])
     batch_nb = self.has_tag(__internal_tags__["batch_nb"])
     if interactive_nb or batch_nb:
         if interactive_nb:
             to_screen(
                 "{} is recognized to be an interactive jupyter notebook job"
                 .format(self.name))
             to_screen(
                 "notebook job needs to be RUNNING state and the kernel started"
             )
         if batch_nb:
             to_screen(
                 "{} is recognized to be a silent jupyter notebook job".
                 format(self.name))
             to_screen(
                 "notebook job needs to be SUCCEEDED state and the output is ready"
             )
         return repeater.retry(
             lambda x: x.get('state', None) in exit_states or x.get(
                 "notebook", None) is not None, self.connect_jupyter)
     to_screen("wait until job to be completed ({})".format(exit_states))
     return repeater.retry(
         lambda x: JobStatusParser.state(x) in exit_states,  # x: job status
         self.get_status)
Beispiel #4
0
 def download_webhdfs(self, remote_path: str, local_path: str, **kwargs):
     mkdir_for(local_path)
     to_screen("download %s -> %s" % (remote_path, local_path))
     return self.client.download(local_path=local_path,
                                 hdfs_path=remote_path,
                                 overwrite=True,
                                 **kwargs)
Beispiel #5
0
 def run_steps(self):
     for name, func in self.get_steps():
         try:
             to_screen(f"\n==== begin to test {name} ====")
             func()
         except Exception as identifier:
             self.fail("test {} failed ({}: {})".format(
                 name, type(identifier), repr(identifier)))
Beispiel #6
0
 def submit(self, cluster_alias: str = None, virtual_cluster: str = None):
     cluster_alias = na(cluster_alias, self.param("cluster_alias", None))
     self.select_cluster(cluster_alias, virtual_cluster)
     self.validate().local_process()
     to_screen("submit job %s to cluster %s" % (self.name, cluster_alias))
     self.client.rest_api_submit(self.get_config())
     job_link = self.client.get_job_link(self.name)
     return {"job_link": job_link, "job_name": self.name}
Beispiel #7
0
 def do_action_list(self, args):
     client = self.__clusters__.get_client(args.cluster_alias)
     if not args.user:
         args.user = client.user
         to_screen("if not set, only your job will be listed, user `--user __all__` to list jobs of all users")
     if args.user == '__all__':
         args.user = None
     jobs = client.rest_api_job_list(user=args.user)
     return ["%s [%s]" % (j["name"], j.get("state", "UNKNOWN")) for j in jobs]
Beispiel #8
0
 def check_arguments_notebook(self, args):
     self.check_essentials(args)
     assert args.notebook or args.interactive, "must specify a notebook name unless in interactive mode"
     if not args.job_name:
         assert args.notebook or args.interactive, "must specify a notebook if no job name defined"
         args.job_name = os.path.splitext(os.path.basename(args.notebook))[
             0] + "_" + randstr().hex if args.notebook else "jupyter_server_{}".format(randstr().hex)
     if args.interactive and not args.token:
         to_screen("no authentication token is set", _type="warn")
Beispiel #9
0
 def get_random_var_name(self):
     import random
     from openpaisdk import LayeredSettings
     lst = [
         x for x in LayeredSettings.keys()
         if not LayeredSettings.act_append(x)
     ]
     ret = lst[random.randint(0, len(lst) - 1)]
     to_screen(f"random select {ret} in {lst}")
     return ret
Beispiel #10
0
 def wrapper(*args, **kwargs):
     try:
         return fn(*args, **kwargs)
     except err_type as e:
         if not err_msg:
             to_screen(repr(e), _type="warn")
         else:
             to_screen(err_msg, _type="warn")
         return default
     except Exception as e:
         raise e
Beispiel #11
0
 def remove(self, target):
     indexes = self.filter_index(target)
     if not indexes:
         to_screen(
             f"OrganizedList: {self._key} = {target} cannot be deleted due to non-existence"
         )
         return self
     for index in sorted(indexes, reverse=True):
         del self[index]
         to_screen(f"OrganizedList: {self._key} = {target} removed")
     return self
Beispiel #12
0
 def func(*args, **kwargs):
     dir_name = 'utdir_' + method.__name__
     os.makedirs(dir_name, exist_ok=True)
     try:
         with safe_chdir(dir_name):
             method(*args, **kwargs)
     except Exception as identifier:
         raise identifier
     finally:
         to_screen(f"trying to remove {dir_name}")
         # ! rmtree not work on windows
         os.system(f'rm -rf {dir_name}')
Beispiel #13
0
 def tabulate_resources(dic: dict):
     to_screen([[
         c,
         i.get("uri", None),
         i.get("user", None), v, i["GPUs"], i["vCores"], i["memory"]
     ] for c in dic.keys() for v, i in dic[c].items()],
               _type="table",
               headers=[
                   "cluster", "uri", "user", "virtual-cluster", "GPUs",
                   "vCores", "memory"
               ])
     return dic
Beispiel #14
0
    def step2_submit_job(self):
        import time
        to_screen("""\
testing REST APIs related to submitting a job, including
- rest_api_submit
        """)
        self.job_name = 'ut_test_' + randstr(10)
        self.cmd_exec([
            'opai', 'job', 'sub', '-i', 'python:3', '-j', self.job_name,
            'opai cluster resources'
        ])
        time.sleep(10)
Beispiel #15
0
    def step1_init_clusters(self):
        to_screen("""\
testing REST APIs related to retrieving cluster info, including
- rest_api_cluster_info
- rest_api_user
- rest_api_token
- rest_api_virtual_clusters
        """)
        with open(self.ut_init_shell) as fn:
            for line in fn:
                if line.startswith('#'):
                    continue
                self.cmd_exec(line)
        alias = get_defaults()["cluster-alias"]
        self.assertTrue(alias, "not specify a cluster")
        self.cmd_exec('opai cluster resources')
Beispiel #16
0
def job_spider(cluster, jobs: list = None):
    jobs = na_lazy(jobs, cluster.rest_api_job_list)
    to_screen("{} jobs to be captured in the cluster {}".format(
        len(jobs), cluster.alias))
    job_statuses = concurrent_map(
        lambda j: cluster.rest_api_job_info(
            j['name'], info=None, user=j['username']), jobs)
    job_configs = concurrent_map(
        lambda j: cluster.rest_api_job_info(
            j['name'], info='config', user=j['username']), jobs)
    job_logs = concurrent_map(JobStatusParser.all_tasks_logs, job_statuses)
    for job, sta, cfg, logs in zip(jobs, job_statuses, job_configs, job_logs):
        job['status'] = sta
        job['config'] = cfg
        job['logs'] = logs
    return jobs
Beispiel #17
0
 def check(self):
     to_screen("try to connect cluster {}".format(self.alias))
     storages = self.rest_api_storages()
     for i, s in enumerate(storages):
         s.setdefault("storage_alias", s["protocol"] + f'-{i}')
     cluster_info = na(self.rest_api_cluster_info(), {})
     if cluster_info.get("authnMethod", "basic") == "OIDC":
         assert self.config[
             "token"], "must use authentication token (instead of password) in OIDC mode"
     self.config.update(
         info=cluster_info,
         storages=storages,
         virtual_clusters=self.virtual_clusters(),
     )
     # ! will check authentication types according to AAD enabled or not
     return self
Beispiel #18
0
 def retry(self, f_exit, func, *args, **kwargs):
     t, i = 0, 0
     while True:
         try:
             x = func(*args, **kwargs)
             if f_exit(x):
                 if not self.silent:
                     to_screen("ready: {}".format(x))
                 return x
         except NotReadyError as identifier:
             __logger__.debug("condition not satisfied", identifier)
         if not self.silent:
             to_screen("not ready yet: {}".format(x))
         i, t = i + 1, t + self.t_sleep
         if self.max_try and i >= self.max_try or self.timeout and t >= self.timeout:
             return None
         if self.t_sleep:
             time.sleep(self.t_sleep)
Beispiel #19
0
 def add(lst: list,
         key: str,
         elem: dict,
         getter=dict.get,
         silent: bool = False) -> bool:
     "return True if update an existing elements, else return False"
     target = getter(elem, key)
     m = OrganizedList.filter(lst, key, target)  # type: dict, matches
     for x in m["matches"]:
         x.update(elem)
         if not silent:
             to_screen("%s = %s already exists, update it" %
                       (key, elem[key]))
         return lst
     lst.append(elem)
     if not silent:
         to_screen("%s = %s added" % (key, elem[key]))
     return lst
Beispiel #20
0
 def plugin_uploadFiles(self, plugin: dict):
     import tarfile
     to_screen("archiving and uploading ...")
     work_directory = self.param("work_directory")
     assert work_directory, "must specify a storage to upload"
     with safe_open(self.temp_archive, "w:gz", func=tarfile.open) as fn:
         for src in plugin["parameters"]["files"]:
             src = os.path.relpath(src)
             if os.path.dirname(src) != "":
                 __logger__.warn(
                     "files not in current folder may cause wrong location in the container, please check it {}"
                     .format(src))
             fn.add(src)
             to_screen("{} archived and wait to be uploaded".format(src))
     self.client.get_storage().upload(local_path=self.temp_archive,
                                      remote_path="{}/source/{}".format(
                                          work_directory,
                                          os.path.basename(
                                              self.temp_archive)),
                                      overwrite=True)
Beispiel #21
0
 def submit(self, cluster_alias: str = None, virtual_cluster: str = None):
     cluster_alias = na(cluster_alias, self.param("cluster_alias", None))
     self.select_cluster(cluster_alias, virtual_cluster)
     self.validate().local_process()
     to_screen("submit job %s to cluster %s" % (self.name, cluster_alias))
     try:
         self.client.rest_api_submit(self.get_config())
         job_link = self.client.get_job_link(self.name)
         return {"job_link": job_link, "job_name": self.name}
     except Exception as identifier:
         to_screen(f"submit failed due to {repr(identifier)}",
                   _type="error")
         to_screen(self.get_config())
         raise identifier
Beispiel #22
0
def main():
    try:
        eng = Engine()
        result = eng.process(sys.argv[1:])
        if result:
            to_screen(result)
        return 0
    except AssertionError as identifier:
        to_screen(f"Value error: {repr(identifier)}", _type="error")
        return 1
    except Exception as identifier:
        to_screen(f"Error: {repr(identifier)}", _type="error")
        return 2
    else:
        return -1
Beispiel #23
0
 def add(self,
         elem: dict,
         getter=dict.get,
         silent: bool = False,
         replace: bool = False):
     for i in self.filter_index(self._fn_get(elem)):
         if replace:
             self[i] = elem
             if not silent:
                 to_screen(
                     f"OrganizedList: {self._key} = {self._fn_get(elem)} already exists, replace it"
                 )
         else:
             self[i].update(elem)
             if not silent:
                 to_screen(
                     f"OrderedDict: {self._key} = {self._fn_get(elem)} already exists, update it"
                 )
         return self  # ~ return
     self.append(elem)
     if not silent:
         to_screen(
             f"OrganizedList: {self._key} = {self._fn_get(elem)} added")
     return self
Beispiel #24
0
 def do_action_connect(self, args):
     to_screen("retrieving job config from cluster")
     self.__job__.load(job_name=args.job_name, cluster_alias=args.cluster_alias)
     return self.connect_notebook()
Beispiel #25
0
 def upload_webhdfs(self, local_path: str, remote_path: str, **kwargs):
     to_screen("upload %s -> %s" % (local_path, remote_path))
     return self.client.upload(local_path=local_path,
                               hdfs_path=remote_path,
                               **kwargs)
Beispiel #26
0
    def update(self, key: str, value=None, delete: bool = False):
        if not self.allow(key):
            to_screen(f"{key} is not a recognized default variable, ignored")
            return
        dic = self.values
        if delete:
            if key not in dic:
                to_screen(f"key {key} not found in {self.name}, ignored")
            elif not self.act_append(
                    key) or not value:  # delete the key when not append action
                del dic[key]
                to_screen(
                    f"key {key} removed completely from {self.name} successfully"
                )
            else:
                dic[key].remove(value)
                to_screen(
                    f"{value} removed in {key} under {self.name} successfully")
        else:
            if self.act_append(key):

                def _append(dic, key, value):
                    dic.setdefault(key, [])
                    if value not in dic[key]:
                        dic[key].append(value)

                _append(dic, key, value)
                to_screen(
                    f"{value} added to {key} under {self.name} successfully")
            else:
                dic[key] = value
                to_screen(
                    f"{key} set to {value} under {self.name} successfully")
        if self.file:
            to_file(self.values, self.file)
Beispiel #27
0
 def do_action_delete(self, args):
     if self.__clusters__.delete(args.cluster_alias):
         to_screen("cluster %s deleted" % args.cluster_alias)
     return None
Beispiel #28
0
 def process_args(self, args):
     to_screen(f'Parsed arguments {args}', _type="debug")
     if not args.scene:
         self.parser.print_help()
         return
     return self.scenes[args.scene].process(args)
Beispiel #29
0
 def process(self, a: list):
     to_screen(f'Received arguments {a}', _type="debug")
     args = self.parser.parse_args(a)
     return self.process_args(args)
Beispiel #30
0
 def do_action_stop(self, args):
     client = self.__clusters__.get_client(args.cluster_alias)
     for job_name in args.job_names:
         to_screen(client.rest_api_execute_job(job_name, "STOP"))