コード例 #1
0
def generate_batch_job_for_testing(resource, appkernel, nodes, dry_run=False):
    """
    Generate batch job script for testing purposes
    """
    from akrr import cfg
    from akrr.util.log import verbose

    resource = cfg.find_resource_by_name(resource)
    app = cfg.find_app_by_name(appkernel)

    str_io = io.StringIO()
    if not verbose:
        sys.stdout = sys.stderr = str_io

    from akrr.akrr_task import AkrrTaskHandlerAppKer
    task_handler = AkrrTaskHandlerAppKer(1, resource['name'], app['name'],
                                         "{'nnodes':%s}" % (nodes, ), "{}",
                                         "{}")
    if dry_run:
        task_handler.generate_batch_job_script()
    else:
        task_handler.create_batch_job_script_and_submit_it(
            do_not_submit_to_queue=True)
    sys.stdout = sys.__stdout__
    sys.stderr = sys.__stderr__

    if task_handler.status.count("ERROR") > 0:
        log.error('Batch job script was not generated see log below!')
        print(str_io.getvalue())
        log.error('Batch job script was not generated see log above!')

    job_script_full_path = os.path.join(task_handler.taskDir, "jobfiles",
                                        task_handler.JobScriptName)
    if os.path.isfile(job_script_full_path):
        fin = open(job_script_full_path, "r")
        job_script_content = fin.read()
        fin.close()

        if dry_run:
            log.info('Below is content of generated batch job script:')
            print(job_script_content)
        else:
            log.info("Local copy of batch job script is " +
                     job_script_full_path)
            print()
            log.info("Application kernel working directory on " +
                     resource['name'] + " is " + task_handler.remoteTaskDir)
            log.info("Batch job script location on " + resource['name'] +
                     " is " + os.path.join(task_handler.remoteTaskDir,
                                           task_handler.JobScriptName))
    else:
        log.error('Batch job script was not generated see messages above!')
    if dry_run:
        log.info(
            'Removing generated files from file-system as only batch job script printing was requested'
        )
        task_handler.delete_local_folder()
コード例 #2
0
ファイル: akrr_task_base.py プロジェクト: ubccr/akrr
    def __init__(self, task_id, resource_name, app_name, resource_param, app_param, task_param):
        self.resourceName = resource_name
        self.appName = app_name
        self.resourceParam = eval(resource_param)
        self.appParam = eval(app_param)
        self.taskParam = copy.deepcopy(cfg.default_task_params)
        self.taskParam.update(eval(task_param))
        self.timeToSubmit = None
        self.repetition = None
        self.task_id = task_id

        self.resource = None
        self.app = None

        # just check that resource and app exists
        self.resource = cfg.find_resource_by_name(self.resourceName)
        self.app = cfg.find_app_by_name(self.appName)
        # some variables to reset from app to resource
        if 'batch_scheduler' in self.app:
            self.resource['batch_scheduler'] = self.app['batch_scheduler']

        self.resourceDir = None
        self.appDir = None
        self.taskDir = None

        self.timeStamp = self.create_local_directory_for_task()
        # set a directory for task already should exists
        self.set_dir_names(cfg.data_dir)
        self.remoteTaskDir = self.get_remote_task_dir(self.resource['akrr_data'], self.appName, self.timeStamp)

        self.JobScriptName = None

        self.LastPickledState = -1
        self.fatal_errors_count = 0

        self._method_to_run_next = "first_step"
        self.status = "Activated"
        self.status_info = "Activated"
        self._old_method_to_run_next = "Does not exist"
        self._old_status = "Does not exist"
コード例 #3
0
ファイル: app_validate.py プロジェクト: ubccr/akrr
def app_validate(resource, appkernel, nnodes):
    from akrr.util.log import verbose
    resource_name = resource
    app_name = appkernel

    error_count = 0
    warning_count = 0

    log.info("Validating " + app_name +
             " application kernel installation on " + resource_name)

    from akrr import get_akrr_dirs

    akrr_dirs = get_akrr_dirs()

    default_resource_param_filename = os.path.abspath(
        os.path.join(akrr_dirs['default_dir'], "default.resource.conf"))
    resource_param_filename = os.path.abspath(
        os.path.join(akrr_dirs['cfg_dir'], "resources", resource_name,
                     "resource.conf"))

    default_app_param_filename = os.path.abspath(
        os.path.join(akrr_dirs['default_dir'], "default.app.conf"))
    app_ker_param_filename = os.path.abspath(
        os.path.join(akrr_dirs['default_dir'], app_name + ".app.conf"))
    ###############################################################################################
    # validating resource parameter file

    log.info("#" * 80)
    log.info("Validating %s parameters from %s" %
             (resource_name, resource_param_filename))

    if not os.path.isfile(resource_param_filename):
        log.error("resource parameters file (%s) do not exists!" %
                  (resource_param_filename, ))
        exit(1)

    # check syntax
    try:
        tmp = {}
        exec(
            compile(
                open(default_resource_param_filename).read(),
                default_resource_param_filename, 'exec'), tmp)
        exec(
            compile(
                open(resource_param_filename).read(), resource_param_filename,
                'exec'), tmp)
    except Exception:
        log.exception("Can not load resource from "
                      "" + resource_param_filename + "\n" +
                      "Probably invalid syntax.")
        exit(1)
    # check syntax
    try:
        tmp = {}
        exec(
            compile(
                open(default_app_param_filename).read(),
                default_app_param_filename, 'exec'), tmp)
        exec(
            compile(
                open(app_ker_param_filename).read(), app_ker_param_filename,
                'exec'), tmp)
    except Exception:
        log.exception("Can not load application kernel from "
                      "" + app_ker_param_filename + "\n" +
                      "Probably invalid syntax")
        exit(1)

    # now we can load akrr
    from akrr import cfg
    from akrr import akrrrestclient
    from akrr.cli.resource_deploy import make_results_summary
    from akrr.cfg_util import load_app_default, load_app_on_resource

    resource = cfg.find_resource_by_name(resource_name)
    log.info(
        "Syntax of %s is correct and all necessary parameters are present." %
        resource_param_filename)

    cfg.find_app_by_name(app_name)
    try:
        app_default = load_app_default(app_name)
        app = load_app_on_resource(app_name, resource_name, resource,
                                   app_default)

        pprint.pprint(app)
    except Exception as e:  # pylint: disable=broad-except
        log.exception("Exception occurred during updated app loading:" +
                      str(e))
        exit(1)
    log.info(
        "Syntax of %s is correct and all necessary parameters are present." %
        app_ker_param_filename)

    # check if AK is in DB
    if True:
        # add entry to mod_appkernel.resource
        db_ak, cur_ak = akrr.db.get_ak_db(True)

        cur_ak.execute(
            '''SELECT * FROM app_kernel_def WHERE ak_base_name=%s''',
            (app_name, ))
        ak_in_akdb = cur_ak.fetchall()
        if len(ak_in_akdb) == 0:
            cur_ak.execute(
                '''INSERT INTO app_kernel_def (name,ak_base_name,processor_unit,enabled, description, visible)
                        VALUES(%s,%s,'node',0,%s,0);''',
                (app_name, app_name, app_name))
            db_ak.commit()
        cur_ak.execute(
            '''SELECT * FROM app_kernel_def WHERE ak_base_name=%s''',
            (app_name, ))
        ak_in_akdb = cur_ak.fetchall()[0]
        # add entry to mod_akrr.resource
        db, cur = akrr.db.get_akrr_db(True)

        cur.execute('''SELECT * FROM app_kernels WHERE name=%s''',
                    (app_name, ))
        ak_in_db = cur.fetchall()
        if len(ak_in_db) == 0:
            cur.execute(
                '''INSERT INTO app_kernels (id,name,enabled,nodes_list)
                        VALUES(%s,%s,0,'1,2,4,8');''',
                (ak_in_akdb['ak_def_id'], app_name))
            db.commit()

    ###############################################################################################
    # connect to resource
    log.info("#" * 80)
    log.info("Validating resource accessibility. Connecting to %s." %
             (resource['name']))
    if resource['ssh_private_key_file'] is not None and os.path.isfile(
            resource['ssh_private_key_file']) is False:
        log.error("Can not access ssh private key (%s)"
                  "" % (resource['ssh_private_key_file'], ))
        exit(1)

    str_io = io.StringIO()
    try:
        sys.stdout = sys.stderr = str_io
        # Connect to resource
        # Spin-up instance before ssh it
        if resource['batch_scheduler'].lower() == "openstack":
            # Start instance if it is cloud
            openstack_server = akrr.util.openstack.OpenStackServer(
                resource=resource)
            resource['openstack_server'] = openstack_server
            openstack_server.create()
            resource['remote_access_node'] = openstack_server.ip
        if resource['batch_scheduler'].lower() == "googlecloud":
            # Start instance if it is cloud
            googlecloud_server = akrr.util.googlecloud.GoogleCloudServer(
                resource=resource)
            resource['googlecloud_server'] = googlecloud_server
            googlecloud_server.create()
            resource['remote_access_node'] = googlecloud_server.ip

        rsh = akrr.util.ssh.ssh_resource(resource)

        sys.stdout = sys.__stdout__
        sys.stderr = sys.__stderr__
    except Exception as e:
        msg2 = str_io.getvalue()
        msg2 += "\n" + traceback.format_exc()
        sys.stdout = sys.__stdout__
        sys.stderr = sys.__stderr__
        msg = "Can not connect to """ + resource['name'] + "\n" + \
              "Probably invalid credential, see full error report below\n" + msg2
        log.error(msg)
        raise e
    print("=" * 80)
    log.info("Successfully connected to %s\n\n" % (resource['name']))

    ###############################################################################################
    log.info("Checking directory locations\n")

    d = resource['akrr_data']
    log.info("Checking: %s:%s" % (resource['remote_access_node'], d))
    status, msg = check_dir(rsh, d, exit_on_fail=True, try_to_create=True)
    log.info(msg + "\n")

    d = resource['appkernel_dir']
    log.info("Checking: %s:%s" % (resource['remote_access_node'], d))
    status, msg = check_dir(rsh, d, exit_on_fail=True, try_to_create=True)
    log.info(msg + "\n")

    d = resource['network_scratch']
    log.info("Checking: %s:%s" % (resource['remote_access_node'], d))
    status, msg = check_dir(rsh, d, exit_on_fail=False, try_to_create=False)
    if status is True:
        log.info(msg)
    else:
        log.warning(msg)
        log.warning(
            ("WARNING %d: network scratch might be have a different location "
             + "on head node, so if it is by design it is ok") %
            (warning_count + 1))
        warning_count += 1
    log.info("")

    d = resource['local_scratch']
    log.info("Checking: %s:%s" % (resource['remote_access_node'], d))
    status, msg = check_dir(rsh, d, exit_on_fail=False, try_to_create=False)
    if status is True:
        log.info(msg)
    else:
        log.warning(msg)
        log.warning(
            ("WARNING %d: local scratch might be have a different location " +
             "on head node, so if it is by design it is ok") %
            (warning_count + 1))
        warning_count += 1
    log.info("")

    # close connection we don't need it any more
    rsh.close(force=True)
    del rsh

    # Delete openstack instance after tests
    if resource['batch_scheduler'].lower() == "openstack":
        # delete instance if it is cloud
        resource['openstack_server'].delete()
        resource['remote_access_node'] = None
    if resource['batch_scheduler'].lower() == "googlecloud":
        # delete instance if it is cloud
        resource['googlecloud_server'].delete()
        resource['remote_access_node'] = None

    ###############################################################################################
    # send test job to queue

    log.info("#" * 80)
    log.info(
        "Will send test job to queue, wait till it executed and will analyze the output"
    )

    print("Will use AKRR REST API at", akrrrestclient.restapi_host)
    # get check connection
    try:
        r = akrrrestclient.get('/scheduled_tasks')
        if r.status_code != 200:
            log.error(
                "Can not get token for AKRR REST API ( "
                "" + akrrrestclient.restapi_host + " )\n" +
                "See server response below:\n %s",
                json.dumps(r.json(), indent=4))
            exit(1)
    except Exception:
        log.error("Can not connect to AKRR REST API ( "
                  "" + akrrrestclient.restapi_host + " )\n" +
                  "Is it running?\n" + "See full error report below:\n" +
                  traceback.format_exc())
        exit(1)

    # check if the test job is already submitted
    task_id = None
    test_job_lock_filename = os.path.join(
        cfg.data_dir, resource_name + "_" + app_name + "_test_task.dat")
    if os.path.isfile(test_job_lock_filename):
        fin = open(test_job_lock_filename, "r")
        task_id = int(fin.readline())
        fin.close()

        r = akrrrestclient.get('/tasks/' + str(task_id))
        if r.status_code != 200:
            task_id = None
        else:
            log.warning(
                "\nWARNING %d: Seems this is rerun of this script, will monitor task with task_id = "
                % (warning_count + 1) + str(task_id))
            log.warning("To submit new task delete " + test_job_lock_filename +
                        "\n")
            warning_count += 1
        # check how old is it
    # submit test job
    if task_id is None:
        try:
            payload = {
                'resource': resource_name,
                'app': app_name,
                'resource_param': "{'nnodes':%d}" % nnodes,
                'task_param': "{'test_run':True}"
            }
            r = akrrrestclient.post('/scheduled_tasks', data=payload)
            if r.status_code != 200:
                log.error(
                    "Can not submit task through AKRR REST API ( "
                    "" + akrrrestclient.restapi_host +
                    " )\n" + "See server response below",
                    json.dumps(r.json(), indent=4))
                exit(1)
            task_id = r.json()['data']['data']['task_id']
        except Exception:
            log.error("Can not submit task through AKRR REST API ( "
                      "" + akrrrestclient.restapi_host + " )\n" +
                      "Is it still running?\n" +
                      "See full error report below:\n" +
                      traceback.format_exc())
            exit(1)
        # write file with tast_id
        fout = open(os.path.join(test_job_lock_filename), "w")
        print(task_id, file=fout)
        fout.close()
        log.info("\nSubmitted test job to AKRR, task_id is " + str(task_id) +
                 "\n")
    # now wait till job is done
    msg_body0 = ""
    while True:
        t = datetime.datetime.now()
        # try:
        r = akrrrestclient.get('/tasks/' + str(task_id))

        if r.status_code == 200:
            response_json = r.json()

            msg_body = "=" * 80
            msg_body += "\nTast status:\n"

            if response_json["data"]["queue"] == "scheduled_tasks":
                msg_body += "Task is in scheduled_tasks queue.\n"
                msg_body += "It schedule to be started on " + response_json[
                    "data"]["data"]['time_to_start'] + "\n"
            elif response_json["data"]["queue"] == "active_tasks":
                msg_body += "Task is in active_tasks queue.\n"
                msg_body += "Status: " + str(
                    response_json["data"]["data"]['status']) + "\n"
                msg_body += "Status info:\n" + str(
                    response_json["data"]["data"]['status_info']) + "\n"
            elif response_json["data"]["queue"] == "completed_tasks":
                msg_body += "Task is completed!\n"
                completed_tasks = r.json()['data']['data']['completed_tasks']
                akrr_xdmod_instanceinfo = r.json(
                )['data']['data']['akrr_xdmod_instanceinfo']
                if verbose:
                    msg_body += "completed_tasks table entry:\n" + pp.pformat(
                        completed_tasks) + "\n"
                    msg_body += "akrr_xdmod_instanceinfo table entry:\n" + pp.pformat(
                        akrr_xdmod_instanceinfo) + "\n"
                    msg_body += 'output parsing results:\n' + akrr_xdmod_instanceinfo[
                        'body'] + "\n"
                else:
                    msg_body += "\tstatus: " + str(
                        akrr_xdmod_instanceinfo['status']) + "\n"
                    if akrr_xdmod_instanceinfo['status'] == 0:
                        msg_body += "\tstatus2: " + completed_tasks[
                            'status'] + "\n"
                    msg_body += "\tstatus_info: " + completed_tasks[
                        'status_info'] + "\n"
            else:
                msg_body += r.text + "\n"

            tail_msg = "time: " + t.strftime("%Y-%m-%d %H:%M:%S")

            if msg_body != msg_body0:
                print("\n\n" + msg_body)
                print(tail_msg, end=' ')
                sys.stdout.flush()
            else:
                print("\r" + tail_msg, end=' ')
                sys.stdout.flush()

            msg_body0 = copy.deepcopy(msg_body)

            if response_json["data"]["queue"] == "completed_tasks":
                break
        # try to update:
        try:
            payload = {'next_check_time': ''}
            akrrrestclient.put('/active_tasks/' + str(task_id), data=payload)
        except Exception:
            pass
        time.sleep(5)
    ###############################################################################################
    # analysing the output
    log.info("Test job is completed analyzing output\n")
    r = akrrrestclient.get('/tasks/' + str(task_id))
    if r.status_code != 200:
        log.error(
            "Can not get information about task\n" +
            "See full error report below", "AKRR server response:\n" + r.text)
        exit(1)
    completed_tasks = r.json()['data']['data']['completed_tasks']
    akrr_xdmod_instanceinfo = r.json(
    )['data']['data']['akrr_xdmod_instanceinfo']
    akrr_errmsg = r.json()['data']['data']['akrr_errmsg']

    results_summary = make_results_summary(resource_name, app_name,
                                           completed_tasks,
                                           akrr_xdmod_instanceinfo,
                                           akrr_errmsg)
    # execution was not successful
    if completed_tasks['status'].count("ERROR") > 0:
        if completed_tasks['status'].count(
                "ERROR Can not created batch job script and submit it to remote queue"
        ) > 0:
            log.error(
                "Can not created batch job script and/or submit it to remote queue\n"
                + "See full error report below:\n" + results_summary)
            os.remove(test_job_lock_filename)
            exit(1)
        else:
            log.error(completed_tasks['status'] + "\n" +
                      "See full error report below:\n" + results_summary)
            os.remove(test_job_lock_filename)
            exit(1)

    # execution was not successful
    if akrr_xdmod_instanceinfo['status'] == 0:
        log.error("Task execution was not successful\n" +
                  "See full error report below:\n" + results_summary)
        os.remove(test_job_lock_filename)
        exit(1)
    # see what is in report
    elm_perf = XMLElementTree.fromstring(akrr_xdmod_instanceinfo['body'])
    elm_perf.find('benchmark').find('parameters')
    elm_perf.find('benchmark').find('statistics')

    log.info("\nTest kernel execution summary:")
    print(results_summary)
    print()
    # log.info("\nThe output looks good.\n")
    if error_count == 0:
        # enabling resource for execution
        log.info("\nEnabling %s on %s for execution\n" %
                 (app_name, resource_name))
        try:
            result = akrrrestclient.put('/resources/%s/on' % (resource_name, ),
                                        data={'application': app_name})
            if result.status_code == 200:
                log.info("Successfully enabled %s on %s" %
                         (app_name, resource_name))
            else:
                if result is not None:
                    log.error(
                        "Can not turn-on %s on %s" % (app_name, resource_name),
                        result.text)
                else:
                    log.error("Can not turn-on %s on %s" %
                              (app_name, resource_name))
                exit(1)
            if True:
                # add entry to mod_appkernel.resource
                db_ak, cur_ak = akrr.db.get_ak_db(True)

                cur_ak.execute(
                    '''SELECT * FROM app_kernel_def WHERE ak_base_name=%s''',
                    (app_name, ))
                ak_in_akdb = cur_ak.fetchall()
                if len(ak_in_akdb) == 0:
                    cur_ak.execute(
                        "INSERT INTO app_kernel_def (name,ak_base_name,processor_unit,enabled, description, visible)"
                        "VALUES(%s,%s,'node',0,%s,0);",
                        (app_name, app_name, app_name))
                    db_ak.commit()
                cur_ak.execute(
                    '''UPDATE app_kernel_def SET enabled=1,visible=1  WHERE ak_base_name=%s''',
                    (app_name, ))
                db_ak.commit()
                # add entry to mod_akrr.resource
                db, cur = akrr.db.get_akrr_db(True)

                cur.execute('''SELECT * FROM app_kernels WHERE name=%s''',
                            (app_name, ))
                ak_in_db = cur.fetchall()
                if len(ak_in_db) == 0:
                    cur.execute(
                        '''INSERT INTO app_kernels (id,name,enabled,nodes_list)
                                VALUES(%s,%s,0,'1,2,4,8');''',
                        (ak_in_akdb['ak_def_id'], app_name))
                    db.commit()
                cur.execute(
                    '''UPDATE app_kernels SET enabled=1  WHERE name=%s''',
                    (app_name, ))
                db.commit()
        except Exception:
            log.exception("Can not turn-on %s on %s", app_name, resource_name)
            exit(1)

    if error_count > 0:
        log.error("There are %d errors, fix them.", error_count)
    if warning_count > 0:
        log.warning(
            "\nThere are %d warnings.\nif warnings have sense (highlighted in yellow), you can move to next step!\n"
            % warning_count)
    if error_count == 0 and warning_count == 0:
        log.info("\nDONE, you can move to next step!\n")
    os.remove(test_job_lock_filename)