Ejemplo n.º 1
0
def test_scp(testconfig, tmpdir):
    """
    Copy file to and from resource
    """
    import os
    import akrr.util.ssh as ssh
    sshcfg = testconfig['ssh']
    resource = prep_resource_dict(testconfig)
    content = u"this is test.\ntest is this!\n"

    sh = ssh.ssh_resource(resource)
    pwd = ssh.ssh_command(sh, 'pwd').strip()
    # file_name1 = os.path.join(pwd, "testfile1")
    # ssh.ssh_command(sh, 'rm -rf '+file_name)

    p = tmpdir / "testfile1.txt"
    p.write_text(content, encoding='utf8')
    ssh.scp_to_resource(resource, str(p), pwd)

    ssh.ssh_command(sh, "cp testfile1.txt testfile2.txt")

    p = tmpdir / "testfile2.txt"
    ssh.scp_from_resource(resource, os.path.join(pwd, "testfile2.txt"), str(p))

    assert p.read_text(encoding='utf8').strip() == content.strip()
Ejemplo n.º 2
0
    def check_the_job_on_remote_machine(self):
        sh = None
        try:
            print("### Checking the job status on remote machine")
            from string import Template
            m_wait_expr = wait_expressions[self.resource['batch_scheduler']]
            cmd = Template(
                m_wait_expr[0]).substitute(jobId=str(self.RemoteJobID))
            rege = Template(
                m_wait_expr[2]).substitute(jobId=str(self.RemoteJobID))

            sh = ssh_resource(self.resource)
            msg = ssh_command(sh, cmd)
            sh.sendline("exit")
            sh.close(force=True)
            del sh
            sh = None

            match_obj = m_wait_expr[1](rege, msg, m_wait_expr[3])
            if match_obj:
                print("Still in queue. Either waiting or running")
                if datetime.datetime.today(
                ) - self.TimeJobSubmetedToRemoteQueue > self.taskParam.get(
                        'MaxTimeInQueue', cfg.max_time_in_queue):
                    print("ERROR:")
                    print(
                        "Job exceeds the maximal time in queue (%s). And will be terminated."
                        % (str(
                            self.taskParam.get('MaxTimeInQueue',
                                               cfg.max_time_in_queue))))
                    print("Removing job from remote queue.")
                    self.terminate()
                    print("copying files from remote machine")
                    scp_from_resource(self.resource,
                                      os.path.join(self.remoteTaskDir, "*"),
                                      os.path.join(self.taskDir, "jobfiles"),
                                      "-r")
                    # print msg
                    print("Deleting all files from remote machine")
                    self.delete_remote_folder()
                    self.status = "ERROR: Job exceeds the maximal time in queue (%s) and was terminated." % (
                        str(
                            self.taskParam.get('MaxTimeInQueue',
                                               cfg.max_time_in_queue)))
                    self.status_info = "\nLast Status report:\n" + msg
                    self.ReportFormat = "Error"
                    self.ToDoNextString = "check_if_subtasks_done_proccessing_results"

                    self.update_sub_tasks()
                    # del self.RemoteJobID
                    return datetime.timedelta(seconds=3)

                self.status = "Still in queue. Either waiting or running"
                self.status_info = msg
                return active_task_default_attempt_repeat
            else:
                print(
                    "Not in queue. Either exited with error or executed successfully."
                )
                print("copying files from remote machine")
                scp_from_resource(self.resource,
                                  os.path.join(self.remoteTaskDir, "*"),
                                  os.path.join(self.taskDir, "jobfiles"), "-r")

                print("Deleting all files from remote machine")
                self.delete_remote_folder()
                self.status = "Not in queue. Either exited with error or executed successfully. " \
                    "Copied all files to local machine. Deleted all files from remote machine"
                self.status_info = "Not in queue. Either exited with error or executed successfully. " \
                    "Copied all files to local machine. Deleted all files from remote machine"
                self.ToDoNextString = "check_if_subtasks_done_proccessing_results"
                self.update_sub_tasks()
                # del self.RemoteJobID
                self.TimeJobPossiblyCompleted = datetime.datetime.today()
                return datetime.timedelta(seconds=3)
            # print msg
        except:
            if sh is not None:
                sh.sendline("exit")
                sh.close(force=True)
                del sh
            self.status = "ERROR Can not check the status of the job on remote resource"
            self.status_info = traceback.format_exc()
            self.fatal_errors_count += 1
            akrr.util.log.log_traceback(self.status)
            self.ToDoNextString = "check_the_job_on_remote_machine"
            return active_task_default_attempt_repeat
Ejemplo n.º 3
0
    def check_the_job_on_remote_machine(self):
        sh = None
        try:
            print("### Checking the job status on remote machine")
            from string import Template

            sh = ssh.ssh_resource(self.resource)

            # if it is subtask get master task id from job.id file (it should be replaced by master task)
            if self.RemoteJobID == 0:
                try:
                    self.RemoteJobID = int(
                        ssh.ssh_command(sh, "cat %s" % (os.path.join(self.remoteTaskDir, "job.id"))))
                except Exception as e:
                    log.error("Can not get remote job ID: %s", str(e))
                    self.RemoteJobID = 0

            m_wait_expression = wait_expressions[self.resource['batch_scheduler']]
            cmd = Template(m_wait_expression[0]).substitute(jobId=str(self.RemoteJobID))
            rege = Template(m_wait_expression[2]).substitute(jobId=str(self.RemoteJobID))

            msg = ssh.ssh_command(sh, cmd)
            sh.sendline("exit")
            sh.close(force=True)
            del sh

            if self.RemoteJobID == 0:
                return active_task_default_attempt_repeat

            match_obj = m_wait_expression[1](rege, msg, m_wait_expression[3])
            if match_obj:
                log.info("Still in queue. Either waiting or running")
                if datetime.datetime.today() - self.TimeJobSubmetedToRemoteQueue > \
                        self.taskParam.get('MaxTimeInQueue', cfg.max_time_in_queue):
                    log.error("Job exceeds the maximal time in queue (%s). And will be terminated. "
                              "Removing job from remote queue." % (
                               str(self.taskParam.get('MaxTimeInQueue', cfg.max_time_in_queue))))
                    self.terminate()

                    log.info("copying files from remote machine")
                    ssh.scp_from_resource(self.resource, os.path.join(self.remoteTaskDir, "*"),
                                          os.path.join(self.taskDir, "jobfiles"), "-r")
                    # print msg
                    log.info("Deleting all files from remote machine")
                    self.delete_remote_folder()

                    self.set_method_to_run_next(
                        "process_results",
                        "ERROR: Job exceeds the maximal time in queue (%s) and was terminated." %
                        str(self.taskParam.get('MaxTimeInQueue', cfg.max_time_in_queue)),
                        "Last Status report:\n" + msg)
                    self.ReportFormat = "Error"
                    # del self.RemoteJobID
                    return datetime.timedelta(seconds=3)

                self.set_method_to_run_next(None, "Still in queue. Either waiting or running", msg)
                return active_task_default_attempt_repeat

            log.info("Not in queue. Either exited with error or executed successfully.")
            log.info("copying files from remote machine")
            msg = ssh.scp_from_resource(self.resource, os.path.join(self.remoteTaskDir, "*"),
                                        os.path.join(self.taskDir, "jobfiles"), "-r")

            log.info("Deleting all files from remote machine")
            self.delete_remote_folder()
            self.set_method_to_run_next(
                "process_results",
                "Not in queue. Either exited with error or executed successfully. "
                "Copied all files to local machine. Deleted all files from remote machine")

            self.TimeJobPossiblyCompleted = datetime.datetime.today()
            return datetime.timedelta(seconds=3)

        except Exception as e:
            if hasattr(locals(), 'sh') and sh is not None:
                sh.sendline("exit")
                sh.close(force=True)
                del sh
            log.error("Got exception during check_the_job_on_remote_machine: %s", e)
            self.set_method_to_run_next(
                None, "ERROR Can not check the status of the job on remote resource", traceback.format_exc())
            self.fatal_errors_count += 1
            akrr.util.log.log_traceback(self.status)
            return active_task_default_attempt_repeat
Ejemplo n.º 4
0
    def create_batch_job_script_and_submit_it(self):
        self.JobScriptName = self.appName + ".job"
        print(
            "### Creating batch job script and submitting it to remote machine"
        )

        # as a current bypass will create a job script remotely and copy it here
        # get ssh to remote resource

        sh = None
        try:
            sh = ssh_resource(self.resource)

            # Create remote directories if needed
            # akrr_data
            check_dir(sh, self.resource['akrr_data'], raise_on_fail=True)
            # dir for app
            check_dir(sh,
                      os.path.join(self.resource['akrr_data'], self.appName),
                      raise_on_fail=True)
            # dir for task
            check_dir(sh, self.remoteTaskDir, raise_on_fail=True)
            # CheckAndCreateDir(self,sh,os.path.join(self.remoteTaskDir,"batchJob_pl"))

            # cd to remoteTaskDir
            ssh_command(sh, "cd %s" % self.remoteTaskDir)

            # get walltime from DB
            dbdefaults = {}
            try:
                db, cur = akrr.db.get_akrr_db()

                cur.execute(
                    '''SELECT resource,app,resource_param,app_param FROM active_tasks
                WHERE task_id=%s ;''', (self.task_id, ))
                raw = cur.fetchall()
                (resource, app, resource_param, app_param) = raw[0]

                cur.execute(
                    """SELECT walltime_limit
                    FROM akrr_default_walltime_limit
                    WHERE resource=%s AND app=%s AND resource_param=%s AND app_param=%s """,
                    (resource, app, resource_param, app_param))
                raw = cur.fetchall()

                if len(raw) > 0:
                    dbdefaults['walltime_limit'] = raw[0][0]

                # db.commit()
                cur.close()
                del db
            except MySQLError:
                pass

            # create job-script
            batchvars = {}

            # print "#"*80
            for di in [
                    self.resource, self.app, dbdefaults, self.resourceParam,
                    self.appParam
            ]:
                batchvars.update(di)

            # stack the subtasks
            sub_task_info = self.get_sub_task_info()
            if batchvars['shuffleSubtasks']:
                random.shuffle(sub_task_info)
            sub_tasks_execution = ""
            for subtask_id, subtask_status, subtask_datetime_stamp, \
                    subtask_resource, subtask_app, subtask_task_param in sub_task_info:
                remote_sub_task_dir = self.get_remote_task_dir(
                    self.resource['akrr_data'], subtask_app,
                    subtask_datetime_stamp)
                sub_task_job_script_name = self.get_job_script_name(
                    subtask_app)
                sub_task_job_script_path = os.path.join(
                    remote_sub_task_dir, sub_task_job_script_name)

                sub_tasks_execution += "cd " + remote_sub_task_dir + "\n"
                # subTasksExecution+="cp "+os.path.join(self.remoteTaskDir,"job.id ")+"./\n"
                sub_tasks_execution += "echo Starting " + subtask_app + "\n"
                sub_tasks_execution += self.resource[
                    'shell'] + " " + sub_task_job_script_path + " > stdout 2> stderr\n"
                sub_tasks_execution += "echo Done with " + subtask_app + "\n" + "\n"

            batchvars['subTasksExecution'] = sub_tasks_execution

            # calculate NNodes and NCores
            if 'nnodes' in batchvars:
                tmp_num_nodes = batchvars['nnodes']
                tmp_num_cores = tmp_num_nodes * batchvars['ppn']
            else:
                tmp_num_cores = batchvars['ncores']
                if tmp_num_cores % batchvars['ppn'] == 0:
                    tmp_num_nodes = tmp_num_cores / batchvars['ppn']
                else:
                    tmp_num_nodes = (tmp_num_cores / batchvars['ppn']) + 1

            batchvars['akrr_num_of_cores'] = tmp_num_cores
            batchvars['akrr_num_of_nodes'] = tmp_num_nodes

            # Set batchvars remaps
            batchvars['akrr_ppn'] = batchvars['ppn']
            batchvars['akrrNCoresToBorder'] = batchvars[
                'akrr_ppn'] * batchvars['akrr_num_of_nodes']
            batchvars['akrr_task_work_dir'] = self.remoteTaskDir
            batchvars['akrr_walltime_limit'] = "%02d:%02d:00" % (
                int(batchvars['walltime_limit']) / 60,
                int(batchvars['walltime_limit']) % 60)
            batchvars['localPATH'] = ssh_command(sh, "echo $PATH").strip()
            batchvars['akrr_appkernel_name'] = self.app['name']
            batchvars['akrr_resource_name'] = self.resource['name']
            batchvars['akrr_time_stamp'] = self.timeStamp
            if batchvars['akrr_num_of_nodes'] == 1:
                batchvars['akk_ppn_or_cores_on_one_node'] = batchvars[
                    'akrr_num_of_cores']
            else:
                batchvars['akk_ppn_or_cores_on_one_node'] = batchvars[
                    'akrr_ppn']
            if 'node_list_setter_template' not in batchvars:
                batchvars['node_list_setter_template'] = batchvars[
                    'node_list_setter'][batchvars['batch_scheduler']]

            # process templates
            batchvars['akrrCommonCommands'] = akrr.util.format_recursively(
                batchvars['akrr_common_commands_template'],
                batchvars,
                keep_double_brackets=True)
            batchvars['akrrCommonCleanup'] = akrr.util.format_recursively(
                batchvars['akrr_common_cleanup_template'],
                batchvars,
                keep_double_brackets=True)

            # do parameters adjustment
            if 'process_params' in batchvars:
                batchvars['process_params'](batchvars)
            # generate job script
            job_script = akrr.util.format_recursively(
                self.resource["batch_job_template"], batchvars)
            fout = open(
                os.path.join(self.taskDir, "jobfiles", self.JobScriptName),
                "w")
            fout.write(job_script)
            fout.close()
            scp_to_resource(
                self.resource,
                os.path.join(self.taskDir, "jobfiles", self.JobScriptName),
                os.path.join(self.remoteTaskDir))

            ssh_command(sh, "cat %s " % self.JobScriptName)

            # send to queue
            from string import Template
            send_to_queue = Template(
                submit_commands[self.resource['batch_scheduler']]).substitute(
                    scriptPath=self.JobScriptName)
            msg = ssh_command(sh, send_to_queue)
            match_obj = re.search(
                job_id_extract_patterns[self.resource['batch_scheduler']], msg,
                re.M | re.S)

            if match_obj:
                try:
                    job_id = int(match_obj.group(1))
                except (ValueError, IndexError):
                    raise AkrrError("Can't get job id. " + msg)
            else:
                raise AkrrError("Can't get job id. " + msg)

            ssh_command(sh, "echo %d > job.id" % job_id)

            # cp job id to subtasks
            for subtask_id, subtask_status, subtask_datetime_stamp, subtask_resource, \
                    subtask_app, subtask_task_param in sub_task_info:
                remote_sub_task_dir = self.get_remote_task_dir(
                    self.resource['akrr_data'], subtask_app,
                    subtask_datetime_stamp)
                ssh_command(sh, "cp job.id %s" % remote_sub_task_dir)

            self.RemoteJobID = job_id
            self.TimeJobSubmetedToRemoteQueue = datetime.datetime.today()

            sh.sendline("exit")
            sh.close(force=True)
            del sh
            sh = None
            print("\nRemoteJobID=", self.RemoteJobID)
            print("copying files from remote machine")
            scp_from_resource(self.resource,
                              os.path.join(self.remoteTaskDir, "*"),
                              os.path.join(self.taskDir, "jobfiles"), "-r")

            # update DB time_submitted_to_queue
            db, cur = akrr.db.get_akrr_db()

            cur.execute(
                '''UPDATE active_tasks
            SET time_submitted_to_queue=%s
            WHERE task_id=%s ;''',
                (datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S"),
                 self.task_id))

            cur.close()
            del db

            self.set_method_to_run_next(
                "check_the_job_on_remote_machine",
                "Created batch job script and have submitted it to remote queue.",
                "Remote job ID is %d" % self.RemoteJobID)

            # check first time in 1 minute
            return datetime.timedelta(days=0, hours=0, minutes=1)
        except Exception:
            if sh is not None:
                sh.sendline("exit")
                sh.close(force=True)
                del sh
            self.set_method_to_run_next(
                None,
                "ERROR Can not created batch job script and submit it to remote queue",
                traceback.format_exc())
            if cfg.max_fails_to_submit_to_the_queue >= 0:
                if hasattr(self, "fails_to_submit_to_the_queue"):
                    self.fails_to_submit_to_the_queue += 1
                    if self.fails_to_submit_to_the_queue > cfg.max_fails_to_submit_to_the_queue:
                        # Stop execution of the task and submit results to db
                        self.set_method_to_run_next("push_to_db")
                        result_file = os.path.join(self.taskDir, "result.xml")
                        self.write_error_xml(result_file)
                        return datetime.timedelta(seconds=3)
                else:
                    self.fails_to_submit_to_the_queue = 1
            else:
                self.fatal_errors_count += 1

            akrr.util.log.log_traceback(self.status)
            return cfg.repeat_after_fails_to_submit_to_the_queue
Ejemplo n.º 5
0
    def create_batch_job_script_and_submit_it(self, do_not_submit_to_queue=False):
        self.JobScriptName = self.get_job_script_name(self.appName)
        log.info("Creating batch job script and submitting it to remote machine")
        # as a current bypass will create a job script remotely and copy it here
        # get ssh to remote resource

        sh = None
        try:
            sh = ssh.ssh_resource(self.resource)

            # akrr_data
            ssh.check_dir(sh, self.resource['akrr_data'], try_to_create=True)
            # dir for app
            ssh.check_dir(sh, os.path.join(self.resource['akrr_data'], self.appName), try_to_create=True)
            # dir for task
            ssh.check_dir(sh, self.remoteTaskDir, try_to_create=True)
            # cd to remoteTaskDir
            ssh.ssh_command(sh, "cd %s" % self.remoteTaskDir)

            # generate_batch_job_script
            self.generate_batch_job_script()

            ssh.scp_to_resource(self.resource, os.path.join(self.taskDir, "jobfiles", self.JobScriptName),
                                os.path.join(self.remoteTaskDir))
            if do_not_submit_to_queue:
                return

            ssh.ssh_command(sh, "cat %s " % self.JobScriptName)

            # send to queue
            from string import Template
            job_id = 0
            if 'masterTaskID' not in self.taskParam:
                # i.e. submit to queue only if task is independent
                send_to_queue = Template(submit_commands[self.resource['batch_scheduler']]).substitute(
                    scriptPath=self.JobScriptName)
                msg = ssh.ssh_command(sh, send_to_queue)
                match_obj = re.search(job_id_extract_patterns[self.resource['batch_scheduler']], msg, re.M | re.S)

                if match_obj:
                    try:
                        job_id = int(match_obj.group(1))
                    except (ValueError, TypeError, IndexError):
                        raise AkrrError("Can't get job id:\n" + msg)
                else:
                    raise AkrrError("Can't get job id:\n" + msg)

                # report
                if self.resource["gateway_reporting"]:
                    ssh.ssh_command(sh, "module load gateway-usage-reporting")
                    ssh.ssh_command(sh, r'gateway_submit_attributes -gateway_user ' + self.resource[
                        "gateway_user"] + r''' -submit_time "`date '+%F %T %:z'`" -jobid ''' + str(job_id))

            ssh.ssh_command(sh, "echo %d > job.id" % job_id)

            self.RemoteJobID = job_id
            self.TimeJobSubmetedToRemoteQueue = datetime.datetime.today()

            sh.sendline("exit")
            sh.close(force=True)
            del sh
            sh = None
            print("\nRemoteJobID=", self.RemoteJobID)
            print("copying files from remote machine")
            ssh.scp_from_resource(self.resource, os.path.join(self.remoteTaskDir, "*"),
                                  os.path.join(self.taskDir, "jobfiles"), "-r")

            # update DB time_submitted_to_queue
            db, cur = akrr.db.get_akrr_db()

            cur.execute('''UPDATE active_tasks
            SET time_submitted_to_queue=%s
            WHERE task_id=%s ;''', (datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S"), self.task_id))

            cur.close()
            del db

            if 'masterTaskID' not in self.taskParam:
                # i.e. independent task
                self.set_method_to_run_next(
                    "check_the_job_on_remote_machine",
                    "Created batch job script and have submitted it to remote queue.",
                    "Remote job ID is %d" % self.RemoteJobID)

                # check first time in 1 minute
                return datetime.timedelta(days=0, hours=0, minutes=1)
            else:
                # i.e. this is subtask
                # i.e. dependent task
                self.set_method_to_run_next(
                    "check_the_job_on_remote_machine",
                    "Created batch job script.",
                    "Created batch job script. Waiting for master task to execute it.")

                # master task will update the time when it will finish task execution
                return datetime.timedelta(days=111 * 365)

        except Exception as e:
            if sh is not None:
                sh.sendline("exit")
                sh.close(force=True)
                del sh

            self.set_method_to_run_next(
                None, "ERROR Can not created batch job script and submit it to remote queue", traceback.format_exc())

            log.error("Got exception during attempt to create and submit job: %s", str(e))

            if cfg.max_fails_to_submit_to_the_queue >= 0:
                self.fails_to_submit_to_the_queue += 1

                if self.fails_to_submit_to_the_queue > cfg.max_fails_to_submit_to_the_queue or \
                        (self.taskParam['test_run'] is True and self.fails_to_submit_to_the_queue >= 2):
                    # Stop execution of the task and submit results to db
                    self.set_method_to_run_next("push_to_db")
                    result_file = os.path.join(self.taskDir, "result.xml")
                    self.write_error_xml(result_file)
                    return datetime.timedelta(seconds=3)
            else:
                self.fatal_errors_count += 1

            akrr.util.log.log_traceback(self.status)
            return cfg.repeat_after_fails_to_submit_to_the_queue