Beispiel #1
0
    def scale_in(self, size):
        count = 0
        if not self.resources:
            print("No resources online, cannot scale down")

        else:
            for resource in self.resources[0:size]:
                print("Cancelling : ", resource['job_id'])
                retcode, stdout, stderr = execute_wait(
                    "scancel {0}".format(resource['job_id']), 1)
                print(retcode, stdout, stderr)

        return count
Beispiel #2
0
    def status(self):

        job_id_list = ','.join([j['job_id'] for j in self.resources])
        retcode, stdout, stderr = execute_wait(
            "squeue --job {0}".format(job_id_list), 1)
        for line in stdout.split('\n'):
            parts = line.split()
            if parts and parts[0] != 'JOBID':
                print("Parts : ", parts)
                job_id = parts[0]
                status = translate_table.get(parts[4], 'UNKNOWN')
                for job in self.resources:
                    if job['job_id'] == job_id:
                        job['status'] = status
        print(self.resources)
Beispiel #3
0
    def submit (self, cmd_string, blocksize, job_name="parsl.auto"):
        ''' Submits the cmd_string onto an Local Resource Manager job of blocksize parallel elements.

        example file with the complex case of multiple submits per job:
            Universe =vanilla
            output = out.$(Cluster).$(Process)
            error = err.$(Cluster).$(Process)
            log = log.$(Cluster)
            leave_in_queue = true
            executable = test.sh
            queue 5
            executable = foo
            queue 1

        $ condor_submit test.sub
        Submitting job(s)......
        5 job(s) submitted to cluster 118907.
        1 job(s) submitted to cluster 118908.
        '''

        blocksize = 1
        job_config["nodes"] = 1
        job_config["condor_overrides"] = job_config.get("condor_overrides", '')
        job_config["user_script"] = cmd_string

        ret = self._write_submit_script(template_string, script_path, job_name, job_config)

        retcode, stdout, stderr = execute_wait("condor_submit {0}".format(script_path), 3)
        logger.debug ("Retcode:%s STDOUT:%s STDERR:%s", retcode,
                      stdout.strip(), stderr.strip())

        job_id = []

        if retcode == 0 :
            for line in stdout.split('\n'):
                if re.match('^[0-9]', line) is not None:
                    cluster = line.split(" ")[5]
                    # We know the first job id ("process" in condor terms) within a
                    # cluster is 0 and we know the total number of jobs from
                    # condor_submit, so we use some list comprehensions to expand
                    # the condor_submit output into job IDs
                    # e.g., ['118907.0', '118907.1', '118907.2', '118907.3', '118907.4', '118908.0']
                    processes = [str(x) for x in range(0,int(line[0]))]
                    job_id += [cluster + process for process in processes]

        return job_id
Beispiel #4
0
    def scale_out(self, size, name=None):
        from datetime import datetime

        ipengine_json = None
        with open(
                os.path.expanduser(
                    "~/.ipython/profile_default/security/ipcontroller-engine.json"
                ), 'r') as f:
            ipengine_json = f.read()

        job_name = "midway.parsl_auto.{0}".format(
            datetime.now().strftime("%Y-%m-%d_%H:%M:%S"))
        script_name = job_name + ".submit"
        submit_script = None

        with open(
                os.path.join(os.path.dirname(__file__),
                             './midway.template.submit'), 'r') as f:
            submit_script = Template(
                f.read()).safe_substitute(**self.config,
                                          nodes=1,
                                          jobname=job_name,
                                          ipengine_json=ipengine_json)

        with open(script_name, 'w') as f:
            f.write(submit_script)

        retcode, stdout, stderr = execute_wait(
            "sbatch {0}".format(script_name), 1)
        print("retcode : ", retcode)
        print("stdout  : ", stdout)
        print("stderr  : ", stderr)
        if retcode == 0:
            for line in stdout.split('\n'):
                if line.startswith("Submitted batch job"):
                    job_id = line.split("Submitted batch job")[1]
                    self.resources.extend([{
                        'job_id': job_id.strip(),
                        'status': 'submitted',
                        'size': size
                    }])
        else:
            print("Submission of command to scale_out failed")
Beispiel #5
0
    def cancel(self, job_ids):
        ''' Cancels the jobs specified by a list of job ids

        Args:
        job_ids : [<job_id> ...]

        Returns :
        [True/False...] : If the cancel operation fails the entire list will be False.
        '''

        job_id_list = ' '.join(job_ids)
        retcode, stdout, stderr = execute_wait("condor_rm {0}".format(job_id_list), 3)
        rets = None
        if retcode == 0 :
            for jid in job_ids:
                self.resources[jid]['status'] = translate_table['CA'] # Setting state to cancelled
            rets = [True for i in job_ids]
        else:
            rets = [False for i in job_ids]

        return rets
Beispiel #6
0
    def _status(self):
        ''' Internal: Do not call. Returns the status list for a list of job_ids

        Args:
              self

        Returns:
              [status...] : Status list of all jobs
        '''

        #job_id_list  = ','.join(self.resources.keys())

        jobs_missing = list(self.resources.keys())

        retcode, stdout, stderr = execute_wait("qstat -u $USER", 3)
        for line in stdout.split('\n'):
            if line.startswith('='): continue

            parts = line.upper().split()
            if parts and parts[0] != 'JOBID':
                job_id = parts[0]
                print(parts)

                if job_id not in self.resources: continue

                status = translate_table.get(parts[4], 'UNKNOWN')

                self.resources[job_id]['status'] = status
                jobs_missing.remove(job_id)

        print("Jobs list : ", self.resources)
        # squeue does not report on jobs that are not running. So we are filling in the
        # blanks for missing jobs, we might lose some information about why the jobs failed.
        for missing_job in jobs_missing:
            if self.resources[missing_job]['status'] in ['PENDING', 'RUNNING']:
                self.resources[missing_job]['status'] = translate_table['CD']
Beispiel #7
0
 def _get_job_status(self, job_id):
     retcode, stdout, stderr = execute_wait(
         "squeue {0}".format(script_name), 1)
     print("Stdout : ", stdout)
Beispiel #8
0
    def submit(self, cmd_string, blocksize, job_name="parsl.auto"):
        ''' Submits the cmd_string onto an Local Resource Manager job of blocksize parallel elements.
        Submit returns an ID that corresponds to the task that was just submitted.

        If tasks_per_node <  1 : ! This is illegal. tasks_per_node should be integer

        If tasks_per_node == 1:
             A single node is provisioned

        If tasks_per_node >  1 :
             tasks_per_node * blocksize number of nodes are provisioned.

        Args:
             - cmd_string  :(String) Commandline invocation to be made on the remote side.
             - blocksize   :(float)

        Kwargs:
             - job_name (String): Name for job, must be unique

        Returns:
             - None: At capacity, cannot provision more
             - job_id: (string) Identifier for the job

        '''

        if self.current_blocksize >= self.config["execution"]["options"][
                "max_parallelism"]:
            logger.warn("[%s] at capacity, cannot add more blocks now",
                        self.sitename)
            return None

        # Note: Fix this later to avoid confusing behavior.
        # We should always allocate blocks in integer counts of node_granularity
        if blocksize < self.config["execution"]["options"]["node_granularity"]:
            blocksize = self.config["execution"]["options"]["node_granularity"]

        account_opt = "-A {0}".format(self.config["execution"]["options"].get(
            "account", ''))

        job_name = "parsl.{0}.{1}".format(job_name, time.time())

        script_path = "{0}/{1}.submit".format(
            self.config["execution"]["options"]["submit_script_dir"], job_name)

        nodes = math.ceil(
            float(blocksize) /
            self.config["execution"]["options"]["tasks_per_node"])
        logger.debug("Requesting blocksize:%s tasks_per_node:%s nodes:%s",
                     blocksize,
                     self.config["execution"]["options"]["tasks_per_node"],
                     nodes)

        job_config = self.config["execution"]["options"]
        job_config["nodes"] = nodes
        job_config["slurm_overrides"] = job_config.get("slurm_overrides", '')
        job_config["user_script"] = cmd_string

        ret = self._write_submit_script(template_string, script_path, job_name,
                                        job_config)

        retcode, stdout, stderr = execute_wait(
            "qsub -n {0} -t {1} {2} {3}".format(nodes, self.max_walltime,
                                                account_opt, script_path), 3)
        logger.debug("Retcode:%s STDOUT:%s STDERR:%s", retcode, stdout.strip(),
                     stderr.strip())

        job_id = None

        if retcode == 0:
            for line in stdout.split('\n'):
                if line.startswith("Submitted batch job"):
                    job_id = line.split("Submitted batch job")[1].strip()
                    self.resources[job_id] = {
                        'job_id': job_id,
                        'status': 'PENDING',
                        'blocksize': blocksize
                    }
        else:
            print("Submission of command to scale_out failed")

        return job_id