Exemple #1
0
    def status(self, job_ids):
        '''  Get the status of a list of jobs identified by their ids.

        Args:
            - job_ids (List of ids) : List of identifiers for the jobs

        Returns:
            - List of status codes.

        '''

        logger.debug("Checking status of: {0}".format(job_ids))
        for job_id in self.resources:

            retcode, stdout, stderr = self.channel.execute_wait('ps -p {} > /dev/null 2> /dev/null; echo "STATUS:$?" '.format(
                self.resources[job_id]['remote_pid']), self.cmd_timeout)
            for line in stdout.split('\n'):
                if line.startswith("STATUS:"):
                    status = line.split("STATUS:")[1].strip()
                    if status == "0":
                        self.resources[job_id]['status'] = JobStatus(JobState.RUNNING)
                    else:
                        self.resources[job_id]['status'] = JobStatus(JobState.FAILED)

        return [self.resources[jid]['status'] for jid in job_ids]
Exemple #2
0
    def _status(self):
        ''' Internal: Do not call. Returns the status list for a list of job_ids

        Args:
              self

        Returns:
              [status...] : Status list of all jobs
        '''
        job_id_list = ','.join(self.resources.keys())
        cmd = "squeue --job {0}".format(job_id_list)
        logger.debug("Executing sqeueue")
        retcode, stdout, stderr = self.execute_wait(cmd)
        logger.debug("sqeueue returned")

        # Execute_wait failed. Do no update
        if retcode != 0:
            logger.warning("squeue failed with non-zero exit code {} - see https://github.com/Parsl/parsl/issues/1588".format(retcode))
            return

        jobs_missing = list(self.resources.keys())
        for line in stdout.split('\n'):
            parts = line.split()
            if parts and parts[0] != 'JOBID':
                job_id = parts[0]
                status = translate_table.get(parts[4], JobState.UNKNOWN)
                logger.debug("Updating job {} with slurm status {} to parsl status {}".format(job_id, parts[4], status))
                self.resources[job_id]['status'] = JobStatus(status)
                jobs_missing.remove(job_id)

        # squeue does not report on jobs that are not running. So we are filling in the
        # blanks for missing jobs, we might lose some information about why the jobs failed.
        for missing_job in jobs_missing:
            logger.debug("Updating missing job {} to completed status".format(missing_job))
            self.resources[missing_job]['status'] = JobStatus(JobState.COMPLETED)
Exemple #3
0
    def _least_loaded(self):
        """ Find channels that are not in use

        Returns
        -------
        channel : Channel object
        None : When there are no more available channels
        """
        while True:
            channel_counts = {channel: 0 for channel in self.channels}
            for job_id in self.resources:
                channel = self.resources[job_id]['channel']
                if self.resources[job_id]['status'] == JobStatus(
                        JobState.RUNNING):
                    channel_counts[channel] = channel_counts.get(channel,
                                                                 0) + 1
                else:
                    channel_counts[channel] = channel_counts.get(channel, 0)

            logger.debug("Channel_counts : {}".format(channel_counts))
            if 0 not in channel_counts.values():
                yield None

            for channel in channel_counts:
                if channel_counts[channel] == 0:
                    yield channel
Exemple #4
0
 def _fail_job_async(self, job_id: Any, message: str):
     """Marks a job that has failed to start but would not otherwise be included in status()
     as failed and report it in status()
     """
     if job_id is None:
         job_id = "failed-block-{}".format(self._generated_job_id_counter)
         self._generated_job_id_counter += 1
     self._simulated_status[job_id] = JobStatus(JobState.FAILED, message)
Exemple #5
0
    def submit(self, command, tasks_per_node, job_name="parsl.localprovider"):
        ''' Submits the command onto an Local Resource Manager job.
        Submit returns an ID that corresponds to the task that was just submitted.

        If tasks_per_node <  1:
             1/tasks_per_node is provisioned

        If tasks_per_node == 1:
             A single node is provisioned

        If tasks_per_node >  1 :
             tasks_per_node nodes are provisioned.

        Args:
             - command  :(String) Commandline invocation to be made on the remote side.
             - tasks_per_node (int) : command invocations to be launched per node

        Kwargs:
             - job_name (String): Name for job, must be unique

        Returns:
             - None: At capacity, cannot provision more
             - job_id: (string) Identifier for the job

        '''

        job_name = "{0}.{1}".format(job_name, time.time())

        # Set script path
        script_path = "{0}/{1}.sh".format(self.script_dir, job_name)
        script_path = os.path.abspath(script_path)

        wrap_command = self.worker_init + '\n' + self.launcher(command, tasks_per_node, self.nodes_per_block)

        self._write_submit_script(wrap_command, script_path)

        job_id = None
        remote_pid = None
        if (self.move_files is None and not isinstance(self.channel, LocalChannel)) or (self.move_files):
            logger.debug("Pushing start script")
            script_path = self.channel.push_file(script_path, self.channel.script_dir)

        logger.debug("Launching in remote mode")
        # Bash would return until the streams are closed. So we redirect to a outs file
        cmd = 'bash {0} > {0}.out 2>&1 & \n echo "PID:$!" '.format(script_path)
        retcode, stdout, stderr = self.channel.execute_wait(cmd, self.cmd_timeout)
        for line in stdout.split('\n'):
            if line.startswith("PID:"):
                remote_pid = line.split("PID:")[1].strip()
                job_id = remote_pid
        if job_id is None:
            logger.warning("Channel failed to start remote command/retrieve PID")

        self.resources[job_id] = {'job_id': job_id, 'status': JobStatus(JobState.RUNNING),
                                  'remote_pid': remote_pid}

        return job_id
Exemple #6
0
 def status(self, job_ids):
     """Get the status of a list of jobs identified by their ids.
     Parameters
     ----------
     job_ids : list of str
         Identifiers for the jobs.
     Returns
     -------
     list of int
         The status codes of the requsted jobs.
     """
     statuses = []
     logger.info('List VMs in resource group')
     for job_id in job_ids:
         try:
             vm = self.compute_client.virtual_machines.get(
                 self.group_name, job_id, expand='instanceView')
             status = vm.instance_view.statuses[1].display_status
             statuses.append(JobStatus(translate_table.get(status, JobState.UNKNOWN)))
         # This only happens when it is in ProvisionState/Pending
         except IndexError:
             statuses.append(JobStatus(JobState.PENDING))
     return statuses