Exemple #1
0
    def test_unsupported(self):
        with self.assertRaises(Exception) as cm:
            get_queue_adapter({
                'config': {
                    'scheduler': {
                        'type': 'foo'
                    }
                }
            }, None)

        self.assertIsNotNone(cm.exception)
Exemple #2
0
def terminate_job(cluster, job, log_write_url=None, girder_token=None):
    script_filepath = None
    headers = {'Girder-Token':  girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)

    try:

        with get_connection(girder_token, cluster) as conn:
            if AbstractQueueAdapter.QUEUE_JOB_ID in job:
                queue_adapter = get_queue_adapter(cluster, conn)
                output = queue_adapter.terminate_job(job)
            else:
                r = requests.patch(status_url, headers=headers,
                                   json={'status': JobState.TERMINATED})
                check_status(r)

            if 'onTerminate' in job:
                commands = '\n'.join(job['onTerminate']['commands']) + '\n'
                commands = Template(commands) \
                    .render(cluster=cluster,
                            job=job,
                            base_url=cumulus.config.girder.baseUrl)

                on_terminate = _put_script(conn, commands + '\n')

                terminate_output = '%s.terminate.out' % job_id
                terminate_cmd = 'nohup %s  &> %s  &\n' % (on_terminate,
                                                          terminate_output)
                terminate_cmd = _put_script(conn, terminate_cmd)
                output = conn.execute(terminate_cmd)

                conn.remove(on_terminate)
                conn.remove(terminate_cmd)

                if len(output) != 1:
                    raise Exception('PID not returned by execute command')

                try:
                    pid = int(output[0])
                except ValueError:
                    raise Exception('Unable to extract PID from: %s'
                                    % output)

                output_message = 'onTerminate error: %s'
                monitor_process.delay(cluster, job, pid, terminate_output,
                                      log_write_url=log_write_url,
                                      output_message=output_message,
                                      girder_token=girder_token)

    except Exception as ex:
        r = requests.patch(status_url, headers=headers,
                           json={'status': JobState.UNEXPECTEDERROR})
        check_status(r)
        get_job_logger(job, girder_token).exception(ex.message)
        raise
    finally:
        if script_filepath and os.path.exists(script_filepath):
            os.remove(script_filepath)
Exemple #3
0
def terminate_job(cluster, job, log_write_url=None, girder_token=None):
    script_filepath = None
    headers = {'Girder-Token':  girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)

    try:

        with get_connection(girder_token, cluster) as conn:
            if AbstractQueueAdapter.QUEUE_JOB_ID in job:
                queue_adapter = get_queue_adapter(cluster, conn)
                output = queue_adapter.terminate_job(job)
            else:
                r = requests.patch(status_url, headers=headers,
                                   json={'status': JobState.TERMINATED})
                check_status(r)

            if 'onTerminate' in job:
                commands = '\n'.join(job['onTerminate']['commands']) + '\n'
                commands = Template(commands) \
                    .render(cluster=cluster,
                            job=job,
                            base_url=cumulus.config.girder.baseUrl)

                on_terminate = _put_script(conn, commands + '\n')

                terminate_output = '%s.terminate.out' % job_id
                terminate_cmd = 'nohup %s  &> %s  &\n' % (on_terminate,
                                                          terminate_output)
                terminate_cmd = _put_script(conn, terminate_cmd)
                output = conn.execute(terminate_cmd)

                conn.remove(on_terminate)
                conn.remove(terminate_cmd)

                if len(output) != 1:
                    raise Exception('PID not returned by execute command')

                try:
                    pid = int(output[0])
                except ValueError:
                    raise Exception('Unable to extract PID from: %s'
                                    % output)

                output_message = 'onTerminate error: %s'
                monitor_process.delay(cluster, job, pid, terminate_output,
                                      log_write_url=log_write_url,
                                      output_message=output_message,
                                      girder_token=girder_token)

    except Exception as ex:
        r = requests.patch(status_url, headers=headers,
                           json={'status': JobState.UNEXPECTEDERROR})
        check_status(r)
        get_job_logger(job, girder_token).exception(str(ex))
        raise
    finally:
        if script_filepath and os.path.exists(script_filepath):
            os.remove(script_filepath)
Exemple #4
0
 def setUp(self):
     self._cluster_connection = mock.MagicMock()
     self._adapter = get_queue_adapter({
         'config': {
             'scheduler': {
                 'type': QueueType.SLURM
             }
         },
         'type': 'trad'
     }, self._cluster_connection)
Exemple #5
0
 def setUp(self):
     self._cluster_connection = mock.MagicMock()
     self._adapter = get_queue_adapter(
         {
             'config': {
                 'scheduler': {
                     'type': QueueType.PBS
                 }
             },
             'type': 'trad'
         }, self._cluster_connection)
Exemple #6
0
def _monitor_jobs(task, cluster, jobs, log_write_url=None, girder_token=None,
                  monitor_interval=5):
    headers = {'Girder-Token':  girder_token}

    cluster_url = '%s/clusters/%s' % (
        cumulus.config.girder.baseUrl, cluster['_id'])
    try:
        with get_connection(girder_token, cluster) as conn:

            try:
                job_queue_states \
                    = get_queue_adapter(cluster, conn).job_statuses(jobs)

                new_states = set()
                for (job, state) in job_queue_states:
                    job_id = job['_id']
                    # First get the current status
                    status_url = '%s/jobs/%s/status' % (
                        cumulus.config.girder.baseUrl, job_id)
                    r = requests.get(status_url, headers=headers)
                    check_status(r)
                    current_status = r.json()['status']

                    if current_status == JobState.TERMINATED:
                        continue

                    job_status = from_string(current_status, task=task,
                                             cluster=cluster, job=job,
                                             log_write_url=log_write_url,
                                             girder_token=girder_token,
                                             conn=conn)
                    job_status = job_status.next(state)
                    job['status'] = str(job_status)
                    job_status.run()
                    json = {
                        'status': str(job_status),
                        'timings': job.get('timings', {}),
                        'output': job['output']
                    }
                    job_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl,
                                              job['_id'])
                    r = requests.patch(job_url, headers=headers, json=json)
                    check_status(r)

                    new_states.add(job['status'])

                # Now see if we still have jobs to monitor
                running_states = set(
                    [JobState.CREATED, JobState.QUEUED,
                     JobState.RUNNING, JobState.TERMINATING]
                )

                # Do we have any job still in a running state?
                if new_states & running_states:
                    task.retry(countdown=monitor_interval)
            except EOFError:
                # Try again
                task.retry(countdown=5)
                return
            except paramiko.ssh_exception.NoValidConnectionsError as ex:
                # Try again
                task.retry(countdown=5)
                return
    # Ensure that the Retry exception will get through
    except Retry:
        raise
    except paramiko.ssh_exception.NoValidConnectionsError as ex:
        r = requests.patch(cluster_url, headers=headers,
                           json={'status': 'error'})
        check_status(r)
        get_cluster_logger(cluster, girder_token).exception(ex.message)

    except Exception as ex:
        traceback.print_exc()
        r = requests.patch(cluster_url, headers=headers,
                           json={'status': 'error'})
        check_status(r)
        get_cluster_logger(cluster, girder_token).exception(ex.message)
        raise
Exemple #7
0
def submit_job(cluster, job, log_write_url=None, girder_token=None,
               monitor=True):
    job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
    log = get_post_logger(job['_id'], girder_token, job_url)
    headers = {'Girder-Token':  girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)
    try:
        # if terminating break out
        if _is_terminating(job, girder_token):
            return

        script_name = job['name']

        with get_connection(girder_token, cluster) as conn:
            job_params = {}
            if 'params' in job:
                job_params = job['params']

            output = conn.execute('pwd')
            if len(output) != 1:
                raise Exception('Unable to fetch users home directory.')

            user_home = output[0].strip()
            job_dir = job_directory(cluster, job, user_home=user_home)
            job['dir'] = job_dir

            slots = -1

            # Try job parameters first
            slots = int(job_params.get('numberOfSlots', slots))

            if slots == -1:
                # Try the cluster
                slots = int(cluster['config'].get('numberOfSlots', slots))

            parallel_env = _get_parallel_env(cluster, job)
            if parallel_env:
                job_params['parallelEnvironment'] = parallel_env

                # If the number of slots has not been provided we will get
                # the number of slots from the parallel environment
                if slots == -1:
                    slots = int(get_queue_adapter(cluster, conn)
                                .number_of_slots(parallel_env))
                    if slots > 0:
                        job_params['numberOfSlots'] = slots

            script = _generate_submission_script(job, cluster, job_params)

            conn.mkdir(job_dir, ignore_failure=True)
            # put the script to master
            conn.put(StringIO(script), os.path.join(job_dir, script_name))

            if slots > -1:
                log.info('We have %s slots available' % slots)

            # Now submit the job
            queue_job_id \
                = get_queue_adapter(cluster, conn).submit_job(job,
                                                              script_name)

            # Update the state and queue job id
            job[AbstractQueueAdapter.QUEUE_JOB_ID] = queue_job_id
            patch_data = {
                'status': JobState.QUEUED,
                AbstractQueueAdapter.QUEUE_JOB_ID: queue_job_id,
                'dir': job_dir
            }

            r = requests.patch(status_url, headers=headers, json=patch_data)
            check_status(r)
            job = r.json()
            job['queuedTime'] = time.time()

            # Now monitor the jobs progress
            if monitor:
                monitor_job.s(
                    cluster, job, log_write_url=log_write_url,
                    girder_token=girder_token).apply_async(countdown=5)

        # Now update the status of the job
        headers = {'Girder-Token':  girder_token}
        r = requests.patch(status_url, headers=headers,
                           json={'status': JobState.QUEUED})
        check_status(r)
    except Exception as ex:
        traceback.print_exc()
        r = requests.patch(status_url, headers=headers,
                           json={'status': JobState.UNEXPECTEDERROR})
        check_status(r)
        get_job_logger(job, girder_token).exception(ex.message)
        raise
Exemple #8
0
def _monitor_jobs(task,
                  cluster,
                  jobs,
                  log_write_url=None,
                  girder_token=None,
                  monitor_interval=5):
    headers = {'Girder-Token': girder_token}

    cluster_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl,
                                      cluster['_id'])
    try:
        with get_connection(girder_token, cluster) as conn:

            try:
                job_queue_states \
                    = get_queue_adapter(cluster, conn).job_statuses(jobs)

                new_states = set()
                for (job, state) in job_queue_states:
                    job_id = job['_id']
                    # First get the current status
                    status_url = '%s/jobs/%s/status' % (
                        cumulus.config.girder.baseUrl, job_id)
                    r = requests.get(status_url, headers=headers)
                    check_status(r)
                    current_status = r.json()['status']

                    if current_status == JobState.TERMINATED:
                        continue

                    job_status = from_string(current_status,
                                             task=task,
                                             cluster=cluster,
                                             job=job,
                                             log_write_url=log_write_url,
                                             girder_token=girder_token,
                                             conn=conn)
                    job_status = job_status.next(state)
                    job['status'] = str(job_status)
                    job_status.run()
                    json = {
                        'status': str(job_status),
                        'timings': job.get('timings', {}),
                        'output': job['output']
                    }
                    job_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl,
                                              job['_id'])
                    r = requests.patch(job_url, headers=headers, json=json)
                    check_status(r)

                    new_states.add(job['status'])

                # Now see if we still have jobs to monitor
                running_states = set([
                    JobState.CREATED, JobState.QUEUED, JobState.RUNNING,
                    JobState.TERMINATING
                ])

                # Do we have any job still in a running state?
                if new_states & running_states:
                    task.retry(countdown=monitor_interval)
            except EOFError:
                # Try again
                task.retry(countdown=5)
                return
            except paramiko.ssh_exception.NoValidConnectionsError:
                # Try again
                task.retry(countdown=5)
                return
    # Ensure that the Retry exception will get through
    except Retry:
        raise
    except paramiko.ssh_exception.NoValidConnectionsError as ex:
        r = requests.patch(cluster_url,
                           headers=headers,
                           json={'status': 'error'})
        check_status(r)
        get_cluster_logger(cluster, girder_token).exception(str(ex))

    except Exception as ex:
        traceback.print_exc()
        r = requests.patch(cluster_url,
                           headers=headers,
                           json={'status': 'error'})
        check_status(r)
        get_cluster_logger(cluster, girder_token).exception(str(ex))
        raise
Exemple #9
0
def submit_job(cluster,
               job,
               log_write_url=None,
               girder_token=None,
               monitor=True):
    job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
    log = get_post_logger(job['_id'], girder_token, job_url)
    headers = {'Girder-Token': girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)
    try:
        # if terminating break out
        if _is_terminating(job, girder_token):
            return

        script_name = job['name']

        with get_connection(girder_token, cluster) as conn:
            job_params = {}
            if 'params' in job:
                job_params = job['params']

            output = conn.execute('pwd')
            if len(output) != 1:
                raise Exception('Unable to fetch users home directory.')

            user_home = output[0].strip()
            job_dir = job_directory(cluster, job, user_home=user_home)
            job['dir'] = job_dir

            slots = -1

            # Try job parameters first
            slots = int(job_params.get('numberOfSlots', slots))

            if slots == -1:
                # Try the cluster
                slots = int(cluster['config'].get('numberOfSlots', slots))

            parallel_env = _get_parallel_env(cluster, job)
            if parallel_env:
                job_params['parallelEnvironment'] = parallel_env

                # If the number of slots has not been provided we will get
                # the number of slots from the parallel environment
                if slots == -1:
                    slots = int(
                        get_queue_adapter(cluster,
                                          conn).number_of_slots(parallel_env))
                    if slots > 0:
                        job_params['numberOfSlots'] = slots

            script = _generate_submission_script(job, cluster, job_params)

            conn.makedirs(job_dir)
            # put the script to master
            conn.put(StringIO(script), os.path.join(job_dir, script_name))

            if slots > -1:
                log.info('We have %s slots available' % slots)

            # Now submit the job
            queue_job_id \
                = get_queue_adapter(cluster, conn).submit_job(job,
                                                              script_name)

            # Update the state and queue job id
            job[AbstractQueueAdapter.QUEUE_JOB_ID] = queue_job_id
            patch_data = {
                'status': JobState.QUEUED,
                AbstractQueueAdapter.QUEUE_JOB_ID: queue_job_id,
                'dir': job_dir
            }

            r = requests.patch(status_url, headers=headers, json=patch_data)
            check_status(r)
            job = r.json()
            job['queuedTime'] = time.time()

            # Now monitor the jobs progress
            if monitor:
                monitor_job.s(
                    cluster,
                    job,
                    log_write_url=log_write_url,
                    girder_token=girder_token).apply_async(countdown=5)

        # Now update the status of the job
        headers = {'Girder-Token': girder_token}
        r = requests.patch(status_url,
                           headers=headers,
                           json={'status': JobState.QUEUED})
        check_status(r)
    except Exception as ex:
        traceback.print_exc()
        r = requests.patch(status_url,
                           headers=headers,
                           json={'status': JobState.UNEXPECTEDERROR})
        check_status(r)
        get_job_logger(job, girder_token).exception(str(ex))
        raise
Exemple #10
0
    def test_unsupported(self):
        with self.assertRaises(Exception) as cm:
            get_queue_adapter({'config': {'scheduler': {'type': 'foo'}}}, None)

        self.assertIsNotNone(cm.exception)