Example #1
0
def get_assetstore_id(girder_token, cluster):
    if "assetstoreId" not in cluster:
        headers = {"Girder-Token": girder_token}
        url_base = get_assetstore_url_base(cluster)
        create_url = "%s/%s" % (cumulus.config.girder.baseUrl, url_base)
        body = {
            "name": cluster["_id"],
            "host": cluster["config"]["host"],
            "machine": cluster["config"]["host"],
            "authKey": cluster["_id"],
        }

        user = parse("config.ssh.user").find(cluster)
        if user:
            body["user"] = user[0].value

        r = requests.post(create_url, json=body, headers=headers)
        check_status(r)

        cluster["assetstoreId"] = r.json()["_id"]

        cluster_url = "%s/clusters/%s" % (cumulus.config.girder.baseUrl, cluster["_id"])
        body = {"assetstoreId": cluster["assetstoreId"]}
        r = requests.patch(cluster_url, json=body, headers=headers)
        check_status(r)

    return cluster["assetstoreId"]
Example #2
0
def test_connection(cluster, log_write_url=None, girder_token=None):
    cluster_id = cluster['_id']
    cluster_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl,
                                      cluster_id)
    log = get_cluster_logger(cluster, girder_token)
    headers = {'Girder-Token': girder_token}

    try:
        # First fetch the cluster with this 'admin' token so we get the
        # passphrase filled out.
        r = requests.get(cluster_url, headers=headers)
        check_status(r)
        cluster = r.json()

        with get_connection(girder_token, cluster) as conn:
            status = 'running'
            # Test can we can connect to cluster
            output = conn.execute('pwd')
        if len(output) < 1:
            log.error('Unable connect to cluster')
            status = 'error'

        r = requests.patch(cluster_url,
                           headers=headers,
                           json={'status': status})
        check_status(r)
    except Exception as ex:
        r = requests.patch(cluster_url,
                           headers=headers,
                           json={'status': 'error'})
        # Log the error message
        log.exception(ex)
Example #3
0
def check_ansible_return_code(returncode, cluster, girder_token):
    if returncode != 0:
        check_status(
            requests.patch('%s/clusters/%s' %
                           (cumulus.config.girder.baseUrl, cluster['_id']),
                           headers={'Girder-Token': girder_token},
                           json={'status': 'error'}))
Example #4
0
def get_assetstore_id(girder_token, cluster):
    if 'assetstoreId' not in cluster:
        headers = {'Girder-Token':  girder_token}
        url_base = get_assetstore_url_base(cluster)
        create_url = '%s/%s' % (cumulus.config.girder.baseUrl, url_base)
        body = {
            'name': cluster['_id'],
            'host': cluster['config']['host'],
            'machine': cluster['config']['host'],
            'authKey': cluster['_id']
        }

        user = parse('config.ssh.user').find(cluster)
        if user:
            body['user'] = user[0].value

        r = requests.post(create_url, json=body, headers=headers)
        check_status(r)

        cluster['assetstoreId'] = r.json()['_id']

        cluster_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl,
                                          cluster['_id'])
        body = {
            'assetstoreId': cluster['assetstoreId']
        }
        r = requests.patch(cluster_url, json=body, headers=headers)
        check_status(r)

    return cluster['assetstoreId']
Example #5
0
def check_ansible_return_code(returncode, cluster, girder_token):
    if returncode != 0:
        check_status(requests.patch('%s/clusters/%s' %
                                    (cumulus.config.girder.baseUrl,
                                     cluster['_id']),
                                    headers={'Girder-Token': girder_token},
                                    json={'status': 'error'}))
Example #6
0
def launch_cluster(playbook, cluster, profile, secret_key, extra_vars,
                   girder_token, log_write_url, post_status):
    playbook = get_playbook_path(playbook)
    playbook_variables = get_playbook_variables(cluster, profile, extra_vars)

    env = os.environ.copy()
    env.update({'AWS_ACCESS_KEY_ID': profile['accessKeyId'],
                'AWS_SECRET_ACCESS_KEY': secret_key,
                'GIRDER_TOKEN': girder_token,
                'LOG_WRITE_URL': log_write_url,
                'CLUSTER_ID': cluster['_id']})

    inventory = simple_inventory('localhost')

    with inventory.to_tempfile() as inventory_path:
        ansible = run_playbook(playbook, inventory_path, playbook_variables,
                               env=env, verbose=3)

    p = CloudProvider(dict(secretAccessKey=secret_key, **profile))

    master = p.get_master_instance(cluster['_id'])

    status_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl,
                                     cluster['_id'])
    updates = {
        'config': {
            'host': master['public_ip']
        }
    }
    headers = {'Girder-Token':  girder_token}
    r = requests.patch(status_url, headers=headers, json=updates)
    check_status(r)

    check_ansible_return_code(ansible, cluster, girder_token)
    check_girder_cluster_status(cluster, girder_token, post_status)
Example #7
0
def test_connection(cluster, log_write_url=None, girder_token=None):
    cluster_id = cluster['_id']
    cluster_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl, cluster_id)
    log = get_cluster_logger(cluster, girder_token)
    headers = {'Girder-Token':  girder_token}

    try:
        # First fetch the cluster with this 'admin' token so we get the
        # passphrase filled out.
        r = requests.get(cluster_url, headers=headers)
        check_status(r)
        cluster = r.json()

        with get_connection(girder_token, cluster) as conn:
            status = 'running'
            # Test can we can connect to cluster
            output = conn.execute('pwd')
        if len(output) < 1:
            log.error('Unable connect to cluster')
            status = 'error'

        r = requests.patch(
            cluster_url, headers=headers, json={'status': status})
        check_status(r)
    except Exception as ex:
        r = requests.patch(cluster_url, headers=headers,
                           json={'status': 'error'})
        # Log the error message
        log.exception(ex)
Example #8
0
def terminate_job(cluster, job, log_write_url=None, girder_token=None):
    script_filepath = None
    headers = {'Girder-Token':  girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)

    try:

        with get_connection(girder_token, cluster) as conn:
            if AbstractQueueAdapter.QUEUE_JOB_ID in job:
                queue_adapter = get_queue_adapter(cluster, conn)
                output = queue_adapter.terminate_job(job)
            else:
                r = requests.patch(status_url, headers=headers,
                                   json={'status': JobState.TERMINATED})
                check_status(r)

            if 'onTerminate' in job:
                commands = '\n'.join(job['onTerminate']['commands']) + '\n'
                commands = Template(commands) \
                    .render(cluster=cluster,
                            job=job,
                            base_url=cumulus.config.girder.baseUrl)

                on_terminate = _put_script(conn, commands + '\n')

                terminate_output = '%s.terminate.out' % job_id
                terminate_cmd = 'nohup %s  &> %s  &\n' % (on_terminate,
                                                          terminate_output)
                terminate_cmd = _put_script(conn, terminate_cmd)
                output = conn.execute(terminate_cmd)

                conn.remove(on_terminate)
                conn.remove(terminate_cmd)

                if len(output) != 1:
                    raise Exception('PID not returned by execute command')

                try:
                    pid = int(output[0])
                except ValueError:
                    raise Exception('Unable to extract PID from: %s'
                                    % output)

                output_message = 'onTerminate error: %s'
                monitor_process.delay(cluster, job, pid, terminate_output,
                                      log_write_url=log_write_url,
                                      output_message=output_message,
                                      girder_token=girder_token)

    except Exception as ex:
        r = requests.patch(status_url, headers=headers,
                           json={'status': JobState.UNEXPECTEDERROR})
        check_status(r)
        get_job_logger(job, girder_token).exception(str(ex))
        raise
    finally:
        if script_filepath and os.path.exists(script_filepath):
            os.remove(script_filepath)
Example #9
0
def terminate_job(cluster, job, log_write_url=None, girder_token=None):
    script_filepath = None
    headers = {'Girder-Token':  girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)

    try:

        with get_connection(girder_token, cluster) as conn:
            if AbstractQueueAdapter.QUEUE_JOB_ID in job:
                queue_adapter = get_queue_adapter(cluster, conn)
                output = queue_adapter.terminate_job(job)
            else:
                r = requests.patch(status_url, headers=headers,
                                   json={'status': JobState.TERMINATED})
                check_status(r)

            if 'onTerminate' in job:
                commands = '\n'.join(job['onTerminate']['commands']) + '\n'
                commands = Template(commands) \
                    .render(cluster=cluster,
                            job=job,
                            base_url=cumulus.config.girder.baseUrl)

                on_terminate = _put_script(conn, commands + '\n')

                terminate_output = '%s.terminate.out' % job_id
                terminate_cmd = 'nohup %s  &> %s  &\n' % (on_terminate,
                                                          terminate_output)
                terminate_cmd = _put_script(conn, terminate_cmd)
                output = conn.execute(terminate_cmd)

                conn.remove(on_terminate)
                conn.remove(terminate_cmd)

                if len(output) != 1:
                    raise Exception('PID not returned by execute command')

                try:
                    pid = int(output[0])
                except ValueError:
                    raise Exception('Unable to extract PID from: %s'
                                    % output)

                output_message = 'onTerminate error: %s'
                monitor_process.delay(cluster, job, pid, terminate_output,
                                      log_write_url=log_write_url,
                                      output_message=output_message,
                                      girder_token=girder_token)

    except Exception as ex:
        r = requests.patch(status_url, headers=headers,
                           json={'status': JobState.UNEXPECTEDERROR})
        check_status(r)
        get_job_logger(job, girder_token).exception(ex.message)
        raise
    finally:
        if script_filepath and os.path.exists(script_filepath):
            os.remove(script_filepath)
Example #10
0
def _is_terminating(job, girder_token):
    headers = {'Girder-Token': girder_token}
    status_url = '%s/jobs/%s/status' % (cumulus.config.girder.baseUrl,
                                        job['_id'])
    r = requests.get(status_url, headers=headers)
    check_status(r)
    current_status = r.json()['status']

    return current_status in [JobState.TERMINATED, JobState.TERMINATING]
Example #11
0
    def terminate_job(self, job):
        url = '%s/queue/%s/%s' % (NEWT_BASE_URL, self._machine,
                                  job['queueJobId'])
        r = self._session.delete(url)
        check_status(r)
        json_response = r.json()

        if json_response['status'] != 'OK' or json_response['error']:
            raise Exception(json_response['error'])
Example #12
0
    def terminate_job(self, job):
        url = '%s/queue/%s/%s' % (NEWT_BASE_URL,
                                  self._machine, job['queueJobId'])
        r = self._session.delete(url)
        check_status(r)
        json_response = r.json()

        if json_response['status'] != 'OK' or json_response['error']:
            raise Exception(json_response['error'])
Example #13
0
def _is_terminating(job, girder_token):
    headers = {'Girder-Token':  girder_token}
    status_url = '%s/jobs/%s/status' % (cumulus.config.girder.baseUrl,
                                        job['_id'])
    r = requests.get(status_url, headers=headers)
    check_status(r)
    current_status = r.json()['status']

    return current_status in [JobState.TERMINATED, JobState.TERMINATING]
Example #14
0
def download_job_input_items(cluster, job, log_write_url=None,
                             girder_token=None):
    headers = {'Girder-Token':  girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)

    try:
        with get_connection(girder_token, cluster) as conn:
            # First put girder client on master
            path = inspect.getsourcefile(cumulus.girderclient)
            with open(path, 'r') as fp:
                conn.put(fp, os.path.basename(path))

            r = requests.patch(status_url, json={'status': 'downloading'},
                               headers=headers)
            check_status(r)

            download_cmd = 'python girderclient.py --token %s --url "%s" ' \
                           'download --dir %s  --job %s' \
                % (girder_token, cumulus.config.girder.baseUrl,
                   job_directory(cluster, job), job_id)

            download_output = '%s.download.out' % job_id
            download_cmd = 'nohup %s  &> %s  &\n' % (download_cmd,
                                                     download_output)

            download_cmd = _put_script(conn, download_cmd)
            output = conn.execute(download_cmd)

            # Remove download script
            conn.remove(download_cmd)

        if len(output) != 1:
            raise Exception('PID not returned by execute command')

        try:
            pid = int(output[0])
        except ValueError:
            raise Exception('Unable to extract PID from: %s' % output)

        # When the download is complete submit the job
        on_complete = submit_job.s(cluster, job, log_write_url=log_write_url,
                                   girder_token=girder_token)

        monitor_process.delay(cluster, job, pid, download_output,
                              log_write_url=log_write_url,
                              on_complete=on_complete,
                              girder_token=girder_token)

    except Exception as ex:
        r = requests.patch(status_url, headers=headers,
                           json={'status': 'error'})
        check_status(r)
        get_job_logger(job, girder_token).exception(str(ex))
Example #15
0
def download_job_input_items(cluster, job, log_write_url=None,
                             girder_token=None):
    headers = {'Girder-Token':  girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)

    try:
        with get_connection(girder_token, cluster) as conn:
            # First put girder client on master
            path = inspect.getsourcefile(cumulus.girderclient)
            with open(path, 'r') as fp:
                conn.put(fp, os.path.basename(path))

            r = requests.patch(status_url, json={'status': 'downloading'},
                               headers=headers)
            check_status(r)

            download_cmd = 'python girderclient.py --token %s --url "%s" ' \
                           'download --dir %s  --job %s' \
                % (girder_token, cumulus.config.girder.baseUrl,
                   job_directory(cluster, job), job_id)

            download_output = '%s.download.out' % job_id
            download_cmd = 'nohup %s  &> %s  &\n' % (download_cmd,
                                                     download_output)

            download_cmd = _put_script(conn, download_cmd)
            output = conn.execute(download_cmd)

            # Remove download script
            conn.remove(download_cmd)

        if len(output) != 1:
            raise Exception('PID not returned by execute command')

        try:
            pid = int(output[0])
        except ValueError:
            raise Exception('Unable to extract PID from: %s' % output)

        # When the download is complete submit the job
        on_complete = submit_job.s(cluster, job, log_write_url=log_write_url,
                                   girder_token=girder_token)

        monitor_process.delay(cluster, job, pid, download_output,
                              log_write_url=log_write_url,
                              on_complete=on_complete,
                              girder_token=girder_token)

    except Exception as ex:
        r = requests.patch(status_url, headers=headers,
                           json={'status': 'error'})
        check_status(r)
        get_job_logger(job, girder_token).exception(ex.message)
Example #16
0
    def get(self, remote_path):
        url = '%s/file/%s/%s' % (NEWT_BASE_URL, self._machine, remote_path)
        params = {'view': 'read'}
        r = None

        try:
            r = self._session.get(url, params=params, stream=True)
            check_status(r)

            yield r.raw
        finally:
            if r:
                r.close()
Example #17
0
    def submit_job(self, job, job_script):
        url = '%s/queue/%s' % (NEWT_BASE_URL, self._machine)
        job_file_path = os.path.join(job['dir'], job_script)
        data = {'jobfile': job_file_path}

        r = self._session.post(url, data=data)
        check_status(r)
        json_response = r.json()

        if json_response['status'] != 'OK' or 'jobid' not in json_response:
            raise Exception(json_response['error'])

        return json_response['jobid']
Example #18
0
    def put(self, stream, remote_path):

        name = os.path.basename(remote_path)
        path = os.path.dirname(remote_path)

        # If not a full path then assume relative to users home
        if path[0] != '/':
            # Get the users home directory
            path = os.path.abspath(os.path.join(self._home_dir(), path))

        files = {'file': (name, stream)}
        url = '%s/file/%s%s' % (NEWT_BASE_URL, self._machine, path)
        r = self._session.post(url, files=files)
        check_status(r)
Example #19
0
def get_assetstore_id(girder_token, cluster):
    if 'assetstoreId' in cluster:
        return cluster['assetstoreId']

    headers = {'Girder-Token':  girder_token}
    url_base = get_assetstore_url_base(cluster)
    create_url = '%s/%s' % (cumulus.config.girder.baseUrl, url_base)
    body = {
        'name': cluster['_id'],
        'host': cluster['config']['host'],
        'machine': cluster['config']['host'],
        'authKey': cluster['_id']
    }

    user = parse('config.ssh.user').find(cluster)
    if user:
        body['user'] = user[0].value

    r = requests.post(create_url, json=body, headers=headers)

    # If the assetstore has been created, patch the cluster to point to it
    if r.status_code == 200:
        assetstore_id = r.json()['_id']

        cluster_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl,
                                          cluster['_id'])
        body = {
            'assetstoreId': assetstore_id
        }
        r = requests.patch(cluster_url, json=body, headers=headers)
        check_status(r)

    # The assetstore may have already been created by another concurrently
    # running task flow. If thats the case, just fetch it and return its id.
    elif r.status_code == 400:
        body = r.json()
        if body.get('field') == 'name' and body.get('type') == 'validation':
            assetstores_url = '%s/assetstore/lookup' % (
                cumulus.config.girder.baseUrl)
            params = {'name': cluster['_id']}
            r = requests.get(assetstores_url, params=params, headers=headers)
            check_status(r)
            assetstores = r.json()
            if len(assetstores) == 0:
                raise Exception(
                    'Could not find assetstore with name "%s" even though '
                    'it should already exist.' % cluster['_id']
                )

            assetstore_id = assetstores[0]['_id']
        else:
            check_status(r)

    # Raise any other errors
    else:
        check_status(r)

    cluster['assetstoreId'] = assetstore_id

    return assetstore_id
Example #20
0
def generate_key_pair(cluster, girder_token=None):
    """
    Task to generate a new key pair for a user.
    """
    cluster_id = cluster['_id']
    status_url = '%s/clusters/%s' \
        % (cumulus.config.girder.baseUrl, cluster_id)
    log = get_cluster_logger(cluster, girder_token)
    headers = {'Girder-Token': girder_token}

    try:
        new_key = RSAKey.generate(bits=4096)
        passphrase = ''.join(
            random.SystemRandom().choice(string.ascii_uppercase +
                                         string.digits) for _ in range(64))
        key_path = os.path.join(cumulus.config.ssh.keyStore, cluster_id)

        new_key.write_private_key_file(key_path, password=passphrase)
        # Allow group read as well
        os.chmod(key_path, stat.S_IREAD | stat.S_IWRITE | stat.S_IRGRP)

        comment = 'cumulus generated access key'
        public_key = '%s %s %s' % (new_key.get_name(), new_key.get_base64(),
                                   comment)

        # Update passphrase and public key on cluster model
        config_update = {
            'config': {
                'ssh': {
                    'passphrase': passphrase,
                    'publicKey': public_key
                }
            },
            'status': 'created'
        }

        patch_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl,
                                        cluster_id)
        request = requests.patch(patch_url,
                                 json=config_update,
                                 headers=headers)
        check_status(request)
    except Exception as ex:
        r = requests.patch(status_url,
                           headers=headers,
                           json={'status': 'error'})
        check_status(r)
        # Log the error message
        log.error(ex)
Example #21
0
    def submit_job(self, job, job_script):
        url = '%s/queue/%s' % (NEWT_BASE_URL, self._machine)
        job_file_path = os.path.join(job['dir'], job_script)
        data = {
            'jobfile': job_file_path
        }

        r = self._session.post(url, data=data)
        check_status(r)
        json_response = r.json()

        if json_response['status'] != 'OK' or 'jobid' not in json_response:
            raise Exception(json_response['error'])

        return json_response['jobid']
Example #22
0
def _upload_file(cluster_connection, girder_client, file, path):
    """
    Upload a file to a cluster

    :param cluster_connection: The connection to access the cluster by.
    :param girder_client: The Grider client for Girder access.
    :param file: The Girder file object.
    :param path: The path on the cluster to upload to.
    """

    r = requests.get(
        '%s/file/%s/download' % (girder_client.urlBase, file['_id']),
        headers={'Girder-Token': girder_client.token}, stream=True)
    check_status(r)
    cluster_connection.put(r.raw, os.path.join(path, file['name']))
Example #23
0
    def get(self, remote_path):
        url = '%s/file/%s/%s' % (NEWT_BASE_URL, self._machine, remote_path)
        params = {
            'view': 'read'
        }
        r = None

        try:
            r = self._session.get(url, params=params, stream=True)
            check_status(r)

            yield r.raw
        finally:
            if r:
                r.close()
Example #24
0
def terminate_cluster(playbook, cluster, profile, secret_key, extra_vars,
                      girder_token, log_write_url, post_status):

    playbook = get_playbook_path(playbook)
    playbook_variables = get_playbook_variables(cluster, profile, extra_vars)

    env = os.environ.copy()
    env.update({
        'AWS_ACCESS_KEY_ID': profile['accessKeyId'],
        'AWS_SECRET_ACCESS_KEY': secret_key,
        'GIRDER_TOKEN': girder_token,
        'LOG_WRITE_URL': log_write_url,
        'CLUSTER_ID': cluster['_id']
    })

    # if there are any volumes,  make sure to detach them first.
    if 'volumes' in cluster and len(cluster['volumes']):
        p = CloudProvider(dict(secretAccessKey=secret_key, **profile))
        master = p.get_master_instance(cluster['_id'])

        for volume_id in cluster['volumes']:
            r = requests.get('%s/volumes/%s' %
                             (cumulus.config.girder.baseUrl, volume_id),
                             headers={'Girder-Token': girder_token})
            check_status(r)
            volume = r.json()

            girder_callback_info = {
                'girder_api_url': cumulus.config.girder.baseUrl,
                'girder_token': girder_token
            }

            vol_log_url = '%s/volumes/%s/log' % (cumulus.config.girder.baseUrl,
                                                 volume_id)
            detach_volume(profile, cluster, master, volume, secret_key,
                          vol_log_url, girder_callback_info)

    inventory = simple_inventory('localhost')

    with inventory.to_tempfile() as inventory_path:
        ansible = run_playbook(playbook,
                               inventory_path,
                               playbook_variables,
                               env=env,
                               verbose=3)

    check_ansible_return_code(ansible, cluster, girder_token)
    check_girder_cluster_status(cluster, girder_token, post_status)
Example #25
0
def _upload_file(cluster_connection, girder_client, file, path):
    """
    Upload a file to a cluster

    :param cluster_connection: The connection to access the cluster by.
    :param girder_client: The Grider client for Girder access.
    :param file: The Girder file object.
    :param path: The path on the cluster to upload to.
    """

    r = requests.get('%s/file/%s/download' %
                     (girder_client.urlBase, file['_id']),
                     headers={'Girder-Token': girder_client.token},
                     stream=True)
    check_status(r)
    cluster_connection.put(r.raw, os.path.join(path, file['name']))
Example #26
0
    def put(self, stream, remote_path):

        name = os.path.basename(remote_path)
        path = os.path.dirname(remote_path)

        # If not a full path then assume relative to users home
        if path[0] != '/':
            # Get the users home directory
            path = os.path.abspath(os.path.join(self._home_dir(), path))

        files = {
            'file': (name, stream)
        }
        url = '%s/file/%s%s' % (NEWT_BASE_URL, self._machine, path)
        r = self._session.post(url, files=files)
        check_status(r)
Example #27
0
def upload_job_output_to_folder(cluster, job, log_write_url=None, job_dir=None,
                                girder_token=None):
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job['_id'])
    headers = {'Girder-Token':  girder_token}
    assetstore_base_url = get_assetstore_url_base(cluster)
    assetstore_id = get_assetstore_id(girder_token, cluster)
    if not job_dir:
        job_dir = job['dir']

    try:
        with get_connection(girder_token, cluster) as conn:
            for output in job['output']:
                if 'folderId' in output and 'path' in output:
                    folder_id = output['folderId']
                    path = os.path.join(job_dir, output['path'])
                    download_path(conn, girder_token, folder_id, path,
                                  assetstore_base_url, assetstore_id)
    except HttpError as e:
        job['status'] = JobState.ERROR
        url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
        logger = get_post_logger('job', girder_token, url)
        logger.exception(e.responseText)
        r = requests.patch(status_url, headers=headers,
                           json={'status': JobState.ERROR})
        check_status(r)

    if _get_on_complete(job) == 'terminate':
        cluster_log_url = '%s/clusters/%s/log' % \
            (cumulus.config.girder.baseUrl, cluster['_id'])
        command.send_task(
            'cumulus.tasks.cluster.terminate_cluster',
            args=(cluster,), kwargs={'log_write_url': cluster_log_url,
                                     'girder_token': girder_token})

    # If we where uploading move job to the complete state
    if job['status'] == JobState.UPLOADING:
        job_status = from_string(job['status'], task=None,
                                 cluster=cluster, job=job,
                                 log_write_url=log_write_url,
                                 girder_token=girder_token,
                                 conn=conn)
        job_status = Complete(job_status)
        job_status = job_status.next(JobQueueState.COMPLETE)
        job_status.run()
        r = requests.patch(status_url, headers=headers,
                           json={'status': str(job_status)})
        check_status(r)
Example #28
0
def upload_job_output_to_folder(cluster, job, log_write_url=None, job_dir=None,
                                girder_token=None):
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job['_id'])
    headers = {'Girder-Token':  girder_token}
    assetstore_base_url = get_assetstore_url_base(cluster)
    assetstore_id = get_assetstore_id(girder_token, cluster)
    if not job_dir:
        job_dir = job['dir']

    try:
        with get_connection(girder_token, cluster) as conn:
            for output in job['output']:
                if 'folderId' in output and 'path' in output:
                    folder_id = output['folderId']
                    path = os.path.join(job_dir, output['path'])
                    download_path(conn, girder_token, folder_id, path,
                                  assetstore_base_url, assetstore_id)
    except HttpError as e:
        job['status'] = JobState.ERROR
        url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
        logger = get_post_logger('job', girder_token, url)
        logger.exception(e.responseText)
        r = requests.patch(status_url, headers=headers,
                           json={'status': JobState.ERROR})
        check_status(r)

    if _get_on_complete(job) == 'terminate':
        cluster_log_url = '%s/clusters/%s/log' % \
            (cumulus.config.girder.baseUrl, cluster['_id'])
        command.send_task(
            'cumulus.tasks.cluster.terminate_cluster',
            args=(cluster,), kwargs={'log_write_url': cluster_log_url,
                                     'girder_token': girder_token})

    # If we where uploading move job to the complete state
    if job['status'] == JobState.UPLOADING:
        job_status = from_string(job['status'], task=None,
                                 cluster=cluster, job=job,
                                 log_write_url=log_write_url,
                                 girder_token=girder_token,
                                 conn=conn)
        job_status = Complete(job_status)
        job_status = job_status.next(JobQueueState.COMPLETE)
        job_status.run()
        r = requests.patch(status_url, headers=headers,
                           json={'status': str(job_status)})
        check_status(r)
Example #29
0
    def execute(self, command, ignore_exit_status=False, source_profile=True):
        url = '%s/command/%s' % (NEWT_BASE_URL, self._machine)

        # NEWT requires all commands are issued using a full executable path
        for (name, full_path) in six.iteritems(commands):
            command = re.sub(r'^%s[ ]*' % name, '%s ' % full_path, command)

        data = {'executable': command, 'loginenv': source_profile}

        r = self._session.post(url, data=data)
        check_status(r)

        json_response = r.json()
        if json_response['error']:
            raise NewtException(json_response['error'])

        return json_response['output'].split('\n')
Example #30
0
def check_girder_cluster_status(cluster, girder_token, post_status):
    # Check status from girder
    cluster_id = cluster['_id']
    headers = {'Girder-Token': girder_token}
    status_url = '%s/clusters/%s/status' % (cumulus.config.girder.baseUrl,
                                            cluster_id)
    r = requests.get(status_url, headers=headers)
    status = r.json()['status']

    if status != 'error':
        # Update girder with the new status
        status_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl,
                                         cluster_id)
        updates = {'status': post_status}

        r = requests.patch(status_url, headers=headers, json=updates)
        check_status(r)
Example #31
0
def generate_key_pair(cluster, girder_token=None):
    '''
    Task to generate a new key pair for a user.
    '''
    cluster_id = cluster['_id']
    status_url = '%s/clusters/%s' \
        % (cumulus.config.girder.baseUrl, cluster_id)
    log = get_cluster_logger(cluster, girder_token)
    headers = {'Girder-Token':  girder_token}

    try:
        new_key = RSAKey.generate(bits=4096)
        passphrase = ''.join(random.SystemRandom()
                             .choice(string.ascii_uppercase +
                                     string.digits) for _ in range(64))
        key_path = os.path.join(cumulus.config.ssh.keyStore, cluster_id)

        new_key.write_private_key_file(key_path, password=passphrase)
        # Allow group read as well
        os.chmod(key_path, stat.S_IREAD | stat.S_IWRITE | stat.S_IRGRP)

        comment = 'cumulus generated access key'
        public_key = '%s %s %s' % (new_key.get_name(), new_key.get_base64(),
                                   comment)

        # Update passphrase and public key on cluster model
        config_update = {
            'config': {
                'ssh': {
                    'passphrase': passphrase,
                    'publicKey': public_key
                }
            },
            'status': 'created'
        }

        patch_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl,
                                        cluster_id)
        request = requests.patch(patch_url, json=config_update, headers=headers)
        check_status(request)
    except Exception as ex:
        r = requests.patch(status_url, headers=headers,
                           json={'status': 'error'})
        check_status(r)
        # Log the error message
        log.error(ex.message)
Example #32
0
def check_girder_cluster_status(cluster, girder_token, post_status):
    # Check status from girder
    cluster_id = cluster['_id']
    headers = {'Girder-Token':  girder_token}
    status_url = '%s/clusters/%s/status' % (cumulus.config.girder.baseUrl,
                                            cluster_id)
    r = requests.get(status_url, headers=headers)
    status = r.json()['status']

    if status != 'error':
        # Update girder with the new status
        status_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl,
                                         cluster_id)
        updates = {
            'status': post_status
        }

        r = requests.patch(status_url, headers=headers, json=updates)
        check_status(r)
Example #33
0
    def list(self, remote_path):
        if remote_path[0] != '/':
            # Get the users home directory
            remote_path = os.path.abspath(os.path.join(self._home_dir(),
                                                       remote_path))

        url = '%s/file/%s/%s' % (NEWT_BASE_URL, self._machine, remote_path)
        r = self._session.get(url)
        check_status(r)

        paths = r.json()

        for path in paths:
            perms = path['perms']
            del path['perms']
            del path['hardlinks']

            path['mode'] = self._perms_to_mode(perms)
            yield path
Example #34
0
    def __enter__(self):

        # Do we need to get the session id for this user
        if not self._newt_session_id:
            headers = {'Girder-Token':  self._girder_token}
            url = '%s/newt/sessionId' % cumulus.config.girder.baseUrl
            r = requests.get(url, headers=headers)
            check_status(r)

            session_id = parse('sessionId').find(r.json())

            if not session_id:
                raise Exception('No NEWT session ID present')

            self._session = requests.Session()
            self._newt_session_id = session_id[0].value
            self._session.cookies.set('newt_sessionid', self._newt_session_id)

        return self
Example #35
0
    def __enter__(self):

        # Do we need to get the session id for this user
        if not self._newt_session_id:
            headers = {'Girder-Token': self._girder_token}
            url = '%s/newt/sessionId' % cumulus.config.girder.baseUrl
            r = requests.get(url, headers=headers)
            check_status(r)

            session_id = parse('sessionId').find(r.json())

            if not session_id:
                raise Exception('No NEWT session ID present')

            self._session = requests.Session()
            self._newt_session_id = session_id[0].value
            self._session.cookies.set('newt_sessionid', self._newt_session_id)

        return self
Example #36
0
    def list(self, remote_path):
        if remote_path[0] != '/':
            # Get the users home directory
            remote_path = os.path.abspath(
                os.path.join(self._home_dir(), remote_path))

        url = '%s/file/%s/%s' % (NEWT_BASE_URL, self._machine, remote_path)
        r = self._session.get(url)
        check_status(r)

        paths = r.json()

        for path in paths:
            perms = path['perms']
            del path['perms']
            del path['hardlinks']

            path['mode'] = self._perms_to_mode(perms)
            path['size'] = int(path['size'])
            yield path
Example #37
0
    def job_statuses(self, jobs):
        user = parse('config.user').find(self._cluster)

        if not user:
            raise Exception('Unable to extract user from cluster '
                            'configuration.')

        user = user[0].value
        url = '%s/queue/%s?user=%s' % (NEWT_BASE_URL, self._machine, user)
        r = self._session.get(url)
        check_status(r)
        json_response = r.json()

        states = []
        for job in jobs:
            slurm_state = self._extract_job_status(json_response, job)
            state = self.to_job_queue_state(slurm_state)
            states.append((job, state))

        return states
Example #38
0
    def execute(self, command, ignore_exit_status=False, source_profile=True):
        url = '%s/command/%s' % (NEWT_BASE_URL, self._machine)

        # NEWT requires all commands are issued using a full executable path
        for (name, full_path) in commands.iteritems():
            command = re.sub(r'^%s[ ]*' % name, '%s ' % full_path, command)

        data = {
            'executable': command,
            'loginenv': source_profile
        }

        r = self._session.post(url, data=data)
        check_status(r)

        json_response = r.json()
        if json_response['error']:
            raise NewtException(json_response['error'])

        return json_response['output'].split('\n')
Example #39
0
    def job_statuses(self, jobs):
        user = parse('config.user').find(self._cluster)

        if not user:
            raise Exception('Unable to extract user from cluster '
                            'configuration.')

        user = user[0].value
        url = '%s/queue/%s?user=%s' % (NEWT_BASE_URL, self._machine, user)
        r = self._session.get(url)
        check_status(r)
        json_response = r.json()

        states = []
        for job in jobs:
            slurm_state = self._extract_job_status(json_response, job)
            state = self.to_job_queue_state(slurm_state)
            states.append((job, state))

        return states
Example #40
0
def terminate_cluster(playbook, cluster, profile, secret_key, extra_vars,
                      girder_token, log_write_url, post_status):

    playbook = get_playbook_path(playbook)
    playbook_variables = get_playbook_variables(cluster, profile, extra_vars)

    env = os.environ.copy()
    env.update({'AWS_ACCESS_KEY_ID': profile['accessKeyId'],
                'AWS_SECRET_ACCESS_KEY': secret_key,
                'GIRDER_TOKEN': girder_token,
                'LOG_WRITE_URL': log_write_url,
                'CLUSTER_ID': cluster['_id']})

    # if there are any volumes,  make sure to detach them first.
    if 'volumes' in cluster and len(cluster['volumes']):
        p = CloudProvider(dict(secretAccessKey=secret_key, **profile))
        master = p.get_master_instance(cluster['_id'])

        for volume_id in cluster['volumes']:
            r = requests.get('%s/volumes/%s' %
                             (cumulus.config.girder.baseUrl, volume_id),
                             headers={'Girder-Token':  girder_token})
            check_status(r)
            volume = r.json()

            girder_callback_info = {
                'girder_api_url': cumulus.config.girder.baseUrl,
                'girder_token': girder_token}

            detach_volume(profile, cluster, master, volume,
                          secret_key, girder_callback_info)

    inventory = simple_inventory('localhost')

    with inventory.to_tempfile() as inventory_path:
        ansible = run_playbook(playbook, inventory_path, playbook_variables,
                               env=env, verbose=3)

    check_ansible_return_code(ansible, cluster, girder_token)
    check_girder_cluster_status(cluster, girder_token, post_status)
Example #41
0
def launch_cluster(playbook, cluster, profile, secret_key, extra_vars,
                   girder_token, log_write_url, post_status):
    playbook = get_playbook_path(playbook)
    playbook_variables = get_playbook_variables(cluster, profile, extra_vars)

    env = os.environ.copy()
    env.update({
        'AWS_ACCESS_KEY_ID': profile['accessKeyId'],
        'AWS_SECRET_ACCESS_KEY': secret_key,
        'GIRDER_TOKEN': girder_token,
        'LOG_WRITE_URL': log_write_url,
        'CLUSTER_ID': cluster['_id']
    })

    inventory = simple_inventory('localhost')

    with inventory.to_tempfile() as inventory_path:
        ansible = run_playbook(playbook,
                               inventory_path,
                               playbook_variables,
                               env=env,
                               verbose=3)

    p = CloudProvider(dict(secretAccessKey=secret_key, **profile))

    master = p.get_master_instance(cluster['_id'])

    status_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl,
                                     cluster['_id'])
    updates = {'config': {'host': master['public_ip']}}
    headers = {'Girder-Token': girder_token}
    r = requests.patch(status_url, headers=headers, json=updates)
    check_status(r)

    check_ansible_return_code(ansible, cluster, girder_token)
    check_girder_cluster_status(cluster, girder_token, post_status)
Example #42
0
def generate_key_pair(aws_profile, girder_token):
    try:
        client = get_ec2_client(aws_profile)
        key_path = _key_path(aws_profile)
        key_pair = client.create_key_pair(KeyName=aws_profile['_id'])

        with open(key_path, 'wb') as fp:
            fp.write(key_pair['KeyMaterial'].encode('utf8'))
        os.chmod(key_path, stat.S_IRUSR)

        aws_profile['status'] = 'available'

    except Exception as ex:
        aws_profile['status'] = 'error'
        aws_profile['errorMessage'] = '%s: %s' % (type(ex).__name__, ex)
        traceback.print_exc()

    update_url = '%s/user/%s/aws/profiles/%s' % (cumulus.config.girder.baseUrl,
                                                 aws_profile['userId'],
                                                 aws_profile['_id'])

    headers = {'Girder-Token':  girder_token}
    r = requests.patch(update_url, json=aws_profile, headers=headers)
    check_status(r)
Example #43
0
def monitor_process(task, cluster, job, pid, nohup_out_path,
                    log_write_url=None, on_complete=None,
                    output_message='Job download/upload error: %s',
                    girder_token=None):
    job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
    log = get_post_logger(job['_id'], girder_token, job_url)
    headers = {'Girder-Token':  girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)

    try:
        # if terminating break out
        if _is_terminating(job, girder_token):
            return

        with get_connection(girder_token, cluster) as conn:
            # See if the process is still running
            output = conn.execute('ps %s | grep %s' % (pid, pid),
                                  ignore_exit_status=True,
                                  source_profile=False)

            if len(output) > 0:
                # Process is still running so schedule self again in about 5
                # secs
                # N.B. throw=False to prevent Retry exception being raised
                task.retry(throw=False, countdown=5)
            else:
                try:
                    nohup_out_file_name = os.path.basename(nohup_out_path)

                    # Log the output
                    with conn.get(nohup_out_path) as fp:
                        output = fp.read()
                        if output.strip():
                            log.error(output_message % output)
                            # If we have output then set the error state on the
                            # job and return
                            r = requests.patch(status_url, headers=headers,
                                               json={'status': JobState.ERROR})
                            check_status(r)
                            return
                finally:
                    if nohup_out_file_name and \
                       os.path.exists(nohup_out_file_name):
                        os.remove(nohup_out_file_name)

                # Fire off the on_compete task if we have one
                if on_complete:
                    signature(on_complete).delay()

                # If we where uploading move job to the complete state
                if job['status'] == JobState.UPLOADING:
                    job_status = from_string(job['status'], task=task,
                                             cluster=cluster, job=job,
                                             log_write_url=log_write_url,
                                             girder_token=girder_token,
                                             conn=conn)
                    job_status = Complete(job_status)
                    job_status = job_status.next(JobQueueState.COMPLETE)
                    job_status.run()
                    r = requests.patch(status_url, headers=headers,
                                       json={'status': str(job_status)})
                    check_status(r)

    except EOFError:
        # Try again
        task.retry(throw=False, countdown=5)
    except Exception as ex:
        r = requests.patch(status_url, headers=headers,
                           json={'status': JobState.UNEXPECTEDERROR})
        check_status(r)
        get_job_logger(job, girder_token).exception(ex.message)
        raise
Example #44
0
def upload_job_output_to_item(cluster,
                              job,
                              log_write_url=None,
                              job_dir=None,
                              girder_token=None):
    headers = {'Girder-Token': girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)

    try:
        # if terminating break out
        if _is_terminating(job, girder_token):
            return

        with get_connection(girder_token, cluster) as conn:
            # First put girder client on master
            path = inspect.getsourcefile(cumulus.girderclient)
            with open(path, 'r') as fp:
                conn.put(
                    fp,
                    os.path.normpath(
                        os.path.join(job_dir, '..', os.path.basename(path))))

            cmds = ['cd %s' % job_dir]
            upload_cmd = 'python ../girderclient.py --token %s --url "%s" ' \
                         'upload --job %s' \
                         % (girder_token,
                            cumulus.config.girder.baseUrl, job['_id'])

            upload_output = '%s.upload.out' % job_id
            upload_output_path = os.path.normpath(
                os.path.join(job_dir, '..', upload_output))
            cmds.append('nohup %s  &> ../%s  &\n' %
                        (upload_cmd, upload_output))

            upload_cmd = _put_script(conn, '\n'.join(cmds))
            output = conn.execute(upload_cmd)

            # Remove upload script
            conn.remove(upload_cmd)

        if len(output) != 1:
            raise Exception('PID not returned by execute command')

        try:
            pid = int(output[0])
        except ValueError:
            raise Exception('Unable to extract PID from: %s' % output)

        on_complete = None

        if _get_on_complete(job) == 'terminate':
            cluster_log_url = '%s/clusters/%s/log' % \
                (cumulus.config.girder.baseUrl, cluster['_id'])
            on_complete = signature('cumulus.tasks.cluster.terminate_cluster',
                                    args=(cluster, ),
                                    kwargs={
                                        'log_write_url': cluster_log_url,
                                        'girder_token': girder_token
                                    })

        monitor_process.delay(cluster,
                              job,
                              pid,
                              upload_output_path,
                              log_write_url=log_write_url,
                              on_complete=on_complete,
                              girder_token=girder_token)

    except Exception as ex:
        r = requests.patch(status_url,
                           headers=headers,
                           json={'status': JobState.UNEXPECTEDERROR})
        check_status(r)
        get_job_logger(job, girder_token).exception(str(ex))
Example #45
0
def monitor_process(task,
                    cluster,
                    job,
                    pid,
                    nohup_out_path,
                    log_write_url=None,
                    on_complete=None,
                    output_message='Job download/upload error: %s',
                    girder_token=None):
    job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
    log = get_post_logger(job['_id'], girder_token, job_url)
    headers = {'Girder-Token': girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)

    try:
        # if terminating break out
        if _is_terminating(job, girder_token):
            return

        with get_connection(girder_token, cluster) as conn:
            # See if the process is still running
            output = conn.execute('ps %s | grep %s' % (pid, pid),
                                  ignore_exit_status=True,
                                  source_profile=False)

            if len(output) > 0:
                # Process is still running so schedule self again in about 5
                # secs
                # N.B. throw=False to prevent Retry exception being raised
                task.retry(throw=False, countdown=5)
            else:
                try:
                    nohup_out_file_name = os.path.basename(nohup_out_path)

                    # Log the output
                    with conn.get(nohup_out_path) as fp:
                        output = fp.read()
                        if output.strip():
                            log.error(output_message % output)
                            # If we have output then set the error state on the
                            # job and return
                            r = requests.patch(status_url,
                                               headers=headers,
                                               json={'status': JobState.ERROR})
                            check_status(r)
                            return
                finally:
                    if nohup_out_file_name and \
                       os.path.exists(nohup_out_file_name):
                        os.remove(nohup_out_file_name)

                # Fire off the on_compete task if we have one
                if on_complete:
                    signature(on_complete).delay()

                # If we where uploading move job to the complete state
                if job['status'] == JobState.UPLOADING:
                    job_status = from_string(job['status'],
                                             task=task,
                                             cluster=cluster,
                                             job=job,
                                             log_write_url=log_write_url,
                                             girder_token=girder_token,
                                             conn=conn)
                    job_status = Complete(job_status)
                    job_status = job_status.next(JobQueueState.COMPLETE)
                    job_status.run()
                    r = requests.patch(status_url,
                                       headers=headers,
                                       json={'status': str(job_status)})
                    check_status(r)

    except EOFError:
        # Try again
        task.retry(throw=False, countdown=5)
    except Exception as ex:
        r = requests.patch(status_url,
                           headers=headers,
                           json={'status': JobState.UNEXPECTEDERROR})
        check_status(r)
        get_job_logger(job, girder_token).exception(str(ex))
        raise
Example #46
0
def submit_job(cluster, job, log_write_url=None, girder_token=None,
               monitor=True):
    job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
    log = get_post_logger(job['_id'], girder_token, job_url)
    headers = {'Girder-Token':  girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)
    try:
        # if terminating break out
        if _is_terminating(job, girder_token):
            return

        script_name = job['name']

        with get_connection(girder_token, cluster) as conn:
            job_params = {}
            if 'params' in job:
                job_params = job['params']

            output = conn.execute('pwd')
            if len(output) != 1:
                raise Exception('Unable to fetch users home directory.')

            user_home = output[0].strip()
            job_dir = job_directory(cluster, job, user_home=user_home)
            job['dir'] = job_dir

            slots = -1

            # Try job parameters first
            slots = int(job_params.get('numberOfSlots', slots))

            if slots == -1:
                # Try the cluster
                slots = int(cluster['config'].get('numberOfSlots', slots))

            parallel_env = _get_parallel_env(cluster, job)
            if parallel_env:
                job_params['parallelEnvironment'] = parallel_env

                # If the number of slots has not been provided we will get
                # the number of slots from the parallel environment
                if slots == -1:
                    slots = int(get_queue_adapter(cluster, conn)
                                .number_of_slots(parallel_env))
                    if slots > 0:
                        job_params['numberOfSlots'] = slots

            script = _generate_submission_script(job, cluster, job_params)

            conn.mkdir(job_dir, ignore_failure=True)
            # put the script to master
            conn.put(StringIO(script), os.path.join(job_dir, script_name))

            if slots > -1:
                log.info('We have %s slots available' % slots)

            # Now submit the job
            queue_job_id \
                = get_queue_adapter(cluster, conn).submit_job(job,
                                                              script_name)

            # Update the state and queue job id
            job[AbstractQueueAdapter.QUEUE_JOB_ID] = queue_job_id
            patch_data = {
                'status': JobState.QUEUED,
                AbstractQueueAdapter.QUEUE_JOB_ID: queue_job_id,
                'dir': job_dir
            }

            r = requests.patch(status_url, headers=headers, json=patch_data)
            check_status(r)
            job = r.json()
            job['queuedTime'] = time.time()

            # Now monitor the jobs progress
            if monitor:
                monitor_job.s(
                    cluster, job, log_write_url=log_write_url,
                    girder_token=girder_token).apply_async(countdown=5)

        # Now update the status of the job
        headers = {'Girder-Token':  girder_token}
        r = requests.patch(status_url, headers=headers,
                           json={'status': JobState.QUEUED})
        check_status(r)
    except Exception as ex:
        traceback.print_exc()
        r = requests.patch(status_url, headers=headers,
                           json={'status': JobState.UNEXPECTEDERROR})
        check_status(r)
        get_job_logger(job, girder_token).exception(ex.message)
        raise
Example #47
0
def _monitor_jobs(task,
                  cluster,
                  jobs,
                  log_write_url=None,
                  girder_token=None,
                  monitor_interval=5):
    headers = {'Girder-Token': girder_token}

    cluster_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl,
                                      cluster['_id'])
    try:
        with get_connection(girder_token, cluster) as conn:

            try:
                job_queue_states \
                    = get_queue_adapter(cluster, conn).job_statuses(jobs)

                new_states = set()
                for (job, state) in job_queue_states:
                    job_id = job['_id']
                    # First get the current status
                    status_url = '%s/jobs/%s/status' % (
                        cumulus.config.girder.baseUrl, job_id)
                    r = requests.get(status_url, headers=headers)
                    check_status(r)
                    current_status = r.json()['status']

                    if current_status == JobState.TERMINATED:
                        continue

                    job_status = from_string(current_status,
                                             task=task,
                                             cluster=cluster,
                                             job=job,
                                             log_write_url=log_write_url,
                                             girder_token=girder_token,
                                             conn=conn)
                    job_status = job_status.next(state)
                    job['status'] = str(job_status)
                    job_status.run()
                    json = {
                        'status': str(job_status),
                        'timings': job.get('timings', {}),
                        'output': job['output']
                    }
                    job_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl,
                                              job['_id'])
                    r = requests.patch(job_url, headers=headers, json=json)
                    check_status(r)

                    new_states.add(job['status'])

                # Now see if we still have jobs to monitor
                running_states = set([
                    JobState.CREATED, JobState.QUEUED, JobState.RUNNING,
                    JobState.TERMINATING
                ])

                # Do we have any job still in a running state?
                if new_states & running_states:
                    task.retry(countdown=monitor_interval)
            except EOFError:
                # Try again
                task.retry(countdown=5)
                return
            except paramiko.ssh_exception.NoValidConnectionsError:
                # Try again
                task.retry(countdown=5)
                return
    # Ensure that the Retry exception will get through
    except Retry:
        raise
    except paramiko.ssh_exception.NoValidConnectionsError as ex:
        r = requests.patch(cluster_url,
                           headers=headers,
                           json={'status': 'error'})
        check_status(r)
        get_cluster_logger(cluster, girder_token).exception(str(ex))

    except Exception as ex:
        traceback.print_exc()
        r = requests.patch(cluster_url,
                           headers=headers,
                           json={'status': 'error'})
        check_status(r)
        get_cluster_logger(cluster, girder_token).exception(str(ex))
        raise
Example #48
0
def _monitor_jobs(task, cluster, jobs, log_write_url=None, girder_token=None,
                  monitor_interval=5):
    headers = {'Girder-Token':  girder_token}

    cluster_url = '%s/clusters/%s' % (
        cumulus.config.girder.baseUrl, cluster['_id'])
    try:
        with get_connection(girder_token, cluster) as conn:

            try:
                job_queue_states \
                    = get_queue_adapter(cluster, conn).job_statuses(jobs)

                new_states = set()
                for (job, state) in job_queue_states:
                    job_id = job['_id']
                    # First get the current status
                    status_url = '%s/jobs/%s/status' % (
                        cumulus.config.girder.baseUrl, job_id)
                    r = requests.get(status_url, headers=headers)
                    check_status(r)
                    current_status = r.json()['status']

                    if current_status == JobState.TERMINATED:
                        continue

                    job_status = from_string(current_status, task=task,
                                             cluster=cluster, job=job,
                                             log_write_url=log_write_url,
                                             girder_token=girder_token,
                                             conn=conn)
                    job_status = job_status.next(state)
                    job['status'] = str(job_status)
                    job_status.run()
                    json = {
                        'status': str(job_status),
                        'timings': job.get('timings', {}),
                        'output': job['output']
                    }
                    job_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl,
                                              job['_id'])
                    r = requests.patch(job_url, headers=headers, json=json)
                    check_status(r)

                    new_states.add(job['status'])

                # Now see if we still have jobs to monitor
                running_states = set(
                    [JobState.CREATED, JobState.QUEUED,
                     JobState.RUNNING, JobState.TERMINATING]
                )

                # Do we have any job still in a running state?
                if new_states & running_states:
                    task.retry(countdown=monitor_interval)
            except EOFError:
                # Try again
                task.retry(countdown=5)
                return
            except paramiko.ssh_exception.NoValidConnectionsError as ex:
                # Try again
                task.retry(countdown=5)
                return
    # Ensure that the Retry exception will get through
    except Retry:
        raise
    except paramiko.ssh_exception.NoValidConnectionsError as ex:
        r = requests.patch(cluster_url, headers=headers,
                           json={'status': 'error'})
        check_status(r)
        get_cluster_logger(cluster, girder_token).exception(ex.message)

    except Exception as ex:
        traceback.print_exc()
        r = requests.patch(cluster_url, headers=headers,
                           json={'status': 'error'})
        check_status(r)
        get_cluster_logger(cluster, girder_token).exception(ex.message)
        raise
Example #49
0
def upload_job_output_to_item(cluster, job, log_write_url=None, job_dir=None,
                              girder_token=None):
    headers = {'Girder-Token':  girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)

    try:
        # if terminating break out
        if _is_terminating(job, girder_token):
            return

        with get_connection(girder_token, cluster) as conn:
            # First put girder client on master
            path = inspect.getsourcefile(cumulus.girderclient)
            with open(path, 'r') as fp:
                conn.put(fp,
                         os.path.normpath(os.path.join(job_dir, '..',
                                                       os.path.basename(path))))

            cmds = ['cd %s' % job_dir]
            upload_cmd = 'python ../girderclient.py --token %s --url "%s" ' \
                         'upload --job %s' \
                         % (girder_token,
                            cumulus.config.girder.baseUrl, job['_id'])

            upload_output = '%s.upload.out' % job_id
            upload_output_path = os.path.normpath(os.path.join(job_dir, '..',
                                                               upload_output))
            cmds.append('nohup %s  &> ../%s  &\n' % (upload_cmd, upload_output))

            upload_cmd = _put_script(conn, '\n'.join(cmds))
            output = conn.execute(upload_cmd)

            # Remove upload script
            conn.remove(upload_cmd)

        if len(output) != 1:
            raise Exception('PID not returned by execute command')

        try:
            pid = int(output[0])
        except ValueError:
            raise Exception('Unable to extract PID from: %s' % output)

        on_complete = None

        if _get_on_complete(job) == 'terminate':
            cluster_log_url = '%s/clusters/%s/log' % \
                (cumulus.config.girder.baseUrl, cluster['_id'])
            on_complete = signature(
                'cumulus.tasks.cluster.terminate_cluster',
                args=(cluster,), kwargs={'log_write_url': cluster_log_url,
                                         'girder_token': girder_token})

        monitor_process.delay(cluster, job, pid, upload_output_path,
                              log_write_url=log_write_url,
                              on_complete=on_complete,
                              girder_token=girder_token)

    except Exception as ex:
        r = requests.patch(status_url, headers=headers,
                           json={'status': JobState.UNEXPECTEDERROR})
        check_status(r)
        get_job_logger(job, girder_token).exception(ex.message)
Example #50
0
def submit_job(cluster,
               job,
               log_write_url=None,
               girder_token=None,
               monitor=True):
    job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
    log = get_post_logger(job['_id'], girder_token, job_url)
    headers = {'Girder-Token': girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)
    try:
        # if terminating break out
        if _is_terminating(job, girder_token):
            return

        script_name = job['name']

        with get_connection(girder_token, cluster) as conn:
            job_params = {}
            if 'params' in job:
                job_params = job['params']

            output = conn.execute('pwd')
            if len(output) != 1:
                raise Exception('Unable to fetch users home directory.')

            user_home = output[0].strip()
            job_dir = job_directory(cluster, job, user_home=user_home)
            job['dir'] = job_dir

            slots = -1

            # Try job parameters first
            slots = int(job_params.get('numberOfSlots', slots))

            if slots == -1:
                # Try the cluster
                slots = int(cluster['config'].get('numberOfSlots', slots))

            parallel_env = _get_parallel_env(cluster, job)
            if parallel_env:
                job_params['parallelEnvironment'] = parallel_env

                # If the number of slots has not been provided we will get
                # the number of slots from the parallel environment
                if slots == -1:
                    slots = int(
                        get_queue_adapter(cluster,
                                          conn).number_of_slots(parallel_env))
                    if slots > 0:
                        job_params['numberOfSlots'] = slots

            script = _generate_submission_script(job, cluster, job_params)

            conn.makedirs(job_dir)
            # put the script to master
            conn.put(StringIO(script), os.path.join(job_dir, script_name))

            if slots > -1:
                log.info('We have %s slots available' % slots)

            # Now submit the job
            queue_job_id \
                = get_queue_adapter(cluster, conn).submit_job(job,
                                                              script_name)

            # Update the state and queue job id
            job[AbstractQueueAdapter.QUEUE_JOB_ID] = queue_job_id
            patch_data = {
                'status': JobState.QUEUED,
                AbstractQueueAdapter.QUEUE_JOB_ID: queue_job_id,
                'dir': job_dir
            }

            r = requests.patch(status_url, headers=headers, json=patch_data)
            check_status(r)
            job = r.json()
            job['queuedTime'] = time.time()

            # Now monitor the jobs progress
            if monitor:
                monitor_job.s(
                    cluster,
                    job,
                    log_write_url=log_write_url,
                    girder_token=girder_token).apply_async(countdown=5)

        # Now update the status of the job
        headers = {'Girder-Token': girder_token}
        r = requests.patch(status_url,
                           headers=headers,
                           json={'status': JobState.QUEUED})
        check_status(r)
    except Exception as ex:
        traceback.print_exc()
        r = requests.patch(status_url,
                           headers=headers,
                           json={'status': JobState.UNEXPECTEDERROR})
        check_status(r)
        get_job_logger(job, girder_token).exception(str(ex))
        raise