Example #1
0
def print_submit_result(cluster, response):
    """
    Parses a submission response from cluster and returns a corresponding message. Note that
    Cook Scheduler returns text when the submission was successful, and JSON when the submission
    failed. Also, in the case of failure, there are different possible shapes for the failure payload.
    """
    cluster_name = cluster['name']
    if response.status_code == 201:
        text = response.text.strip('"')
        if ' submitted groups' in text:
            group_index = text.index(' submitted groups')
            text = text[:group_index]
        uuids = [p for p in text.split() if is_valid_uuid(p)]
        print_info(submit_succeeded_message(cluster_name, uuids),
                   '\n'.join(uuids))
    else:
        try:
            data = response.json()
            if 'errors' in data:
                reason = json.dumps(data['errors'])
            elif 'error' in data:
                reason = data['error']
            else:
                reason = json.dumps(data)
        except json.decoder.JSONDecodeError:
            reason = '%s\n' % response.text
        print_error(submit_failed_message(cluster_name, reason))
Example #2
0
def filter_query_result_by_pools(query_result, pools):
    """Filter query result if pools are provided. Return warning message if some of the pools not found in any cluster"""

    clusters = []
    known_pools = []

    pools_set = set(pools)
    filtered_clusters = {}
    for cluster, cluster_usage in query_result['clusters'].items():
        clusters.append(cluster)
        if cluster_usage['using_pools']:
            filtered_pools = {}
            for pool, pool_usage in cluster_usage['pools'].items():
                known_pools.append(pool)
                if pool in pools_set:
                    filtered_pools[pool] = pool_usage
            if filtered_pools:
                filtered_clusters[cluster] = cluster_usage
                cluster_usage['pools'] = filtered_pools
    query_result['clusters'] = filtered_clusters

    missing_pools = pools_set.difference(set(known_pools))
    if missing_pools:
        print_error(
            (f"{list(missing_pools)[0]} is not a valid pool in "
             if len(missing_pools) ==
             1 else f"{' / '.join(missing_pools)} are not valid pools in ") +
            (clusters[0] if len(clusters) == 1 else ' / '.join(clusters)) +
            '.')
        if query_result['clusters'].items():
            print_error('')

    return query_result
Example #3
0
def submit_federated(clusters, jobs, group, pool):
    """
    Attempts to submit the provided jobs to each cluster in clusters, until a cluster
    returns a "created" status code. If no cluster returns "created" status, throws.
    """
    messages = ""
    for cluster in clusters:
        cluster_name = cluster['name']
        cluster_url = cluster['url']
        try:
            print_info('Attempting to submit on %s cluster...' % terminal.bold(cluster_name))

            json_body = {'jobs': jobs}
            if group:
                json_body['groups'] = [group]
            if pool:
                json_body['pool'] = pool

            resp = http.post(cluster, 'jobs', json_body)
            print_submit_result(cluster, resp)
            if resp.status_code == 201:
                metrics.inc('command.submit.jobs', len(jobs))
                return 0
        except requests.exceptions.ReadTimeout as rt:
            logging.exception(rt)
            print_info(terminal.failed(
                f'Encountered read timeout with {cluster_name} ({cluster_url}). Your submission may have completed.'))
            return 1
        except IOError as ioe:
            logging.exception(ioe)
            reason = f'Cannot connect to {cluster_name} ({cluster_url})'
            message = submit_failed_message(cluster_name, reason)
            messages += message
    print_error(messages)
    raise Exception(terminal.failed('Job submission failed on all of your configured clusters.'))
Example #4
0
def main(args=None):
    if args is None:
        args = sys.argv[1:]

    try:
        result = run(args)
        sys.exit(result)
    except Exception as e:
        logging.exception('exception when running with %s' % args)
        print_error(str(e))
        sys.exit(1)
Example #5
0
def make_data_request(cluster, make_request_fn):
    """
    Makes a request (using make_request_fn), parsing the
    assumed-to-be-JSON response and handling common errors
    """
    try:
        resp = make_request_fn()
        if resp.status_code == 200:
            return resp.json()
        elif resp.status_code == 401:
            print_error(f'Authentication failed on {cluster["name"]} ({cluster["url"]}).')
            return []
        elif resp.status_code == 500:
            print_error(f'Encountered server error while querying {cluster["name"]}.')
            # fall through to logging call below

        logging.warn(f'Unexpected response code {resp.status_code} for data request. Response body: {resp.text}')
    except requests.exceptions.ConnectionError as ce:
        logging.exception(ce)
        print_error(f'Encountered connection error with {cluster["name"]} ({cluster["url"]}).')
    except requests.exceptions.ReadTimeout as rt:
        logging.exception(rt)
        print_error(f'Encountered read timeout with {cluster["name"]} ({cluster["url"]}).')
    except IOError as ioe:
        logging.exception(ioe)
    except json.decoder.JSONDecodeError as jde:
        logging.exception(jde)
    return []
Example #6
0
File: http.py Project: dPeS/Cook
def make_data_request(cluster, make_request_fn):
    """
    Makes a request (using make_request_fn), parsing the
    assumed-to-be-JSON response and handling common errors
    """
    try:
        resp = make_request_fn()
        if resp.status_code == 200:
            return resp.json()
        elif resp.status_code == 401:
            print_error(
                f'Authentication failed on {cluster["name"]} ({cluster["url"]}).'
            )
    except requests.exceptions.ConnectionError as ce:
        logging.exception(ce)
        print_error(
            f'Encountered connection error with {cluster["name"]} ({cluster["url"]}).'
        )
    except requests.exceptions.ReadTimeout as rt:
        logging.exception(rt)
        print_error(
            f'Encountered read timeout with {cluster["name"]} ({cluster["url"]}).'
        )
    except IOError as ioe:
        logging.exception(ioe)
    except json.decoder.JSONDecodeError as jde:
        logging.exception(jde)

    return []
Example #7
0
def copy_limits(args, config_path):
    """Copies limits (share and quota) for a particular user from one cluster to another cluster"""
    user = args.get('user')

    from_cluster = args.get('from')
    from_url = args.get('from_url')
    if not from_cluster and not from_url:
        copy_limits_parser.print_help()
        print()
        raise Exception(f'You must provide either a from-cluster name (--from) or URL (--from-url).')

    to_cluster = args.get('to')
    to_url = args.get('to_url')
    if not to_cluster and not to_url:
        copy_limits_parser.print_help()
        print()
        raise Exception(f'You must provide either a to-cluster name (--to) or URL (--to-url).')

    _, config_map = configuration.load_config_with_defaults(config_path)
    from_clusters = load_target_clusters(config_map, from_url, from_cluster)
    to_clusters = load_target_clusters(config_map, to_url, to_cluster)
    assert len(from_clusters) == 1, 'Only a single from-cluster is supported.'
    assert len(to_clusters) == 1, 'Only a single to-cluster is supported.'
    from_cluster = from_clusters[0]
    to_cluster = to_clusters[0]
    from_cluster_name = from_cluster['name']
    to_cluster_name = to_cluster['name']
    print(f'Copying limits for {terminal.bold(user)} user '
          f'from {terminal.bold(from_cluster_name)} '
          f'to {terminal.bold(to_cluster_name)}:')
    from_pools = http.make_data_request(from_cluster, lambda: http.get(from_cluster, 'pools', params={}))
    to_pools = http.make_data_request(to_cluster, lambda: http.get(to_cluster, 'pools', params={}))
    from_pools_dict = {pool['name']: pool for pool in from_pools}
    to_pools_dict = {pool['name']: pool for pool in to_pools}
    for pool_name, from_pool in from_pools_dict.items():
        if pool_name in to_pools_dict and to_pools_dict[pool_name]['state'] != 'inactive':
            print(f'\n=== Pool: {pool_name} ===')
            query_result = query([from_cluster, to_cluster], user)
            query_result = filter_query_result_by_pools(query_result, [pool_name])
            print_formatted(query_result)
            answer = input(f'Copy limits for {terminal.bold(pool_name)} pool '
                           f'from {terminal.bold(from_cluster_name)} '
                           f'to {terminal.bold(to_cluster_name)}? ')
            should_copy = str2bool(answer)
            if should_copy:
                from_dict = query_result['clusters'][from_cluster_name]['pools'][pool_name]
                reason = f'Copying limits for {user} user from {from_cluster_name} to {to_cluster_name}'

                from_share = from_dict['share']
                resp = http.post(to_cluster,
                                 'share',
                                 {'pool': pool_name,
                                  'user': user,
                                  'reason': reason,
                                  'share': from_share})
                if resp.status_code != 201:
                    print_error(f'Setting share for {pool_name} on {to_cluster_name} '
                                f'failed with status code {resp.status_code}: {resp.text}')
                else:
                    print(terminal.success(f'Copied share for {pool_name} pool '
                                           f'from {from_cluster_name} '
                                           f'to {to_cluster_name}.'))

                from_quota = from_dict['quota']
                resp = http.post(to_cluster,
                                 'quota',
                                 {'pool': pool_name,
                                  'user': user,
                                  'reason': reason,
                                  'quota': from_quota})
                if resp.status_code != 201:
                    print_error(f'Setting quota for {pool_name} on {to_cluster_name} '
                                f'failed with status code {resp.status_code}: {resp.text}')
                else:
                    print(terminal.success(f'Copied quota for {pool_name} pool '
                                           f'from {from_cluster_name} '
                                           f'to {to_cluster_name}.'))
Example #8
0
def get_usage_on_cluster(cluster, user):
    """Queries cluster for usage information for the given user"""
    params = {'user': user, 'group_breakdown': 'true'}
    usage_map = http.make_data_request(
        cluster, lambda: http.get(cluster, 'usage', params=params))
    if not usage_map:
        print_error(
            f'Unable to retrieve usage information on {cluster["name"]} ({cluster["url"]}).'
        )
        return {'count': 0}

    using_pools = 'pools' in usage_map
    pool_names = usage_map['pools'].keys() if using_pools else []

    share_map = http.make_data_request(
        cluster, lambda: http.get(cluster, 'share', params={'user': user}))
    if not share_map:
        print_error(
            f'Unable to retrieve share information on {cluster["name"]} ({cluster["url"]}).'
        )
        return {'count': 0}

    if using_pools != ('pools' in share_map):
        print_error(
            f'Share information on {cluster["name"]} ({cluster["url"]}) is invalid. '
            f'Usage information is{"" if using_pools else " not"} per pool, but share '
            f'is{"" if not using_pools else " not"}')
        return {'count': 0}
    if pool_names != (share_map['pools'].keys() if using_pools else []):
        print_error(
            f'Share information on {cluster["name"]} ({cluster["url"]}) is invalid. '
            f'Usage information has pools: {pool_names}, but share '
            f'has pools: {share_map["pools"].keys()}')
        return {'count': 0}

    def make_query_result(using_pools, usage_map, share_map, pool_data=None):
        query_result = {
            'using_pools': using_pools,
            'usage': usage_map['total_usage'],
            'share': share_map
        }
        query_result.update(get_job_data(cluster, usage_map))
        if pool_data:
            query_result.update(pool_data)
        return query_result

    if using_pools:
        pools = http.make_data_request(
            cluster, lambda: http.get(cluster, 'pools', params={}))
        pools_dict = {pool['name']: pool for pool in pools}
        for pool_name in pool_names:
            if pool_name not in pools_dict or 'state' not in pools_dict[
                    pool_name]:
                print_error(
                    f'Pool information on {cluster["name"]} ({cluster["url"]}) is invalid. '
                    f'Can\'t determine the state of pool {pool_name}')
                return {'count': 0}
        query_result = {
            'using_pools': using_pools,
            'pools': {
                pool_name:
                make_query_result(using_pools, usage_map['pools'][pool_name],
                                  share_map['pools'][pool_name],
                                  {'state': pools_dict[pool_name]['state']})
                for pool_name in pool_names
            }
        }
        return query_result
    else:
        return make_query_result(using_pools, usage_map, share_map)