Exemple #1
0
def query_cluster(cluster, uuids, pred, timeout, interval, make_request_fn,
                  entity_type):
    """
    Queries the given cluster for the given uuids with
    an optional predicate, pred, that must be satisfied
    """
    def satisfy_pred():
        return pred(
            http.make_data_request(lambda: make_request_fn(cluster, uuids)))

    entities = http.make_data_request(lambda: make_request_fn(cluster, uuids))
    if pred and len(entities) > 0:
        if entity_type == 'job':
            wait_text = 'Waiting for the following jobs'
        elif entity_type == 'instance':
            wait_text = 'Waiting for instances of the following jobs'
        elif entity_type == 'group':
            wait_text = 'Waiting for the following job groups'
        else:
            raise Exception('Invalid entity type %s.' % entity_type)

        uuid_text = ', '.join([e['uuid'] for e in entities])
        wait_text = '%s on %s: %s' % (wait_text, colors.bold(
            cluster['name']), uuid_text)
        index = progress.add(wait_text)
        if pred(entities):
            progress.update(index, colors.bold('Done'))
        else:
            entities = wait_until(satisfy_pred, timeout, interval)
            if entities:
                progress.update(index, colors.bold('Done'))
            else:
                raise TimeoutError('Timeout waiting for response.')
    return entities
Exemple #2
0
def query_instances_on_cluster(cluster, status, start_ms, end_ms):
    """Queries cluster for instance stats with the given status / time"""
    params = {'status': status, 'start': start_ms, 'end': end_ms}
    stats = http.make_data_request(cluster, lambda: http.get(cluster, 'stats/instances', params=params))
    overall_stats = stats['overall']
    data = {'count': overall_stats['count'] if 'count' in overall_stats else 0}
    return data
Exemple #3
0
def __query_cluster(cluster, uuids, pred, timeout, interval, make_request_fn, entity_type):
    """
    Queries the given cluster for the given uuids with
    an optional predicate, pred, that must be satisfied
    """

    def satisfy_pred():
        return pred(http.make_data_request(cluster, lambda: make_request_fn(cluster, uuids)))

    entities = http.make_data_request(cluster, lambda: make_request_fn(cluster, uuids))
    num_entities = len(entities)
    if pred and num_entities > 0:
        s = 's' if num_entities > 1 else ''
        num_string = colors.bold(str(num_entities))
        if entity_type == Types.JOB:
            wait_text = f'Waiting for {num_string} job{s}'
        elif entity_type == Types.INSTANCE:
            wait_text = f'Waiting for instances of {num_string} job{s}'
        elif entity_type == Types.GROUP:
            wait_text = f'Waiting for {num_string} job group{s}'
        else:
            raise Exception(f'Invalid entity type {entity_type}.')

        wait_text = f'{wait_text} on {colors.bold(cluster["name"])}'
        index = progress.add(wait_text)
        if pred(entities):
            progress.update(index, colors.bold('Done'))
        else:
            entities = wait_until(satisfy_pred, timeout, interval)
            if entities:
                progress.update(index, colors.bold('Done'))
            else:
                raise TimeoutError('Timeout waiting for response.')
    return entities
Exemple #4
0
def list_jobs_on_cluster(cluster, state, user, start_ms, end_ms, name, limit,
                         include_custom_executor):
    """Queries cluster for jobs with the given state / user / time / name"""
    if 'all' in state:
        state = ['waiting', 'running', 'completed']
    params = {'user': user, 'name': name, 'limit': limit}
    if include_custom_executor:
        params['state'] = state
        params['start'] = start_ms
        params['end'] = end_ms
        jobs = http.make_data_request(
            cluster, lambda: http.get(cluster, 'jobs', params=params))
    else:
        params['state'] = '+'.join(state)
        params['start-ms'] = start_ms
        params['end-ms'] = end_ms
        jobs = http.make_data_request(
            cluster, lambda: http.get(cluster, 'list', params=params))
    entities = {'jobs': jobs, 'count': len(jobs)}
    return entities
Exemple #5
0
def list_jobs_on_cluster(cluster, state, user, start_ms, end_ms, name, limit):
    """Queries cluster for jobs with the given state / user / time / name"""
    if 'all' in state:
        state_string = 'waiting+running+completed'
    else:
        state_string = '+'.join(state)
    params = {
        'state': state_string,
        'user': user,
        'start-ms': start_ms,
        'end-ms': end_ms,
        'name': name,
        'limit': limit
    }
    jobs = http.make_data_request(
        cluster, lambda: http.get(cluster, 'list', params=params))
    entities = {'jobs': jobs, 'count': len(jobs)}
    return entities
Exemple #6
0
def list_jobs_on_cluster(cluster, state, user, lookback_hours, name, limit):
    """Queries cluster for jobs with the given state / user / time / name"""
    now_ms = int(round(time.time() * 1000))
    lookback_ms = int(lookback_hours * MILLIS_PER_HOUR)
    start_ms = now_ms - lookback_ms
    if 'all' in state:
        state_string = 'waiting+running+completed'
    else:
        state_string = '+'.join(state)
    params = {
        'state': state_string,
        'user': user,
        'start-ms': start_ms,
        'name': name,
        'limit': limit
    }
    jobs = http.make_data_request(
        lambda: http.get(cluster, 'list', params=params))
    entities = {'jobs': jobs, 'count': len(jobs)}
    return entities
Exemple #7
0
 def satisfy_pred():
     return pred(
         http.make_data_request(cluster,
                                lambda: make_request_fn(cluster, uuids)))
Exemple #8
0
def copy_limits(args, config_path):
    """Copies limits (share and quota) for a particular user from one cluster to another cluster"""
    user = args.get('user')

    from_cluster = args.get('from')
    from_url = args.get('from_url')
    if not from_cluster and not from_url:
        copy_limits_parser.print_help()
        print()
        raise Exception(f'You must provide either a from-cluster name (--from) or URL (--from-url).')

    to_cluster = args.get('to')
    to_url = args.get('to_url')
    if not to_cluster and not to_url:
        copy_limits_parser.print_help()
        print()
        raise Exception(f'You must provide either a to-cluster name (--to) or URL (--to-url).')

    _, config_map = configuration.load_config_with_defaults(config_path)
    from_clusters = load_target_clusters(config_map, from_url, from_cluster)
    to_clusters = load_target_clusters(config_map, to_url, to_cluster)
    assert len(from_clusters) == 1, 'Only a single from-cluster is supported.'
    assert len(to_clusters) == 1, 'Only a single to-cluster is supported.'
    from_cluster = from_clusters[0]
    to_cluster = to_clusters[0]
    from_cluster_name = from_cluster['name']
    to_cluster_name = to_cluster['name']
    print(f'Copying limits for {terminal.bold(user)} user '
          f'from {terminal.bold(from_cluster_name)} '
          f'to {terminal.bold(to_cluster_name)}:')
    from_pools = http.make_data_request(from_cluster, lambda: http.get(from_cluster, 'pools', params={}))
    to_pools = http.make_data_request(to_cluster, lambda: http.get(to_cluster, 'pools', params={}))
    from_pools_dict = {pool['name']: pool for pool in from_pools}
    to_pools_dict = {pool['name']: pool for pool in to_pools}
    for pool_name, from_pool in from_pools_dict.items():
        if pool_name in to_pools_dict and to_pools_dict[pool_name]['state'] != 'inactive':
            print(f'\n=== Pool: {pool_name} ===')
            query_result = query([from_cluster, to_cluster], user)
            query_result = filter_query_result_by_pools(query_result, [pool_name])
            print_formatted(query_result)
            answer = input(f'Copy limits for {terminal.bold(pool_name)} pool '
                           f'from {terminal.bold(from_cluster_name)} '
                           f'to {terminal.bold(to_cluster_name)}? ')
            should_copy = str2bool(answer)
            if should_copy:
                from_dict = query_result['clusters'][from_cluster_name]['pools'][pool_name]
                reason = f'Copying limits for {user} user from {from_cluster_name} to {to_cluster_name}'

                from_share = from_dict['share']
                resp = http.post(to_cluster,
                                 'share',
                                 {'pool': pool_name,
                                  'user': user,
                                  'reason': reason,
                                  'share': from_share})
                if resp.status_code != 201:
                    print_error(f'Setting share for {pool_name} on {to_cluster_name} '
                                f'failed with status code {resp.status_code}: {resp.text}')
                else:
                    print(terminal.success(f'Copied share for {pool_name} pool '
                                           f'from {from_cluster_name} '
                                           f'to {to_cluster_name}.'))

                from_quota = from_dict['quota']
                resp = http.post(to_cluster,
                                 'quota',
                                 {'pool': pool_name,
                                  'user': user,
                                  'reason': reason,
                                  'quota': from_quota})
                if resp.status_code != 201:
                    print_error(f'Setting quota for {pool_name} on {to_cluster_name} '
                                f'failed with status code {resp.status_code}: {resp.text}')
                else:
                    print(terminal.success(f'Copied quota for {pool_name} pool '
                                           f'from {from_cluster_name} '
                                           f'to {to_cluster_name}.'))
Exemple #9
0
def get_job_data(cluster, usage_map):
    """Gets data for jobs in usage map if it has any"""
    ungrouped_running_job_uuids = usage_map['ungrouped']['running_jobs']
    job_uuids_to_retrieve = ungrouped_running_job_uuids[:]
    grouped = usage_map['grouped']

    group_uuid_to_name = {}
    for group_usage in grouped:
        group = group_usage['group']
        job_uuids_to_retrieve.extend(group['running_jobs'])
        group_uuid_to_name[group['uuid']] = group['name']

    applications = {}
    num_running_jobs = len(job_uuids_to_retrieve)

    if num_running_jobs > 0:
        jobs = http.make_data_request(
            cluster, lambda: make_job_request(cluster, job_uuids_to_retrieve))
        for job in jobs:
            application = job['application'][
                'name'] if 'application' in job else None
            if 'groups' in job:
                group_uuids = job['groups']
                group = f'{group_uuid_to_name[group_uuids[0]]} ({group_uuids[0]})' if group_uuids else None
            else:
                group = None

            if application not in applications:
                applications[application] = {
                    'usage': {
                        'cpus': 0,
                        'mem': 0,
                        'gpus': 0
                    },
                    'groups': {}
                }

            applications[application]['usage']['cpus'] += job['cpus']
            applications[application]['usage']['mem'] += job['mem']
            applications[application]['usage']['gpus'] += job['gpus']

            if group not in applications[application]['groups']:
                applications[application]['groups'][group] = {
                    'usage': {
                        'cpus': 0,
                        'mem': 0,
                        'gpus': 0
                    },
                    'jobs': []
                }

            applications[application]['groups'][group]['usage']['cpus'] += job[
                'cpus']
            applications[application]['groups'][group]['usage']['mem'] += job[
                'mem']
            applications[application]['groups'][group]['usage']['gpus'] += job[
                'gpus']
            applications[application]['groups'][group]['jobs'].append(
                job['uuid'])

    return {'count': num_running_jobs, 'applications': applications}
Exemple #10
0
def get_usage_on_cluster(cluster, user):
    """Queries cluster for usage information for the given user"""
    params = {'user': user, 'group_breakdown': 'true'}
    usage_map = http.make_data_request(
        cluster, lambda: http.get(cluster, 'usage', params=params))
    if not usage_map:
        print_error(
            f'Unable to retrieve usage information on {cluster["name"]} ({cluster["url"]}).'
        )
        return {'count': 0}

    using_pools = 'pools' in usage_map
    pool_names = usage_map['pools'].keys() if using_pools else []

    share_map = http.make_data_request(
        cluster, lambda: http.get(cluster, 'share', params={'user': user}))
    if not share_map:
        print_error(
            f'Unable to retrieve share information on {cluster["name"]} ({cluster["url"]}).'
        )
        return {'count': 0}

    if using_pools != ('pools' in share_map):
        print_error(
            f'Share information on {cluster["name"]} ({cluster["url"]}) is invalid. '
            f'Usage information is{"" if using_pools else " not"} per pool, but share '
            f'is{"" if not using_pools else " not"}')
        return {'count': 0}
    if pool_names != (share_map['pools'].keys() if using_pools else []):
        print_error(
            f'Share information on {cluster["name"]} ({cluster["url"]}) is invalid. '
            f'Usage information has pools: {pool_names}, but share '
            f'has pools: {share_map["pools"].keys()}')
        return {'count': 0}

    def make_query_result(using_pools, usage_map, share_map, pool_data=None):
        query_result = {
            'using_pools': using_pools,
            'usage': usage_map['total_usage'],
            'share': share_map
        }
        query_result.update(get_job_data(cluster, usage_map))
        if pool_data:
            query_result.update(pool_data)
        return query_result

    if using_pools:
        pools = http.make_data_request(
            cluster, lambda: http.get(cluster, 'pools', params={}))
        pools_dict = {pool['name']: pool for pool in pools}
        for pool_name in pool_names:
            if pool_name not in pools_dict or 'state' not in pools_dict[
                    pool_name]:
                print_error(
                    f'Pool information on {cluster["name"]} ({cluster["url"]}) is invalid. '
                    f'Can\'t determine the state of pool {pool_name}')
                return {'count': 0}
        query_result = {
            'using_pools': using_pools,
            'pools': {
                pool_name:
                make_query_result(using_pools, usage_map['pools'][pool_name],
                                  share_map['pools'][pool_name],
                                  {'state': pools_dict[pool_name]['state']})
                for pool_name in pool_names
            }
        }
        return query_result
    else:
        return make_query_result(using_pools, usage_map, share_map)