def query_cluster(cluster, uuids, pred, timeout, interval, make_request_fn, entity_type): """ Queries the given cluster for the given uuids with an optional predicate, pred, that must be satisfied """ def satisfy_pred(): return pred( http.make_data_request(lambda: make_request_fn(cluster, uuids))) entities = http.make_data_request(lambda: make_request_fn(cluster, uuids)) if pred and len(entities) > 0: if entity_type == 'job': wait_text = 'Waiting for the following jobs' elif entity_type == 'instance': wait_text = 'Waiting for instances of the following jobs' elif entity_type == 'group': wait_text = 'Waiting for the following job groups' else: raise Exception('Invalid entity type %s.' % entity_type) uuid_text = ', '.join([e['uuid'] for e in entities]) wait_text = '%s on %s: %s' % (wait_text, colors.bold( cluster['name']), uuid_text) index = progress.add(wait_text) if pred(entities): progress.update(index, colors.bold('Done')) else: entities = wait_until(satisfy_pred, timeout, interval) if entities: progress.update(index, colors.bold('Done')) else: raise TimeoutError('Timeout waiting for response.') return entities
def query_instances_on_cluster(cluster, status, start_ms, end_ms): """Queries cluster for instance stats with the given status / time""" params = {'status': status, 'start': start_ms, 'end': end_ms} stats = http.make_data_request(cluster, lambda: http.get(cluster, 'stats/instances', params=params)) overall_stats = stats['overall'] data = {'count': overall_stats['count'] if 'count' in overall_stats else 0} return data
def __query_cluster(cluster, uuids, pred, timeout, interval, make_request_fn, entity_type): """ Queries the given cluster for the given uuids with an optional predicate, pred, that must be satisfied """ def satisfy_pred(): return pred(http.make_data_request(cluster, lambda: make_request_fn(cluster, uuids))) entities = http.make_data_request(cluster, lambda: make_request_fn(cluster, uuids)) num_entities = len(entities) if pred and num_entities > 0: s = 's' if num_entities > 1 else '' num_string = colors.bold(str(num_entities)) if entity_type == Types.JOB: wait_text = f'Waiting for {num_string} job{s}' elif entity_type == Types.INSTANCE: wait_text = f'Waiting for instances of {num_string} job{s}' elif entity_type == Types.GROUP: wait_text = f'Waiting for {num_string} job group{s}' else: raise Exception(f'Invalid entity type {entity_type}.') wait_text = f'{wait_text} on {colors.bold(cluster["name"])}' index = progress.add(wait_text) if pred(entities): progress.update(index, colors.bold('Done')) else: entities = wait_until(satisfy_pred, timeout, interval) if entities: progress.update(index, colors.bold('Done')) else: raise TimeoutError('Timeout waiting for response.') return entities
def list_jobs_on_cluster(cluster, state, user, start_ms, end_ms, name, limit, include_custom_executor): """Queries cluster for jobs with the given state / user / time / name""" if 'all' in state: state = ['waiting', 'running', 'completed'] params = {'user': user, 'name': name, 'limit': limit} if include_custom_executor: params['state'] = state params['start'] = start_ms params['end'] = end_ms jobs = http.make_data_request( cluster, lambda: http.get(cluster, 'jobs', params=params)) else: params['state'] = '+'.join(state) params['start-ms'] = start_ms params['end-ms'] = end_ms jobs = http.make_data_request( cluster, lambda: http.get(cluster, 'list', params=params)) entities = {'jobs': jobs, 'count': len(jobs)} return entities
def list_jobs_on_cluster(cluster, state, user, start_ms, end_ms, name, limit): """Queries cluster for jobs with the given state / user / time / name""" if 'all' in state: state_string = 'waiting+running+completed' else: state_string = '+'.join(state) params = { 'state': state_string, 'user': user, 'start-ms': start_ms, 'end-ms': end_ms, 'name': name, 'limit': limit } jobs = http.make_data_request( cluster, lambda: http.get(cluster, 'list', params=params)) entities = {'jobs': jobs, 'count': len(jobs)} return entities
def list_jobs_on_cluster(cluster, state, user, lookback_hours, name, limit): """Queries cluster for jobs with the given state / user / time / name""" now_ms = int(round(time.time() * 1000)) lookback_ms = int(lookback_hours * MILLIS_PER_HOUR) start_ms = now_ms - lookback_ms if 'all' in state: state_string = 'waiting+running+completed' else: state_string = '+'.join(state) params = { 'state': state_string, 'user': user, 'start-ms': start_ms, 'name': name, 'limit': limit } jobs = http.make_data_request( lambda: http.get(cluster, 'list', params=params)) entities = {'jobs': jobs, 'count': len(jobs)} return entities
def satisfy_pred(): return pred( http.make_data_request(cluster, lambda: make_request_fn(cluster, uuids)))
def copy_limits(args, config_path): """Copies limits (share and quota) for a particular user from one cluster to another cluster""" user = args.get('user') from_cluster = args.get('from') from_url = args.get('from_url') if not from_cluster and not from_url: copy_limits_parser.print_help() print() raise Exception(f'You must provide either a from-cluster name (--from) or URL (--from-url).') to_cluster = args.get('to') to_url = args.get('to_url') if not to_cluster and not to_url: copy_limits_parser.print_help() print() raise Exception(f'You must provide either a to-cluster name (--to) or URL (--to-url).') _, config_map = configuration.load_config_with_defaults(config_path) from_clusters = load_target_clusters(config_map, from_url, from_cluster) to_clusters = load_target_clusters(config_map, to_url, to_cluster) assert len(from_clusters) == 1, 'Only a single from-cluster is supported.' assert len(to_clusters) == 1, 'Only a single to-cluster is supported.' from_cluster = from_clusters[0] to_cluster = to_clusters[0] from_cluster_name = from_cluster['name'] to_cluster_name = to_cluster['name'] print(f'Copying limits for {terminal.bold(user)} user ' f'from {terminal.bold(from_cluster_name)} ' f'to {terminal.bold(to_cluster_name)}:') from_pools = http.make_data_request(from_cluster, lambda: http.get(from_cluster, 'pools', params={})) to_pools = http.make_data_request(to_cluster, lambda: http.get(to_cluster, 'pools', params={})) from_pools_dict = {pool['name']: pool for pool in from_pools} to_pools_dict = {pool['name']: pool for pool in to_pools} for pool_name, from_pool in from_pools_dict.items(): if pool_name in to_pools_dict and to_pools_dict[pool_name]['state'] != 'inactive': print(f'\n=== Pool: {pool_name} ===') query_result = query([from_cluster, to_cluster], user) query_result = filter_query_result_by_pools(query_result, [pool_name]) print_formatted(query_result) answer = input(f'Copy limits for {terminal.bold(pool_name)} pool ' f'from {terminal.bold(from_cluster_name)} ' f'to {terminal.bold(to_cluster_name)}? ') should_copy = str2bool(answer) if should_copy: from_dict = query_result['clusters'][from_cluster_name]['pools'][pool_name] reason = f'Copying limits for {user} user from {from_cluster_name} to {to_cluster_name}' from_share = from_dict['share'] resp = http.post(to_cluster, 'share', {'pool': pool_name, 'user': user, 'reason': reason, 'share': from_share}) if resp.status_code != 201: print_error(f'Setting share for {pool_name} on {to_cluster_name} ' f'failed with status code {resp.status_code}: {resp.text}') else: print(terminal.success(f'Copied share for {pool_name} pool ' f'from {from_cluster_name} ' f'to {to_cluster_name}.')) from_quota = from_dict['quota'] resp = http.post(to_cluster, 'quota', {'pool': pool_name, 'user': user, 'reason': reason, 'quota': from_quota}) if resp.status_code != 201: print_error(f'Setting quota for {pool_name} on {to_cluster_name} ' f'failed with status code {resp.status_code}: {resp.text}') else: print(terminal.success(f'Copied quota for {pool_name} pool ' f'from {from_cluster_name} ' f'to {to_cluster_name}.'))
def get_job_data(cluster, usage_map): """Gets data for jobs in usage map if it has any""" ungrouped_running_job_uuids = usage_map['ungrouped']['running_jobs'] job_uuids_to_retrieve = ungrouped_running_job_uuids[:] grouped = usage_map['grouped'] group_uuid_to_name = {} for group_usage in grouped: group = group_usage['group'] job_uuids_to_retrieve.extend(group['running_jobs']) group_uuid_to_name[group['uuid']] = group['name'] applications = {} num_running_jobs = len(job_uuids_to_retrieve) if num_running_jobs > 0: jobs = http.make_data_request( cluster, lambda: make_job_request(cluster, job_uuids_to_retrieve)) for job in jobs: application = job['application'][ 'name'] if 'application' in job else None if 'groups' in job: group_uuids = job['groups'] group = f'{group_uuid_to_name[group_uuids[0]]} ({group_uuids[0]})' if group_uuids else None else: group = None if application not in applications: applications[application] = { 'usage': { 'cpus': 0, 'mem': 0, 'gpus': 0 }, 'groups': {} } applications[application]['usage']['cpus'] += job['cpus'] applications[application]['usage']['mem'] += job['mem'] applications[application]['usage']['gpus'] += job['gpus'] if group not in applications[application]['groups']: applications[application]['groups'][group] = { 'usage': { 'cpus': 0, 'mem': 0, 'gpus': 0 }, 'jobs': [] } applications[application]['groups'][group]['usage']['cpus'] += job[ 'cpus'] applications[application]['groups'][group]['usage']['mem'] += job[ 'mem'] applications[application]['groups'][group]['usage']['gpus'] += job[ 'gpus'] applications[application]['groups'][group]['jobs'].append( job['uuid']) return {'count': num_running_jobs, 'applications': applications}
def get_usage_on_cluster(cluster, user): """Queries cluster for usage information for the given user""" params = {'user': user, 'group_breakdown': 'true'} usage_map = http.make_data_request( cluster, lambda: http.get(cluster, 'usage', params=params)) if not usage_map: print_error( f'Unable to retrieve usage information on {cluster["name"]} ({cluster["url"]}).' ) return {'count': 0} using_pools = 'pools' in usage_map pool_names = usage_map['pools'].keys() if using_pools else [] share_map = http.make_data_request( cluster, lambda: http.get(cluster, 'share', params={'user': user})) if not share_map: print_error( f'Unable to retrieve share information on {cluster["name"]} ({cluster["url"]}).' ) return {'count': 0} if using_pools != ('pools' in share_map): print_error( f'Share information on {cluster["name"]} ({cluster["url"]}) is invalid. ' f'Usage information is{"" if using_pools else " not"} per pool, but share ' f'is{"" if not using_pools else " not"}') return {'count': 0} if pool_names != (share_map['pools'].keys() if using_pools else []): print_error( f'Share information on {cluster["name"]} ({cluster["url"]}) is invalid. ' f'Usage information has pools: {pool_names}, but share ' f'has pools: {share_map["pools"].keys()}') return {'count': 0} def make_query_result(using_pools, usage_map, share_map, pool_data=None): query_result = { 'using_pools': using_pools, 'usage': usage_map['total_usage'], 'share': share_map } query_result.update(get_job_data(cluster, usage_map)) if pool_data: query_result.update(pool_data) return query_result if using_pools: pools = http.make_data_request( cluster, lambda: http.get(cluster, 'pools', params={})) pools_dict = {pool['name']: pool for pool in pools} for pool_name in pool_names: if pool_name not in pools_dict or 'state' not in pools_dict[ pool_name]: print_error( f'Pool information on {cluster["name"]} ({cluster["url"]}) is invalid. ' f'Can\'t determine the state of pool {pool_name}') return {'count': 0} query_result = { 'using_pools': using_pools, 'pools': { pool_name: make_query_result(using_pools, usage_map['pools'][pool_name], share_map['pools'][pool_name], {'state': pools_dict[pool_name]['state']}) for pool_name in pool_names } } return query_result else: return make_query_result(using_pools, usage_map, share_map)