Beispiel #1
0
def submit_federated(clusters, jobs, group, pool):
    """
    Attempts to submit the provided jobs to each cluster in clusters, until a cluster
    returns a "created" status code. If no cluster returns "created" status, throws.
    """
    messages = ""
    for cluster in clusters:
        cluster_name = cluster['name']
        cluster_url = cluster['url']
        try:
            print_info('Attempting to submit on %s cluster...' % terminal.bold(cluster_name))

            json_body = {'jobs': jobs}
            if group:
                json_body['groups'] = [group]
            if pool:
                json_body['pool'] = pool

            resp = http.post(cluster, 'jobs', json_body)
            print_submit_result(cluster, resp)
            if resp.status_code == 201:
                metrics.inc('command.submit.jobs', len(jobs))
                return 0
        except requests.exceptions.ReadTimeout as rt:
            logging.exception(rt)
            print_info(terminal.failed(
                f'Encountered read timeout with {cluster_name} ({cluster_url}). Your submission may have completed.'))
            return 1
        except IOError as ioe:
            logging.exception(ioe)
            reason = f'Cannot connect to {cluster_name} ({cluster_url})'
            message = submit_failed_message(cluster_name, reason)
            messages += message
    print_error(messages)
    raise Exception(terminal.failed('Job submission failed on all of your configured clusters.'))
Beispiel #2
0
def print_formatted_cluster_or_pool_usage(cluster_or_pool,
                                          cluster_or_pool_usage):
    """Prints the query result for a cluster or pool in a cluster as a hierarchical set of bullets"""
    usage_map = cluster_or_pool_usage['usage']
    share_map = cluster_or_pool_usage['share']
    quota_map = cluster_or_pool_usage['quota']
    print_info(terminal.bold(cluster_or_pool))

    format_limit = lambda limit, formatter=(lambda x: x): \
        'Unlimited' if limit == sys.float_info.max else formatter(limit)

    rows = [[
        'Max Quota',
        format_limit(quota_map['cpus']),
        format_limit(quota_map['mem'], format_memory_amount),
        format_limit(quota_map['gpus']),
        'Unlimited' if quota_map['count'] == (2**31 -
                                              1) else quota_map['count']
    ],
            [
                'Non-preemptible Share',
                format_limit(share_map['cpus']),
                format_limit(share_map['mem'], format_memory_amount),
                format_limit(share_map['gpus']), 'N/A'
            ],
            [
                'Current Usage', usage_map['cpus'],
                format_job_memory(usage_map), usage_map['gpus'],
                usage_map['jobs']
            ]]
    print_info(
        tabulate(rows,
                 headers=['', 'CPUs', 'Memory', 'GPUs', 'Jobs'],
                 tablefmt='plain'))

    applications = cluster_or_pool_usage['applications']
    if applications:
        print_info('Applications:')
    for application, application_usage in applications.items():
        usage_map = application_usage['usage']
        print_info(
            f'- {terminal.running(application if application else "[no application defined]")}'
        )
        print_info(f'  {format_usage(usage_map)}')
        print_info('  Job Groups:')
        for group, group_usage in application_usage['groups'].items():
            usage_map = group_usage['usage']
            jobs = group_usage['jobs']
            print_info(
                f'\t- {terminal.bold(group if group else "[ungrouped]")}')
            print_info(f'\t  {format_usage(usage_map)}')
            print_info(f'\t  Jobs: {len(jobs)}')
            print_info('')
    print_info('')
Beispiel #3
0
def __query_cluster(cluster, uuids, pred, timeout, interval, make_request_fn,
                    entity_type):
    """
    Queries the given cluster for the given uuids with
    an optional predicate, pred, that must be satisfied
    """
    def satisfy_pred():
        return pred(
            http.make_data_request(cluster,
                                   lambda: make_request_fn(cluster, uuids)))

    entities = http.make_data_request(cluster,
                                      lambda: make_request_fn(cluster, uuids))
    num_entities = len(entities)
    if pred and num_entities > 0:
        s = 's' if num_entities > 1 else ''
        num_string = terminal.bold(str(num_entities))
        if entity_type == Types.JOB:
            wait_text = f'Waiting for {num_string} job{s}'
        elif entity_type == Types.INSTANCE:
            wait_text = f'Waiting for instances of {num_string} job{s}'
        elif entity_type == Types.GROUP:
            wait_text = f'Waiting for {num_string} job group{s}'
        else:
            raise Exception(f'Invalid entity type {entity_type}.')

        wait_text = f'{wait_text} on {terminal.bold(cluster["name"])}'
        index = progress.add(wait_text)
        if pred(entities):
            progress.update(index, terminal.bold('Done'))
        else:
            entities = wait_until(satisfy_pred, timeout, interval)
            if entities:
                progress.update(index, terminal.bold('Done'))
            else:
                raise TimeoutError('Timeout waiting for response.')
    return entities