def submit_federated(clusters, jobs, group, pool): """ Attempts to submit the provided jobs to each cluster in clusters, until a cluster returns a "created" status code. If no cluster returns "created" status, throws. """ messages = "" for cluster in clusters: cluster_name = cluster['name'] cluster_url = cluster['url'] try: print_info('Attempting to submit on %s cluster...' % terminal.bold(cluster_name)) json_body = {'jobs': jobs} if group: json_body['groups'] = [group] if pool: json_body['pool'] = pool resp = http.post(cluster, 'jobs', json_body) print_submit_result(cluster, resp) if resp.status_code == 201: metrics.inc('command.submit.jobs', len(jobs)) return 0 except requests.exceptions.ReadTimeout as rt: logging.exception(rt) print_info(terminal.failed( f'Encountered read timeout with {cluster_name} ({cluster_url}). Your submission may have completed.')) return 1 except IOError as ioe: logging.exception(ioe) reason = f'Cannot connect to {cluster_name} ({cluster_url})' message = submit_failed_message(cluster_name, reason) messages += message print_error(messages) raise Exception(terminal.failed('Job submission failed on all of your configured clusters.'))
def print_no_data(clusters, states, user): """Prints a message indicating that no data was found in the given clusters""" clusters_text = ' / '.join([c['name'] for c in clusters]) if 'all' in states: states = ['waiting', 'running', 'completed'] elif 'success' in states: states.remove('success') states.append('successful') states_text = ' / '.join(states) print( terminal.failed( f'No matching {states_text} jobs for {user} found in {clusters_text}.' ))
def format_state(state): """Capitalizes and colorizes the given state""" state = state.capitalize() if state == 'Running': text = terminal.running(state) elif state == 'Waiting': text = terminal.waiting(state) elif state == 'Failed': text = terminal.failed(state) elif state == 'Success': text = terminal.success(state) else: text = state return text
def kill_entities(query_result, clusters): """Attempts to kill the jobs / instances / groups with the given UUIDs""" kill_batch_size = 100 failed = [] succeeded = [] clusters_by_name = {c['name']: c for c in clusters} def __kill(cluster, uuids, kill_fn, entity_type): if len(uuids) > 0: for uuid_batch in partition(uuids, kill_batch_size): success = kill_fn(cluster, uuid_batch) batch = [{ 'cluster': cluster, 'type': entity_type, 'uuid': u } for u in uuid_batch] (succeeded if success else failed).extend(batch) for cluster_name, entities in query_result['clusters'].items(): cluster = clusters_by_name[cluster_name] job_uuids = [j['uuid'] for j in entities['jobs']] if 'jobs' in entities else [] instance_uuids = [i['task_id'] for i, _ in entities['instances'] ] if 'instances' in entities else [] group_uuids = [g['uuid'] for g in entities['groups'] ] if 'groups' in entities else [] __kill(cluster, job_uuids, kill_jobs, 'job') __kill(cluster, instance_uuids, kill_instances, 'job instance') __kill(cluster, group_uuids, kill_groups, 'job group') for item in succeeded: print_info( f'Killed {item["type"]} {terminal.bold(item["uuid"])} on {terminal.bold(item["cluster"]["name"])}.' ) for item in failed: print( terminal.failed( f'Failed to kill {item["type"]} {item["uuid"]} on {item["cluster"]["name"]}.' )) num_succeeded = len(succeeded) num_failed = len(failed) print_info(f'Successful: {num_succeeded}, Failed: {num_failed}') return num_failed
def no_data_message(clusters): """Returns a message indicating that no data was found in the given clusters""" clusters_text = ' / '.join([c['name'] for c in clusters]) message = terminal.failed(f'No matching data found in {clusters_text}.') message = f'{message}\nDo you need to add another cluster to your configuration?' return message
def submit_failed_message(cluster_name, reason): """Generates a failed submission message with the given cluster name and reason""" return 'Job submission %s on %s:\n%s' % ( terminal.failed('failed'), cluster_name, terminal.reason(reason))
def print_error(text): """Prints text to stderr, colored as a failure""" print(terminal.failed(text), file=sys.stderr)