def cancel(state: State, name: str, match: str, purge: bool, pod_ids: str, pod_status: str, listed_runs_kinds: List[RunKinds] = None): """ Cancels chosen experiments based on a name provided as a parameter. """ if not listed_runs_kinds: listed_runs_kinds = [RunKinds.TRAINING, RunKinds.JUPYTER] # check whether we have runs with a given name if name and match: handle_error(user_msg=Texts.NAME_M_BOTH_GIVEN_ERROR_MSG) exit(1) if not name and not match: handle_error(user_msg=Texts.NAME_M_NONE_GIVEN_ERROR_MSG) exit(1) current_namespace = get_current_namespace() if pod_ids or pod_status: if not name: name = match cancel_pods_mode(namespace=current_namespace, run_name=name, pod_ids=pod_ids, pod_status=pod_status) exit(0) search_for_experiment = False exp_to_be_cancelled = None if name: exp_to_be_cancelled = Experiment.get(namespace=current_namespace, name=name) exp_to_be_cancelled_kind = RunKinds(exp_to_be_cancelled.metadata['labels'].get('runKind')) \ if exp_to_be_cancelled else None exp_to_be_cancelled = exp_to_be_cancelled if exp_to_be_cancelled_kind in listed_runs_kinds else None if exp_to_be_cancelled: search_for_experiment = True else: name = f"^{name}$" else: name = match list_of_all_runs = None list_of_applicable_states = [RunStatus.QUEUED, RunStatus.RUNNING] if purge: list_of_applicable_states.extend( [RunStatus.FAILED, RunStatus.COMPLETE, RunStatus.CANCELLED]) try: if search_for_experiment: list_of_all_runs = Run.list(namespace=current_namespace, exp_name_filter=[name], run_kinds_filter=listed_runs_kinds) else: list_of_all_runs = Run.list(namespace=current_namespace, name_filter=name, run_kinds_filter=listed_runs_kinds) except Exception: handle_error( logger, Texts.LIST_RUNS_ERROR_MSG.format( experiment_name_plural=experiment_name_plural), Texts.LIST_RUNS_ERROR_MSG.format( experiment_name_plural=experiment_name_plural)) exit(1) # Handle cancellation of experiments with no associated Runs if exp_to_be_cancelled and not list_of_all_runs: cancel_uninitialized_experiment(experiment=exp_to_be_cancelled, namespace=current_namespace, purge=purge) if not list_of_all_runs: handle_error(user_msg=Texts.LACK_OF_EXPERIMENTS_ERROR_MSG.format( experiment_name_plural=experiment_name_plural, experiment_name=experiment_name)) exit(1) elif not purge and not [ run for run in list_of_all_runs if run.state in [RunStatus.QUEUED, RunStatus.RUNNING] ]: handle_error( user_msg=Texts.LACK_OF_EXP_TO_BE_CANCELLED_ERROR_MSG.format( experiment_name_plural=experiment_name_plural)) exit(1) # check whether we have at least one experiment in state other than CANCELLED list_of_runs_to_be_deleted = [] names_of_cancelled_runs = [] if not purge: # check whether we have at least one experiment in state other than CANCELLED for run in list_of_all_runs: if run.state in list_of_applicable_states: list_of_runs_to_be_deleted.append(run) else: names_of_cancelled_runs.append(run.name) if not list_of_runs_to_be_deleted: handle_error( user_msg=Texts.EXPERIMENTS_ALREADY_CANCELLED_ERROR_MSG.format( experiment_name_plural=experiment_name_plural, operation_word=Texts. DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) exit(1) elif len(list_of_runs_to_be_deleted) != len(list_of_all_runs): click.echo( Texts.ALREADY_CANCELLED_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for name in names_of_cancelled_runs: click.echo(f" - {name}") click.echo( Texts.CAN_BE_CANCELLED_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for name in list_of_runs_to_be_deleted: click.echo(f" - {name.name}") else: click.echo( Texts.WILL_BE_CANCELLED_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for name in list_of_runs_to_be_deleted: click.echo(f" - {name.name}") else: list_of_runs_to_be_deleted = list_of_all_runs click.echo( Texts.WILL_BE_PURGED_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for name in list_of_runs_to_be_deleted: click.echo(f" - {name.name}") if not click.confirm( Texts.CONFIRM_CANCEL_MSG.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deletion"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancellation"])): handle_error(user_msg=Texts.CANCELLATION_ABORTED_MSG.format( experiment_name_plural=experiment_name_plural, operation_word=Texts. DELETE_OPERATION["deletion"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancellation"])) exit(0) # group runs by experiments exp_with_runs = defaultdict(list) for run in list_of_runs_to_be_deleted: exp_with_runs[run.experiment_name].append(run) deleted_runs = [] not_deleted_runs = [] if purge: # Connect to elasticsearch in order to purge run logs try: with K8sProxy(NAUTAAppNames.ELASTICSEARCH) as proxy: es_client = K8sElasticSearchClient( host="127.0.0.1", port=proxy.tunnel_port, verify_certs=False, use_ssl=False, with_admin_privledges=is_current_user_administrator()) for exp_name, run_list in exp_with_runs.items(): try: exp_del_runs, exp_not_del_runs = purge_experiment( exp_name=exp_name, runs_to_purge=run_list, namespace=current_namespace, k8s_es_client=es_client) deleted_runs.extend(exp_del_runs) not_deleted_runs.extend(exp_not_del_runs) except Exception: handle_error(logger, Texts.OTHER_CANCELLING_ERROR_MSG) not_deleted_runs.extend(run_list) except K8sProxyCloseError: handle_error(logger, Texts.PROXY_CLOSING_ERROR_LOG_MSG, Texts.PROXY_CLOSING_ERROR_USER_MSG) exit(1) except LocalPortOccupiedError as exe: handle_error( logger, Texts.PORT_OCCUPIED_ERROR_LOG_MSG, Texts.PORT_OCCUPIED_ERROR_USER_MSG.format( exception_message=exe.message)) exit(1) except K8sProxyOpenError: handle_error(logger, Texts.PROXY_OPEN_ERROR_MSG, Texts.PROXY_OPEN_ERROR_MSG) exit(1) else: for exp_name, run_list in exp_with_runs.items(): try: exp_del_runs, exp_not_del_runs = cancel_experiment( exp_name=exp_name, runs_to_cancel=run_list, namespace=current_namespace) deleted_runs.extend(exp_del_runs) not_deleted_runs.extend(exp_not_del_runs) except Exception: handle_error(logger, Texts.OTHER_CANCELLING_ERROR_MSG) not_deleted_runs.extend(run_list) if deleted_runs: click.echo( Texts.SUCCESSFULLY_CANCELLED_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for run in deleted_runs: click.echo(f" - {run.name}") if not_deleted_runs: click.echo( Texts.FAILED_TO_CANCEL_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for run in not_deleted_runs: click.echo(f" - {run.name}") sys.exit(1)
def list_runs_in_cli(verbosity_lvl: int, all_users: bool, name: str, listed_runs_kinds: List[RunKinds], runs_list_headers: List[str], with_metrics: bool, status: RunStatus = None, count: int = None, brief: bool = False): """ Display a list of selected runs in the cli. :param verbosity_lvl: level at which error messages should be logged or displayed :param all_users: whether to display runs regardless of their owner or not :param name: regular expression to which names of the shown runs have to match :param status: display runs with this status :param listed_runs_kinds: list of kinds of runs that will be listed out :param runs_list_headers: headers which will be displayed on top of a table shown in the cli :param with_metrics: whether to show metrics column or not :param count: number of rows displayed on a list. If not given - content of a list is not limited :param brief: when true only experiment name, submission date, owner and state will be print """ try: namespace = None if all_users else get_kubectl_current_context_namespace( ) # List experiments command is actually listing Run resources instead of Experiment resources with one # exception - if run is initialized - nctl displays data of an experiment instead of data of a run runs = replace_initializing_runs( Run.list(namespace=namespace, state_list=[status], name_filter=name, run_kinds_filter=listed_runs_kinds)) runs_representations = [run.cli_representation for run in runs] if brief: runs_table_data = [ (run_representation.name, run_representation.submission_date, run_representation.submitter, run_representation.status) for run_representation in runs_representations ] elif with_metrics: runs_table_data = runs_representations else: runs_table_data = [ ( run_representation.name, run_representation.parameters, # type: ignore run_representation.submission_date, run_representation.start_date, run_representation.duration, run_representation.submitter, run_representation.status, run_representation.template_name, run_representation.template_version) for run_representation in runs_representations ] click.echo( tabulate( runs_table_data if not count else runs_table_data[-count:], headers=runs_list_headers, tablefmt=TBLT_TABLE_FORMAT)) except InvalidRegularExpressionError: handle_error(logger, Texts.INVALID_REGEX_ERROR_MSG, Texts.INVALID_REGEX_ERROR_MSG, add_verbosity_msg=verbosity_lvl == 0) exit(1) except Exception: handle_error(logger, Texts.OTHER_ERROR_MSG, Texts.OTHER_ERROR_MSG, add_verbosity_msg=verbosity_lvl == 0) exit(1)
def purge_experiment(exp_name: str, runs_to_purge: List[Run], k8s_es_client: K8sElasticSearchClient, namespace: str) -> Tuple[List[Run], List[Run]]: """ Purge experiment with a given name by cancelling runs given as a parameter. If given experiment contains more runs than is in the list of runs - experiment's state remains intact. :param exp_name: name of an experiment to which belong runs passed in run_list parameter :param runs_to_purge: list of runs that should be purged, they have to belong to exp_name experiment :param k8s_es_client: Kubernetes ElasticSearch client :param namespace: namespace where experiment is located :return: two list - first contains runs that were cancelled successfully, second - those which weren't """ logger.debug(f"Purging {exp_name} experiment ...") purged_runs = [] not_purged_runs = [] experiment = Experiment.get(name=exp_name, namespace=namespace) if not experiment: raise RuntimeError(Texts.GET_EXPERIMENT_ERROR_MSG) experiment_runs = Run.list(namespace=namespace, exp_name_filter=[exp_name]) # check whether experiment has more runs that should be cancelled cancel_whole_experiment = (len(experiment_runs) == len(runs_to_purge)) if cancel_whole_experiment: experiment.state = ExperimentStatus.CANCELLING experiment.update() try: cancelled_runs, not_cancelled_runs = cancel_experiment_runs( runs_to_cancel=runs_to_purge, namespace=namespace) not_purged_runs = not_cancelled_runs if cancel_whole_experiment: # Delete associated workflows experiment_associated_workflows = [ wf for wf in ArgoWorkflow.list(namespace=namespace) if wf.labels.get('experimentName') == experiment.name ] for wf in experiment_associated_workflows: wf.delete() # Remove tags from git repo manager try: delete_exp_tag_from_git_repo_manager( experiment_name=experiment.name, username=namespace, experiments_workdir=get_run_environment_path('')) except Exception: handle_error(logger, Texts.GIT_REPO_MANAGER_ERROR_MSG, Texts.GIT_REPO_MANAGER_ERROR_MSG) raise for run in cancelled_runs: logger.debug(f"Purging {run.name} run ...") click.echo(Texts.PURGING_START_MSG.format(run_name=run.name)) try: with spinner(text=Texts.PURGING_PROGRESS_MSG.format( run_name=run.name)): # purge helm release delete_helm_release(run.name, namespace=namespace, purge=True) # delete run kubectl.delete_k8s_object("run", run.name) purged_runs.append(run) except Exception as exe: not_purged_runs.append(run) logger.exception("Error during purging runs.") # occurence of NotFound error may mean, that run has been removed earlier if "NotFound" not in str(exe): click.echo( Texts.INCOMPLETE_PURGE_ERROR_MSG.format( experiment_name=experiment_name)) raise exe try: # clear run logs if is_current_user_administrator(): logger.debug(f"Clearing logs for {run.name} run.") with spinner(text=Texts.PURGING_LOGS_PROGRESS_MSG.format( run_name=run.name)): k8s_es_client.delete_logs_for_run(run=run.name, namespace=namespace) except Exception: logger.exception("Error during clearing run logs.") # CAN-1099 - docker garbage collector has errors that prevent from correct removal of images # try: # try to remove images from docker registry # delete_images_for_experiment(exp_name=run.name) # except Exception: # logger.exception("Error during removing images.") if cancel_whole_experiment and not not_purged_runs: try: kubectl.delete_k8s_object("experiment", exp_name) except Exception: # problems during deleting experiments are hidden as if runs were # cancelled user doesn't have a possibility to remove them logger.exception("Error during purging experiment.") except Exception: logger.exception("Error during purging experiment.") return purged_runs, not_purged_runs return purged_runs, not_purged_runs
def cancel_pods_mode(namespace: str, run_name: str = None, pod_ids: str = None, pod_status: str = None): namespace_pods = k8s_pods.list_pods(namespace=namespace) runs_only_pods = [pod for pod in namespace_pods if 'runName' in pod.labels] filtered_pods = runs_only_pods if run_name: run_name_match_pods = [] for pod in runs_only_pods: if re.match(run_name, pod.labels['runName']): run_name_match_pods.append(pod) filtered_pods = run_name_match_pods if pod_ids: pod_ids_match_pods = [] pod_ids_array = pod_ids.split(',') for pod in filtered_pods: if pod.name in pod_ids_array: pod_ids_match_pods.append(pod) filtered_pods = pod_ids_match_pods if pod_status: status_filtered_pods = [] try: converted_pod_status = PodStatus(pod_status.upper()) except ValueError: handle_error(user_msg=Texts.BAD_POD_STATUS_PASSED.format( status_passed=pod_status, available_statuses=PodStatus.all_members())) exit(1) return for pod in filtered_pods: if pod.status == converted_pod_status: status_filtered_pods.append(pod) filtered_pods = status_filtered_pods if not filtered_pods: handle_error(user_msg=Texts.LACK_OF_PODS_ERROR_MSG) exit(1) click.echo( Texts.WILL_BE_PURGED_LIST_HEADER.format( experiment_name_plural='pods', operation_word=Texts.DELETE_OPERATION["deleted"])) for pod in filtered_pods: click.echo(f" - {pod.name}") if not click.confirm( Texts.CONFIRM_CANCEL_MSG.format( experiment_name_plural='pods', operation_word=Texts.DELETE_OPERATION["deletion"])): handle_error(user_msg=Texts.CANCELLATION_ABORTED_MSG.format( experiment_name_plural='pods', operation_word=Texts.DELETE_OPERATION["deletion"])) exit(0) deleted_pods = [] not_deleted_pods = [] for pod in filtered_pods: click.echo(Texts.CANCELING_PODS_MSG.format(pod_name=pod.name)) try: pod.delete() deleted_pods.append(pod) except Exception: handle_error(logger, Texts.OTHER_POD_CANCELLING_ERROR_MSG) not_deleted_pods.append(pod) if deleted_pods: click.echo( Texts.SUCCESSFULLY_CANCELLED_LIST_HEADER.format( experiment_name_plural='pods', operation_word=Texts.DELETE_OPERATION["deleted"])) for pod in deleted_pods: click.echo(f" - {pod.name}") if not_deleted_pods: click.echo( Texts.FAILED_TO_CANCEL_LIST_HEADER.format( experiment_name_plural='pods', operation_word=Texts.DELETE_OPERATION["deleted"])) for pod in not_deleted_pods: click.echo(f" - {pod.name}") sys.exit(1)
def submit(ctx: click.Context, script_location: str, script_folder_location: str, template: str, name: str, pack_param: List[Tuple[str, str]], parameter_range: List[Tuple[str, str]], parameter_set: Tuple[str, ...], env: List[str], script_parameters: Tuple[str, ...], requirements: Optional[str]): logger.debug(Texts.SUBMIT_START_LOG_MSG) validate_script_location(script_location) validate_pack_params(pack_param) validate_pack(template) if os.path.isdir(script_location): if not requirements: requirements = get_default_requirements_location( script_directory=script_location) script_location = get_default_script_location( script_directory=script_location) click.echo(Texts.SUBMIT_START_USER_MSG) runs_list = None # noinspection PyBroadException try: runs_list, runs_errors, _ = submit_experiment( run_kind=RunKinds.TRAINING, script_location=script_location, script_folder_location=script_folder_location, template=template, name=name, pack_params=pack_param, parameter_range=parameter_range, parameter_set=parameter_set, script_parameters=script_parameters, env_variables=env, requirements_file=requirements) except K8sProxyCloseError as exe: handle_error(user_msg=exe.message) click.echo(exe.message) if not runs_list: exit(1) except SubmitExperimentError as exe: handle_error(user_msg=Texts.SUBMIT_ERROR_MSG.format( exception_message=exe.message)) exit(1) except Exception: handle_error(user_msg=Texts.SUBMIT_OTHER_ERROR_MSG) exit(1) # display information about status of a training click.echo( tabulate( [(run.cli_representation.name, run.cli_representation.parameters, run.cli_representation.status, format_run_message(runs_errors.get(run.name, ""))) for run in runs_list], headers=[RUN_NAME, RUN_PARAMETERS, RUN_STATUS, RUN_MESSAGE], tablefmt=TBLT_TABLE_FORMAT)) # if there is at least one FAILED experiment - application has to return exit code != 0 if any(run.state == RunStatus.FAILED for run in runs_list): handle_error(logger, Texts.FAILED_RUNS_LOG_MSG) exit(1)
def validate_script_location(script_location: str): if not (os.path.isfile(script_location) or os.path.isdir(script_location)): handle_error(user_msg=Texts.SCRIPT_NOT_FOUND_ERROR_MSG.format( script_location=script_location)) exit(2)
def config(state: State, cpu: str, memory: str): if not cpu or not memory: handle_error(logger, Texts.MISSING_ARGUMENTS, Texts.MISSING_ARGUMENTS) sys.exit(1) if not validate_cpu_settings(cpu): handle_error(logger, Texts.CPU_WRONG_FORMAT, Texts.CPU_WRONG_FORMAT) sys.exit(1) if not validate_memory_settings(memory): handle_error(logger, Texts.MEMORY_WRONG_FORMAT, Texts.MEMORY_WRONG_FORMAT) sys.exit(1) configuration = NAUTAConfigMap() if configuration.minimal_node_memory_amount and \ convert_k8s_memory_resource(configuration.minimal_node_memory_amount) > convert_k8s_memory_resource(memory): error_message = Texts.MEMORY_SETTINGS_TOO_LOW.format( memory_value=configuration.minimal_node_memory_amount) handle_error(logger, error_message, error_message) sys.exit(1) if configuration.minimal_node_cpu_number and \ convert_k8s_cpu_resource(configuration.minimal_node_cpu_number) > convert_k8s_cpu_resource(cpu): error_message = Texts.CPU_SETTINGS_TOO_LOW.format( cpu_value=configuration.minimal_node_cpu_number) handle_error(logger, error_message, error_message) sys.exit(1) config_file_location = os.path.join(Config().config_path, NODE_CONFIG_FILENAME) if not os.path.isfile(config_file_location): handle_error(logger, Texts.MISSING_CONFIG_FILE, Texts.MISSING_CONFIG_FILE) sys.exit(1) with open(config_file_location, 'r+', encoding='utf-8') as config_file, \ spinner(text=Texts.CONFIG_UPDATE): config_file_content = yaml.load(config_file) cpu_number = str(config_file_content.get(CPU_NUMBER_FIELDNAME)) memory_amount = str(config_file_content.get(MEMORY_AMOUNT_FIELDNAME)) cpu_system_required = str( config_file_content.get(CPU_SYSTEM_REQUIRED_FIELDNAME)) memory_system_required = str( config_file_content.get(MEMORY_SYSTEM_REQUIRED_FIELDNAME)) if not cpu_number or cpu_number == "None" or not memory_amount or memory_amount == "None": handle_error(logger, Texts.CONFIG_FILE_INCORRECT, Texts.CONFIG_FILE_INCORRECT) sys.exit(1) try: override_values_in_packs( new_cpu_number=cpu, new_memory_amount=memory, current_cpu_number=cpu_number, current_mem_amount=memory_amount, cpu_system_required=cpu_system_required, mem_system_required=memory_system_required) except Exception: logger.exception(Texts.ERROR_DURING_UPDATE) handle_error(logger, Texts.ERROR_DURING_UPDATE, Texts.ERROR_DURING_UPDATE) sys.exit(1) config_file.seek(0) config_file.truncate() config_file_content[CPU_NUMBER_FIELDNAME] = cpu config_file_content[MEMORY_AMOUNT_FIELDNAME] = memory yaml.dump(config_file_content, config_file, default_flow_style=False, explicit_start=True) click.echo(Texts.SUCCESS_MESSAGE)
def launch(state: State, name: str, model_location: str, local_model_location: str, model_name: str, pack_param: List[Tuple[str, str]], requirements: str): """ Starts a new prediction instance that can be used for performing prediction, classification and regression tasks on trained model. """ if not model_location and not local_model_location: handle_error(user_msg=Texts.MISSING_MODEL_LOCATION_ERROR_MSG.format( local_model_location=local_model_location)) exit(1) if local_model_location: validate_local_model_location(local_model_location) click.echo('Submitting prediction instance.') try: model_path = model_location.rstrip( '/') if model_location else local_model_location.rstrip('/') model_name = model_name if model_name else os.path.basename(model_path) name = name if name else generate_name( name=model_name, prefix=INFERENCE_INSTANCE_PREFIX) inference_instance = start_inference_instance( name=name, model_location=model_location, model_name=model_name, local_model_location=local_model_location, requirements=requirements, pack_params=pack_param) if inference_instance.state == RunStatus.FAILED: raise RuntimeError('Inference instance submission failed.') except Exception: handle_error(logger, Texts.INSTANCE_START_ERROR_MSG, Texts.INSTANCE_START_ERROR_MSG, add_verbosity_msg=state.verbosity == 0) exit(1) click.echo( tabulate([[ inference_instance.cli_representation.name, model_location, inference_instance.cli_representation.status ]], headers=Texts.TABLE_HEADERS, tablefmt=TBLT_TABLE_FORMAT)) try: namespace = get_kubectl_current_context_namespace() authorization_header = get_authorization_header( service_account_name=name, namespace=namespace) inference_instance_url = get_inference_instance_url( inference_instance=inference_instance, model_name=model_name) click.echo( Texts.INSTANCE_INFO_MSG.format( inference_instance_url=inference_instance_url, authorization_header=authorization_header)) except Exception: handle_error(logger, Texts.INSTANCE_URL_ERROR_MSG, Texts.INSTANCE_URL_ERROR_MSG, add_verbosity_msg=state.verbosity == 0) exit(1)
def validate_local_model_location(local_model_location: str): if not os.path.isdir(local_model_location): handle_error(user_msg=Texts.MODEL_DIR_NOT_FOUND_ERROR_MSG.format( local_model_location=local_model_location)) exit(2)
def cancel(ctx: click.Context, name: str, match: str, purge: bool, pod_ids: str, pod_status: str, listed_runs_kinds: List[RunKinds] = None): """ Cancels chosen experiments based on a name provided as a parameter. """ if not listed_runs_kinds: listed_runs_kinds = [RunKinds.TRAINING, RunKinds.JUPYTER] # check whether we have runs with a given name if name and match: handle_error(user_msg=Texts.NAME_M_BOTH_GIVEN_ERROR_MSG) exit(1) if not name and not match: handle_error(user_msg=Texts.NAME_M_NONE_GIVEN_ERROR_MSG) exit(1) current_namespace = get_current_namespace() if pod_ids or pod_status: if not name: name = match cancel_pods_mode(namespace=current_namespace, run_name=name, pod_ids=pod_ids, pod_status=pod_status) exit(0) search_for_experiment = False exp_to_be_cancelled = None if name: exp_to_be_cancelled = Experiment.get(namespace=current_namespace, name=name) exp_to_be_cancelled_kind = RunKinds(exp_to_be_cancelled.metadata['labels'].get('runKind')) \ if exp_to_be_cancelled else None exp_to_be_cancelled = exp_to_be_cancelled if exp_to_be_cancelled_kind in listed_runs_kinds else None if exp_to_be_cancelled: search_for_experiment = True else: name = f"^{name}$" else: name = match list_of_all_runs = None list_of_applicable_states = [RunStatus.QUEUED, RunStatus.RUNNING] if purge: list_of_applicable_states.extend( [RunStatus.FAILED, RunStatus.COMPLETE, RunStatus.CANCELLED]) try: if search_for_experiment: list_of_all_runs = Run.list(namespace=current_namespace, exp_name_filter=[name], run_kinds_filter=listed_runs_kinds) else: list_of_all_runs = Run.list(namespace=current_namespace, name_filter=name, run_kinds_filter=listed_runs_kinds) except Exception: handle_error( logger, Texts.LIST_RUNS_ERROR_MSG.format( experiment_name_plural=experiment_name_plural), Texts.LIST_RUNS_ERROR_MSG.format( experiment_name_plural=experiment_name_plural)) exit(1) # Handle cancellation of experiments with no associated Runs if exp_to_be_cancelled and not list_of_all_runs: cancel_uninitialized_experiment(experiment=exp_to_be_cancelled, purge=purge) if not list_of_all_runs: handle_error(user_msg=Texts.LACK_OF_EXPERIMENTS_ERROR_MSG.format( experiment_name_plural=experiment_name_plural, experiment_name=experiment_name)) exit(1) elif not purge and not [ run for run in list_of_all_runs if run.state in [RunStatus.QUEUED, RunStatus.RUNNING] ]: handle_error( user_msg=Texts.LACK_OF_EXP_TO_BE_CANCELLED_ERROR_MSG.format( experiment_name_plural=experiment_name_plural)) exit(1) # check whether we have at least one experiment in state other than CANCELLED list_of_runs_to_be_deleted: List[Run] = [] names_of_cancelled_runs: List[str] = [] if not purge: # check whether we have at least one experiment in state other than CANCELLED for run in list_of_all_runs: if run.state in list_of_applicable_states: list_of_runs_to_be_deleted.append(run) else: names_of_cancelled_runs.append(run.name) if not list_of_runs_to_be_deleted: handle_error( user_msg=Texts.EXPERIMENTS_ALREADY_CANCELLED_ERROR_MSG.format( experiment_name_plural=experiment_name_plural, operation_word=Texts. DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) exit(1) elif len(list_of_runs_to_be_deleted) != len(list_of_all_runs): click.echo( Texts.ALREADY_CANCELLED_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for name in names_of_cancelled_runs: click.echo(f" - {name}") click.echo( Texts.CAN_BE_CANCELLED_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for run in list_of_runs_to_be_deleted: click.echo(f" - {run.name}") else: click.echo( Texts.WILL_BE_CANCELLED_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for run in list_of_runs_to_be_deleted: click.echo(f" - {run.name}") else: list_of_runs_to_be_deleted = list_of_all_runs click.echo( Texts.WILL_BE_PURGED_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for run in list_of_runs_to_be_deleted: click.echo(f" - {run.name}") if (not ctx.obj.force) and (not click.confirm( Texts.CONFIRM_CANCEL_MSG.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deletion"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancellation"]))): handle_error(user_msg=Texts.CANCELLATION_ABORTED_MSG.format( experiment_name_plural=experiment_name_plural, operation_word=Texts. DELETE_OPERATION["deletion"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancellation"])) exit(0) # group runs by experiments exp_with_runs: defaultdict = defaultdict(list) for run in list_of_runs_to_be_deleted: exp_with_runs[run.experiment_name].append(run) deleted_runs = [] not_deleted_runs = [] if purge: # Connect to elasticsearch in order to purge run logs es_client = K8sElasticSearchClient( host=f'{get_kubectl_host(with_port=True)}' f'/api/v1/namespaces/nauta/services/nauta-elasticsearch:nauta/proxy', verify_certs=False, use_ssl=True, headers={'Authorization': get_api_key()}) for exp_name, run_list in exp_with_runs.items(): try: exp_del_runs, exp_not_del_runs = purge_experiment( exp_name=exp_name, runs_to_purge=run_list, namespace=current_namespace, k8s_es_client=es_client) deleted_runs.extend(exp_del_runs) not_deleted_runs.extend(exp_not_del_runs) except Exception: handle_error(logger, Texts.OTHER_CANCELLING_ERROR_MSG) not_deleted_runs.extend(run_list) else: for exp_name, run_list in exp_with_runs.items(): try: exp_del_runs, exp_not_del_runs = cancel_experiment( exp_name=exp_name, runs_to_cancel=run_list, namespace=current_namespace) deleted_runs.extend(exp_del_runs) not_deleted_runs.extend(exp_not_del_runs) except Exception: handle_error(logger, Texts.OTHER_CANCELLING_ERROR_MSG) not_deleted_runs.extend(run_list) if deleted_runs: click.echo( Texts.SUCCESSFULLY_CANCELLED_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for run in deleted_runs: click.echo(f" - {run.name}") if not_deleted_runs: click.echo( Texts.FAILED_TO_CANCEL_LIST_HEADER.format( experiment_name_plural=experiment_name_plural, operation_word=Texts.DELETE_OPERATION["deleted"] if experiment_name_plural == 'pods' else Texts.CANCEL_OPERATION["cancelled"])) for run in not_deleted_runs: click.echo(f" - {run.name}") sys.exit(1)
def list_unitialized_experiments_in_cli( verbosity_lvl: int, all_users: bool, name: str, headers: List[str], listed_runs_kinds: List[RunKinds] = None, count: int = None, brief: bool = False): """ Display a list of selected runs in the cli. :param verbosity_lvl: level at which error messages should be logged or displayed :param all_users: whether to display runs regardless of their owner or not :param name: regular expression to which names of the shown runs have to match :param headers: headers which will be displayed on top of a table shown in the cli :param count: number of rows displayed on a list. If not given - content of a list is not limited """ if not listed_runs_kinds: listed_runs_kinds = [RunKinds.TRAINING, RunKinds.JUPYTER] try: namespace = None if all_users else get_kubectl_current_context_namespace( ) creating_experiments = Experiment.list( namespace=namespace, state=ExperimentStatus.CREATING, run_kinds_filter=listed_runs_kinds, name_filter=name) runs = Run.list(namespace=namespace, name_filter=name, run_kinds_filter=listed_runs_kinds) # Get Experiments without associated Runs names_of_experiment_with_runs = set() for run in runs: names_of_experiment_with_runs.add(run.experiment_name) uninitialized_experiments = [ experiment for experiment in creating_experiments if experiment.name not in names_of_experiment_with_runs ] displayed_items_count = count if count else len( uninitialized_experiments) click.echo( tabulate([ uninitialized_experiment_cli_representation(experiment) for experiment in uninitialized_experiments ][-displayed_items_count:], headers=headers, tablefmt="orgtbl")) except InvalidRegularExpressionError: handle_error(logger, Texts.INVALID_REGEX_ERROR_MSG, Texts.INVALID_REGEX_ERROR_MSG, add_verbosity_msg=verbosity_lvl == 0) exit(1) except Exception: handle_error(logger, Texts.OTHER_ERROR_MSG, Texts.OTHER_ERROR_MSG, add_verbosity_msg=verbosity_lvl == 0) exit(1)