def purge_user(username: str): """ Removes all system's artifacts that belong to a removed user. K8s objects are removed during removal of a namespace. :param username: name of a user for which artifacts should be removed It throws exception in case of any problems detected during removal of a user """ try: # remove data from elasticsearch with spinner(text=TextsDel.DELETION_DELETING_USERS_EXPERIMENTS): es_client = K8sElasticSearchClient( host=f'{get_kubectl_host(with_port=True)}' f'/api/v1/namespaces/nauta/services/nauta-elasticsearch:nauta/proxy', verify_certs=False, use_ssl=True, headers={'Authorization': get_api_key()}) es_client.delete_logs_for_namespace(username) # remove data from git repo manager with k8s_proxy_context_manager.K8sProxy(NAUTAAppNames.GIT_REPO_MANAGER) as proxy,\ spinner(text=TextsDel.DELETION_DELETING_USERS_REPOSITORY): grm_client = GitRepoManagerClient(host='127.0.0.1', port=proxy.tunnel_port) grm_client.delete_nauta_user(username=username) except K8sProxyCloseError as exe: logger.exception("Error during closing of a proxy.") raise exe except Exception as exe: logger.exception(f"Error during removal of {username} user data") raise exe
def delete_user(username: str): """ Removes a user with all his/her objects :param username: name of a user to be deleted Throws an excpetion in case of any errors """ with spinner(text=TextsDel.DELETION_DELETING_NAMESPACE): delete_namespace(username) with spinner(text=TextsDel.DELETION_DELETING_USERS_OBJECTS): delete_helm_release(username, purge=True)
def tensorboard(state: State, no_launch: bool, tensorboard_service_client_port: Optional[int], port_number: Optional[int], experiment_name: List[str]): """ Subcommand for launching tensorboard with credentials """ current_namespace = get_kubectl_current_context_namespace() with spinner(Texts.TB_WAITING_MSG) as proxy_spinner, \ K8sProxy(nauta_app_name=NAUTAAppNames.TENSORBOARD_SERVICE, app_name='tensorboard-service', namespace=current_namespace, port=tensorboard_service_client_port) as proxy: tensorboard_service_client = TensorboardServiceClient( address=f'http://127.0.0.1:{proxy.tunnel_port}') requested_runs = build_tensorboard_run_list( exp_list=experiment_name, current_namespace=current_namespace) # noinspection PyBroadException try: tb = tensorboard_service_client.create_tensorboard(requested_runs) if tb.invalid_runs: list_of_invalid_runs = ', '.join([ f'{item.get("owner")}/{item.get("name")}' for item in tb.invalid_runs ]) click.echo( Texts.TB_INVALID_RUNS_MSG.format( invalid_runs=list_of_invalid_runs)) except Exception as exe: err_message = Texts.TB_CREATE_ERROR_MSG if hasattr( exe, 'error_code' ) and exe.error_code == HTTPStatus.UNPROCESSABLE_ENTITY: # type: ignore err_message = str(exe) handle_error(logger, err_message, err_message, add_verbosity_msg=state.verbosity == 0) sys.exit(1) for i in range(TENSORBOARD_TRIES_COUNT): # noinspection PyTypeChecker # tb.id is str tb = tensorboard_service_client.get_tensorboard(tb.id) if not tb: continue if tb.status == TensorboardStatus.RUNNING: proxy_spinner.hide() launch_app_with_proxy(k8s_app_name=NAUTAAppNames.TENSORBOARD, no_launch=no_launch, namespace=current_namespace, port=port_number, app_name=f"tensorboard-{tb.id}") return logger.warning( Texts.TB_WAITING_FOR_TB_MSG.format( tb_id=tb.id, tb_status_value=tb.status.value)) sleep(TENSORBOARD_CHECK_BACKOFF_SECONDS) click.echo(Texts.TB_TIMEOUT_ERROR_MSG) sys.exit(2)
def save_logs_to_file(logs_generator: Generator[LogEntry, None, None], instance_name: str, instance_type: str): filename = instance_name + ".log" confirmation_message = Texts.LOGS_STORING_CONF.format( filename=filename, instance_name=instance_name, instance_type=instance_type) if os.path.isfile(filename): confirmation_message = Texts.LOGS_STORING_CONF_FILE_EXISTS.format( filename=filename, instance_name=instance_name, instance_type=instance_type) if click.get_current_context().obj.force or click.confirm( confirmation_message, default=True): try: with open(filename, 'w') as file, spinner( spinner=NctlSpinner, text=Texts.SAVING_LOGS_TO_FILE_PROGRESS_MSG, color=SPINNER_COLOR): for log_entry in logs_generator: if not log_entry.content.isspace(): formatted_date = format_log_date(log_entry.date) file.write( f'{formatted_date} {log_entry.pod_name} {log_entry.content}' ) click.echo(Texts.LOGS_STORING_FINAL_MESSAGE) except Exception: handle_error(logger, Texts.LOGS_STORING_ERROR, Texts.LOGS_STORING_ERROR) exit(1) else: click.echo(Texts.LOGS_STORING_CANCEL_MESSAGE)
def status(ctx: click.Context, username: str): """ Returns status of a model :param username; if checked - searches for model for a certain user """ try: workflows: List[ArgoWorkflow.ArgoWorkflowCliModel] = [] if not username: namespace = get_kubectl_current_context_namespace() else: namespace = username with spinner(text=Texts.LOAD_DATA_MSG): # filtering out workflows used to build images with training jobs workflows = [ workflow.cli_representation for workflow in ArgoWorkflow.list( namespace=namespace, label_selector="type!=build-workflow") ] click.echo( tabulate(workflows, headers=MODEL_HEADERS, tablefmt=TBLT_TABLE_FORMAT)) except Exception: handle_error(logger, Texts.OTHER_ERROR_MSG, Texts.OTHER_ERROR_MSG, add_verbosity_msg=True) exit(1)
def save_logs_to_file(run: Run, run_logs_generator: Generator[LogEntry, None, None], instance_type: str): filename = run.name + '.log' confirmation_message = Texts.LOGS_STORING_CONFIRMATION.format(filename=filename, experiment_name=run.name, instance_type=instance_type) if os.path.isfile(filename): confirmation_message = Texts.LOGS_STORING_CONFIRMATION_FILE_EXISTS.format(filename=filename, experiment_name=run.name, instance_type=instance_type) if click.confirm(confirmation_message, default=True): try: with open(filename, 'w') as file, spinner(spinner=NctlSpinner, text=Texts.SAVING_LOGS_TO_FILE_PROGRESS_MSG, color=SPINNER_COLOR): for log_entry in run_logs_generator: if not log_entry.content.isspace(): formatted_date = format_log_date(log_entry.date) file.write(f'{formatted_date} {log_entry.pod_name} {log_entry.content}') click.echo(Texts.LOGS_STORING_FINAL_MESSAGE) except Exception as exe: handle_error(logger, Texts.LOGS_STORING_ERROR.format(exception_message=exe.message), Texts.LOGS_STORING_ERROR.format(exception_message=exe.message)) exit(1) else: click.echo(Texts.LOGS_STORING_CANCEL_MESSAGE)
def upgrade(ctx: click.Context): """ Upgrade users after Nauta upgrade. """ with spinner(text=Texts.UPGRADE_IN_PROGRESS): # noinspection PyBroadException try: # noinspection PyTypeChecker users: List[User] = User.list() with K8sProxy(NAUTAAppNames.GIT_REPO_MANAGER, number_of_retries_wait_for_readiness=60) as proxy: grm_client = GitRepoManagerClient(host='127.0.0.1', port=proxy.tunnel_port) for user in users: grm_user = grm_client.get_user(user.name) if not grm_user: grm_client.add_nauta_user(user.name) except Exception: handle_error(logger, Texts.UPGRADE_FAILED, Texts.UPGRADE_FAILED, add_verbosity_msg=ctx.obj.verbosity == 0) sys.exit(1) click.echo(Texts.UPGRADE_SUCCEEDED)
def status(state: State, model_name: str, status: PodPhase, username: str): """ Returns status of a model :param model_name: name of a model data of which should be displayed :param status: status of a model step that should be displayed :param username; if checked - searches for model for a certain user """ try: if not username: namespace = get_kubectl_current_context_namespace() else: namespace = username with spinner(text=Texts.LOAD_DATA_MSG): workflow: ArgoWorkflow = ArgoWorkflow.get(namespace=namespace, name=model_name) if not workflow: click.echo(Texts.MODEL_NOT_FOUND.format(model_name=model_name)) exit(0) click.echo('\nOperation details:\n') click.echo(tabulate([workflow.cli_representation], headers=MODEL_HEADERS, tablefmt=TBLT_TABLE_FORMAT)) click.echo('\nOperation steps:\n') if workflow.steps: click.echo(tabulate([step.cli_representation for step in workflow.steps if status is None or status == step.phase], headers=STEP_HEADERS, tablefmt=TBLT_TABLE_FORMAT)) else: click.echo(Texts.LACK_OF_STEPS) except Exception: handle_error(logger, Texts.OTHER_ERROR_MSG, Texts.OTHER_ERROR_MSG, add_verbosity_msg=True) exit(1)
def stream(state: State, name: str, data: str, method_verb: InferenceVerb): """ Perform stream inference task on launched prediction instance. """ method_verb = InferenceVerb(method_verb) try: namespace = get_kubectl_current_context_namespace() # TODO: check if kind field of inference instance Run is correct inference_instance = Run.get(name=name, namespace=namespace) if not inference_instance: handle_error(user_msg=Texts.INSTANCE_NOT_EXISTS_ERROR_MSG.format( name=name)) exit(1) if not inference_instance.state == RunStatus.RUNNING: handle_error(user_msg=Texts.INSTANCE_NOT_RUNNING_ERROR_MSG.format( name=name, running_code=RunStatus.RUNNING.value)) exit(1) inference_instance_url = get_inference_instance_url( inference_instance=inference_instance) stream_url = f'{inference_instance_url}:{method_verb.value}' except Exception: handle_error(logger, Texts.INSTANCE_GET_FAIL_ERROR_MSG.format(name=name), Texts.INSTANCE_GET_FAIL_ERROR_MSG.format(name=name), add_verbosity_msg=state.verbosity == 0) exit(1) try: with open(data, 'r', encoding='utf-8') as data_file: stream_data = json.load(data_file) except (json.JSONDecodeError, IOError): handle_error(logger, Texts.JSON_LOAD_ERROR_MSG.format(data=data), Texts.JSON_LOAD_ERROR_MSG.format(data=data)) exit(1) try: api_key = get_api_key() headers = { 'Authorization': api_key, 'Accept': 'application/json', 'Content-Type': 'application/json' } with spinner(text=Texts.WAITING_FOR_RESPONSE_MSG): stream_response = requests.post( stream_url, data=json.dumps(stream_data), # nosec - request to k8s cluster verify=False, headers=headers) stream_response.raise_for_status() click.echo(stream_response.text) except Exception as e: error_msg = Texts.INFERENCE_OTHER_ERROR_MSG.format(exception=e) if hasattr(e, 'response'): error_msg += Texts.INFERENCE_ERROR_RESPONSE_MSG.format( response_text=e.response.text) # type: ignore handle_error(logger, error_msg, error_msg) exit(1)
def install(state: State, template_name: str): chart_file_location = os.path.join(Config.get_config_path(), "packs", template_name) with spinner(text=Texts.GETTING_LIST_OF_TEMPLATES_MSG): repository_name, access_token = get_repository_configuration() try: remote_templates = get_remote_templates(repository_name, access_token) except ExceptionWithMessage as e: click.echo(e.message) sys.exit(1) remote_template_counterpart = remote_templates.get(template_name) if not remote_template_counterpart: click.echo( Texts.REMOTE_TEMPLATE_NOT_FOUND.format( template_name=template_name)) sys.exit(1) local_templates = get_local_templates() local_template_counterpart = local_templates.get(template_name) if local_template_counterpart: click.confirm(Texts.LOCAL_VERSION_ALREADY_INSTALLED.format( local_version=local_template_counterpart.local_version, template_name=local_template_counterpart.name, remote_version=remote_template_counterpart.remote_version), abort=True) # noinspection PyBroadException try: shutil.rmtree(chart_file_location) except Exception: logger.exception("failed to remove local copy of template!") with spinner(text=Texts.DOWNLOADING_TEMPLATE): repository_name, access_token = get_repository_configuration() g = Github(repository_name, access_token) g.download_whole_directory(template_name, chart_file_location) click.echo("successfully installed!")
def update_resources_in_packs(cpu: str = None, memory: str = None): config_file_location = os.path.join(Config().config_path, NODE_CONFIG_FILENAME) if not os.path.isfile(config_file_location): handle_error(logger, Texts.MISSING_CONFIG_FILE, Texts.MISSING_CONFIG_FILE) sys.exit(1) with open(config_file_location, 'r+', encoding='utf-8') as config_file, \ spinner(text=Texts.CONFIG_UPDATE): config_file_content = yaml.safe_load(config_file) cpu_number = str(config_file_content.get(CPU_NUMBER_FIELDNAME)) memory_amount = str(config_file_content.get(MEMORY_AMOUNT_FIELDNAME)) cpu_system_required_min = str( config_file_content.get(CPU_SYSTEM_REQUIRED_MIN_FIELDNAME)) cpu_system_required_percent = str( config_file_content.get(CPU_SYSTEM_REQUIRED_PERCENT_FIELDNAME)) memory_system_required_min = str( config_file_content.get(MEMORY_SYSTEM_REQUIRED_MIN_FIELDNAME)) memory_system_required_percent = str( config_file_content.get(MEMORY_SYSTEM_REQUIRED_PERCENT_FIELDNAME)) if not cpu_number or cpu_number == "None" or not memory_amount or memory_amount == "None": handle_error(logger, Texts.CONFIG_FILE_INCORRECT, Texts.CONFIG_FILE_INCORRECT) sys.exit(1) new_cpu = cpu if cpu else cpu_number new_memory = memory if memory else memory_amount try: override_values_in_packs( new_cpu_number=new_cpu, new_memory_amount=new_memory, current_cpu_number=cpu_number, current_mem_amount=memory_amount, cpu_system_required_min=cpu_system_required_min, cpu_system_required_percent=cpu_system_required_percent, mem_system_required_min=memory_system_required_min, mem_system_required_percent=memory_system_required_percent) except Exception: logger.exception(Texts.ERROR_DURING_UPDATE) handle_error(logger, Texts.ERROR_DURING_UPDATE, Texts.ERROR_DURING_UPDATE) sys.exit(1) if new_cpu != cpu_number and new_memory != memory_amount: config_file.seek(0) config_file.truncate() config_file_content[CPU_NUMBER_FIELDNAME] = cpu config_file_content[MEMORY_AMOUNT_FIELDNAME] = memory yaml.safe_dump(config_file_content, config_file, default_flow_style=False, explicit_start=True)
def launch_app(k8s_app_name: NAUTAAppNames, no_launch: bool = False, port: int = None, app_name: str = None, number_of_retries: int = 0, url_end: str = "", namespace: str = None): try: with spinner(text=Texts.LAUNCHING_APP_MSG) as proxy_spinner, \ K8sProxy(nauta_app_name=k8s_app_name, port=port, app_name=app_name, number_of_retries=number_of_retries, namespace=namespace) as proxy: url = FORWARDED_URL.format(proxy.tunnel_port, url_end) if k8s_app_name == NAUTAAppNames.INGRESS: config.load_kube_config() user_token = configuration.Configuration().api_key.get( 'authorization') prepared_user_token = user_token.replace('Bearer ', '') url = f'{url}?token={prepared_user_token}' if not no_launch: if is_gui_browser_available(): wait_for_connection(url) webbrowser.open_new(url) proxy_spinner.stop() else: click.echo(Texts.NO_WEB_BROWSER_ERROR_MSG) if port and port != proxy.tunnel_port: click.echo( Texts.CANNOT_USE_PORT.format( required_port=port, random_port=proxy.tunnel_port)) proxy_spinner.stop() click.echo(Texts.GO_TO_MSG.format(url=url)) click.echo(Texts.PROXY_CREATED_MSG) wait_for_ctrl_c() except K8sProxyCloseError: err_message = Texts.PROXY_CLOSE_ERROR_MSG.format(app_name=k8s_app_name) raise ProxyClosingError(err_message) except LocalPortOccupiedError as exe: err_message = Texts.PROXY_CREATED_EXTENDED_ERROR_MSG.format( app_name=k8s_app_name, reason=exe.message) raise LaunchError(err_message) except K8sProxyOpenError: error_msg = Texts.PROXY_CREATED_ERROR_MSG.format(app_name=k8s_app_name) logger.exception(error_msg) raise LaunchError(error_msg) except LaunchError as e: raise e except Exception: err_message = Texts.WEB_APP_LAUCH_FAIL_MSG logger.exception(err_message) raise LaunchError(err_message)
def list_templates(state: State): """ List experiments. """ with spinner(text=Texts.GETTING_LIST_OF_TEMPLATES_MSG): list_of_templates, error_messages = prepare_list_of_templates() for message in error_messages: click.echo(message) click.echo( tabulate.tabulate(list_of_templates, headers=TEMPLATE_LIST_HEADERS, tablefmt="orgtbl"))
def update_configuration(run_folder: str, script_location: str, script_parameters: Tuple[str, ...], experiment_name: str, run_name: str, local_registry_port: int, cluster_registry_port: int, pack_type: str, pack_params: List[Tuple[str, str]] = None, script_folder_location: str = None, env_variables: List[str] = None): """ Updates configuration of a tf-training pack based on paramaters given by a user. The following files are modified: - Dockerfile - name of a training script is replaced with the one given by a user - all additional files from experiment_folder are copied into an image (excluding files generated by draft) - charts/templates/job.yaml - list of arguments is replaces with those given by a user :return: in case of any errors it throws an exception with a description of a problem """ log.debug("Update configuration - start") try: modify_values_yaml(run_folder, script_location, script_parameters, pack_params=pack_params, experiment_name=experiment_name, run_name=run_name, pack_type=pack_type, cluster_registry_port=cluster_registry_port, env_variables=env_variables) with spinner(text=Texts.PREPARING_IMAGES_MSG.format( run_name=experiment_name)): modify_dockerfile(run_folder, script_location, local_registry_port=local_registry_port, script_folder_location=script_folder_location) modify_draft_toml(run_folder, registry=f'127.0.0.1:{local_registry_port}') except Exception as exe: log.exception("Update configuration - i/o error : {}".format(exe)) raise RuntimeError(Texts.CONFIG_NOT_UPDATED) from exe log.debug("Update configuration - end")
def cancel(state: State, workflow_name: str): try: namespace = get_kubectl_current_context_namespace() workflow: ArgoWorkflow = ArgoWorkflow.get(name=workflow_name, namespace=namespace) if not workflow: click.echo(Texts.NOT_FOUND_MSG.format(workflow_name=workflow_name)) exit(0) with spinner(text=Texts.PROGRESS_MSG.format( workflow_name=workflow_name)): workflow.delete() click.echo(Texts.SUCCESS_MSG.format(workflow_name=workflow_name)) except Exception: handle_error(logger, Texts.OTHER_ERROR_MSG, Texts.OTHER_ERROR_MSG, add_verbosity_msg=True) exit(1)
def cancel_experiment_runs(runs_to_cancel: List[Run], namespace: str) -> Tuple[List[Run], List[Run]]: """ Cancel given list of Runs belonging to a single namespace. :param runs_to_cancel: Runs to be cancelled :param namespace: namespace where Run instances reside :return: tuple of list containing successfully Runs and list containing Runs that were not cancelled """ deleted_runs = [] not_deleted_runs = [] try: for run in runs_to_cancel: logger.debug(f"Cancelling {run.name} run ...") click.echo( Texts.CANCELING_RUNS_START_MSG.format( run_name=run.name, experiment_name=experiment_name)) try: # if run status is cancelled - omit the following steps if run.state != RunStatus.CANCELLED: with spinner(text=Texts.CANCEL_SETTING_STATUS_MSG.format( run_name=run.name)): delete_helm_release(release_name=run.name, namespace=namespace, purge=False) # change a run state to CANCELLED run.state = RunStatus.CANCELLED run.end_timestamp = datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%SZ") run.update() deleted_runs.append(run) except Exception: logger.exception( Texts.INCOMPLETE_CANCEL_ERROR_MSG.format( run_name=run.name, experiment_name=experiment_name)) click.echo( Texts.INCOMPLETE_CANCEL_ERROR_MSG.format( run_name=run.name, experiment_name=experiment_name)) not_deleted_runs.append(run) except Exception: logger.exception("Error during cancelling experiments") return deleted_runs, not_deleted_runs return deleted_runs, not_deleted_runs
def purge_user(username: str): """ Removes all system's artifacts that belong to a removed user. K8s objects are removed during removal of a namespace. :param username: name of a user for which artifacts should be removed It throws exception in case of any problems detected during removal of a user """ # remove data from elasticsearch try: with k8s_proxy_context_manager.K8sProxy(NAUTAAppNames.ELASTICSEARCH) as proxy,\ spinner(text=TextsDel.DELETION_DELETING_USERS_EXPERIMENTS): es_client = K8sElasticSearchClient(host="127.0.0.1", port=proxy.tunnel_port, verify_certs=False, use_ssl=False) es_client.delete_logs_for_namespace(username) except K8sProxyCloseError as exe: logger.exception("Error during closing of a proxy for elasticsearch.") raise exe except Exception as exe: logger.exception("Error during removal of data from elasticsearch") raise exe
def ctrl_c_handler_for_submit(sig, frame): log.debug("ctrl-c pressed while submitting") try: with spinner(text=Texts.CTRL_C_PURGING_PROGRESS_MSG): if submitted_runs: for run in submitted_runs: try: # delete run delete_k8s_object("run", run.name) # purge helm release delete_helm_release(run.name, namespace=submitted_namespace, purge=True) except Exception: log.exception(Texts.ERROR_WHILE_REMOVING_RUNS) delete_k8s_object("experiment", submitted_experiment) except Exception: log.exception(Texts.ERROR_WHILE_REMOVING_EXPERIMENT) for proc in psutil.Process(os.getpid()).children(recursive=True): proc.send_signal(signal.SIGKILL) exit(1)
def submit(state: State, workflow_path: str): try: workflow: ArgoWorkflow = ArgoWorkflow.from_yaml(workflow_path) namespace = get_kubectl_current_context_namespace() with spinner(text=Texts.PROGRESS_MSG): workflow.create(namespace=namespace) workflow.namespace = namespace # Set namespace, to properly display owner in CLI click.echo( tabulate([workflow.cli_representation], headers=HEADERS, tablefmt=TBLT_TABLE_FORMAT)) except IOError as e: handle_error(logger, Texts.LOAD_SPEC_ERROR_MSG.format(msg=str(e)), Texts.LOAD_SPEC_ERROR_MSG.format(msg=str(e))) exit(1) except Exception: handle_error(logger, Texts.OTHER_ERROR_MSG, Texts.OTHER_ERROR_MSG, add_verbosity_msg=True) exit(1)
def delete(state: State, username: str, purge: bool): """ Deletes a user with a name given as a parameter. :param username: name of a user that should be deleted :param purge: if set - command removes also all artifacts associated with a user """ try: click.echo(Texts.DELETION_CHECK_PRESENCE) user_state = check_users_presence(username) if user_state == UserState.NOT_EXISTS: handle_error(user_msg=Texts.USER_NOT_EXISTS_ERROR_MSG.format( username=username)) exit(1) if user_state == UserState.TERMINATING: handle_error(user_msg=Texts.USER_BEING_REMOVED_ERROR_MSG) exit(1) except Exception: handle_error(logger, Texts.USER_PRESENCE_VERIFICATION_ERROR_MSG, Texts.USER_PRESENCE_VERIFICATION_ERROR_MSG, add_verbosity_msg=state.verbosity == 0) exit(1) click.echo() if not click.confirm(Texts.DELETE_CONFIRM_MSG.format(username=username)): click.echo(Texts.DELETE_ABORT_MSG) exit(0) click.echo() try: click.echo(Texts.DELETION_START_DELETING) delete_user(username) patch_config_map_data(name=USER_DEL_CM, namespace=NAUTA_NAMESPACE, key=username, value="1") if purge: try: click.echo(Texts.DELETION_START_PURGING) # failure during purging a user doesn't mean that user wasn't deleted purge_user(username) except Exception: handle_error(logger, Texts.PURGE_ERROR_MSG, Texts.PURGE_ERROR_MSG) # CAN-616 - wait until user has been really deleted with spinner(text=Texts.DELETION_VERIFICATION_OF_DELETING ) as user_del_spinner: for i in range(60): user_state = check_users_presence(username) user_del_cm_content = get_config_map_data( name=USER_DEL_CM, namespace=NAUTA_NAMESPACE, request_timeout=1) if (not user_state or user_state == UserState.NOT_EXISTS) and \ (not user_del_cm_content or not user_del_cm_content.get(username)): break time.sleep(1) else: user_del_spinner.stop() click.echo() click.echo(Texts.DELETE_IN_PROGRESS_MSG) exit(0) click.echo() click.echo(Texts.DELETE_SUCCESS_MSG.format(username=username)) except K8sProxyCloseError: handle_error(logger, Texts.PROXY_ERROR_LOG_MSG, Texts.PROXY_ERROR_USER_MSG, add_verbosity_msg=state.verbosity == 0) exit(1) except Exception: handle_error(logger, Texts.OTHER_ERROR_LOG_MSG, Texts.OTHER_ERROR_USER_MSG, add_verbosity_msg=state.verbosity == 0) exit(1)
def prepare_experiment_environment(experiment_name: str, run_name: str, local_script_location: str, script_parameters: Tuple[str, ...], pack_type: str, local_registry_port: int, cluster_registry_port: int, script_folder_location: str = None, pack_params: List[Tuple[str, str]] = None, env_variables: List[str] = None, requirements_file: str = None) -> PrepareExperimentResult: """ Prepares draft's environment for a certain run based on provided parameters :param experiment_name: name of an experiment :param run_name: name of an experiment run :param local_script_location: location of a script used for training purposes on local machine :param script_folder_location: location of an additional folder used in training :param script_parameters: parameters passed to a script :param pack_type: type of a pack used to start training job :param local_registry_port: port on which docker registry is accessible locally :param cluster_registry_port: port on which docker registry is accessible within nauta cluster :param pack_params: additional pack params :param env_variables: environmental variables to be passed to training :param requirements_file: path to a file with experiment requirements :return: name of folder with an environment created for this run, a name of script used for training purposes and count of Pods In case of any problems - an exception with a description of a problem is thrown """ log.debug(f'Prepare run {run_name} environment - start') run_folder = get_run_environment_path(run_name) try: # check environment directory check_run_environment(run_folder) with spinner(text=Texts.CREATING_ENVIRONMENT_MSG.format(run_name=run_name)): # create an environment create_environment(run_name, local_script_location, script_folder_location) # generate draft's data output, exit_code, log_output = cmd.create(working_directory=run_folder, pack_type=pack_type) # copy requirements file if it was provided, create empty requirements file otherwise dest_requirements_file = os.path.join(run_folder, 'requirements.txt') if requirements_file: shutil.copyfile(requirements_file, dest_requirements_file) else: Path(dest_requirements_file).touch() if exit_code: raise SubmitExperimentError(Texts.DRAFT_TEMPLATES_NOT_GENERATED_ERROR_MSG.format(reason=log_output)) # Script location on experiment container remote_script_location = Path(local_script_location).name if local_script_location else '' if pack_type in JUPYTER_NOTEBOOK_TEMPLATES_NAMES and remote_script_location.endswith(".py"): # for interact (jupyter notebooks) try to convert .py file into .ipynb py_script_location = os.path.join(run_folder, FOLDER_DIR_NAME, remote_script_location) ipynb_file_name = convert_py_to_ipynb(py_script_location, os.path.join(run_folder, FOLDER_DIR_NAME)) local_script_location = ipynb_file_name # reconfigure draft's templates update_configuration(run_folder=run_folder, script_location=remote_script_location, script_parameters=script_parameters, experiment_name=experiment_name, run_name=run_name, local_registry_port=local_registry_port, cluster_registry_port=cluster_registry_port, pack_type=pack_type, pack_params=pack_params, script_folder_location=script_folder_location, env_variables=env_variables) pod_count = get_pod_count(run_folder=run_folder, pack_type=pack_type) except Exception as exe: delete_environment(run_folder) raise SubmitExperimentError('Problems during creation of environments.') from exe log.debug(f'Prepare run {run_name} environment - finish') return PrepareExperimentResult(folder_name=run_folder, script_name=local_script_location, pod_count=pod_count)
def submit_experiment(template: str, name: str = None, run_kind: RunKinds = RunKinds.TRAINING, script_location: str = None, script_parameters: Tuple[str, ...] = None, pack_params: List[Tuple[str, str]] = None, parameter_range: List[Tuple[str, str]] = None, parameter_set: Tuple[str, ...] = None, script_folder_location: str = None, env_variables: List[str] = None, requirements_file: str = None) -> (List[Run], Dict[str, str], str): script_parameters = script_parameters if script_parameters else () parameter_set = parameter_set if parameter_set else () parameter_range = parameter_range if parameter_range else [] log.debug("Submit experiment - start") try: namespace = get_kubectl_current_context_namespace() global submitted_namespace submitted_namespace = namespace except Exception: message = Texts.GET_NAMESPACE_ERROR_MSG log.exception(message) raise SubmitExperimentError(message) try: with spinner(text=Texts.PREPARING_RESOURCE_DEFINITIONS_MSG): experiment_name, labels = generate_exp_name_and_labels(script_name=script_location, namespace=namespace, name=name, run_kind=run_kind) runs_list = prepare_list_of_runs(experiment_name=experiment_name, parameter_range=parameter_range, parameter_set=parameter_set, template_name=template) except SubmitExperimentError as exe: log.exception(str(exe)) raise exe except Exception: message = Texts.SUBMIT_PREPARATION_ERROR_MSG log.exception(message) raise SubmitExperimentError(message) global submitted_experiment submitted_experiment = experiment_name # Ctrl-C handling signal.signal(signal.SIGINT, ctrl_c_handler_for_submit) signal.signal(signal.SIGTERM, ctrl_c_handler_for_submit) try: config = Config() # start port forwarding # noinspection PyBroadException with K8sProxy(NAUTAAppNames.DOCKER_REGISTRY, port=config.local_registry_port) as proxy: # Save port that was actually used in configuration if proxy.tunnel_port != config.local_registry_port: config.local_registry_port = proxy.tunnel_port experiment_run_folders = [] # List of local directories used by experiment's runs try: # run socat if on Windows or Mac OS if get_current_os() in (OS.WINDOWS, OS.MACOS): # noinspection PyBroadException try: with spinner(text=Texts.CLUSTER_CONNECTION_MSG): socat.start(proxy.tunnel_port) except Exception: error_msg = Texts.LOCAL_DOCKER_TUNNEL_ERROR_MSG log.exception(error_msg) raise SubmitExperimentError(error_msg) cluster_registry_port = get_app_service_node_port(nauta_app_name=NAUTAAppNames.DOCKER_REGISTRY) # prepare environments for all experiment's runs for experiment_run in runs_list: if script_parameters and experiment_run.parameters: current_script_parameters = script_parameters + experiment_run.parameters elif script_parameters: current_script_parameters = script_parameters elif experiment_run.parameters: current_script_parameters = experiment_run.parameters else: current_script_parameters = "" run_folder, script_location, pod_count = \ prepare_experiment_environment(experiment_name=experiment_name, run_name=experiment_run.name, local_script_location=script_location, script_folder_location=script_folder_location, # noqa: E501 script_parameters=current_script_parameters, pack_type=template, pack_params=pack_params, local_registry_port=proxy.tunnel_port, cluster_registry_port=cluster_registry_port, env_variables=env_variables, requirements_file=requirements_file) # Set correct pod count if not pod_count or pod_count < 1: raise SubmitExperimentError('Unable to determine pod count: make sure that values.yaml ' 'file in your pack has podCount field with positive integer value.') experiment_run.pod_count = pod_count experiment_run_folders.append(run_folder) script_name = None if script_location is not None: script_name = os.path.basename(script_location) # Prepend script_name parameter to run description only for display purposes. experiment_run.parameters = script_parameters if not experiment_run.parameters \ else experiment_run.parameters + script_parameters if experiment_run.parameters and script_name: experiment_run.parameters = (script_name, ) + experiment_run.parameters elif script_name: experiment_run.parameters = (script_name, ) except SubmitExperimentError as e: log.exception(Texts.ENV_CREATION_ERROR_MSG) e.message += f' {Texts.ENV_CREATION_ERROR_MSG}' raise except Exception: # any error in this step breaks execution of this command message = Texts.ENV_CREATION_ERROR_MSG log.exception(message) # just in case - remove folders that were created with a success for experiment_run_folder in experiment_run_folders: delete_environment(experiment_run_folder) # if ps or pr option is used - first ask whether experiment(s) should be submitted if parameter_range or parameter_set: click.echo(Texts.CONFIRM_SUBMIT_MSG) click.echo(tabulate({RUN_NAME: [run.name for run in runs_list], RUN_PARAMETERS: ["\n".join(run.parameters) if run.parameters else "" for run in runs_list]}, headers=[RUN_NAME, RUN_PARAMETERS], tablefmt="orgtbl")) if not click.confirm(Texts.CONFIRM_SUBMIT_QUESTION_MSG, default=True): for experiment_run_folder in experiment_run_folders: delete_environment(experiment_run_folder) exit() # create Experiment model # TODO template_name & template_namespace should be filled after Template implementation parameter_range_spec = [f'-pr {param_name} {param_value}' for param_name, param_value in parameter_range] parameter_set_spec = [f'-ps {ps_spec}' for ps_spec in parameter_set] experiment_parameters_spec = list(script_parameters) + parameter_range_spec + parameter_set_spec experiment = experiments_model.Experiment(name=experiment_name, template_name=template, parameters_spec=experiment_parameters_spec, template_namespace="template-namespace") experiment.create(namespace=namespace, labels=labels) # submit runs run_errors = {} for run, run_folder in zip(runs_list, experiment_run_folders): try: run.state = RunStatus.QUEUED with spinner(text=Texts.CREATING_RESOURCES_MSG.format(run_name=run.name)): # Add Run object with runKind label and pack params as annotations run.create(namespace=namespace, labels={'runKind': run_kind.value}, annotations={pack_param_name: pack_param_value for pack_param_name, pack_param_value in pack_params}) submitted_runs.append(run) submit_draft_pack(run_folder, namespace=namespace) except Exception as exe: delete_environment(run_folder) try: run.state = RunStatus.FAILED run_errors[run.name] = str(exe) run.update() except Exception as rexe: # update of non-existing run may fail log.debug(Texts.ERROR_DURING_PATCHING_RUN.format(str(rexe))) # Delete experiment if no Runs were submitted if not submitted_runs: click.echo(Texts.SUBMISSION_FAIL_ERROR_MSG) delete_k8s_object("experiment", experiment_name) # Change experiment status to submitted experiment.state = experiments_model.ExperimentStatus.SUBMITTED experiment.update() except LocalPortOccupiedError as exe: click.echo(exe.message) raise SubmitExperimentError(exe.message) except K8sProxyCloseError: log.exception('Error during closing of a proxy for a {}'.format(NAUTAAppNames.DOCKER_REGISTRY)) raise K8sProxyCloseError(Texts.PROXY_CLOSE_ERROR_MSG) except K8sProxyOpenError: error_msg = Texts.PROXY_OPEN_ERROR_MSG log.exception(error_msg) raise SubmitExperimentError(error_msg) except SubmitExperimentError: raise except Exception as exe: error_msg = Texts.SUBMIT_OTHER_ERROR_MSG log.exception(error_msg) raise SubmitExperimentError(error_msg) from exe finally: with spinner(text=Texts.CLUSTER_CONNECTION_CLOSING_MSG): # noinspection PyBroadException try: socat.stop() except Exception: log.exception("Error during closing of a proxy for a local docker-host tunnel") raise K8sProxyCloseError(Texts.DOCKER_TUNNEL_CLOSE_ERROR_MSG) # remove semaphores from all exp folders remove_sempahore(experiment_name) log.debug("Submit - finish") return runs_list, run_errors, script_location
def install(ctx: click.Context, template_name: str): packs_location = os.path.join(Config.get_config_path(), "packs") chart_file_location = os.path.join(packs_location, template_name) repository_address = get_repository_address() with spinner( text=Texts.GETTING_LIST_OF_TEMPLATES_MSG) as templates_spinner: try: remote_template = load_remote_template( template_name, repository_address=repository_address) except Exception: templates_spinner.stop() handle_error(logger, user_msg=Texts.FAILED_TO_LOAD_TEMPLATE.format( template_name=template_name), log_msg=Texts.FAILED_TO_LOAD_TEMPLATE.format( template_name=template_name), add_verbosity_msg=ctx.obj.verbosity == 0) sys.exit(1) if not remote_template: templates_spinner.stop() handle_error(logger, user_msg=Texts.REMOTE_TEMPLATE_NOT_FOUND.format( template_name=template_name), log_msg=Texts.REMOTE_TEMPLATE_NOT_FOUND.format( template_name=template_name), add_verbosity_msg=ctx.obj.verbosity == 0) sys.exit(1) local_templates = get_local_templates() local_template_counterpart = local_templates.get(template_name) if local_template_counterpart: if (not click.get_current_context().obj.force) and (not click.confirm( Texts.LOCAL_VERSION_ALREADY_INSTALLED.format( local_version=local_template_counterpart.local_version, template_name=local_template_counterpart.name, remote_version=remote_template.remote_version))): sys.exit(0) # noinspection PyBroadException try: shutil.rmtree(chart_file_location) except Exception: logger.exception("failed to remove local copy of template!") with spinner(text=Texts.DOWNLOADING_TEMPLATE) as download_spinner: try: download_remote_template(template=remote_template, repository_address=repository_address, output_dir_path=packs_location) except Exception: download_spinner.stop() handle_error(logger, user_msg=Texts.FAILED_TO_INSTALL_TEMPLATE.format( template_name=template_name, repository_name=repository_address), log_msg=Texts.FAILED_TO_INSTALL_TEMPLATE.format( template_name=template_name, repository_name=repository_address), add_verbosity_msg=ctx.obj.verbosity == 0) sys.exit(1) update_resources_in_packs() click.echo("successfully installed!")
def verify(state: State): try: with spinner(text=Texts.CHECKING_OS_MSG): check_os() click.echo(Texts.OS_SUPPORTED_MSG) except InvalidOsError as exception: handle_error(logger, str(exception), str(exception), add_verbosity_msg=True) exit(1) dependencies = get_dependency_map() kubectl_dependency_name = 'kubectl' kubectl_dependency_spec = dependencies[kubectl_dependency_name] with spinner(text=Texts.VERIFYING_DEPENDENCY_MSG.format( dependency_name=kubectl_dependency_name)): valid, installed_version = check_dependency( dependency_name=kubectl_dependency_name, dependency_spec=kubectl_dependency_spec) supported_versions_sign = '>=' logger.info( Texts.VERSION_CHECKING_MSG.format( dependency_name=kubectl_dependency_name, installed_version=installed_version, supported_versions_sign=supported_versions_sign, expected_version=kubectl_dependency_spec.expected_version)) if valid: click.echo( Texts.DEPENDENCY_VERIFICATION_SUCCESS_MSG.format( dependency_name=kubectl_dependency_name)) else: handle_error( logger, Texts.KUBECTL_INVALID_VERSION_ERROR_MSG.format( installed_version=installed_version, supported_versions_sign=supported_versions_sign, expected_version= # noqa kubectl_dependency_spec.expected_version), Texts.KUBECTL_INVALID_VERSION_ERROR_MSG, add_verbosity_msg=state.verbosity == 0) exit(1) del dependencies[kubectl_dependency_name] try: with spinner(text=Texts.CHECKING_CONNECTION_TO_CLUSTER_MSG): check_connection_to_cluster() with spinner(text=Texts.CHECKING_PORT_FORWARDING_FROM_CLUSTER_MSG): check_port_forwarding() except KubectlConnectionError as e: handle_error(logger, str(e), str(e), add_verbosity_msg=state.verbosity == 0) exit(1) except FileNotFoundError: handle_error(logger, Texts.KUBECTL_NOT_INSTALLED_ERROR_MSG, Texts.KUBECTL_NOT_INSTALLED_ERROR_MSG, add_verbosity_msg=state.verbosity == 0) exit(1) try: namespace = 'kube-system' if is_current_user_administrator( ) else get_kubectl_current_context_namespace() except Exception: handle_error(logger, Texts.GET_K8S_NAMESPACE_ERROR_MSG, Texts.GET_K8S_NAMESPACE_ERROR_MSG, add_verbosity_msg=state.verbosity == 0) exit(1) dependency_versions = {} for dependency_name, dependency_spec in dependencies.items(): try: supported_versions_sign = '==' if dependency_spec.match_exact_version else '>=' with spinner(text=Texts.VERIFYING_DEPENDENCY_MSG.format( dependency_name=dependency_name)): valid, installed_version = check_dependency( dependency_name=dependency_name, dependency_spec=dependency_spec, namespace=namespace) dependency_versions[dependency_name] = installed_version logger.info( Texts.VERSION_CHECKING_MSG.format( dependency_name=dependency_name, installed_version=installed_version, supported_versions_sign=supported_versions_sign, expected_version=dependency_spec.expected_version)) if valid: click.echo( Texts.DEPENDENCY_VERIFICATION_SUCCESS_MSG.format( dependency_name=dependency_name)) else: click.echo( Texts.INVALID_VERSION_WARNING_MSG.format( dependency_name=dependency_name, installed_version=installed_version, supported_versions_sign=supported_versions_sign, expected_version=dependency_spec.expected_version)) except FileNotFoundError: handle_error(logger, Texts.DEPENDENCY_NOT_INSTALLED_ERROR_MSG.format( dependency_name=dependency_name), Texts.DEPENDENCY_NOT_INSTALLED_ERROR_MSG.format( dependency_name=dependency_name), add_verbosity_msg="client" not in dependency_name) exit(1) except (RuntimeError, ValueError, TypeError): handle_error(logger, Texts.DEPENDENCY_VERSION_CHECK_ERROR_MSG.format( dependency_name=dependency_name), Texts.DEPENDENCY_VERSION_CHECK_ERROR_MSG.format( dependency_name=dependency_name), add_verbosity_msg=state.verbosity == 0) exit(1) except Exception: handle_error(logger, Texts.DEPENDENCY_VERIFICATION_OTHER_ERROR_MSG.format( dependency_name=dependency_name), Texts.DEPENDENCY_VERIFICATION_OTHER_ERROR_MSG.format( dependency_name=dependency_name), add_verbosity_msg=state.verbosity == 0) exit(1) else: # This block is entered if all dependencies were validated successfully # Save dependency versions in a file save_dependency_versions(dependency_versions)
def submit_experiment( template: str, name: str = None, run_kind: RunKinds = RunKinds.TRAINING, script_location: str = None, script_parameters: Tuple[str, ...] = None, pack_params: List[Tuple[str, str]] = None, parameter_range: List[Tuple[str, str]] = None, parameter_set: Tuple[str, ...] = None, script_folder_location: str = None, env_variables: List[str] = None, requirements_file: str = None ) -> Tuple[List[Run], Dict[str, str], Optional[str]]: script_parameters: Union[Tuple[str, ...], Tuple[( )]] = script_parameters if script_parameters else () parameter_set: Union[Tuple[str, ...], Tuple[()]] = parameter_set if parameter_set else () parameter_range = parameter_range if parameter_range else [] pack_params = pack_params if pack_params else [] log.debug("Submit experiment - start") try: namespace = get_kubectl_current_context_namespace() global submitted_namespace submitted_namespace = namespace except Exception: message = Texts.GET_NAMESPACE_ERROR_MSG log.exception(message) raise SubmitExperimentError(message) try: with spinner(text=Texts.PREPARING_RESOURCE_DEFINITIONS_MSG): experiment_name, labels = generate_exp_name_and_labels( script_name=script_location, namespace=namespace, name=name, run_kind=run_kind) runs_list = prepare_list_of_runs(experiment_name=experiment_name, parameter_range=parameter_range, parameter_set=parameter_set, template_name=template) except SubmitExperimentError as exe: log.exception(str(exe)) raise exe except Exception: message = Texts.SUBMIT_PREPARATION_ERROR_MSG log.exception(message) raise SubmitExperimentError(message) global submitted_experiment submitted_experiment = experiment_name # Ctrl-C handling signal.signal(signal.SIGINT, ctrl_c_handler_for_submit) signal.signal(signal.SIGTERM, ctrl_c_handler_for_submit) try: experiment_run_folders = [ ] # List of local directories used by experiment's runs try: cluster_registry_port = get_app_service_node_port( nauta_app_name=NAUTAAppNames.DOCKER_REGISTRY) # prepare environments for all experiment's runs for experiment_run in runs_list: if script_parameters and experiment_run.parameters: current_script_parameters = script_parameters + experiment_run.parameters elif script_parameters: current_script_parameters = script_parameters elif experiment_run.parameters: current_script_parameters = experiment_run.parameters else: current_script_parameters = None run_folder, script_location, pod_count = \ prepare_experiment_environment(experiment_name=experiment_name, run_name=experiment_run.name, local_script_location=script_location, script_folder_location=script_folder_location, # noqa: E501 script_parameters=current_script_parameters, pack_type=template, pack_params=pack_params, cluster_registry_port=cluster_registry_port, env_variables=env_variables, requirements_file=requirements_file, username=namespace, run_kind=run_kind) # Set correct pod count if not pod_count or pod_count < 1: raise SubmitExperimentError( 'Unable to determine pod count: make sure that values.yaml ' 'file in your pack has podCount field with positive integer value.' ) experiment_run.pod_count = pod_count experiment_run_folders.append(run_folder) script_name = None if script_location is not None: script_name = os.path.basename(script_location) # Prepend script_name parameter to run description only for display purposes. experiment_run.parameters = script_parameters if not experiment_run.parameters \ else experiment_run.parameters + script_parameters if experiment_run.parameters and script_name: experiment_run.parameters = ( script_name, ) + experiment_run.parameters elif script_name: experiment_run.parameters = (script_name, ) except SubmitExperimentError as e: log.exception(Texts.ENV_CREATION_ERROR_MSG) e.message += f' {Texts.ENV_CREATION_ERROR_MSG}' raise except Exception: # any error in this step breaks execution of this command message = Texts.ENV_CREATION_ERROR_MSG log.exception(message) # just in case - remove folders that were created with a success for experiment_run_folder in experiment_run_folders: delete_environment(experiment_run_folder) # if ps or pr option is used - first ask whether experiment(s) should be submitted if parameter_range or parameter_set: click.echo(Texts.CONFIRM_SUBMIT_MSG) click.echo( tabulate( { RUN_NAME: [run.name for run in runs_list], RUN_PARAMETERS: [ "\n".join(run.parameters) if run.parameters else "" for run in runs_list ] }, headers=[RUN_NAME, RUN_PARAMETERS], tablefmt=TBLT_TABLE_FORMAT)) if not click.confirm(Texts.CONFIRM_SUBMIT_QUESTION_MSG, default=True): for experiment_run_folder in experiment_run_folders: delete_environment(experiment_run_folder) exit() # create Experiment model # TODO template_name & template_namespace should be filled after Template implementation parameter_range_spec = [ f'-pr {param_name} {param_value}' for param_name, param_value in parameter_range ] parameter_set_spec = [f'-ps {ps_spec}' for ps_spec in parameter_set] experiment_parameters_spec = list( script_parameters) + parameter_range_spec + parameter_set_spec template_version = get_template_version(template) experiment = experiments_model.Experiment( name=experiment_name, template_name=template, parameters_spec=experiment_parameters_spec, template_namespace="template-namespace", template_version=template_version) experiment.create(namespace=namespace, labels=labels) with spinner('Uploading experiment...'): try: upload_experiment_to_git_repo_manager( experiments_workdir=get_run_environment_path(''), experiment_name=experiment_name, run_name=runs_list[0].name, username=namespace) except Exception: log.exception('Failed to upload experiment.') try: experiment.state = experiments_model.ExperimentStatus.FAILED experiment.update() except Exception: log.exception( f'Failed to set state of {experiment.name} experiment ' f'to {experiments_model.ExperimentStatus.FAILED}') raise SubmitExperimentError('Failed to upload experiment.') with spinner('Building experiment image...'): try: image_build_workflow: ExperimentImageBuildWorkflow = ExperimentImageBuildWorkflow.from_yaml( yaml_template_path= f'{Config().config_path}/workflows/{EXP_IMAGE_BUILD_WORKFLOW_SPEC}', username=namespace, experiment_name=experiment_name) image_build_workflow.create(namespace=namespace) image_build_workflow.wait_for_completion() except Exception: error_msg = 'Failed to build experiment image.' log.exception(error_msg) # Try to get workflow logs _debug_workflow_logs(workflow=image_build_workflow, namespace=namespace) if image_build_workflow.name: error_msg += f' Run nctl workflow logs {image_build_workflow.name} command for more details.' try: experiment.state = experiments_model.ExperimentStatus.FAILED experiment.update() except Exception: log.exception( f'Failed to set state of {experiment.name} experiment ' f'to {experiments_model.ExperimentStatus.FAILED}') raise SubmitExperimentError(error_msg) # submit runs run_errors: Dict[str, str] = {} for run, run_folder in zip(runs_list, experiment_run_folders): try: run.state = RunStatus.QUEUED with spinner(text=Texts.CREATING_RESOURCES_MSG.format( run_name=run.name)): # Add Run object with runKind label and pack params as annotations run.create(namespace=namespace, labels={'runKind': run_kind.value}, annotations={ pack_param_name: pack_param_value for pack_param_name, pack_param_value in pack_params }) submitted_runs.append(run) submit_draft_pack(run_name=run.name, run_folder=run_folder, namespace=namespace) except Exception as exe: delete_environment(run_folder) try: run.state = RunStatus.FAILED run_errors[run.name] = str(exe) run.update() except Exception as rexe: # update of non-existing run may fail log.debug(Texts.ERROR_DURING_PATCHING_RUN.format( str(rexe))) # Delete experiment if no Runs were submitted if not submitted_runs: click.echo(Texts.SUBMISSION_FAIL_ERROR_MSG) delete_k8s_object("experiment", experiment_name) # Change experiment status to submitted experiment.state = experiments_model.ExperimentStatus.SUBMITTED experiment.update() except LocalPortOccupiedError as exe: click.echo(exe.message) raise SubmitExperimentError(exe.message) except K8sProxyCloseError: log.exception('Error during closing of a proxy for a {}'.format( NAUTAAppNames.DOCKER_REGISTRY)) raise K8sProxyCloseError(Texts.PROXY_CLOSE_ERROR_MSG) except K8sProxyOpenError: error_msg = Texts.PROXY_OPEN_ERROR_MSG log.exception(error_msg) raise SubmitExperimentError(error_msg) except SubmitExperimentError: raise except Exception as exe: error_msg = Texts.SUBMIT_OTHER_ERROR_MSG log.exception(error_msg) raise SubmitExperimentError(error_msg) from exe finally: # remove semaphores from all exp folders remove_sempahore(experiment_name) log.debug("Submit - finish") return runs_list, run_errors, script_location
def purge_experiment(exp_name: str, runs_to_purge: List[Run], k8s_es_client: K8sElasticSearchClient, namespace: str) -> Tuple[List[Run], List[Run]]: """ Purge experiment with a given name by cancelling runs given as a parameter. If given experiment contains more runs than is in the list of runs - experiment's state remains intact. :param exp_name: name of an experiment to which belong runs passed in run_list parameter :param runs_to_purge: list of runs that should be purged, they have to belong to exp_name experiment :param k8s_es_client: Kubernetes ElasticSearch client :param namespace: namespace where experiment is located :return: two list - first contains runs that were cancelled successfully, second - those which weren't """ logger.debug(f"Purging {exp_name} experiment ...") purged_runs: List[Run] = [] not_purged_runs: List[Run] = [] experiment = Experiment.get(name=exp_name, namespace=namespace) if not experiment: raise RuntimeError(Texts.GET_EXPERIMENT_ERROR_MSG) experiment_runs = Run.list(namespace=namespace, exp_name_filter=[exp_name]) # check whether experiment has more runs that should be cancelled cancel_whole_experiment = (len(experiment_runs) == len(runs_to_purge)) if cancel_whole_experiment: experiment.state = ExperimentStatus.CANCELLING experiment.update() try: cancelled_runs, not_cancelled_runs = cancel_experiment_runs( runs_to_cancel=runs_to_purge, namespace=namespace) not_purged_runs = not_cancelled_runs if cancel_whole_experiment: # Delete associated workflows experiment_associated_workflows = [ wf for wf in ArgoWorkflow.list(namespace=namespace) if wf.labels.get('experimentName') == experiment.name ] for wf in experiment_associated_workflows: wf.delete() # Remove tags from git repo manager try: delete_exp_tag_from_git_repo_manager( experiment_name=experiment.name, username=namespace, experiments_workdir=get_run_environment_path('')) except Exception: handle_error(logger, Texts.GIT_REPO_MANAGER_ERROR_MSG, Texts.GIT_REPO_MANAGER_ERROR_MSG) raise for run in cancelled_runs: logger.debug(f"Purging {run.name} run ...") click.echo(Texts.PURGING_START_MSG.format(run_name=run.name)) try: with spinner(text=Texts.PURGING_PROGRESS_MSG.format( run_name=run.name)): # purge helm release delete_helm_release(run.name, namespace=namespace, purge=True) # delete run kubectl.delete_k8s_object("run", run.name) purged_runs.append(run) except Exception as exe: not_purged_runs.append(run) logger.exception("Error during purging runs.") # occurence of NotFound error may mean, that run has been removed earlier if "NotFound" not in str(exe): click.echo( Texts.INCOMPLETE_PURGE_ERROR_MSG.format( experiment_name=experiment_name)) raise exe try: # clear run logs if is_current_user_administrator(): logger.debug(f"Clearing logs for {run.name} run.") with spinner(text=Texts.PURGING_LOGS_PROGRESS_MSG.format( run_name=run.name)): k8s_es_client.delete_logs_for_run(run=run.name, namespace=namespace) except Exception: logger.exception("Error during clearing run logs.") # CAN-1099 - docker garbage collector has errors that prevent from correct removal of images # try: # try to remove images from docker registry # delete_images_for_experiment(exp_name=run.name) # except Exception: # logger.exception("Error during removing images.") if cancel_whole_experiment and not not_purged_runs: try: kubectl.delete_k8s_object("experiment", exp_name) except Exception: # problems during deleting experiments are hidden as if runs were # cancelled user doesn't have a possibility to remove them logger.exception("Error during purging experiment.") except Exception: logger.exception("Error during purging experiment.") return purged_runs, not_purged_runs return purged_runs, not_purged_runs
def create(state: State, username: str, list_only: bool, filename: str): """ Adds a new user with a name given as a parameter. :param username: name of a new user """ if list_only and filename: handle_error(user_msg=Texts.F_L_OPTIONS_EXCLUSION_ERROR_MSG) exit(1) try: try: validate_user_name(username) except ValueError as exe: handle_error( logger, Texts.NAME_VALIDATION_ERROR_MSG.format(username=username), str(exe), add_verbosity_msg=state.verbosity == 0) exit(1) if not is_current_user_administrator(): handle_error(logger, Texts.USER_NOT_ADMIN_ERROR_MSG, Texts.USER_NOT_ADMIN_ERROR_MSG) exit(1) user_state = check_users_presence(username) if user_state == UserState.ACTIVE: handle_error( logger, Texts.USER_ALREADY_EXISTS_ERROR_MSG.format(username=username), Texts.USER_ALREADY_EXISTS_ERROR_MSG.format(username=username)) exit(1) if user_state == UserState.TERMINATING: handle_error( logger, Texts.USER_BEING_REMOVED_ERROR_MSG.format(username=username), Texts.USER_BEING_REMOVED_ERROR_MSG.format(username=username)) exit(1) except Exception: handle_error( logger, Texts.USER_VERIFICATION_ERROR_MSG.format(username=username), Texts.USER_VERIFICATION_ERROR_MSG.format(username=username), add_verbosity_msg=state.verbosity == 0) exit(1) try: with spinner(text=Texts.CREATING_USER_PROGRESS_MSG.format( username=username)): chart_location = os.path.join(Config().config_path, ADD_USER_CHART_NAME) nauta_config_map = NAUTAConfigMap() tiller_location = nauta_config_map.image_tiller tensorboard_service_location = nauta_config_map.image_tensorboard_service add_user_command = [ "helm", "install", "--wait", "--namespace", username, "--name", username, chart_location, "--set", "global.nauta=nauta", "--set", f"username={username}", "--set", "TillerImage={}".format(tiller_location), "--set", f"TensorboardServiceImage={tensorboard_service_location}" ] env = os.environ.copy() env['PATH'] = Config().config_path + os.pathsep + env['PATH'] _, err_code, log_output = execute_system_command( ' '.join(add_user_command), env=env, shell=True) if err_code: handle_error(logger, log_output, Texts.USER_ADD_ERROR_MSG, add_verbosity_msg=state.verbosity == 0) if not delete_user(username): handle_error(user_msg=Texts.REMOVE_USER_ERROR_MSG.format( username=username)) sys.exit(1) try: users_password = get_users_token(username) except Exception: handle_error(logger, Texts.PASSWORD_GATHER_ERROR_MSG, Texts.PASSWORD_GATHER_ERROR_MSG, add_verbosity_msg=state.verbosity == 0) users_password = "" try: cert = get_certificate(username) except Exception: handle_error(logger, Texts.CERT_GATHER_ERROR_MSG, Texts.CERT_GATHER_ERROR_MSG, add_verbosity_msg=state.verbosity == 0) cert = "" except Exception: handle_error(logger, Texts.USER_ADD_ERROR_MSG.format(username=username), Texts.USER_ADD_ERROR_MSG.format(username=username), add_verbosity_msg=state.verbosity == 0) if not delete_user(username): handle_error(user_msg=Texts.REMOVE_USER_ERROR_MSG.format( username=username)) sys.exit(1) if is_user_created(username, 90): click.echo(Texts.USER_CREATION_SUCCESS_MSG.format(username=username)) else: # if during 90 seconds a user hasn't been created - app displays information about it # but don't step processing the command - config file generated here my be useful later # when user has been created click.echo(Texts.USER_NOT_READY_ERROR_MSG.format(username=username)) try: kubeconfig = generate_kubeconfig(username, username, get_kubectl_host(), users_password, cert) except Exception: handle_error(logger, Texts.CONFIG_CREATION_ERROR_MSG, Texts.CONFIG_CREATION_ERROR_MSG, add_verbosity_msg=state.verbosity == 0) exit(1) if list_only: click.echo(Texts.LIST_ONLY_HEADER) click.echo(kubeconfig) else: if not filename: filename = DEFAULT_FILENAME.format(username) try: with open(filename, "w") as file: file.write(kubeconfig) click.echo(Texts.CONFIG_SAVE_SUCCESS_MSG.format(filename=filename)) except Exception: handle_error(logger, Texts.CONFIG_SAVE_FAIL_MSG, Texts.CONFIG_SAVE_FAIL_MSG, add_verbosity_msg=state.verbosity == 0) click.echo(Texts.CONFIG_SAVE_FAIL_INSTRUCTIONS_MSG) click.echo(kubeconfig) sys.exit(1)
def config(state: State, cpu: str, memory: str): if not cpu or not memory: handle_error(logger, Texts.MISSING_ARGUMENTS, Texts.MISSING_ARGUMENTS) sys.exit(1) if not validate_cpu_settings(cpu): handle_error(logger, Texts.CPU_WRONG_FORMAT, Texts.CPU_WRONG_FORMAT) sys.exit(1) if not validate_memory_settings(memory): handle_error(logger, Texts.MEMORY_WRONG_FORMAT, Texts.MEMORY_WRONG_FORMAT) sys.exit(1) configuration = NAUTAConfigMap() if configuration.minimal_node_memory_amount and \ convert_k8s_memory_resource(configuration.minimal_node_memory_amount) > convert_k8s_memory_resource(memory): error_message = Texts.MEMORY_SETTINGS_TOO_LOW.format( memory_value=configuration.minimal_node_memory_amount) handle_error(logger, error_message, error_message) sys.exit(1) if configuration.minimal_node_cpu_number and \ convert_k8s_cpu_resource(configuration.minimal_node_cpu_number) > convert_k8s_cpu_resource(cpu): error_message = Texts.CPU_SETTINGS_TOO_LOW.format( cpu_value=configuration.minimal_node_cpu_number) handle_error(logger, error_message, error_message) sys.exit(1) config_file_location = os.path.join(Config().config_path, NODE_CONFIG_FILENAME) if not os.path.isfile(config_file_location): handle_error(logger, Texts.MISSING_CONFIG_FILE, Texts.MISSING_CONFIG_FILE) sys.exit(1) with open(config_file_location, 'r+', encoding='utf-8') as config_file, \ spinner(text=Texts.CONFIG_UPDATE): config_file_content = yaml.safe_load(config_file) cpu_number = str(config_file_content.get(CPU_NUMBER_FIELDNAME)) memory_amount = str(config_file_content.get(MEMORY_AMOUNT_FIELDNAME)) cpu_system_required_min = str( config_file_content.get(CPU_SYSTEM_REQUIRED_MIN_FIELDNAME)) cpu_system_required_percent = str( config_file_content.get(CPU_SYSTEM_REQUIRED_PERCENT_FIELDNAME)) memory_system_required_min = str( config_file_content.get(MEMORY_SYSTEM_REQUIRED_MIN_FIELDNAME)) memory_system_required_percent = str( config_file_content.get(MEMORY_SYSTEM_REQUIRED_PERCENT_FIELDNAME)) if not cpu_number or cpu_number == "None" or not memory_amount or memory_amount == "None": handle_error(logger, Texts.CONFIG_FILE_INCORRECT, Texts.CONFIG_FILE_INCORRECT) sys.exit(1) try: override_values_in_packs( new_cpu_number=cpu, new_memory_amount=memory, current_cpu_number=cpu_number, current_mem_amount=memory_amount, cpu_system_required_min=cpu_system_required_min, cpu_system_required_percent=cpu_system_required_percent, mem_system_required_min=memory_system_required_min, mem_system_required_percent=memory_system_required_percent) except Exception: logger.exception(Texts.ERROR_DURING_UPDATE) handle_error(logger, Texts.ERROR_DURING_UPDATE, Texts.ERROR_DURING_UPDATE) sys.exit(1) config_file.seek(0) config_file.truncate() config_file_content[CPU_NUMBER_FIELDNAME] = cpu config_file_content[MEMORY_AMOUNT_FIELDNAME] = memory yaml.dump(config_file_content, config_file, default_flow_style=False, explicit_start=True) click.echo(Texts.SUCCESS_MESSAGE)