def status(ctx: click.Context, username: str): """ Returns status of a model :param username; if checked - searches for model for a certain user """ try: workflows: List[ArgoWorkflow.ArgoWorkflowCliModel] = [] if not username: namespace = get_kubectl_current_context_namespace() else: namespace = username with spinner(text=Texts.LOAD_DATA_MSG): # filtering out workflows used to build images with training jobs workflows = [ workflow.cli_representation for workflow in ArgoWorkflow.list( namespace=namespace, label_selector="type!=build-workflow") ] click.echo( tabulate(workflows, headers=MODEL_HEADERS, tablefmt=TBLT_TABLE_FORMAT)) except Exception: handle_error(logger, Texts.OTHER_ERROR_MSG, Texts.OTHER_ERROR_MSG, add_verbosity_msg=True) exit(1)
def tensorboard(state: State, no_launch: bool, tensorboard_service_client_port: Optional[int], port_number: Optional[int], experiment_name: List[str]): """ Subcommand for launching tensorboard with credentials """ current_namespace = get_kubectl_current_context_namespace() with spinner(Texts.TB_WAITING_MSG) as proxy_spinner, \ K8sProxy(nauta_app_name=NAUTAAppNames.TENSORBOARD_SERVICE, app_name='tensorboard-service', namespace=current_namespace, port=tensorboard_service_client_port) as proxy: tensorboard_service_client = TensorboardServiceClient( address=f'http://127.0.0.1:{proxy.tunnel_port}') requested_runs = build_tensorboard_run_list( exp_list=experiment_name, current_namespace=current_namespace) # noinspection PyBroadException try: tb = tensorboard_service_client.create_tensorboard(requested_runs) if tb.invalid_runs: list_of_invalid_runs = ', '.join([ f'{item.get("owner")}/{item.get("name")}' for item in tb.invalid_runs ]) click.echo( Texts.TB_INVALID_RUNS_MSG.format( invalid_runs=list_of_invalid_runs)) except Exception as exe: err_message = Texts.TB_CREATE_ERROR_MSG if hasattr( exe, 'error_code' ) and exe.error_code == HTTPStatus.UNPROCESSABLE_ENTITY: # type: ignore err_message = str(exe) handle_error(logger, err_message, err_message, add_verbosity_msg=state.verbosity == 0) sys.exit(1) for i in range(TENSORBOARD_TRIES_COUNT): # noinspection PyTypeChecker # tb.id is str tb = tensorboard_service_client.get_tensorboard(tb.id) if not tb: continue if tb.status == TensorboardStatus.RUNNING: proxy_spinner.hide() launch_app_with_proxy(k8s_app_name=NAUTAAppNames.TENSORBOARD, no_launch=no_launch, namespace=current_namespace, port=port_number, app_name=f"tensorboard-{tb.id}") return logger.warning( Texts.TB_WAITING_FOR_TB_MSG.format( tb_id=tb.id, tb_status_value=tb.status.value)) sleep(TENSORBOARD_CHECK_BACKOFF_SECONDS) click.echo(Texts.TB_TIMEOUT_ERROR_MSG) sys.exit(2)
def logs(state: State, workflow_name: str): try: namespace = get_kubectl_current_context_namespace() workflow: ArgoWorkflow = ArgoWorkflow.get(namespace=namespace, name=workflow_name) if not workflow: click.echo(Texts.NOT_FOUND_MSG.format(workflow_name=workflow_name)) exit(0) with K8sProxy(NAUTAAppNames.ELASTICSEARCH) as proxy: es_client = K8sElasticSearchClient(host="127.0.0.1", port=proxy.tunnel_port, verify_certs=False, use_ssl=False) start_date = workflow.started_at workflow_logs_generator = es_client.get_argo_workflow_logs_generator(workflow=workflow, namespace=namespace, start_date=start_date) for log_entry in workflow_logs_generator: if not log_entry.content.isspace(): click.echo(f'{log_entry.date} {log_entry.pod_name} {log_entry.content}') except K8sProxyCloseError: handle_error(logger, Texts.PROXY_CLOSE_LOG_ERROR_MSG, Texts.PROXY_CLOSE_USER_ERROR_MSG) exit(1) except LocalPortOccupiedError as exe: handle_error(logger, Texts.LOCAL_PORT_OCCUPIED_ERROR_MSG.format(exception_message=exe.message), Texts.LOCAL_PORT_OCCUPIED_ERROR_MSG.format(exception_message=exe.message)) exit(1) except K8sProxyOpenError: handle_error(logger, Texts.PROXY_CREATION_ERROR_MSG, Texts.PROXY_CREATION_ERROR_MSG) exit(1) except Exception: handle_error(logger, Texts.OTHER_ERROR_MSG, Texts.OTHER_ERROR_MSG, add_verbosity_msg=True) exit(1)
def logs(ctx: click.Context, workflow_name: str): try: namespace = get_kubectl_current_context_namespace() workflow: ArgoWorkflow = ArgoWorkflow.get(namespace=namespace, name=workflow_name) if not workflow: click.echo(Texts.NOT_FOUND_MSG.format(workflow_name=workflow_name)) exit(0) es_client = K8sElasticSearchClient( host=f'{get_kubectl_host(with_port=True)}' f'/api/v1/namespaces/nauta/services/nauta-elasticsearch:nauta/proxy', verify_certs=False, use_ssl=True, headers={'Authorization': get_api_key()}) start_date = workflow.started_at workflow_logs_generator = es_client.get_argo_workflow_logs_generator( workflow=workflow, namespace=namespace, start_date=start_date) for log_entry in workflow_logs_generator: if not log_entry.content.isspace(): click.echo( f'{log_entry.date} {log_entry.pod_name} {log_entry.content}' ) except Exception: handle_error(logger, Texts.OTHER_ERROR_MSG, Texts.OTHER_ERROR_MSG, add_verbosity_msg=True) exit(1)
def status(state: State, model_name: str, status: PodPhase, username: str): """ Returns status of a model :param model_name: name of a model data of which should be displayed :param status: status of a model step that should be displayed :param username; if checked - searches for model for a certain user """ try: if not username: namespace = get_kubectl_current_context_namespace() else: namespace = username with spinner(text=Texts.LOAD_DATA_MSG): workflow: ArgoWorkflow = ArgoWorkflow.get(namespace=namespace, name=model_name) if not workflow: click.echo(Texts.MODEL_NOT_FOUND.format(model_name=model_name)) exit(0) click.echo('\nOperation details:\n') click.echo(tabulate([workflow.cli_representation], headers=MODEL_HEADERS, tablefmt=TBLT_TABLE_FORMAT)) click.echo('\nOperation steps:\n') if workflow.steps: click.echo(tabulate([step.cli_representation for step in workflow.steps if status is None or status == step.phase], headers=STEP_HEADERS, tablefmt=TBLT_TABLE_FORMAT)) else: click.echo(Texts.LACK_OF_STEPS) except Exception: handle_error(logger, Texts.OTHER_ERROR_MSG, Texts.OTHER_ERROR_MSG, add_verbosity_msg=True) exit(1)
def export(path: str, format: str, operation_options: Tuple[str, ...]): additional_params_str = " ".join(operation_options) format = format.lower() workflow_exports_files = os.listdir( f'{Config().config_path}/workflows/exports') formats = [ file.rstrip('.yaml') for file in workflow_exports_files if file.endswith('.yaml') ] if format not in formats: click.echo(f'Format: {format} does not exist. Choose from: {formats}') sys.exit(2) try: current_namespace = get_kubectl_current_context_namespace() export_workflow = ArgoWorkflow.from_yaml( f'{Config().config_path}/workflows/exports/{format}.yaml') export_workflow.parameters = { 'cluster-registry-address': NAUTAConfigMap().registry, 'saved-model-dir-path': path, 'additional-params': additional_params_str } export_workflow.create(namespace=current_namespace) except Exception: error_msg = 'Failed to create export workflow.' click.echo(error_msg) logger.exception(error_msg) sys.exit(1) click.echo(f'Successfully created export workflow: {export_workflow.name}')
def stream(state: State, name: str, data: str, method_verb: InferenceVerb): """ Perform stream inference task on launched prediction instance. """ method_verb = InferenceVerb(method_verb) try: namespace = get_kubectl_current_context_namespace() # TODO: check if kind field of inference instance Run is correct inference_instance = Run.get(name=name, namespace=namespace) if not inference_instance: handle_error(user_msg=Texts.INSTANCE_NOT_EXISTS_ERROR_MSG.format( name=name)) exit(1) if not inference_instance.state == RunStatus.RUNNING: handle_error(user_msg=Texts.INSTANCE_NOT_RUNNING_ERROR_MSG.format( name=name, running_code=RunStatus.RUNNING.value)) exit(1) inference_instance_url = get_inference_instance_url( inference_instance=inference_instance) stream_url = f'{inference_instance_url}:{method_verb.value}' except Exception: handle_error(logger, Texts.INSTANCE_GET_FAIL_ERROR_MSG.format(name=name), Texts.INSTANCE_GET_FAIL_ERROR_MSG.format(name=name), add_verbosity_msg=state.verbosity == 0) exit(1) try: with open(data, 'r', encoding='utf-8') as data_file: stream_data = json.load(data_file) except (json.JSONDecodeError, IOError): handle_error(logger, Texts.JSON_LOAD_ERROR_MSG.format(data=data), Texts.JSON_LOAD_ERROR_MSG.format(data=data)) exit(1) try: api_key = get_api_key() headers = { 'Authorization': api_key, 'Accept': 'application/json', 'Content-Type': 'application/json' } with spinner(text=Texts.WAITING_FOR_RESPONSE_MSG): stream_response = requests.post( stream_url, data=json.dumps(stream_data), # nosec - request to k8s cluster verify=False, headers=headers) stream_response.raise_for_status() click.echo(stream_response.text) except Exception as e: error_msg = Texts.INFERENCE_OTHER_ERROR_MSG.format(exception=e) if hasattr(e, 'response'): error_msg += Texts.INFERENCE_ERROR_RESPONSE_MSG.format( response_text=e.response.text) # type: ignore handle_error(logger, error_msg, error_msg) exit(1)
def export(path: str, format: str, operation_options: Tuple[str, ...]): if path == FORMATS_OPTION: try: list_of_workflows = get_list_of_workflows(EXPORT_WORKFLOWS_LOCATION) except Exception: handle_error(logger, Texts.EXPORT_LIST_ERROR_MSG, Texts.EXPORT_LIST_ERROR_MSG) sys.exit(1) click.echo(tabulate(list_of_workflows, headers=EXPORT_LIST_HEADERS, tablefmt=TBLT_TABLE_FORMAT)) sys.exit(0) config_path = Config().config_path formats: List[str] = [] # noqa: E701 if os.path.isdir(config_path): workflow_exports_files = os.listdir(f'{config_path}/workflows/exports') formats = [os.path.splitext(file)[0] for file in workflow_exports_files if file.endswith('.yaml')] if not format: click.echo(Texts.MISSING_EXPORT_FORMAT.format(formats=formats)) sys.exit(2) format = format.lower() if format not in formats: click.echo(Texts.WRONG_EXPORT_FORMAT.format(format=format, formats=formats)) sys.exit(2) additional_params_str = " ".join(operation_options) try: current_namespace = get_kubectl_current_context_namespace() export_workflow = ArgoWorkflow.from_yaml(f'{Config().config_path}/workflows/exports/{format}.yaml') export_workflow.parameters = { 'cluster-registry-address': NAUTAConfigMap().registry, 'saved-model-dir-path': path, 'additional-params': additional_params_str } export_workflow.create(namespace=current_namespace) workflow: ArgoWorkflow = ArgoWorkflow.get(namespace=current_namespace, name=export_workflow.name) except Exception: error_msg = 'Failed to create export workflow.' click.echo(error_msg) logger.exception(error_msg) sys.exit(1) click.echo(tabulate([workflow.cli_representation], headers=MODEL_HEADERS, tablefmt=TBLT_TABLE_FORMAT)) click.echo(f'\nSuccessfully created export workflow')
def view(state: State, workflow_name: str): try: namespace = get_kubectl_current_context_namespace() workflow: ArgoWorkflow = ArgoWorkflow.get(namespace=namespace, name=workflow_name) if not workflow: click.echo(Texts.NOT_FOUND_MSG.format(workflow_name=workflow_name)) exit(0) click.echo(tabulate([workflow.cli_representation], headers=HEADERS, tablefmt="orgtbl")) click.echo('\nWorkflow status:\n') click.echo(yaml.dump(workflow.status)) except Exception: handle_error(logger, Texts.OTHER_ERROR_MSG, Texts.OTHER_ERROR_MSG, add_verbosity_msg=True) exit(1)
def get_inference_instance_url(inference_instance: Run, model_name: str = None) -> str: """ Get URL to inference instance. """ service_name = inference_instance.name model_name = model_name if model_name else inference_instance.metadata['annotations']['modelName'] k8s_host = get_kubectl_host(replace_https=False) k8s_namespace = get_kubectl_current_context_namespace() proxy_url = f'{k8s_host}/api/v1/namespaces/{k8s_namespace}/' \ f'services/{service_name}:rest-port/proxy/v1/models/{model_name}' return proxy_url
def workflow_list(state: State): try: namespace = get_kubectl_current_context_namespace() workflows = ArgoWorkflow.list(namespace=namespace) click.echo( tabulate([workflow.cli_representation for workflow in workflows], headers=HEADERS, tablefmt="orgtbl")) except Exception: handle_error(logger, Texts.OTHER_ERROR_MSG, Texts.OTHER_ERROR_MSG, add_verbosity_msg=True) exit(1)
def verify_cli_dependencies(): try: namespace = 'kube-system' if is_current_user_administrator(request_timeout=VERIFY_REQUEST_TIMEOUT) \ else get_kubectl_current_context_namespace() except Exception: error_msg = Texts.KUBECTL_NAMESPACE_ERROR_MSG handle_error(logger, error_msg, error_msg, add_verbosity_msg=True) sys.exit(1) try: check_os() check_all_binary_dependencies(namespace=namespace) except (InvalidDependencyError, InvalidOsError): error_msg = Texts.INVALID_DEPENDENCY_ERROR_MSG handle_error(logger, error_msg, error_msg, add_verbosity_msg=True)
def list_runs_in_cli(verbosity_lvl: int, all_users: bool, name: str, status: RunStatus, listed_runs_kinds: List[RunKinds], runs_list_headers: List[str], with_metrics: bool, count: int = None, brief: bool = False): """ Display a list of selected runs in the cli. :param verbosity_lvl: level at which error messages should be logged or displayed :param all_users: whether to display runs regardless of their owner or not :param name: regular expression to which names of the shown runs have to match :param status: display runs with this status :param listed_runs_kinds: list of kinds of runs that will be listed out :param runs_list_headers: headers which will be displayed on top of a table shown in the cli :param with_metrics: whether to show metrics column or not :param count: number of rows displayed on a list. If not given - content of a list is not limited :param brief: when true only experiment name, submission date, owner and state will be print """ try: namespace = None if all_users else get_kubectl_current_context_namespace() status = RunStatus[status] if status else None # List experiments command is actually listing Run resources instead of Experiment resources with one # exception - if run is initialized - nctl displays data of an experiment instead of data of a run runs = replace_initializing_runs( Run.list(namespace=namespace, state_list=[status], name_filter=name, run_kinds_filter=listed_runs_kinds)) runs_representations = [run.cli_representation for run in runs] if brief: runs_table_data = [ (run_representation.name, run_representation.submission_date, run_representation.submitter, run_representation.status) for run_representation in runs_representations ] elif with_metrics: runs_table_data = runs_representations else: runs_table_data = [ (run_representation.name, run_representation.parameters, run_representation.submission_date, run_representation.start_date, run_representation.duration, run_representation.submitter, run_representation.status, run_representation.template_name) for run_representation in runs_representations ] click.echo(tabulate(runs_table_data if not count else runs_table_data[-count:], headers=runs_list_headers, tablefmt="orgtbl")) except InvalidRegularExpressionError: handle_error(logger, Texts.INVALID_REGEX_ERROR_MSG, Texts.INVALID_REGEX_ERROR_MSG, add_verbosity_msg=verbosity_lvl == 0) exit(1) except Exception: handle_error(logger, Texts.OTHER_ERROR_MSG, Texts.OTHER_ERROR_MSG, add_verbosity_msg=verbosity_lvl == 0) exit(1)
def workflow_list(ctx: click.Context): try: namespace = get_kubectl_current_context_namespace() workflows: List[ArgoWorkflow] = ArgoWorkflow.list(namespace=namespace) click.echo( tabulate([workflow.cli_representation for workflow in workflows], headers=HEADERS, tablefmt=TBLT_TABLE_FORMAT)) except Exception: handle_error(logger, Texts.OTHER_ERROR_MSG, Texts.OTHER_ERROR_MSG, add_verbosity_msg=True) exit(1)
def cancel(state: State, workflow_name: str): try: namespace = get_kubectl_current_context_namespace() workflow: ArgoWorkflow = ArgoWorkflow.get(name=workflow_name, namespace=namespace) if not workflow: click.echo(Texts.NOT_FOUND_MSG.format(workflow_name=workflow_name)) exit(0) with spinner(text=Texts.PROGRESS_MSG.format( workflow_name=workflow_name)): workflow.delete() click.echo(Texts.SUCCESS_MSG.format(workflow_name=workflow_name)) except Exception: handle_error(logger, Texts.OTHER_ERROR_MSG, Texts.OTHER_ERROR_MSG, add_verbosity_msg=True) exit(1)
def list_unitialized_experiments_in_cli(verbosity_lvl: int, all_users: bool, name: str, headers: List[str], listed_runs_kinds: List[RunKinds] = None, count: int = None, brief: bool = False): """ Display a list of selected runs in the cli. :param verbosity_lvl: level at which error messages should be logged or displayed :param all_users: whether to display runs regardless of their owner or not :param name: regular expression to which names of the shown runs have to match :param headers: headers which will be displayed on top of a table shown in the cli :param count: number of rows displayed on a list. If not given - content of a list is not limited """ if not listed_runs_kinds: listed_runs_kinds = [RunKinds.TRAINING, RunKinds.JUPYTER] try: namespace = None if all_users else get_kubectl_current_context_namespace() creating_experiments = Experiment.list(namespace=namespace, state=ExperimentStatus.CREATING, run_kinds_filter=listed_runs_kinds, name_filter=name) runs = Run.list(namespace=namespace, name_filter=name, run_kinds_filter=listed_runs_kinds) # Get Experiments without associated Runs names_of_experiment_with_runs = set() for run in runs: names_of_experiment_with_runs.add(run.experiment_name) uninitialized_experiments = [experiment for experiment in creating_experiments if experiment.name not in names_of_experiment_with_runs] displayed_items_count = count if count else len(uninitialized_experiments) click.echo(tabulate([uninitialized_experiment_cli_representation(experiment) for experiment in uninitialized_experiments][-displayed_items_count:], headers=headers, tablefmt="orgtbl")) except InvalidRegularExpressionError: handle_error(logger, Texts.INVALID_REGEX_ERROR_MSG, Texts.INVALID_REGEX_ERROR_MSG, add_verbosity_msg=verbosity_lvl == 0) exit(1) except Exception: handle_error(logger, Texts.OTHER_ERROR_MSG, Texts.OTHER_ERROR_MSG, add_verbosity_msg=verbosity_lvl == 0) exit(1)
def modify_draft_toml(experiment_folder: str, registry: str): log.debug("Modify draft.toml - start") draft_toml_filename = os.path.join(experiment_folder, "draft.toml") draft_toml_temp_filename = os.path.join(experiment_folder, "draft_temp.toml") namespace = k8s_info.get_kubectl_current_context_namespace() with open(draft_toml_filename, "r") as draft_toml_file: draft_toml_yaml = toml.load(draft_toml_file) log.debug(draft_toml_yaml["environments"]) draft_toml_yaml["environments"]["development"]["namespace"] = namespace draft_toml_yaml["environments"]["development"]["registry"] = registry draft_toml_yaml["environments"]["development"]["wait"] = False with open(draft_toml_temp_filename, "w") as draft_toml_file: toml.dump(draft_toml_yaml, draft_toml_file) shutil.move(draft_toml_temp_filename, draft_toml_filename) log.debug("Modify draft.toml - end")
def submit(state: State, workflow_path: str): try: workflow: ArgoWorkflow = ArgoWorkflow.from_yaml(workflow_path) namespace = get_kubectl_current_context_namespace() with spinner(text=Texts.PROGRESS_MSG): workflow.create(namespace=namespace) workflow.namespace = namespace # Set namespace, to properly display owner in CLI click.echo( tabulate([workflow.cli_representation], headers=HEADERS, tablefmt=TBLT_TABLE_FORMAT)) except IOError as e: handle_error(logger, Texts.LOAD_SPEC_ERROR_MSG.format(msg=str(e)), Texts.LOAD_SPEC_ERROR_MSG.format(msg=str(e))) exit(1) except Exception: handle_error(logger, Texts.OTHER_ERROR_MSG, Texts.OTHER_ERROR_MSG, add_verbosity_msg=True) exit(1)
def process(path: str, kind: str, options: Tuple[str, ...]): additional_params_str = " ".join(options) kind = kind.lower() config_path = Config().config_path process_path = f'{config_path}/workflows/processes' kinds: List[str] = [] if os.path.isdir(process_path): process_kinds = os.listdir(f'{config_path}/workflows/processes') kinds = [ os.path.splitext(file)[0] for file in process_kinds if file.endswith('.yaml') ] if kind not in kinds: click.echo(Texts.WRONG_PROCESS_KIND.format(process=kind, kinds=kinds)) sys.exit(2) try: current_namespace = get_kubectl_current_context_namespace() process_workflow = ArgoWorkflow.from_yaml( f'{Config().config_path}/workflows/processes/{kind}.yaml') process_workflow.parameters = { 'cluster-registry-address': NAUTAConfigMap().registry, 'saved-model-dir-path': path, 'additional-params': additional_params_str } process_workflow.create(namespace=current_namespace) except Exception: error_msg = 'Failed to create export workflow.' click.echo(error_msg) logger.exception(error_msg) sys.exit(1) click.echo( f'Successfully created process workflow: {process_workflow.name}')
def get_logs(experiment_name: str, min_severity: SeverityLevel, start_date: str, end_date: str, pod_ids: str, pod_status: PodStatus, match: str, output: bool, pager: bool, follow: bool, runs_kinds: List[RunKinds], instance_type: str): """ Show logs for a given experiment. """ # check whether we have runs with a given name if experiment_name and match: handle_error(user_msg=Texts.NAME_M_BOTH_GIVEN_ERROR_MSG.format( instance_type=instance_type)) exit(1) elif not experiment_name and not match: handle_error(user_msg=Texts.NAME_M_NONE_GIVEN_ERROR_MSG.format( instance_type=instance_type)) exit(1) try: es_client = K8sElasticSearchClient( host=f'{get_kubectl_host(with_port=True)}' f'/api/v1/namespaces/nauta/services/nauta-elasticsearch:nauta/proxy', verify_certs=False, use_ssl=True, headers={'Authorization': get_api_key()}) namespace = get_kubectl_current_context_namespace() if match: experiment_name = match name_filter = match else: name_filter = f'^{experiment_name}$' runs = Run.list(namespace=namespace, name_filter=name_filter, run_kinds_filter=runs_kinds) if not runs: raise ValueError( f'Run with given name: {experiment_name} does not exists in namespace {namespace}.' ) pod_ids = pod_ids.split(',') if pod_ids else None # type: ignore follow_logs = True if follow and not output else False if output and len(runs) > 1: click.echo(Texts.MORE_EXP_LOGS_MESSAGE) for run in runs: start_date = start_date if start_date else run.creation_timestamp run_logs_generator = es_client.get_experiment_logs_generator( run=run, namespace=namespace, min_severity=min_severity, start_date=start_date, end_date=end_date, pod_ids=pod_ids, pod_status=pod_status, follow=follow_logs) if output: save_logs_to_file(logs_generator=run_logs_generator, instance_name=run.name, instance_type=instance_type) else: if len(runs) > 1: click.echo(f'Experiment : {run.name}') print_logs(run_logs_generator=run_logs_generator, pager=pager) except ValueError: handle_error( logger, Texts.EXPERIMENT_NOT_EXISTS_ERROR_MSG.format( experiment_name=experiment_name, instance_type=instance_type.capitalize()), Texts.EXPERIMENT_NOT_EXISTS_ERROR_MSG.format( experiment_name=experiment_name, instance_type=instance_type.capitalize())) exit(1) except Exception: handle_error( logger, Texts.LOGS_GET_OTHER_ERROR_MSG.format(instance_type=instance_type), Texts.LOGS_GET_OTHER_ERROR_MSG.format(instance_type=instance_type)) exit(1)
def submit_experiment(template: str, name: str = None, run_kind: RunKinds = RunKinds.TRAINING, script_location: str = None, script_parameters: Tuple[str, ...] = None, pack_params: List[Tuple[str, str]] = None, parameter_range: List[Tuple[str, str]] = None, parameter_set: Tuple[str, ...] = None, script_folder_location: str = None, env_variables: List[str] = None, requirements_file: str = None) -> (List[Run], Dict[str, str], str): script_parameters = script_parameters if script_parameters else () parameter_set = parameter_set if parameter_set else () parameter_range = parameter_range if parameter_range else [] log.debug("Submit experiment - start") try: namespace = get_kubectl_current_context_namespace() global submitted_namespace submitted_namespace = namespace except Exception: message = Texts.GET_NAMESPACE_ERROR_MSG log.exception(message) raise SubmitExperimentError(message) try: with spinner(text=Texts.PREPARING_RESOURCE_DEFINITIONS_MSG): experiment_name, labels = generate_exp_name_and_labels(script_name=script_location, namespace=namespace, name=name, run_kind=run_kind) runs_list = prepare_list_of_runs(experiment_name=experiment_name, parameter_range=parameter_range, parameter_set=parameter_set, template_name=template) except SubmitExperimentError as exe: log.exception(str(exe)) raise exe except Exception: message = Texts.SUBMIT_PREPARATION_ERROR_MSG log.exception(message) raise SubmitExperimentError(message) global submitted_experiment submitted_experiment = experiment_name # Ctrl-C handling signal.signal(signal.SIGINT, ctrl_c_handler_for_submit) signal.signal(signal.SIGTERM, ctrl_c_handler_for_submit) try: config = Config() # start port forwarding # noinspection PyBroadException with K8sProxy(NAUTAAppNames.DOCKER_REGISTRY, port=config.local_registry_port) as proxy: # Save port that was actually used in configuration if proxy.tunnel_port != config.local_registry_port: config.local_registry_port = proxy.tunnel_port experiment_run_folders = [] # List of local directories used by experiment's runs try: # run socat if on Windows or Mac OS if get_current_os() in (OS.WINDOWS, OS.MACOS): # noinspection PyBroadException try: with spinner(text=Texts.CLUSTER_CONNECTION_MSG): socat.start(proxy.tunnel_port) except Exception: error_msg = Texts.LOCAL_DOCKER_TUNNEL_ERROR_MSG log.exception(error_msg) raise SubmitExperimentError(error_msg) cluster_registry_port = get_app_service_node_port(nauta_app_name=NAUTAAppNames.DOCKER_REGISTRY) # prepare environments for all experiment's runs for experiment_run in runs_list: if script_parameters and experiment_run.parameters: current_script_parameters = script_parameters + experiment_run.parameters elif script_parameters: current_script_parameters = script_parameters elif experiment_run.parameters: current_script_parameters = experiment_run.parameters else: current_script_parameters = "" run_folder, script_location, pod_count = \ prepare_experiment_environment(experiment_name=experiment_name, run_name=experiment_run.name, local_script_location=script_location, script_folder_location=script_folder_location, # noqa: E501 script_parameters=current_script_parameters, pack_type=template, pack_params=pack_params, local_registry_port=proxy.tunnel_port, cluster_registry_port=cluster_registry_port, env_variables=env_variables, requirements_file=requirements_file) # Set correct pod count if not pod_count or pod_count < 1: raise SubmitExperimentError('Unable to determine pod count: make sure that values.yaml ' 'file in your pack has podCount field with positive integer value.') experiment_run.pod_count = pod_count experiment_run_folders.append(run_folder) script_name = None if script_location is not None: script_name = os.path.basename(script_location) # Prepend script_name parameter to run description only for display purposes. experiment_run.parameters = script_parameters if not experiment_run.parameters \ else experiment_run.parameters + script_parameters if experiment_run.parameters and script_name: experiment_run.parameters = (script_name, ) + experiment_run.parameters elif script_name: experiment_run.parameters = (script_name, ) except SubmitExperimentError as e: log.exception(Texts.ENV_CREATION_ERROR_MSG) e.message += f' {Texts.ENV_CREATION_ERROR_MSG}' raise except Exception: # any error in this step breaks execution of this command message = Texts.ENV_CREATION_ERROR_MSG log.exception(message) # just in case - remove folders that were created with a success for experiment_run_folder in experiment_run_folders: delete_environment(experiment_run_folder) # if ps or pr option is used - first ask whether experiment(s) should be submitted if parameter_range or parameter_set: click.echo(Texts.CONFIRM_SUBMIT_MSG) click.echo(tabulate({RUN_NAME: [run.name for run in runs_list], RUN_PARAMETERS: ["\n".join(run.parameters) if run.parameters else "" for run in runs_list]}, headers=[RUN_NAME, RUN_PARAMETERS], tablefmt="orgtbl")) if not click.confirm(Texts.CONFIRM_SUBMIT_QUESTION_MSG, default=True): for experiment_run_folder in experiment_run_folders: delete_environment(experiment_run_folder) exit() # create Experiment model # TODO template_name & template_namespace should be filled after Template implementation parameter_range_spec = [f'-pr {param_name} {param_value}' for param_name, param_value in parameter_range] parameter_set_spec = [f'-ps {ps_spec}' for ps_spec in parameter_set] experiment_parameters_spec = list(script_parameters) + parameter_range_spec + parameter_set_spec experiment = experiments_model.Experiment(name=experiment_name, template_name=template, parameters_spec=experiment_parameters_spec, template_namespace="template-namespace") experiment.create(namespace=namespace, labels=labels) # submit runs run_errors = {} for run, run_folder in zip(runs_list, experiment_run_folders): try: run.state = RunStatus.QUEUED with spinner(text=Texts.CREATING_RESOURCES_MSG.format(run_name=run.name)): # Add Run object with runKind label and pack params as annotations run.create(namespace=namespace, labels={'runKind': run_kind.value}, annotations={pack_param_name: pack_param_value for pack_param_name, pack_param_value in pack_params}) submitted_runs.append(run) submit_draft_pack(run_folder, namespace=namespace) except Exception as exe: delete_environment(run_folder) try: run.state = RunStatus.FAILED run_errors[run.name] = str(exe) run.update() except Exception as rexe: # update of non-existing run may fail log.debug(Texts.ERROR_DURING_PATCHING_RUN.format(str(rexe))) # Delete experiment if no Runs were submitted if not submitted_runs: click.echo(Texts.SUBMISSION_FAIL_ERROR_MSG) delete_k8s_object("experiment", experiment_name) # Change experiment status to submitted experiment.state = experiments_model.ExperimentStatus.SUBMITTED experiment.update() except LocalPortOccupiedError as exe: click.echo(exe.message) raise SubmitExperimentError(exe.message) except K8sProxyCloseError: log.exception('Error during closing of a proxy for a {}'.format(NAUTAAppNames.DOCKER_REGISTRY)) raise K8sProxyCloseError(Texts.PROXY_CLOSE_ERROR_MSG) except K8sProxyOpenError: error_msg = Texts.PROXY_OPEN_ERROR_MSG log.exception(error_msg) raise SubmitExperimentError(error_msg) except SubmitExperimentError: raise except Exception as exe: error_msg = Texts.SUBMIT_OTHER_ERROR_MSG log.exception(error_msg) raise SubmitExperimentError(error_msg) from exe finally: with spinner(text=Texts.CLUSTER_CONNECTION_CLOSING_MSG): # noinspection PyBroadException try: socat.stop() except Exception: log.exception("Error during closing of a proxy for a local docker-host tunnel") raise K8sProxyCloseError(Texts.DOCKER_TUNNEL_CLOSE_ERROR_MSG) # remove semaphores from all exp folders remove_sempahore(experiment_name) log.debug("Submit - finish") return runs_list, run_errors, script_location
def interact(ctx: click.Context, name: str, filename: str, pack_param: List[Tuple[str, str]], no_launch: bool, port_number: int, env: List[str], template: str): """ Starts an interactive session with Jupyter Notebook. """ current_namespace = get_kubectl_current_context_namespace() jupyters_number = calculate_number_of_running_jupyters(current_namespace) if jupyters_number > ACCEPTED_NUMBER_OF_NOTEBOOKS: if not click.confirm( Texts.TOO_MANY_JUPYTERS.format( jupyter_number=str(jupyters_number))): click.echo(Texts.INTERACT_ABORT_MSG) sys.exit(0) create_new_notebook = True jupyter_experiment = None if name: try: jupyter_experiment = Experiment.get(name=name, namespace=current_namespace) if jupyter_experiment and filename: handle_error(user_msg=Texts.FILENAME_BUT_SESSION_EXISTS) sys.exit(1) if jupyter_experiment: metadata = jupyter_experiment.metadata if metadata and metadata.get("labels") and metadata.get( "labels").get("script_name"): filename = metadata.get("labels").get("script_name") except Exception: handle_error(logger, Texts.EXPERIMENT_GET_ERROR_MSG, Texts.EXPERIMENT_GET_ERROR_MSG) sys.exit(1) # if experiment exists and is not based on jupyter image - we need to ask a user to choose another name if jupyter_experiment and jupyter_experiment.template_name not in JUPYTER_NOTEBOOK_TEMPLATES_NAMES: handle_error(user_msg=Texts.NAME_ALREADY_USED.format(name=name)) sys.exit(1) # if experiment exists but its state is different than RUNNING - display info about a need of purging of # this experiment if jupyter_experiment and jupyter_experiment.state not in \ [ExperimentStatus.SUBMITTED, ExperimentStatus.CREATING]: handle_error( user_msg=Texts.EXP_WITH_THE_SAME_NAME_MUST_BE_PURGED.format( name=name)) sys.exit(1) if not jupyter_experiment and ( not click.get_current_context().obj.force and not click.confirm(Texts.CONFIRM_EXPERIMENT_CREATION)): sys.exit(0) if jupyter_experiment: create_new_notebook = False else: try: check_experiment_name(value=name) except click.BadParameter as exe: handle_error(user_msg=str(exe)) sys.exit(1) number_of_retries = 0 if create_new_notebook: number_of_retries = 5 try: exp_name = name if not name and not filename: exp_name = generate_name("jup") click.echo(Texts.SUBMITTING_EXPERIMENT_USER_MSG) runs, runs_errors, filename = submit_experiment( run_kind=RunKinds.JUPYTER, script_location=filename, script_folder_location=None, template=template, name=exp_name, parameter_range=[], parameter_set=(), script_parameters=(), pack_params=pack_param, env_variables=env) click.echo( tabulate( { RUN_NAME: [run.cli_representation.name for run in runs], RUN_PARAMETERS: [run.cli_representation.parameters for run in runs], RUN_STATUS: [run.cli_representation.status for run in runs], RUN_MESSAGE: [runs_errors.get(run.name, "") for run in runs] }, headers=[ RUN_NAME, RUN_PARAMETERS, RUN_STATUS, RUN_MESSAGE ], tablefmt=TBLT_TABLE_FORMAT)) if runs: name = runs[0].name else: # run wasn't created - error raise RuntimeError("Run wasn't created") except K8sProxyCloseError as exe: handle_error(user_msg=exe.message) sys.exit(1) except SubmitExperimentError as exe: handle_error( logger, Texts.SUBMIT_ERROR_MSG.format(exception_message=exe.message), Texts.SUBMIT_ERROR_MSG.format(exception_message=exe.message)) sys.exit(1) except Exception: handle_error(logger, Texts.SUBMIT_OTHER_ERROR_MSG, Texts.SUBMIT_OTHER_ERROR_MSG) sys.exit(1) else: # if jupyter service exists - the system only connects to it click.echo(Texts.SESSION_EXISTS_MSG) url_end = "" if filename: # only Jupyter notebooks are opened directly, other files are opened in edit mode url_end = f"/notebooks/output/experiment/" if jupyter_experiment and filename.endswith(".py"): filename = filename[:filename.index(".py", -3)] + ".ipynb" if not filename.endswith(".ipynb"): url_end = "/edit/" url_end = url_end + Path(filename).name # wait until all jupyter pods are ready for i in range(JUPYTER_CHECK_POD_READY_TRIES): try: if check_pods_status(run_name=name, namespace=current_namespace, status=PodStatus.RUNNING): break except Exception: handle_error(logger, Texts.NOTEBOOK_STATE_CHECK_ERROR_MSG) sys.exit(1) time.sleep(1) else: handle_error(user_msg=Texts.NOTEBOOK_NOT_READY_ERROR_MSG) sys.exit(1) try: launch_app(k8s_app_name=NAUTAAppNames.JUPYTER, app_name=name, no_launch=no_launch, number_of_retries=number_of_retries, url_end=url_end, port=port_number) except LaunchError as exe: handle_error(logger, exe.message, exe.message) sys.exit(1) except ProxyClosingError: handle_error(user_msg=Texts.PROXY_CLOSING_ERROR_MSG) sys.exit(1) except Exception: handle_error(logger, Texts.SESSION_LAUNCH_OTHER_ERROR_MSG, Texts.SESSION_LAUNCH_OTHER_ERROR_MSG) sys.exit(1)
def verify(state: State): try: with spinner(text=Texts.CHECKING_OS_MSG): check_os() click.echo(Texts.OS_SUPPORTED_MSG) except InvalidOsError as exception: handle_error(logger, str(exception), str(exception), add_verbosity_msg=True) exit(1) dependencies = get_dependency_map() kubectl_dependency_name = 'kubectl' kubectl_dependency_spec = dependencies[kubectl_dependency_name] with spinner(text=Texts.VERIFYING_DEPENDENCY_MSG.format( dependency_name=kubectl_dependency_name)): valid, installed_version = check_dependency( dependency_name=kubectl_dependency_name, dependency_spec=kubectl_dependency_spec) supported_versions_sign = '>=' logger.info( Texts.VERSION_CHECKING_MSG.format( dependency_name=kubectl_dependency_name, installed_version=installed_version, supported_versions_sign=supported_versions_sign, expected_version=kubectl_dependency_spec.expected_version)) if valid: click.echo( Texts.DEPENDENCY_VERIFICATION_SUCCESS_MSG.format( dependency_name=kubectl_dependency_name)) else: handle_error( logger, Texts.KUBECTL_INVALID_VERSION_ERROR_MSG.format( installed_version=installed_version, supported_versions_sign=supported_versions_sign, expected_version= # noqa kubectl_dependency_spec.expected_version), Texts.KUBECTL_INVALID_VERSION_ERROR_MSG, add_verbosity_msg=state.verbosity == 0) exit(1) del dependencies[kubectl_dependency_name] try: with spinner(text=Texts.CHECKING_CONNECTION_TO_CLUSTER_MSG): check_connection_to_cluster() with spinner(text=Texts.CHECKING_PORT_FORWARDING_FROM_CLUSTER_MSG): check_port_forwarding() except KubectlConnectionError as e: handle_error(logger, str(e), str(e), add_verbosity_msg=state.verbosity == 0) exit(1) except FileNotFoundError: handle_error(logger, Texts.KUBECTL_NOT_INSTALLED_ERROR_MSG, Texts.KUBECTL_NOT_INSTALLED_ERROR_MSG, add_verbosity_msg=state.verbosity == 0) exit(1) try: namespace = 'kube-system' if is_current_user_administrator( ) else get_kubectl_current_context_namespace() except Exception: handle_error(logger, Texts.GET_K8S_NAMESPACE_ERROR_MSG, Texts.GET_K8S_NAMESPACE_ERROR_MSG, add_verbosity_msg=state.verbosity == 0) exit(1) dependency_versions = {} for dependency_name, dependency_spec in dependencies.items(): try: supported_versions_sign = '==' if dependency_spec.match_exact_version else '>=' with spinner(text=Texts.VERIFYING_DEPENDENCY_MSG.format( dependency_name=dependency_name)): valid, installed_version = check_dependency( dependency_name=dependency_name, dependency_spec=dependency_spec, namespace=namespace) dependency_versions[dependency_name] = installed_version logger.info( Texts.VERSION_CHECKING_MSG.format( dependency_name=dependency_name, installed_version=installed_version, supported_versions_sign=supported_versions_sign, expected_version=dependency_spec.expected_version)) if valid: click.echo( Texts.DEPENDENCY_VERIFICATION_SUCCESS_MSG.format( dependency_name=dependency_name)) else: click.echo( Texts.INVALID_VERSION_WARNING_MSG.format( dependency_name=dependency_name, installed_version=installed_version, supported_versions_sign=supported_versions_sign, expected_version=dependency_spec.expected_version)) except FileNotFoundError: handle_error(logger, Texts.DEPENDENCY_NOT_INSTALLED_ERROR_MSG.format( dependency_name=dependency_name), Texts.DEPENDENCY_NOT_INSTALLED_ERROR_MSG.format( dependency_name=dependency_name), add_verbosity_msg="client" not in dependency_name) exit(1) except (RuntimeError, ValueError, TypeError): handle_error(logger, Texts.DEPENDENCY_VERSION_CHECK_ERROR_MSG.format( dependency_name=dependency_name), Texts.DEPENDENCY_VERSION_CHECK_ERROR_MSG.format( dependency_name=dependency_name), add_verbosity_msg=state.verbosity == 0) exit(1) except Exception: handle_error(logger, Texts.DEPENDENCY_VERIFICATION_OTHER_ERROR_MSG.format( dependency_name=dependency_name), Texts.DEPENDENCY_VERIFICATION_OTHER_ERROR_MSG.format( dependency_name=dependency_name), add_verbosity_msg=state.verbosity == 0) exit(1) else: # This block is entered if all dependencies were validated successfully # Save dependency versions in a file save_dependency_versions(dependency_versions)
def launch(state: State, name: str, model_location: str, local_model_location: str, model_name: str, pack_param: List[Tuple[str, str]], requirements: str): """ Starts a new prediction instance that can be used for performing prediction, classification and regression tasks on trained model. """ if not model_location and not local_model_location: handle_error(user_msg=Texts.MISSING_MODEL_LOCATION_ERROR_MSG.format( local_model_location=local_model_location)) exit(1) if local_model_location: validate_local_model_location(local_model_location) click.echo('Submitting prediction instance.') try: model_path = model_location.rstrip( '/') if model_location else local_model_location.rstrip('/') model_name = model_name if model_name else os.path.basename(model_path) name = name if name else generate_name( name=model_name, prefix=INFERENCE_INSTANCE_PREFIX) inference_instance = start_inference_instance( name=name, model_location=model_location, model_name=model_name, local_model_location=local_model_location, requirements=requirements, pack_params=pack_param) if inference_instance.state == RunStatus.FAILED: raise RuntimeError('Inference instance submission failed.') except Exception: handle_error(logger, Texts.INSTANCE_START_ERROR_MSG, Texts.INSTANCE_START_ERROR_MSG, add_verbosity_msg=state.verbosity == 0) exit(1) click.echo( tabulate([[ inference_instance.cli_representation.name, model_location, inference_instance.cli_representation.status ]], headers=Texts.TABLE_HEADERS, tablefmt="orgtbl")) try: namespace = get_kubectl_current_context_namespace() authorization_header = get_authorization_header( service_account_name=name, namespace=namespace) inference_instance_url = get_inference_instance_url( inference_instance=inference_instance, model_name=model_name) click.echo( Texts.INSTANCE_INFO_MSG.format( inference_instance_url=inference_instance_url, authorization_header=authorization_header)) except Exception: handle_error(logger, Texts.INSTANCE_URL_ERROR_MSG, Texts.INSTANCE_URL_ERROR_MSG, add_verbosity_msg=state.verbosity == 0) exit(1)
def get_logs(experiment_name: str, min_severity: SeverityLevel, start_date: str, end_date: str, pod_ids: str, pod_status: PodStatus, match: str, output: bool, pager: bool, follow: bool, runs_kinds: List[RunKinds], instance_type: str): """ Show logs for a given experiment. """ # check whether we have runs with a given name if experiment_name and match: handle_error(user_msg=Texts.NAME_M_BOTH_GIVEN_ERROR_MSG.format(instance_type=instance_type)) exit(1) elif not experiment_name and not match: handle_error(user_msg=Texts.NAME_M_NONE_GIVEN_ERROR_MSG.format(instance_type=instance_type)) exit(1) try: with K8sProxy(NAUTAAppNames.ELASTICSEARCH) as proxy: es_client = K8sElasticSearchClient(host="127.0.0.1", port=proxy.tunnel_port, verify_certs=False, use_ssl=False) namespace = get_kubectl_current_context_namespace() if match: experiment_name = match name_filter = match else: name_filter = f'^{experiment_name}$' runs = Run.list(namespace=namespace, name_filter=name_filter, run_kinds_filter=runs_kinds) if not runs: raise ValueError(f'Run with given name: {experiment_name} does not exists in namespace {namespace}.') pod_ids = pod_ids.split(',') if pod_ids else None min_severity = SeverityLevel[min_severity] if min_severity else None pod_status = PodStatus[pod_status] if pod_status else None follow_logs = True if follow and not output else False if output and len(runs) > 1: click.echo(Texts.MORE_EXP_LOGS_MESSAGE) for run in runs: start_date = start_date if start_date else run.creation_timestamp run_logs_generator = es_client.get_experiment_logs_generator(run=run, namespace=namespace, min_severity=min_severity, start_date=start_date, end_date=end_date, pod_ids=pod_ids, pod_status=pod_status, follow=follow_logs) if output: save_logs_to_file(run=run, run_logs_generator=run_logs_generator, instance_type=instance_type) else: if len(runs) > 1: click.echo(f'Experiment : {run.name}') print_logs(run_logs_generator=run_logs_generator, pager=pager) except K8sProxyCloseError: handle_error(logger, Texts.PROXY_CLOSE_LOG_ERROR_MSG, Texts.PROXY_CLOSE_USER_ERROR_MSG) exit(1) except LocalPortOccupiedError as exe: handle_error(logger, Texts.LOCAL_PORT_OCCUPIED_ERROR_MSG.format(exception_message=exe.message), Texts.LOCAL_PORT_OCCUPIED_ERROR_MSG.format(exception_message=exe.message)) exit(1) except K8sProxyOpenError: handle_error(logger, Texts.PROXY_CREATION_ERROR_MSG, Texts.PROXY_CREATION_ERROR_MSG) exit(1) except ValueError: handle_error(logger, Texts.EXPERIMENT_NOT_EXISTS_ERROR_MSG.format(experiment_name=experiment_name, instance_type=instance_type.capitalize()), Texts.EXPERIMENT_NOT_EXISTS_ERROR_MSG.format(experiment_name=experiment_name, instance_type=instance_type.capitalize())) exit(1) except Exception: handle_error(logger, Texts.LOGS_GET_OTHER_ERROR_MSG.format(instance_type=instance_type), Texts.LOGS_GET_OTHER_ERROR_MSG.format(instance_type=instance_type)) exit(1)
def submit_experiment( template: str, name: str = None, run_kind: RunKinds = RunKinds.TRAINING, script_location: str = None, script_parameters: Tuple[str, ...] = None, pack_params: List[Tuple[str, str]] = None, parameter_range: List[Tuple[str, str]] = None, parameter_set: Tuple[str, ...] = None, script_folder_location: str = None, env_variables: List[str] = None, requirements_file: str = None ) -> Tuple[List[Run], Dict[str, str], Optional[str]]: script_parameters: Union[Tuple[str, ...], Tuple[( )]] = script_parameters if script_parameters else () parameter_set: Union[Tuple[str, ...], Tuple[()]] = parameter_set if parameter_set else () parameter_range = parameter_range if parameter_range else [] pack_params = pack_params if pack_params else [] log.debug("Submit experiment - start") try: namespace = get_kubectl_current_context_namespace() global submitted_namespace submitted_namespace = namespace except Exception: message = Texts.GET_NAMESPACE_ERROR_MSG log.exception(message) raise SubmitExperimentError(message) try: with spinner(text=Texts.PREPARING_RESOURCE_DEFINITIONS_MSG): experiment_name, labels = generate_exp_name_and_labels( script_name=script_location, namespace=namespace, name=name, run_kind=run_kind) runs_list = prepare_list_of_runs(experiment_name=experiment_name, parameter_range=parameter_range, parameter_set=parameter_set, template_name=template) except SubmitExperimentError as exe: log.exception(str(exe)) raise exe except Exception: message = Texts.SUBMIT_PREPARATION_ERROR_MSG log.exception(message) raise SubmitExperimentError(message) global submitted_experiment submitted_experiment = experiment_name # Ctrl-C handling signal.signal(signal.SIGINT, ctrl_c_handler_for_submit) signal.signal(signal.SIGTERM, ctrl_c_handler_for_submit) try: experiment_run_folders = [ ] # List of local directories used by experiment's runs try: cluster_registry_port = get_app_service_node_port( nauta_app_name=NAUTAAppNames.DOCKER_REGISTRY) # prepare environments for all experiment's runs for experiment_run in runs_list: if script_parameters and experiment_run.parameters: current_script_parameters = script_parameters + experiment_run.parameters elif script_parameters: current_script_parameters = script_parameters elif experiment_run.parameters: current_script_parameters = experiment_run.parameters else: current_script_parameters = None run_folder, script_location, pod_count = \ prepare_experiment_environment(experiment_name=experiment_name, run_name=experiment_run.name, local_script_location=script_location, script_folder_location=script_folder_location, # noqa: E501 script_parameters=current_script_parameters, pack_type=template, pack_params=pack_params, cluster_registry_port=cluster_registry_port, env_variables=env_variables, requirements_file=requirements_file, username=namespace, run_kind=run_kind) # Set correct pod count if not pod_count or pod_count < 1: raise SubmitExperimentError( 'Unable to determine pod count: make sure that values.yaml ' 'file in your pack has podCount field with positive integer value.' ) experiment_run.pod_count = pod_count experiment_run_folders.append(run_folder) script_name = None if script_location is not None: script_name = os.path.basename(script_location) # Prepend script_name parameter to run description only for display purposes. experiment_run.parameters = script_parameters if not experiment_run.parameters \ else experiment_run.parameters + script_parameters if experiment_run.parameters and script_name: experiment_run.parameters = ( script_name, ) + experiment_run.parameters elif script_name: experiment_run.parameters = (script_name, ) except SubmitExperimentError as e: log.exception(Texts.ENV_CREATION_ERROR_MSG) e.message += f' {Texts.ENV_CREATION_ERROR_MSG}' raise except Exception: # any error in this step breaks execution of this command message = Texts.ENV_CREATION_ERROR_MSG log.exception(message) # just in case - remove folders that were created with a success for experiment_run_folder in experiment_run_folders: delete_environment(experiment_run_folder) # if ps or pr option is used - first ask whether experiment(s) should be submitted if parameter_range or parameter_set: click.echo(Texts.CONFIRM_SUBMIT_MSG) click.echo( tabulate( { RUN_NAME: [run.name for run in runs_list], RUN_PARAMETERS: [ "\n".join(run.parameters) if run.parameters else "" for run in runs_list ] }, headers=[RUN_NAME, RUN_PARAMETERS], tablefmt=TBLT_TABLE_FORMAT)) if not click.confirm(Texts.CONFIRM_SUBMIT_QUESTION_MSG, default=True): for experiment_run_folder in experiment_run_folders: delete_environment(experiment_run_folder) exit() # create Experiment model # TODO template_name & template_namespace should be filled after Template implementation parameter_range_spec = [ f'-pr {param_name} {param_value}' for param_name, param_value in parameter_range ] parameter_set_spec = [f'-ps {ps_spec}' for ps_spec in parameter_set] experiment_parameters_spec = list( script_parameters) + parameter_range_spec + parameter_set_spec template_version = get_template_version(template) experiment = experiments_model.Experiment( name=experiment_name, template_name=template, parameters_spec=experiment_parameters_spec, template_namespace="template-namespace", template_version=template_version) experiment.create(namespace=namespace, labels=labels) with spinner('Uploading experiment...'): try: upload_experiment_to_git_repo_manager( experiments_workdir=get_run_environment_path(''), experiment_name=experiment_name, run_name=runs_list[0].name, username=namespace) except Exception: log.exception('Failed to upload experiment.') try: experiment.state = experiments_model.ExperimentStatus.FAILED experiment.update() except Exception: log.exception( f'Failed to set state of {experiment.name} experiment ' f'to {experiments_model.ExperimentStatus.FAILED}') raise SubmitExperimentError('Failed to upload experiment.') with spinner('Building experiment image...'): try: image_build_workflow: ExperimentImageBuildWorkflow = ExperimentImageBuildWorkflow.from_yaml( yaml_template_path= f'{Config().config_path}/workflows/{EXP_IMAGE_BUILD_WORKFLOW_SPEC}', username=namespace, experiment_name=experiment_name) image_build_workflow.create(namespace=namespace) image_build_workflow.wait_for_completion() except Exception: error_msg = 'Failed to build experiment image.' log.exception(error_msg) # Try to get workflow logs _debug_workflow_logs(workflow=image_build_workflow, namespace=namespace) if image_build_workflow.name: error_msg += f' Run nctl workflow logs {image_build_workflow.name} command for more details.' try: experiment.state = experiments_model.ExperimentStatus.FAILED experiment.update() except Exception: log.exception( f'Failed to set state of {experiment.name} experiment ' f'to {experiments_model.ExperimentStatus.FAILED}') raise SubmitExperimentError(error_msg) # submit runs run_errors: Dict[str, str] = {} for run, run_folder in zip(runs_list, experiment_run_folders): try: run.state = RunStatus.QUEUED with spinner(text=Texts.CREATING_RESOURCES_MSG.format( run_name=run.name)): # Add Run object with runKind label and pack params as annotations run.create(namespace=namespace, labels={'runKind': run_kind.value}, annotations={ pack_param_name: pack_param_value for pack_param_name, pack_param_value in pack_params }) submitted_runs.append(run) submit_draft_pack(run_name=run.name, run_folder=run_folder, namespace=namespace) except Exception as exe: delete_environment(run_folder) try: run.state = RunStatus.FAILED run_errors[run.name] = str(exe) run.update() except Exception as rexe: # update of non-existing run may fail log.debug(Texts.ERROR_DURING_PATCHING_RUN.format( str(rexe))) # Delete experiment if no Runs were submitted if not submitted_runs: click.echo(Texts.SUBMISSION_FAIL_ERROR_MSG) delete_k8s_object("experiment", experiment_name) # Change experiment status to submitted experiment.state = experiments_model.ExperimentStatus.SUBMITTED experiment.update() except LocalPortOccupiedError as exe: click.echo(exe.message) raise SubmitExperimentError(exe.message) except K8sProxyCloseError: log.exception('Error during closing of a proxy for a {}'.format( NAUTAAppNames.DOCKER_REGISTRY)) raise K8sProxyCloseError(Texts.PROXY_CLOSE_ERROR_MSG) except K8sProxyOpenError: error_msg = Texts.PROXY_OPEN_ERROR_MSG log.exception(error_msg) raise SubmitExperimentError(error_msg) except SubmitExperimentError: raise except Exception as exe: error_msg = Texts.SUBMIT_OTHER_ERROR_MSG log.exception(error_msg) raise SubmitExperimentError(error_msg) from exe finally: # remove semaphores from all exp folders remove_sempahore(experiment_name) log.debug("Submit - finish") return runs_list, run_errors, script_location
def view(context, state: State, experiment_name: str, tensorboard: bool, username: str): """ Displays details of an experiment. """ try: if username: namespace = username else: namespace = get_kubectl_current_context_namespace() run = Run.get(name=experiment_name, namespace=namespace) if not run: handle_error(user_msg=Texts.EXPERIMENT_NOT_FOUND_ERROR_MSG.format( experiment_name=experiment_name)) exit(2) click.echo( tabulate([run.cli_representation], headers=EXPERIMENTS_LIST_HEADERS, tablefmt="orgtbl")) click.echo(Texts.PODS_PARTICIPATING_LIST_HEADER) pods = get_namespaced_pods(label_selector="runName=" + experiment_name, namespace=namespace) tabular_output = [] containers_resources = [] pending_pods = [] for pod in pods: status_string = "" if pod.status.conditions: for cond in pod.status.conditions: msg = "\n" if not cond.reason else "\n reason: " + \ wrap_text(cond.reason, width=POD_CONDITIONS_MAX_WIDTH) msg = msg + ", \n message: " + wrap_text(cond.message, width=POD_CONDITIONS_MAX_WIDTH) \ if cond.message else msg status_string += wrap_text( cond.type + ": " + cond.status, width=POD_CONDITIONS_MAX_WIDTH) + msg + "\n" else: pod_events = get_pod_events(namespace=namespace, name=pod.metadata.name) for event in pod_events: msg = "\n" if not event.reason else "\n reason: " + \ wrap_text(event.reason, width=POD_CONDITIONS_MAX_WIDTH) msg = msg + ", \n message: " + wrap_text(event.message, width=POD_CONDITIONS_MAX_WIDTH) \ if event.message else msg status_string += msg + "\n" if pod.status.phase.upper() == PodStatus.PENDING.value: pending_pods.append(pod.metadata.name) container_statuses = defaultdict(lambda: None) if pod.status.container_statuses: for container_status in pod.status.container_statuses: container_statuses[ container_status.name] = container_status.state container_details = [] for container in pod.spec.containers: container_description = Texts.CONTAINER_DETAILS_MSG.format( name=container.name, status=container_status_to_msg( container_statuses[container.name]), volumes=container_volume_mounts_to_msg( container.volume_mounts, spaces=2), resources=container_resources_to_msg(container.resources, spaces=4)) container_details.append(container_description) containers_resources.append(container.resources) container_details = ''.join(container_details) tabular_output.append([ pod.metadata.name, wrap_text(pod.metadata.uid, width=UID_MAX_WIDTH, spaces=0), status_string, container_details ]) click.echo( tabulate(tabular_output, Texts.PODS_TABLE_HEADERS, tablefmt="orgtbl")) try: cpu_requests_sum = sum_cpu_resources([ container_resource.requests["cpu"] for container_resource in containers_resources if container_resource.requests and container_resource.requests.get("cpu") ]) mem_requests_sum = sum_mem_resources([ container_resource.requests["memory"] for container_resource in containers_resources if container_resource.requests and container_resource.requests.get("memory") ]) cpu_limits_sum = sum_cpu_resources([ container_resource.limits["cpu"] for container_resource in containers_resources if container_resource.limits and container_resource.limits.get("cpu") ]) mem_limits_sum = sum_mem_resources([ container_resource.limits["memory"] for container_resource in containers_resources if container_resource.limits and container_resource.limits.get("memory") ]) except ValueError as exception: handle_error( logger, Texts.RESOURCES_SUM_PARSING_ERROR_MSG.format( error_msg=str(exception)), Texts.RESOURCES_SUM_PARSING_ERROR_MSG.format( error_msg=str(exception))) click.echo(Texts.RESOURCES_SUM_LIST_HEADER) click.echo( tabulate(list( zip(Texts.RESOURCES_SUM_TABLE_ROWS_HEADERS, [ cpu_requests_sum, mem_requests_sum, cpu_limits_sum, mem_limits_sum ])), Texts.RESOURCES_SUM_TABLE_HEADERS, tablefmt="orgtbl")) if tensorboard: click.echo() context.invoke(tensorboard_command, experiment_name=[experiment_name]) if pending_pods: click.echo() try: cpu = False memory = False for pod in pending_pods: events_list = get_pod_events(namespace=namespace, name=pod) for event in events_list: if "insufficient cpu" in event.message.lower(): cpu = True elif "insufficient memory" in event.message.lower(): memory = True if cpu and memory: break if cpu and memory: break if not cpu and not memory: exit(0) if cpu and memory: resources = "number of cpus and amount of memory" elif cpu: resources = "number of cpus" else: resources = "amount of memory" click.echo( Texts.INSUFFICIENT_RESOURCES_MESSAGE.format( resources=resources)) click.echo() top_cpu_users, top_mem_users = get_highest_usage() click.echo( Texts.TOP_CPU_CONSUMERS.format(consumers=", ".join([ res.user_name for res in top_cpu_users[0:3 if len(top_cpu_users ) > 2 else len(top_cpu_users)] ]))) click.echo( Texts.TOP_MEMORY_CONSUMERS.format(consumers=", ".join([ res.user_name for res in top_mem_users[0:3 if len(top_mem_users ) > 2 else len(top_mem_users)] ]))) except Exception: click.echo(Texts.PROBLEMS_WHILE_GATHERING_USAGE_DATA) logger.exception( Texts.PROBLEMS_WHILE_GATHERING_USAGE_DATA_LOGS) except Exception: handle_error(logger, Texts.VIEW_OTHER_ERROR_MSG, Texts.VIEW_OTHER_ERROR_MSG) exit(1)
def get_logs(operation_name: str, start_date: str, end_date: str, match: str, output: bool, pager: bool, follow: bool): """ Show logs for a given model export operation. """ # check whether we have operations with a given name if operation_name and match: handle_error(user_msg=Texts.NAME_M_BOTH_GIVEN_ERROR_MSG) exit(1) elif not operation_name and not match: handle_error(user_msg=Texts.NAME_M_NONE_GIVEN_ERROR_MSG) exit(1) try: with K8sProxy(NAUTAAppNames.ELASTICSEARCH) as proxy: es_client = K8sElasticSearchClient(host="127.0.0.1", port=proxy.tunnel_port, verify_certs=False, use_ssl=False) namespace = get_kubectl_current_context_namespace() if match: operation_name = match name_filter = match else: name_filter = f'^{operation_name}$' workflows = ArgoWorkflow.list(namespace=namespace, name_filter=name_filter) if not workflows: raise ValueError( f'Operation with given name: {operation_name} does not ' f'exists in namespace {namespace}.') follow_logs = True if follow and not output else False if output and len(workflows) > 1: click.echo(Texts.MORE_EXP_LOGS_MESSAGE) for workflow in workflows: start_date = start_date if start_date else workflow.started_at ops_logs_generator = es_client.get_argo_workflow_logs_generator( workflow=workflow, namespace=namespace, start_date=start_date, end_date=end_date, follow=follow_logs) if output: save_logs_to_file(logs_generator=ops_logs_generator, instance_name=workflow.name, instance_type="operation") else: if len(workflows) > 1: click.echo(f'Operation : {workflow.name}') print_logs(run_logs_generator=ops_logs_generator, pager=pager) except K8sProxyCloseError: handle_error(logger, Texts.PROXY_CLOSE_LOG_ERROR_MSG, Texts.PROXY_CLOSE_LOG_ERROR_MSG) exit(1) except LocalPortOccupiedError as exe: handle_error( logger, Texts.LOCAL_PORT_OCCUPIED_ERROR_MSG.format( exception_message=exe.message), Texts.LOCAL_PORT_OCCUPIED_ERROR_MSG.format( exception_message=exe.message)) exit(1) except K8sProxyOpenError: handle_error(logger, Texts.PROXY_CREATION_ERROR_MSG, Texts.PROXY_CREATION_ERROR_MSG) exit(1) except ValueError: handle_error( logger, Texts.OPERATION_NOT_EXISTS_ERROR_MSG.format( operation_name=operation_name), Texts.OPERATION_NOT_EXISTS_ERROR_MSG.format( experiment_name=operation_name)) exit(1) except Exception: handle_error(logger, Texts.LOGS_GET_OTHER_ERROR_MSG, Texts.LOGS_GET_OTHER_ERROR_MSG) exit(1)
def launch(ctx: click.Context, name: str, model_location: str, local_model_location: str, model_name: str, pack_param: List[Tuple[str, str]], requirements: str, runtime: InferenceRuntime): """ Starts a new prediction instance that can be used for performing prediction, classification and regression tasks on trained model. """ if not model_location and not local_model_location: handle_error(user_msg=Texts.MISSING_MODEL_LOCATION_ERROR_MSG.format( local_model_location=local_model_location)) exit(1) if local_model_location: validate_local_model_location(local_model_location) click.echo('Submitting prediction instance.') try: template = INFERENCE_TEMPLATE_OVMS if InferenceRuntime(runtime) == InferenceRuntime.OVMS else \ INFERENCE_TEMPLATE_TFSERVING model_path = model_location.rstrip( '/') if model_location else local_model_location.rstrip('/') model_name = model_name if model_name else os.path.basename(model_path) name = name if name else generate_name( name=model_name, prefix=INFERENCE_INSTANCE_PREFIX) inference_instance = start_inference_instance( name=name, model_location=model_location, model_name=model_name, local_model_location=local_model_location, template=template, requirements=requirements, pack_params=pack_param) if inference_instance.state == RunStatus.FAILED: raise RuntimeError('Inference instance submission failed.') except Exception: handle_error(logger, Texts.INSTANCE_START_ERROR_MSG, Texts.INSTANCE_START_ERROR_MSG, add_verbosity_msg=ctx.obj.verbosity == 0) exit(1) click.echo( tabulate([[ inference_instance.cli_representation.name, model_location, inference_instance.cli_representation.status ]], headers=Texts.TABLE_HEADERS, tablefmt=TBLT_TABLE_FORMAT)) try: namespace = get_kubectl_current_context_namespace() authorization_header = get_authorization_header( service_account_name=name, namespace=namespace) inference_instance_url = get_inference_instance_url( inference_instance=inference_instance, model_name=model_name) click.echo( Texts.INSTANCE_INFO_MSG.format( inference_instance_url=inference_instance_url, authorization_header=authorization_header)) except Exception: handle_error(logger, Texts.INSTANCE_URL_ERROR_MSG, Texts.INSTANCE_URL_ERROR_MSG, add_verbosity_msg=ctx.obj.verbosity == 0) exit(1) # wait till pod is ready - no more than 40 seconds for _ in range(40): pods = get_namespaced_pods(label_selector=f'runName={name}', namespace=namespace) if pods and all(pod.status.phase == 'Running' for pod in pods) \ and all(container.ready for pod in pods for container in pod.status.container_statuses): break if pods and any(pod.status.phase == 'Failed' for pod in pods): handle_error(logger, Texts.INSTANCE_START_ERROR_MSG, Texts.INSTANCE_START_ERROR_MSG, add_verbosity_msg=ctx.obj.verbosity == 0) exit(1) time.sleep(1) else: handle_error(logger, Texts.PREDICTION_INSTANCE_NOT_READY.format(name=name), Texts.PREDICTION_INSTANCE_NOT_READY.format(name=name), add_verbosity_msg=ctx.obj.verbosity == 0) exit(0)