Example #1
0
def cancel(state: State,
           name: str,
           match: str,
           purge: bool,
           pod_ids: str,
           pod_status: str,
           listed_runs_kinds: List[RunKinds] = None):
    """
    Cancels chosen experiments based on a name provided as a parameter.
    """
    if not listed_runs_kinds:
        listed_runs_kinds = [RunKinds.TRAINING, RunKinds.JUPYTER]

    # check whether we have runs with a given name
    if name and match:
        handle_error(user_msg=Texts.NAME_M_BOTH_GIVEN_ERROR_MSG)
        exit(1)

    if not name and not match:
        handle_error(user_msg=Texts.NAME_M_NONE_GIVEN_ERROR_MSG)
        exit(1)

    current_namespace = get_current_namespace()

    if pod_ids or pod_status:
        if not name:
            name = match

        cancel_pods_mode(namespace=current_namespace,
                         run_name=name,
                         pod_ids=pod_ids,
                         pod_status=pod_status)
        exit(0)

    search_for_experiment = False
    exp_to_be_cancelled = None

    if name:
        exp_to_be_cancelled = Experiment.get(namespace=current_namespace,
                                             name=name)
        exp_to_be_cancelled_kind = RunKinds(exp_to_be_cancelled.metadata['labels'].get('runKind')) \
            if exp_to_be_cancelled else None
        exp_to_be_cancelled = exp_to_be_cancelled if exp_to_be_cancelled_kind in listed_runs_kinds else None

        if exp_to_be_cancelled:
            search_for_experiment = True
        else:
            name = f"^{name}$"
    else:
        name = match

    list_of_all_runs = None

    list_of_applicable_states = [RunStatus.QUEUED, RunStatus.RUNNING]

    if purge:
        list_of_applicable_states.extend(
            [RunStatus.FAILED, RunStatus.COMPLETE, RunStatus.CANCELLED])

    try:
        if search_for_experiment:
            list_of_all_runs = Run.list(namespace=current_namespace,
                                        exp_name_filter=[name],
                                        run_kinds_filter=listed_runs_kinds)
        else:
            list_of_all_runs = Run.list(namespace=current_namespace,
                                        name_filter=name,
                                        run_kinds_filter=listed_runs_kinds)
    except Exception:
        handle_error(
            logger,
            Texts.LIST_RUNS_ERROR_MSG.format(
                experiment_name_plural=experiment_name_plural),
            Texts.LIST_RUNS_ERROR_MSG.format(
                experiment_name_plural=experiment_name_plural))
        exit(1)

    # Handle cancellation of experiments with no associated Runs
    if exp_to_be_cancelled and not list_of_all_runs:
        cancel_uninitialized_experiment(experiment=exp_to_be_cancelled,
                                        namespace=current_namespace,
                                        purge=purge)

    if not list_of_all_runs:
        handle_error(user_msg=Texts.LACK_OF_EXPERIMENTS_ERROR_MSG.format(
            experiment_name_plural=experiment_name_plural,
            experiment_name=experiment_name))
        exit(1)
    elif not purge and not [
            run for run in list_of_all_runs
            if run.state in [RunStatus.QUEUED, RunStatus.RUNNING]
    ]:
        handle_error(
            user_msg=Texts.LACK_OF_EXP_TO_BE_CANCELLED_ERROR_MSG.format(
                experiment_name_plural=experiment_name_plural))
        exit(1)

    # check whether we have at least one experiment in state other than CANCELLED
    list_of_runs_to_be_deleted = []
    names_of_cancelled_runs = []

    if not purge:
        # check whether we have at least one experiment in state other than CANCELLED
        for run in list_of_all_runs:
            if run.state in list_of_applicable_states:
                list_of_runs_to_be_deleted.append(run)
            else:
                names_of_cancelled_runs.append(run.name)

        if not list_of_runs_to_be_deleted:
            handle_error(
                user_msg=Texts.EXPERIMENTS_ALREADY_CANCELLED_ERROR_MSG.format(
                    experiment_name_plural=experiment_name_plural,
                    operation_word=Texts.
                    DELETE_OPERATION["deleted"] if experiment_name_plural ==
                    'pods' else Texts.CANCEL_OPERATION["cancelled"]))
            exit(1)
        elif len(list_of_runs_to_be_deleted) != len(list_of_all_runs):
            click.echo(
                Texts.ALREADY_CANCELLED_LIST_HEADER.format(
                    experiment_name_plural=experiment_name_plural,
                    operation_word=Texts.DELETE_OPERATION["deleted"]
                    if experiment_name_plural == 'pods' else
                    Texts.CANCEL_OPERATION["cancelled"]))
            for name in names_of_cancelled_runs:
                click.echo(f"     - {name}")
            click.echo(
                Texts.CAN_BE_CANCELLED_LIST_HEADER.format(
                    experiment_name_plural=experiment_name_plural,
                    operation_word=Texts.DELETE_OPERATION["deleted"]
                    if experiment_name_plural == 'pods' else
                    Texts.CANCEL_OPERATION["cancelled"]))
            for name in list_of_runs_to_be_deleted:
                click.echo(f"     - {name.name}")
        else:
            click.echo(
                Texts.WILL_BE_CANCELLED_LIST_HEADER.format(
                    experiment_name_plural=experiment_name_plural,
                    operation_word=Texts.DELETE_OPERATION["deleted"]
                    if experiment_name_plural == 'pods' else
                    Texts.CANCEL_OPERATION["cancelled"]))
            for name in list_of_runs_to_be_deleted:
                click.echo(f"     - {name.name}")
    else:
        list_of_runs_to_be_deleted = list_of_all_runs
        click.echo(
            Texts.WILL_BE_PURGED_LIST_HEADER.format(
                experiment_name_plural=experiment_name_plural,
                operation_word=Texts.DELETE_OPERATION["deleted"]
                if experiment_name_plural == 'pods' else
                Texts.CANCEL_OPERATION["cancelled"]))
        for name in list_of_runs_to_be_deleted:
            click.echo(f"     - {name.name}")

    if not click.confirm(
            Texts.CONFIRM_CANCEL_MSG.format(
                experiment_name_plural=experiment_name_plural,
                operation_word=Texts.DELETE_OPERATION["deletion"]
                if experiment_name_plural == 'pods' else
                Texts.CANCEL_OPERATION["cancellation"])):
        handle_error(user_msg=Texts.CANCELLATION_ABORTED_MSG.format(
            experiment_name_plural=experiment_name_plural,
            operation_word=Texts.
            DELETE_OPERATION["deletion"] if experiment_name_plural ==
            'pods' else Texts.CANCEL_OPERATION["cancellation"]))
        exit(0)

    # group runs by experiments
    exp_with_runs = defaultdict(list)

    for run in list_of_runs_to_be_deleted:
        exp_with_runs[run.experiment_name].append(run)

    deleted_runs = []
    not_deleted_runs = []

    if purge:
        # Connect to elasticsearch in order to purge run logs
        try:
            with K8sProxy(NAUTAAppNames.ELASTICSEARCH) as proxy:
                es_client = K8sElasticSearchClient(
                    host="127.0.0.1",
                    port=proxy.tunnel_port,
                    verify_certs=False,
                    use_ssl=False,
                    with_admin_privledges=is_current_user_administrator())
                for exp_name, run_list in exp_with_runs.items():
                    try:
                        exp_del_runs, exp_not_del_runs = purge_experiment(
                            exp_name=exp_name,
                            runs_to_purge=run_list,
                            namespace=current_namespace,
                            k8s_es_client=es_client)
                        deleted_runs.extend(exp_del_runs)
                        not_deleted_runs.extend(exp_not_del_runs)
                    except Exception:
                        handle_error(logger, Texts.OTHER_CANCELLING_ERROR_MSG)
                        not_deleted_runs.extend(run_list)
        except K8sProxyCloseError:
            handle_error(logger, Texts.PROXY_CLOSING_ERROR_LOG_MSG,
                         Texts.PROXY_CLOSING_ERROR_USER_MSG)
            exit(1)
        except LocalPortOccupiedError as exe:
            handle_error(
                logger, Texts.PORT_OCCUPIED_ERROR_LOG_MSG,
                Texts.PORT_OCCUPIED_ERROR_USER_MSG.format(
                    exception_message=exe.message))
            exit(1)
        except K8sProxyOpenError:
            handle_error(logger, Texts.PROXY_OPEN_ERROR_MSG,
                         Texts.PROXY_OPEN_ERROR_MSG)
            exit(1)
    else:
        for exp_name, run_list in exp_with_runs.items():
            try:
                exp_del_runs, exp_not_del_runs = cancel_experiment(
                    exp_name=exp_name,
                    runs_to_cancel=run_list,
                    namespace=current_namespace)
                deleted_runs.extend(exp_del_runs)
                not_deleted_runs.extend(exp_not_del_runs)
            except Exception:
                handle_error(logger, Texts.OTHER_CANCELLING_ERROR_MSG)
                not_deleted_runs.extend(run_list)

    if deleted_runs:
        click.echo(
            Texts.SUCCESSFULLY_CANCELLED_LIST_HEADER.format(
                experiment_name_plural=experiment_name_plural,
                operation_word=Texts.DELETE_OPERATION["deleted"]
                if experiment_name_plural == 'pods' else
                Texts.CANCEL_OPERATION["cancelled"]))
        for run in deleted_runs:
            click.echo(f"     - {run.name}")

    if not_deleted_runs:
        click.echo(
            Texts.FAILED_TO_CANCEL_LIST_HEADER.format(
                experiment_name_plural=experiment_name_plural,
                operation_word=Texts.DELETE_OPERATION["deleted"]
                if experiment_name_plural == 'pods' else
                Texts.CANCEL_OPERATION["cancelled"]))
        for run in not_deleted_runs:
            click.echo(f"     - {run.name}")
        sys.exit(1)
Example #2
0
def list_runs_in_cli(verbosity_lvl: int,
                     all_users: bool,
                     name: str,
                     listed_runs_kinds: List[RunKinds],
                     runs_list_headers: List[str],
                     with_metrics: bool,
                     status: RunStatus = None,
                     count: int = None,
                     brief: bool = False):
    """
    Display a list of selected runs in the cli.

    :param verbosity_lvl: level at which error messages should be logged or displayed
    :param all_users: whether to display runs regardless of their owner or not
    :param name: regular expression to which names of the shown runs have to match
    :param status: display runs with this status
    :param listed_runs_kinds: list of kinds of runs that will be listed out
    :param runs_list_headers: headers which will be displayed on top of a table shown in the cli
    :param with_metrics: whether to show metrics column or not
    :param count: number of rows displayed on a list. If not given - content of a list is not limited
    :param brief: when true only experiment name, submission date, owner and state will be print
    """

    try:
        namespace = None if all_users else get_kubectl_current_context_namespace(
        )

        # List experiments command is actually listing Run resources instead of Experiment resources with one
        # exception - if run is initialized - nctl displays data of an experiment instead of data of a run
        runs = replace_initializing_runs(
            Run.list(namespace=namespace,
                     state_list=[status],
                     name_filter=name,
                     run_kinds_filter=listed_runs_kinds))
        runs_representations = [run.cli_representation for run in runs]
        if brief:
            runs_table_data = [
                (run_representation.name, run_representation.submission_date,
                 run_representation.submitter, run_representation.status)
                for run_representation in runs_representations
            ]
        elif with_metrics:
            runs_table_data = runs_representations
        else:
            runs_table_data = [
                (
                    run_representation.name,
                    run_representation.parameters,  # type: ignore
                    run_representation.submission_date,
                    run_representation.start_date,
                    run_representation.duration,
                    run_representation.submitter,
                    run_representation.status,
                    run_representation.template_name,
                    run_representation.template_version)
                for run_representation in runs_representations
            ]
        click.echo(
            tabulate(
                runs_table_data if not count else runs_table_data[-count:],
                headers=runs_list_headers,
                tablefmt=TBLT_TABLE_FORMAT))
    except InvalidRegularExpressionError:
        handle_error(logger,
                     Texts.INVALID_REGEX_ERROR_MSG,
                     Texts.INVALID_REGEX_ERROR_MSG,
                     add_verbosity_msg=verbosity_lvl == 0)
        exit(1)
    except Exception:
        handle_error(logger,
                     Texts.OTHER_ERROR_MSG,
                     Texts.OTHER_ERROR_MSG,
                     add_verbosity_msg=verbosity_lvl == 0)
        exit(1)
Example #3
0
def purge_experiment(exp_name: str, runs_to_purge: List[Run],
                     k8s_es_client: K8sElasticSearchClient,
                     namespace: str) -> Tuple[List[Run], List[Run]]:
    """
       Purge experiment with a given name by cancelling runs given as a parameter. If given experiment
       contains more runs than is in the list of runs - experiment's state remains intact.

       :param exp_name: name of an experiment to which belong runs passed in run_list parameter
       :param runs_to_purge: list of runs that should be purged, they have to belong to exp_name experiment
       :param k8s_es_client: Kubernetes ElasticSearch client
       :param namespace: namespace where experiment is located
       :return: two list - first contains runs that were cancelled successfully, second - those which weren't
       """
    logger.debug(f"Purging {exp_name} experiment ...")

    purged_runs = []
    not_purged_runs = []

    experiment = Experiment.get(name=exp_name, namespace=namespace)
    if not experiment:
        raise RuntimeError(Texts.GET_EXPERIMENT_ERROR_MSG)

    experiment_runs = Run.list(namespace=namespace, exp_name_filter=[exp_name])
    # check whether experiment has more runs that should be cancelled
    cancel_whole_experiment = (len(experiment_runs) == len(runs_to_purge))
    if cancel_whole_experiment:
        experiment.state = ExperimentStatus.CANCELLING
        experiment.update()

    try:
        cancelled_runs, not_cancelled_runs = cancel_experiment_runs(
            runs_to_cancel=runs_to_purge, namespace=namespace)
        not_purged_runs = not_cancelled_runs

        if cancel_whole_experiment:
            # Delete associated workflows
            experiment_associated_workflows = [
                wf for wf in ArgoWorkflow.list(namespace=namespace)
                if wf.labels.get('experimentName') == experiment.name
            ]
            for wf in experiment_associated_workflows:
                wf.delete()

            # Remove tags from git repo manager
            try:
                delete_exp_tag_from_git_repo_manager(
                    experiment_name=experiment.name,
                    username=namespace,
                    experiments_workdir=get_run_environment_path(''))
            except Exception:
                handle_error(logger, Texts.GIT_REPO_MANAGER_ERROR_MSG,
                             Texts.GIT_REPO_MANAGER_ERROR_MSG)
                raise

        for run in cancelled_runs:
            logger.debug(f"Purging {run.name} run ...")
            click.echo(Texts.PURGING_START_MSG.format(run_name=run.name))
            try:
                with spinner(text=Texts.PURGING_PROGRESS_MSG.format(
                        run_name=run.name)):
                    # purge helm release
                    delete_helm_release(run.name,
                                        namespace=namespace,
                                        purge=True)
                    # delete run
                    kubectl.delete_k8s_object("run", run.name)
                    purged_runs.append(run)
            except Exception as exe:
                not_purged_runs.append(run)
                logger.exception("Error during purging runs.")
                # occurence of NotFound error may mean, that run has been removed earlier
                if "NotFound" not in str(exe):
                    click.echo(
                        Texts.INCOMPLETE_PURGE_ERROR_MSG.format(
                            experiment_name=experiment_name))
                    raise exe
            try:
                # clear run logs
                if is_current_user_administrator():
                    logger.debug(f"Clearing logs for {run.name} run.")
                    with spinner(text=Texts.PURGING_LOGS_PROGRESS_MSG.format(
                            run_name=run.name)):
                        k8s_es_client.delete_logs_for_run(run=run.name,
                                                          namespace=namespace)
            except Exception:
                logger.exception("Error during clearing run logs.")

            # CAN-1099 - docker garbage collector has errors that prevent from correct removal of images
            # try:
            # try to remove images from docker registry
            #    delete_images_for_experiment(exp_name=run.name)
            # except Exception:
            #    logger.exception("Error during removing images.")

        if cancel_whole_experiment and not not_purged_runs:
            try:
                kubectl.delete_k8s_object("experiment", exp_name)
            except Exception:
                # problems during deleting experiments are hidden as if runs were
                # cancelled user doesn't have a possibility to remove them
                logger.exception("Error during purging experiment.")

    except Exception:
        logger.exception("Error during purging experiment.")
        return purged_runs, not_purged_runs

    return purged_runs, not_purged_runs
Example #4
0
def cancel_pods_mode(namespace: str,
                     run_name: str = None,
                     pod_ids: str = None,
                     pod_status: str = None):
    namespace_pods = k8s_pods.list_pods(namespace=namespace)

    runs_only_pods = [pod for pod in namespace_pods if 'runName' in pod.labels]

    filtered_pods = runs_only_pods

    if run_name:
        run_name_match_pods = []
        for pod in runs_only_pods:
            if re.match(run_name, pod.labels['runName']):
                run_name_match_pods.append(pod)

        filtered_pods = run_name_match_pods

    if pod_ids:
        pod_ids_match_pods = []
        pod_ids_array = pod_ids.split(',')
        for pod in filtered_pods:
            if pod.name in pod_ids_array:
                pod_ids_match_pods.append(pod)

        filtered_pods = pod_ids_match_pods

    if pod_status:
        status_filtered_pods = []
        try:
            converted_pod_status = PodStatus(pod_status.upper())
        except ValueError:
            handle_error(user_msg=Texts.BAD_POD_STATUS_PASSED.format(
                status_passed=pod_status,
                available_statuses=PodStatus.all_members()))
            exit(1)
            return

        for pod in filtered_pods:
            if pod.status == converted_pod_status:
                status_filtered_pods.append(pod)

        filtered_pods = status_filtered_pods

    if not filtered_pods:
        handle_error(user_msg=Texts.LACK_OF_PODS_ERROR_MSG)
        exit(1)

    click.echo(
        Texts.WILL_BE_PURGED_LIST_HEADER.format(
            experiment_name_plural='pods',
            operation_word=Texts.DELETE_OPERATION["deleted"]))
    for pod in filtered_pods:
        click.echo(f"     - {pod.name}")

    if not click.confirm(
            Texts.CONFIRM_CANCEL_MSG.format(
                experiment_name_plural='pods',
                operation_word=Texts.DELETE_OPERATION["deletion"])):
        handle_error(user_msg=Texts.CANCELLATION_ABORTED_MSG.format(
            experiment_name_plural='pods',
            operation_word=Texts.DELETE_OPERATION["deletion"]))
        exit(0)

    deleted_pods = []
    not_deleted_pods = []

    for pod in filtered_pods:
        click.echo(Texts.CANCELING_PODS_MSG.format(pod_name=pod.name))
        try:
            pod.delete()
            deleted_pods.append(pod)
        except Exception:
            handle_error(logger, Texts.OTHER_POD_CANCELLING_ERROR_MSG)
            not_deleted_pods.append(pod)

    if deleted_pods:
        click.echo(
            Texts.SUCCESSFULLY_CANCELLED_LIST_HEADER.format(
                experiment_name_plural='pods',
                operation_word=Texts.DELETE_OPERATION["deleted"]))
        for pod in deleted_pods:
            click.echo(f"     - {pod.name}")

    if not_deleted_pods:
        click.echo(
            Texts.FAILED_TO_CANCEL_LIST_HEADER.format(
                experiment_name_plural='pods',
                operation_word=Texts.DELETE_OPERATION["deleted"]))
        for pod in not_deleted_pods:
            click.echo(f"     - {pod.name}")
        sys.exit(1)
Example #5
0
def submit(ctx: click.Context, script_location: str,
           script_folder_location: str, template: str, name: str,
           pack_param: List[Tuple[str,
                                  str]], parameter_range: List[Tuple[str,
                                                                     str]],
           parameter_set: Tuple[str, ...], env: List[str],
           script_parameters: Tuple[str, ...], requirements: Optional[str]):
    logger.debug(Texts.SUBMIT_START_LOG_MSG)
    validate_script_location(script_location)
    validate_pack_params(pack_param)
    validate_pack(template)

    if os.path.isdir(script_location):
        if not requirements:
            requirements = get_default_requirements_location(
                script_directory=script_location)
        script_location = get_default_script_location(
            script_directory=script_location)

    click.echo(Texts.SUBMIT_START_USER_MSG)

    runs_list = None
    # noinspection PyBroadException
    try:
        runs_list, runs_errors, _ = submit_experiment(
            run_kind=RunKinds.TRAINING,
            script_location=script_location,
            script_folder_location=script_folder_location,
            template=template,
            name=name,
            pack_params=pack_param,
            parameter_range=parameter_range,
            parameter_set=parameter_set,
            script_parameters=script_parameters,
            env_variables=env,
            requirements_file=requirements)
    except K8sProxyCloseError as exe:
        handle_error(user_msg=exe.message)
        click.echo(exe.message)
        if not runs_list:
            exit(1)
    except SubmitExperimentError as exe:
        handle_error(user_msg=Texts.SUBMIT_ERROR_MSG.format(
            exception_message=exe.message))
        exit(1)
    except Exception:
        handle_error(user_msg=Texts.SUBMIT_OTHER_ERROR_MSG)
        exit(1)

    # display information about status of a training
    click.echo(
        tabulate(
            [(run.cli_representation.name, run.cli_representation.parameters,
              run.cli_representation.status,
              format_run_message(runs_errors.get(run.name, "")))
             for run in runs_list],
            headers=[RUN_NAME, RUN_PARAMETERS, RUN_STATUS, RUN_MESSAGE],
            tablefmt=TBLT_TABLE_FORMAT))

    # if there is at least one FAILED experiment - application has to return exit code != 0
    if any(run.state == RunStatus.FAILED for run in runs_list):
        handle_error(logger, Texts.FAILED_RUNS_LOG_MSG)
        exit(1)
Example #6
0
def validate_script_location(script_location: str):
    if not (os.path.isfile(script_location) or os.path.isdir(script_location)):
        handle_error(user_msg=Texts.SCRIPT_NOT_FOUND_ERROR_MSG.format(
            script_location=script_location))
        exit(2)
Example #7
0
def config(state: State, cpu: str, memory: str):

    if not cpu or not memory:
        handle_error(logger, Texts.MISSING_ARGUMENTS, Texts.MISSING_ARGUMENTS)
        sys.exit(1)

    if not validate_cpu_settings(cpu):
        handle_error(logger, Texts.CPU_WRONG_FORMAT, Texts.CPU_WRONG_FORMAT)
        sys.exit(1)

    if not validate_memory_settings(memory):
        handle_error(logger, Texts.MEMORY_WRONG_FORMAT,
                     Texts.MEMORY_WRONG_FORMAT)
        sys.exit(1)

    configuration = NAUTAConfigMap()

    if configuration.minimal_node_memory_amount and \
       convert_k8s_memory_resource(configuration.minimal_node_memory_amount) > convert_k8s_memory_resource(memory):
        error_message = Texts.MEMORY_SETTINGS_TOO_LOW.format(
            memory_value=configuration.minimal_node_memory_amount)
        handle_error(logger, error_message, error_message)
        sys.exit(1)

    if configuration.minimal_node_cpu_number and \
       convert_k8s_cpu_resource(configuration.minimal_node_cpu_number) > convert_k8s_cpu_resource(cpu):
        error_message = Texts.CPU_SETTINGS_TOO_LOW.format(
            cpu_value=configuration.minimal_node_cpu_number)
        handle_error(logger, error_message, error_message)
        sys.exit(1)

    config_file_location = os.path.join(Config().config_path,
                                        NODE_CONFIG_FILENAME)

    if not os.path.isfile(config_file_location):
        handle_error(logger, Texts.MISSING_CONFIG_FILE,
                     Texts.MISSING_CONFIG_FILE)
        sys.exit(1)

    with open(config_file_location, 'r+', encoding='utf-8') as config_file, \
            spinner(text=Texts.CONFIG_UPDATE):
        config_file_content = yaml.load(config_file)
        cpu_number = str(config_file_content.get(CPU_NUMBER_FIELDNAME))
        memory_amount = str(config_file_content.get(MEMORY_AMOUNT_FIELDNAME))
        cpu_system_required = str(
            config_file_content.get(CPU_SYSTEM_REQUIRED_FIELDNAME))
        memory_system_required = str(
            config_file_content.get(MEMORY_SYSTEM_REQUIRED_FIELDNAME))

        if not cpu_number or cpu_number == "None" or not memory_amount or memory_amount == "None":
            handle_error(logger, Texts.CONFIG_FILE_INCORRECT,
                         Texts.CONFIG_FILE_INCORRECT)
            sys.exit(1)

        try:
            override_values_in_packs(
                new_cpu_number=cpu,
                new_memory_amount=memory,
                current_cpu_number=cpu_number,
                current_mem_amount=memory_amount,
                cpu_system_required=cpu_system_required,
                mem_system_required=memory_system_required)
        except Exception:
            logger.exception(Texts.ERROR_DURING_UPDATE)
            handle_error(logger, Texts.ERROR_DURING_UPDATE,
                         Texts.ERROR_DURING_UPDATE)
            sys.exit(1)

        config_file.seek(0)
        config_file.truncate()
        config_file_content[CPU_NUMBER_FIELDNAME] = cpu
        config_file_content[MEMORY_AMOUNT_FIELDNAME] = memory
        yaml.dump(config_file_content,
                  config_file,
                  default_flow_style=False,
                  explicit_start=True)

    click.echo(Texts.SUCCESS_MESSAGE)
Example #8
0
def launch(state: State, name: str, model_location: str,
           local_model_location: str, model_name: str,
           pack_param: List[Tuple[str, str]], requirements: str):
    """
    Starts a new prediction instance that can be used for performing prediction, classification and
    regression tasks on trained model.
    """
    if not model_location and not local_model_location:
        handle_error(user_msg=Texts.MISSING_MODEL_LOCATION_ERROR_MSG.format(
            local_model_location=local_model_location))
        exit(1)

    if local_model_location:
        validate_local_model_location(local_model_location)

    click.echo('Submitting prediction instance.')
    try:
        model_path = model_location.rstrip(
            '/') if model_location else local_model_location.rstrip('/')
        model_name = model_name if model_name else os.path.basename(model_path)
        name = name if name else generate_name(
            name=model_name, prefix=INFERENCE_INSTANCE_PREFIX)
        inference_instance = start_inference_instance(
            name=name,
            model_location=model_location,
            model_name=model_name,
            local_model_location=local_model_location,
            requirements=requirements,
            pack_params=pack_param)
        if inference_instance.state == RunStatus.FAILED:
            raise RuntimeError('Inference instance submission failed.')
    except Exception:
        handle_error(logger,
                     Texts.INSTANCE_START_ERROR_MSG,
                     Texts.INSTANCE_START_ERROR_MSG,
                     add_verbosity_msg=state.verbosity == 0)
        exit(1)

    click.echo(
        tabulate([[
            inference_instance.cli_representation.name, model_location,
            inference_instance.cli_representation.status
        ]],
                 headers=Texts.TABLE_HEADERS,
                 tablefmt=TBLT_TABLE_FORMAT))

    try:
        namespace = get_kubectl_current_context_namespace()
        authorization_header = get_authorization_header(
            service_account_name=name, namespace=namespace)
        inference_instance_url = get_inference_instance_url(
            inference_instance=inference_instance, model_name=model_name)
        click.echo(
            Texts.INSTANCE_INFO_MSG.format(
                inference_instance_url=inference_instance_url,
                authorization_header=authorization_header))
    except Exception:
        handle_error(logger,
                     Texts.INSTANCE_URL_ERROR_MSG,
                     Texts.INSTANCE_URL_ERROR_MSG,
                     add_verbosity_msg=state.verbosity == 0)
        exit(1)
Example #9
0
def validate_local_model_location(local_model_location: str):
    if not os.path.isdir(local_model_location):
        handle_error(user_msg=Texts.MODEL_DIR_NOT_FOUND_ERROR_MSG.format(
            local_model_location=local_model_location))
        exit(2)
Example #10
0
def cancel(ctx: click.Context,
           name: str,
           match: str,
           purge: bool,
           pod_ids: str,
           pod_status: str,
           listed_runs_kinds: List[RunKinds] = None):
    """
    Cancels chosen experiments based on a name provided as a parameter.
    """
    if not listed_runs_kinds:
        listed_runs_kinds = [RunKinds.TRAINING, RunKinds.JUPYTER]

    # check whether we have runs with a given name
    if name and match:
        handle_error(user_msg=Texts.NAME_M_BOTH_GIVEN_ERROR_MSG)
        exit(1)

    if not name and not match:
        handle_error(user_msg=Texts.NAME_M_NONE_GIVEN_ERROR_MSG)
        exit(1)

    current_namespace = get_current_namespace()

    if pod_ids or pod_status:
        if not name:
            name = match

        cancel_pods_mode(namespace=current_namespace,
                         run_name=name,
                         pod_ids=pod_ids,
                         pod_status=pod_status)
        exit(0)

    search_for_experiment = False
    exp_to_be_cancelled = None

    if name:
        exp_to_be_cancelled = Experiment.get(namespace=current_namespace,
                                             name=name)
        exp_to_be_cancelled_kind = RunKinds(exp_to_be_cancelled.metadata['labels'].get('runKind')) \
            if exp_to_be_cancelled else None
        exp_to_be_cancelled = exp_to_be_cancelled if exp_to_be_cancelled_kind in listed_runs_kinds else None

        if exp_to_be_cancelled:
            search_for_experiment = True
        else:
            name = f"^{name}$"
    else:
        name = match

    list_of_all_runs = None

    list_of_applicable_states = [RunStatus.QUEUED, RunStatus.RUNNING]

    if purge:
        list_of_applicable_states.extend(
            [RunStatus.FAILED, RunStatus.COMPLETE, RunStatus.CANCELLED])

    try:
        if search_for_experiment:
            list_of_all_runs = Run.list(namespace=current_namespace,
                                        exp_name_filter=[name],
                                        run_kinds_filter=listed_runs_kinds)
        else:
            list_of_all_runs = Run.list(namespace=current_namespace,
                                        name_filter=name,
                                        run_kinds_filter=listed_runs_kinds)
    except Exception:
        handle_error(
            logger,
            Texts.LIST_RUNS_ERROR_MSG.format(
                experiment_name_plural=experiment_name_plural),
            Texts.LIST_RUNS_ERROR_MSG.format(
                experiment_name_plural=experiment_name_plural))
        exit(1)

    # Handle cancellation of experiments with no associated Runs
    if exp_to_be_cancelled and not list_of_all_runs:
        cancel_uninitialized_experiment(experiment=exp_to_be_cancelled,
                                        purge=purge)

    if not list_of_all_runs:
        handle_error(user_msg=Texts.LACK_OF_EXPERIMENTS_ERROR_MSG.format(
            experiment_name_plural=experiment_name_plural,
            experiment_name=experiment_name))
        exit(1)
    elif not purge and not [
            run for run in list_of_all_runs
            if run.state in [RunStatus.QUEUED, RunStatus.RUNNING]
    ]:
        handle_error(
            user_msg=Texts.LACK_OF_EXP_TO_BE_CANCELLED_ERROR_MSG.format(
                experiment_name_plural=experiment_name_plural))
        exit(1)

    # check whether we have at least one experiment in state other than CANCELLED
    list_of_runs_to_be_deleted: List[Run] = []
    names_of_cancelled_runs: List[str] = []

    if not purge:
        # check whether we have at least one experiment in state other than CANCELLED
        for run in list_of_all_runs:
            if run.state in list_of_applicable_states:
                list_of_runs_to_be_deleted.append(run)
            else:
                names_of_cancelled_runs.append(run.name)

        if not list_of_runs_to_be_deleted:
            handle_error(
                user_msg=Texts.EXPERIMENTS_ALREADY_CANCELLED_ERROR_MSG.format(
                    experiment_name_plural=experiment_name_plural,
                    operation_word=Texts.
                    DELETE_OPERATION["deleted"] if experiment_name_plural ==
                    'pods' else Texts.CANCEL_OPERATION["cancelled"]))
            exit(1)
        elif len(list_of_runs_to_be_deleted) != len(list_of_all_runs):
            click.echo(
                Texts.ALREADY_CANCELLED_LIST_HEADER.format(
                    experiment_name_plural=experiment_name_plural,
                    operation_word=Texts.DELETE_OPERATION["deleted"]
                    if experiment_name_plural == 'pods' else
                    Texts.CANCEL_OPERATION["cancelled"]))
            for name in names_of_cancelled_runs:
                click.echo(f"     - {name}")
            click.echo(
                Texts.CAN_BE_CANCELLED_LIST_HEADER.format(
                    experiment_name_plural=experiment_name_plural,
                    operation_word=Texts.DELETE_OPERATION["deleted"]
                    if experiment_name_plural == 'pods' else
                    Texts.CANCEL_OPERATION["cancelled"]))
            for run in list_of_runs_to_be_deleted:
                click.echo(f"     - {run.name}")
        else:
            click.echo(
                Texts.WILL_BE_CANCELLED_LIST_HEADER.format(
                    experiment_name_plural=experiment_name_plural,
                    operation_word=Texts.DELETE_OPERATION["deleted"]
                    if experiment_name_plural == 'pods' else
                    Texts.CANCEL_OPERATION["cancelled"]))
            for run in list_of_runs_to_be_deleted:
                click.echo(f"     - {run.name}")
    else:
        list_of_runs_to_be_deleted = list_of_all_runs
        click.echo(
            Texts.WILL_BE_PURGED_LIST_HEADER.format(
                experiment_name_plural=experiment_name_plural,
                operation_word=Texts.DELETE_OPERATION["deleted"]
                if experiment_name_plural == 'pods' else
                Texts.CANCEL_OPERATION["cancelled"]))
        for run in list_of_runs_to_be_deleted:
            click.echo(f"     - {run.name}")

    if (not ctx.obj.force) and (not click.confirm(
            Texts.CONFIRM_CANCEL_MSG.format(
                experiment_name_plural=experiment_name_plural,
                operation_word=Texts.DELETE_OPERATION["deletion"]
                if experiment_name_plural == 'pods' else
                Texts.CANCEL_OPERATION["cancellation"]))):
        handle_error(user_msg=Texts.CANCELLATION_ABORTED_MSG.format(
            experiment_name_plural=experiment_name_plural,
            operation_word=Texts.
            DELETE_OPERATION["deletion"] if experiment_name_plural ==
            'pods' else Texts.CANCEL_OPERATION["cancellation"]))
        exit(0)

    # group runs by experiments
    exp_with_runs: defaultdict = defaultdict(list)

    for run in list_of_runs_to_be_deleted:
        exp_with_runs[run.experiment_name].append(run)

    deleted_runs = []
    not_deleted_runs = []

    if purge:
        # Connect to elasticsearch in order to purge run logs
        es_client = K8sElasticSearchClient(
            host=f'{get_kubectl_host(with_port=True)}'
            f'/api/v1/namespaces/nauta/services/nauta-elasticsearch:nauta/proxy',
            verify_certs=False,
            use_ssl=True,
            headers={'Authorization': get_api_key()})
        for exp_name, run_list in exp_with_runs.items():
            try:
                exp_del_runs, exp_not_del_runs = purge_experiment(
                    exp_name=exp_name,
                    runs_to_purge=run_list,
                    namespace=current_namespace,
                    k8s_es_client=es_client)
                deleted_runs.extend(exp_del_runs)
                not_deleted_runs.extend(exp_not_del_runs)
            except Exception:
                handle_error(logger, Texts.OTHER_CANCELLING_ERROR_MSG)
                not_deleted_runs.extend(run_list)
    else:
        for exp_name, run_list in exp_with_runs.items():
            try:
                exp_del_runs, exp_not_del_runs = cancel_experiment(
                    exp_name=exp_name,
                    runs_to_cancel=run_list,
                    namespace=current_namespace)
                deleted_runs.extend(exp_del_runs)
                not_deleted_runs.extend(exp_not_del_runs)
            except Exception:
                handle_error(logger, Texts.OTHER_CANCELLING_ERROR_MSG)
                not_deleted_runs.extend(run_list)

    if deleted_runs:
        click.echo(
            Texts.SUCCESSFULLY_CANCELLED_LIST_HEADER.format(
                experiment_name_plural=experiment_name_plural,
                operation_word=Texts.DELETE_OPERATION["deleted"]
                if experiment_name_plural == 'pods' else
                Texts.CANCEL_OPERATION["cancelled"]))
        for run in deleted_runs:
            click.echo(f"     - {run.name}")

    if not_deleted_runs:
        click.echo(
            Texts.FAILED_TO_CANCEL_LIST_HEADER.format(
                experiment_name_plural=experiment_name_plural,
                operation_word=Texts.DELETE_OPERATION["deleted"]
                if experiment_name_plural == 'pods' else
                Texts.CANCEL_OPERATION["cancelled"]))
        for run in not_deleted_runs:
            click.echo(f"     - {run.name}")
        sys.exit(1)
Example #11
0
def list_unitialized_experiments_in_cli(
        verbosity_lvl: int,
        all_users: bool,
        name: str,
        headers: List[str],
        listed_runs_kinds: List[RunKinds] = None,
        count: int = None,
        brief: bool = False):
    """
    Display a list of selected runs in the cli.

    :param verbosity_lvl: level at which error messages should be logged or displayed
    :param all_users: whether to display runs regardless of their owner or not
    :param name: regular expression to which names of the shown runs have to match
    :param headers: headers which will be displayed on top of a table shown in the cli
    :param count: number of rows displayed on a list. If not given - content of a list is not limited
    """

    if not listed_runs_kinds:
        listed_runs_kinds = [RunKinds.TRAINING, RunKinds.JUPYTER]

    try:
        namespace = None if all_users else get_kubectl_current_context_namespace(
        )

        creating_experiments = Experiment.list(
            namespace=namespace,
            state=ExperimentStatus.CREATING,
            run_kinds_filter=listed_runs_kinds,
            name_filter=name)
        runs = Run.list(namespace=namespace,
                        name_filter=name,
                        run_kinds_filter=listed_runs_kinds)

        # Get Experiments without associated Runs
        names_of_experiment_with_runs = set()
        for run in runs:
            names_of_experiment_with_runs.add(run.experiment_name)

        uninitialized_experiments = [
            experiment for experiment in creating_experiments
            if experiment.name not in names_of_experiment_with_runs
        ]

        displayed_items_count = count if count else len(
            uninitialized_experiments)
        click.echo(
            tabulate([
                uninitialized_experiment_cli_representation(experiment)
                for experiment in uninitialized_experiments
            ][-displayed_items_count:],
                     headers=headers,
                     tablefmt="orgtbl"))
    except InvalidRegularExpressionError:
        handle_error(logger,
                     Texts.INVALID_REGEX_ERROR_MSG,
                     Texts.INVALID_REGEX_ERROR_MSG,
                     add_verbosity_msg=verbosity_lvl == 0)
        exit(1)
    except Exception:
        handle_error(logger,
                     Texts.OTHER_ERROR_MSG,
                     Texts.OTHER_ERROR_MSG,
                     add_verbosity_msg=verbosity_lvl == 0)
        exit(1)