コード例 #1
0
ファイル: user.py プロジェクト: zhcf/nauta
    def list(cls,
             namespace: str = None,
             custom_objects_api: CustomObjectsApi = None):
        """
        Return list of users.
        :namespace:
        :return: List of User objects
        """
        logger.debug('Listing users.')
        k8s_custom_object_api = custom_objects_api if custom_objects_api else PlatformResourceApiClient.get(
        )

        raw_users = k8s_custom_object_api.list_cluster_custom_object(
            group=cls.api_group_name,
            plural=cls.crd_plural_name,
            version=cls.crd_version)

        users = [
            User.from_k8s_response_dict(user_dict)
            for user_dict in raw_users['items']
        ]

        # Get experiment runs for each user
        # TODO: CHANGE IMPLEMENTATION TO USE AGGREGATED USER DATA AFTER CAN-366
        runs = Run.list(custom_objects_api=k8s_custom_object_api)
        user_map = {user.name: user for user in users}

        for run in runs:
            if user_map.get(run.namespace):
                user_map[run.namespace].experiment_runs.append(run)
            else:
                logger.error(
                    f"Run exists for nonexisting user {run.namespace}")

        return users
コード例 #2
0
def generate_name_for_existing_exps(script_name: str, namespace: str,
                                    run_kind: RunKinds = RunKinds.TRAINING) -> Tuple[Optional[str], Dict[str, str]]:
    exp_list = list_k8s_experiments_by_label(namespace=namespace,
                                             label_selector=f"script_name={script_name},name_origin")
    if not exp_list or len(exp_list) == 0:
        return None, {}

    # 1. Find newest experiment name
    newest_exp = None
    for exp in exp_list:
        if not newest_exp:
            newest_exp = exp
        elif exp.metadata.creation_timestamp > newest_exp.metadata.creation_timestamp:
            newest_exp = exp
    name_origin = newest_exp.metadata.labels['name_origin']

    names_of_experiments_with_the_same_origin = []
    for exp in exp_list:
        if exp.metadata.labels['name_origin'] == name_origin:
            names_of_experiments_with_the_same_origin.append(exp.metadata.name)

    # 2. Count experiments(runs) matching the same origin name of an experiment
    runs_of_exp_list = Run.list(namespace=namespace, exp_name_filter=names_of_experiments_with_the_same_origin)

    counter = 1
    if runs_of_exp_list:
        counter = len(runs_of_exp_list) + 1

    calculated_name = f"{name_origin}-{counter}"
    return calculated_name, prepare_label(script_name, calculated_name, name_origin, run_kind=run_kind)
コード例 #3
0
def test_list_runs_from_namespace(mock_k8s_api_client: CustomObjectsApi):
    raw_runs_single_namespace = dict(LIST_RUNS_RESPONSE_RAW)
    raw_runs_single_namespace['items'] = [raw_runs_single_namespace['items'][0]]
    mock_k8s_api_client.list_namespaced_custom_object.return_value = raw_runs_single_namespace

    runs = Run.list(namespace='namespace-1')

    assert [TEST_RUNS[0]] == runs
コード例 #4
0
def list_runs_in_cli(verbosity_lvl: int, all_users: bool, name: str, status: RunStatus,
                     listed_runs_kinds: List[RunKinds], runs_list_headers: List[str], with_metrics: bool,
                     count: int = None, brief: bool = False):
    """
    Display a list of selected runs in the cli.

    :param verbosity_lvl: level at which error messages should be logged or displayed
    :param all_users: whether to display runs regardless of their owner or not
    :param name: regular expression to which names of the shown runs have to match
    :param status: display runs with this status
    :param listed_runs_kinds: list of kinds of runs that will be listed out
    :param runs_list_headers: headers which will be displayed on top of a table shown in the cli
    :param with_metrics: whether to show metrics column or not
    :param count: number of rows displayed on a list. If not given - content of a list is not limited
    :param brief: when true only experiment name, submission date, owner and state will be print
    """

    try:
        namespace = None if all_users else get_kubectl_current_context_namespace()
        status = RunStatus[status] if status else None

        # List experiments command is actually listing Run resources instead of Experiment resources with one
        # exception - if run is initialized - nctl displays data of an experiment instead of data of a run
        runs = replace_initializing_runs(
            Run.list(namespace=namespace, state_list=[status], name_filter=name, run_kinds_filter=listed_runs_kinds))
        runs_representations = [run.cli_representation for run in runs]
        if brief:
            runs_table_data = [
                (run_representation.name, run_representation.submission_date, run_representation.submitter,
                 run_representation.status)
                for run_representation in runs_representations
            ]
        elif with_metrics:
            runs_table_data = runs_representations
        else:
            runs_table_data = [
                (run_representation.name, run_representation.parameters, run_representation.submission_date,
                 run_representation.start_date, run_representation.duration,
                 run_representation.submitter, run_representation.status, run_representation.template_name)
                for run_representation in runs_representations
            ]
        click.echo(tabulate(runs_table_data if not count else runs_table_data[-count:],
                            headers=runs_list_headers, tablefmt="orgtbl"))
    except InvalidRegularExpressionError:
        handle_error(logger, Texts.INVALID_REGEX_ERROR_MSG, Texts.INVALID_REGEX_ERROR_MSG,
                     add_verbosity_msg=verbosity_lvl == 0)
        exit(1)
    except Exception:
        handle_error(logger, Texts.OTHER_ERROR_MSG, Texts.OTHER_ERROR_MSG, add_verbosity_msg=verbosity_lvl == 0)
        exit(1)
コード例 #5
0
def cancel_experiment(exp_name: str, runs_to_cancel: List[Run],
                      namespace: str) -> Tuple[List[Run], List[Run]]:
    """
    Cancel experiment with a given name by cancelling runs given as a parameter. If given experiment
    contains more runs than is in the list of runs - experiment's state remains intact.

    :param exp_name: name of an experiment to which belong runs passed in run_list parameter
    :param runs_to_cancel: list of runs that should be deleted, they have to belong to exp_name experiment
    :param namespace: namespace where experiment is located
    :return: two list - first contains runs that were cancelled successfully, second - those which weren't
    """
    logger.debug(f"Cancelling {exp_name} experiment ...")

    deleted_runs: List[Run] = []
    not_deleted_runs: List[Run] = []

    experiment = Experiment.get(name=exp_name, namespace=namespace)
    if not experiment:
        raise RuntimeError(Texts.GET_EXPERIMENT_ERROR_MSG)

    experiment_runs = Run.list(namespace=namespace,
                               exp_name_filter=[exp_name],
                               excl_state=RunStatus.CANCELLED)
    # check whether experiment has more runs that should be cancelled
    cancel_whole_experiment = (len(experiment_runs) == len(runs_to_cancel))
    if cancel_whole_experiment:
        experiment.state = ExperimentStatus.CANCELLING
        experiment.update()

    try:
        deleted_runs, not_deleted_runs = cancel_experiment_runs(
            runs_to_cancel=runs_to_cancel, namespace=namespace)

        if cancel_whole_experiment and not not_deleted_runs:
            try:
                # change an experiment state to CANCELLED
                experiment.state = ExperimentStatus.CANCELLED
                experiment.update()
            except Exception:
                # problems during deleting experiments are hidden as if runs were
                # cancelled user doesn't have a possibility to remove them
                logger.exception(
                    "Error during cancelling Experiment resource.")

    except Exception:
        logger.exception("Error during cancelling experiment.")
        return deleted_runs, not_deleted_runs

    return deleted_runs, not_deleted_runs
コード例 #6
0
def list_unitialized_experiments_in_cli(verbosity_lvl: int, all_users: bool,
                                        name: str, headers: List[str], listed_runs_kinds: List[RunKinds] = None,
                                        count: int = None, brief: bool = False):
    """
    Display a list of selected runs in the cli.

    :param verbosity_lvl: level at which error messages should be logged or displayed
    :param all_users: whether to display runs regardless of their owner or not
    :param name: regular expression to which names of the shown runs have to match
    :param headers: headers which will be displayed on top of a table shown in the cli
    :param count: number of rows displayed on a list. If not given - content of a list is not limited
    """

    if not listed_runs_kinds:
        listed_runs_kinds = [RunKinds.TRAINING, RunKinds.JUPYTER]

    try:
        namespace = None if all_users else get_kubectl_current_context_namespace()

        creating_experiments = Experiment.list(namespace=namespace,
                                               state=ExperimentStatus.CREATING,
                                               run_kinds_filter=listed_runs_kinds,
                                               name_filter=name)
        runs = Run.list(namespace=namespace, name_filter=name, run_kinds_filter=listed_runs_kinds)

        # Get Experiments without associated Runs
        names_of_experiment_with_runs = set()
        for run in runs:
            names_of_experiment_with_runs.add(run.experiment_name)

        uninitialized_experiments = [experiment for experiment in creating_experiments
                                     if experiment.name not in names_of_experiment_with_runs]

        displayed_items_count = count if count else len(uninitialized_experiments)
        click.echo(tabulate([uninitialized_experiment_cli_representation(experiment)
                             for experiment in uninitialized_experiments][-displayed_items_count:],
                            headers=headers, tablefmt="orgtbl"))
    except InvalidRegularExpressionError:
        handle_error(logger, Texts.INVALID_REGEX_ERROR_MSG, Texts.INVALID_REGEX_ERROR_MSG,
                     add_verbosity_msg=verbosity_lvl == 0)
        exit(1)
    except Exception:
        handle_error(logger, Texts.OTHER_ERROR_MSG, Texts.OTHER_ERROR_MSG, add_verbosity_msg=verbosity_lvl == 0)
        exit(1)
コード例 #7
0
def get_logs(experiment_name: str, min_severity: SeverityLevel, start_date: str,
             end_date: str, pod_ids: str, pod_status: PodStatus, match: str, output: bool, pager: bool, follow: bool,
             runs_kinds: List[RunKinds], instance_type: str):
    """
    Show logs for a given experiment.
    """
    # check whether we have runs with a given name
    if experiment_name and match:
        handle_error(user_msg=Texts.NAME_M_BOTH_GIVEN_ERROR_MSG.format(instance_type=instance_type))
        exit(1)
    elif not experiment_name and not match:
        handle_error(user_msg=Texts.NAME_M_NONE_GIVEN_ERROR_MSG.format(instance_type=instance_type))
        exit(1)

    try:
        with K8sProxy(NAUTAAppNames.ELASTICSEARCH) as proxy:
            es_client = K8sElasticSearchClient(host="127.0.0.1", port=proxy.tunnel_port,
                                               verify_certs=False, use_ssl=False)
            namespace = get_kubectl_current_context_namespace()
            if match:
                experiment_name = match
                name_filter = match
            else:
                name_filter = f'^{experiment_name}$'
            runs = Run.list(namespace=namespace, name_filter=name_filter, run_kinds_filter=runs_kinds)
            if not runs:
                raise ValueError(f'Run with given name: {experiment_name} does not exists in namespace {namespace}.')

            pod_ids = pod_ids.split(',') if pod_ids else None
            min_severity = SeverityLevel[min_severity] if min_severity else None
            pod_status = PodStatus[pod_status] if pod_status else None
            follow_logs = True if follow and not output else False

            if output and len(runs) > 1:
                click.echo(Texts.MORE_EXP_LOGS_MESSAGE)

            for run in runs:
                start_date = start_date if start_date else run.creation_timestamp

                run_logs_generator = es_client.get_experiment_logs_generator(run=run, namespace=namespace,
                                                                             min_severity=min_severity,
                                                                             start_date=start_date, end_date=end_date,
                                                                             pod_ids=pod_ids, pod_status=pod_status,
                                                                             follow=follow_logs)

                if output:
                    save_logs_to_file(run=run, run_logs_generator=run_logs_generator, instance_type=instance_type)
                else:
                    if len(runs) > 1:
                        click.echo(f'Experiment : {run.name}')
                    print_logs(run_logs_generator=run_logs_generator, pager=pager)

    except K8sProxyCloseError:
        handle_error(logger, Texts.PROXY_CLOSE_LOG_ERROR_MSG, Texts.PROXY_CLOSE_USER_ERROR_MSG)
        exit(1)
    except LocalPortOccupiedError as exe:
        handle_error(logger, Texts.LOCAL_PORT_OCCUPIED_ERROR_MSG.format(exception_message=exe.message),
                     Texts.LOCAL_PORT_OCCUPIED_ERROR_MSG.format(exception_message=exe.message))
        exit(1)
    except K8sProxyOpenError:
        handle_error(logger, Texts.PROXY_CREATION_ERROR_MSG, Texts.PROXY_CREATION_ERROR_MSG)
        exit(1)
    except ValueError:
        handle_error(logger, Texts.EXPERIMENT_NOT_EXISTS_ERROR_MSG.format(experiment_name=experiment_name,
                                                                          instance_type=instance_type.capitalize()),
                     Texts.EXPERIMENT_NOT_EXISTS_ERROR_MSG.format(experiment_name=experiment_name,
                                                                  instance_type=instance_type.capitalize()))
        exit(1)
    except Exception:
        handle_error(logger, Texts.LOGS_GET_OTHER_ERROR_MSG.format(instance_type=instance_type),
                     Texts.LOGS_GET_OTHER_ERROR_MSG.format(instance_type=instance_type))
        exit(1)
コード例 #8
0
def cancel(state: State,
           name: str,
           match: str,
           purge: bool,
           pod_ids: str,
           pod_status: str,
           listed_runs_kinds: List[RunKinds] = None):
    """
    Cancels chosen experiments based on a name provided as a parameter.
    """
    if not listed_runs_kinds:
        listed_runs_kinds = [RunKinds.TRAINING, RunKinds.JUPYTER]

    # check whether we have runs with a given name
    if name and match:
        handle_error(user_msg=Texts.NAME_M_BOTH_GIVEN_ERROR_MSG)
        exit(1)

    if not name and not match:
        handle_error(user_msg=Texts.NAME_M_NONE_GIVEN_ERROR_MSG)
        exit(1)

    current_namespace = get_current_namespace()

    if pod_ids or pod_status:
        if not name:
            name = match

        cancel_pods_mode(namespace=current_namespace,
                         run_name=name,
                         pod_ids=pod_ids,
                         pod_status=pod_status)
        exit(0)

    search_for_experiment = False
    exp_to_be_cancelled = None

    if name:
        exp_to_be_cancelled = Experiment.get(namespace=current_namespace,
                                             name=name)
        exp_to_be_cancelled_kind = RunKinds(exp_to_be_cancelled.metadata['labels'].get('runKind')) \
            if exp_to_be_cancelled else None
        exp_to_be_cancelled = exp_to_be_cancelled if exp_to_be_cancelled_kind in listed_runs_kinds else None

        if exp_to_be_cancelled:
            search_for_experiment = True
        else:
            name = f"^{name}$"
    else:
        name = match

    list_of_all_runs = None

    list_of_applicable_states = [RunStatus.QUEUED, RunStatus.RUNNING]

    if purge:
        list_of_applicable_states.extend(
            [RunStatus.FAILED, RunStatus.COMPLETE, RunStatus.CANCELLED])

    try:
        if search_for_experiment:
            list_of_all_runs = Run.list(namespace=current_namespace,
                                        exp_name_filter=[name],
                                        run_kinds_filter=listed_runs_kinds)
        else:
            list_of_all_runs = Run.list(namespace=current_namespace,
                                        name_filter=name,
                                        run_kinds_filter=listed_runs_kinds)
    except Exception:
        handle_error(
            logger,
            Texts.LIST_RUNS_ERROR_MSG.format(
                experiment_name_plural=experiment_name_plural),
            Texts.LIST_RUNS_ERROR_MSG.format(
                experiment_name_plural=experiment_name_plural))
        exit(1)

    # Handle cancellation of experiments with no associated Runs
    if exp_to_be_cancelled and not list_of_all_runs:
        cancel_uninitialized_experiment(experiment=exp_to_be_cancelled,
                                        namespace=current_namespace,
                                        purge=purge)

    if not list_of_all_runs:
        handle_error(user_msg=Texts.LACK_OF_EXPERIMENTS_ERROR_MSG.format(
            experiment_name_plural=experiment_name_plural,
            experiment_name=experiment_name))
        exit(1)
    elif not purge and not [
            run for run in list_of_all_runs
            if run.state in [RunStatus.QUEUED, RunStatus.RUNNING]
    ]:
        handle_error(
            user_msg=Texts.LACK_OF_EXP_TO_BE_CANCELLED_ERROR_MSG.format(
                experiment_name_plural=experiment_name_plural))
        exit(1)

    # check whether we have at least one experiment in state other than CANCELLED
    list_of_runs_to_be_deleted: List[Run] = []
    names_of_cancelled_runs: List[str] = []

    if not purge:
        # check whether we have at least one experiment in state other than CANCELLED
        for run in list_of_all_runs:
            if run.state in list_of_applicable_states:
                list_of_runs_to_be_deleted.append(run)
            else:
                names_of_cancelled_runs.append(run.name)

        if not list_of_runs_to_be_deleted:
            handle_error(
                user_msg=Texts.EXPERIMENTS_ALREADY_CANCELLED_ERROR_MSG.format(
                    experiment_name_plural=experiment_name_plural,
                    operation_word=Texts.
                    DELETE_OPERATION["deleted"] if experiment_name_plural ==
                    'pods' else Texts.CANCEL_OPERATION["cancelled"]))
            exit(1)
        elif len(list_of_runs_to_be_deleted) != len(list_of_all_runs):
            click.echo(
                Texts.ALREADY_CANCELLED_LIST_HEADER.format(
                    experiment_name_plural=experiment_name_plural,
                    operation_word=Texts.DELETE_OPERATION["deleted"]
                    if experiment_name_plural == 'pods' else
                    Texts.CANCEL_OPERATION["cancelled"]))
            for name in names_of_cancelled_runs:
                click.echo(f"     - {name}")
            click.echo(
                Texts.CAN_BE_CANCELLED_LIST_HEADER.format(
                    experiment_name_plural=experiment_name_plural,
                    operation_word=Texts.DELETE_OPERATION["deleted"]
                    if experiment_name_plural == 'pods' else
                    Texts.CANCEL_OPERATION["cancelled"]))
            for run in list_of_runs_to_be_deleted:
                click.echo(f"     - {run.name}")
        else:
            click.echo(
                Texts.WILL_BE_CANCELLED_LIST_HEADER.format(
                    experiment_name_plural=experiment_name_plural,
                    operation_word=Texts.DELETE_OPERATION["deleted"]
                    if experiment_name_plural == 'pods' else
                    Texts.CANCEL_OPERATION["cancelled"]))
            for run in list_of_runs_to_be_deleted:
                click.echo(f"     - {run.name}")
    else:
        list_of_runs_to_be_deleted = list_of_all_runs
        click.echo(
            Texts.WILL_BE_PURGED_LIST_HEADER.format(
                experiment_name_plural=experiment_name_plural,
                operation_word=Texts.DELETE_OPERATION["deleted"]
                if experiment_name_plural == 'pods' else
                Texts.CANCEL_OPERATION["cancelled"]))
        for run in list_of_runs_to_be_deleted:
            click.echo(f"     - {run.name}")

    if not click.confirm(
            Texts.CONFIRM_CANCEL_MSG.format(
                experiment_name_plural=experiment_name_plural,
                operation_word=Texts.DELETE_OPERATION["deletion"]
                if experiment_name_plural == 'pods' else
                Texts.CANCEL_OPERATION["cancellation"])):
        handle_error(user_msg=Texts.CANCELLATION_ABORTED_MSG.format(
            experiment_name_plural=experiment_name_plural,
            operation_word=Texts.
            DELETE_OPERATION["deletion"] if experiment_name_plural ==
            'pods' else Texts.CANCEL_OPERATION["cancellation"]))
        exit(0)

    # group runs by experiments
    exp_with_runs: defaultdict = defaultdict(list)

    for run in list_of_runs_to_be_deleted:
        exp_with_runs[run.experiment_name].append(run)

    deleted_runs = []
    not_deleted_runs = []

    if purge:
        # Connect to elasticsearch in order to purge run logs
        try:
            with K8sProxy(NAUTAAppNames.ELASTICSEARCH) as proxy:
                es_client = K8sElasticSearchClient(
                    host="127.0.0.1",
                    port=proxy.tunnel_port,
                    verify_certs=False,
                    use_ssl=False,
                    with_admin_privledges=is_current_user_administrator())
                for exp_name, run_list in exp_with_runs.items():
                    try:
                        exp_del_runs, exp_not_del_runs = purge_experiment(
                            exp_name=exp_name,
                            runs_to_purge=run_list,
                            namespace=current_namespace,
                            k8s_es_client=es_client)
                        deleted_runs.extend(exp_del_runs)
                        not_deleted_runs.extend(exp_not_del_runs)
                    except Exception:
                        handle_error(logger, Texts.OTHER_CANCELLING_ERROR_MSG)
                        not_deleted_runs.extend(run_list)
        except K8sProxyCloseError:
            handle_error(logger, Texts.PROXY_CLOSING_ERROR_LOG_MSG,
                         Texts.PROXY_CLOSING_ERROR_USER_MSG)
            exit(1)
        except LocalPortOccupiedError as exe:
            handle_error(
                logger, Texts.PORT_OCCUPIED_ERROR_LOG_MSG,
                Texts.PORT_OCCUPIED_ERROR_USER_MSG.format(
                    exception_message=exe.message))
            exit(1)
        except K8sProxyOpenError:
            handle_error(logger, Texts.PROXY_OPEN_ERROR_MSG,
                         Texts.PROXY_OPEN_ERROR_MSG)
            exit(1)
    else:
        for exp_name, run_list in exp_with_runs.items():
            try:
                exp_del_runs, exp_not_del_runs = cancel_experiment(
                    exp_name=exp_name,
                    runs_to_cancel=run_list,
                    namespace=current_namespace)
                deleted_runs.extend(exp_del_runs)
                not_deleted_runs.extend(exp_not_del_runs)
            except Exception:
                handle_error(logger, Texts.OTHER_CANCELLING_ERROR_MSG)
                not_deleted_runs.extend(run_list)

    if deleted_runs:
        click.echo(
            Texts.SUCCESSFULLY_CANCELLED_LIST_HEADER.format(
                experiment_name_plural=experiment_name_plural,
                operation_word=Texts.DELETE_OPERATION["deleted"]
                if experiment_name_plural == 'pods' else
                Texts.CANCEL_OPERATION["cancelled"]))
        for run in deleted_runs:
            click.echo(f"     - {run.name}")

    if not_deleted_runs:
        click.echo(
            Texts.FAILED_TO_CANCEL_LIST_HEADER.format(
                experiment_name_plural=experiment_name_plural,
                operation_word=Texts.DELETE_OPERATION["deleted"]
                if experiment_name_plural == 'pods' else
                Texts.CANCEL_OPERATION["cancelled"]))
        for run in not_deleted_runs:
            click.echo(f"     - {run.name}")
        sys.exit(1)
コード例 #9
0
def purge_experiment(exp_name: str, runs_to_purge: List[Run],
                     k8s_es_client: K8sElasticSearchClient,
                     namespace: str) -> Tuple[List[Run], List[Run]]:
    """
       Purge experiment with a given name by cancelling runs given as a parameter. If given experiment
       contains more runs than is in the list of runs - experiment's state remains intact.

       :param exp_name: name of an experiment to which belong runs passed in run_list parameter
       :param runs_to_purge: list of runs that should be purged, they have to belong to exp_name experiment
       :param k8s_es_client: Kubernetes ElasticSearch client
       :param namespace: namespace where experiment is located
       :return: two list - first contains runs that were cancelled successfully, second - those which weren't
       """
    logger.debug(f"Purging {exp_name} experiment ...")

    purged_runs: List[Run] = []
    not_purged_runs: List[Run] = []

    experiment = Experiment.get(name=exp_name, namespace=namespace)
    if not experiment:
        raise RuntimeError(Texts.GET_EXPERIMENT_ERROR_MSG)

    experiment_runs = Run.list(namespace=namespace, exp_name_filter=[exp_name])
    # check whether experiment has more runs that should be cancelled
    cancel_whole_experiment = (len(experiment_runs) == len(runs_to_purge))
    if cancel_whole_experiment:
        experiment.state = ExperimentStatus.CANCELLING
        experiment.update()

    try:
        cancelled_runs, not_cancelled_runs = cancel_experiment_runs(
            runs_to_cancel=runs_to_purge, namespace=namespace)
        not_purged_runs = not_cancelled_runs

        if cancel_whole_experiment:
            # Delete associated workflows
            experiment_associated_workflows = [
                wf for wf in ArgoWorkflow.list(namespace=namespace)
                if wf.labels.get('experimentName') == experiment.name
            ]
            for wf in experiment_associated_workflows:
                wf.delete()

            # Remove tags from git repo manager
            try:
                delete_exp_tag_from_git_repo_manager(
                    experiment_name=experiment.name,
                    username=namespace,
                    experiments_workdir=get_run_environment_path(''))
            except Exception:
                handle_error(logger, Texts.GIT_REPO_MANAGER_ERROR_MSG,
                             Texts.GIT_REPO_MANAGER_ERROR_MSG)
                raise

        for run in cancelled_runs:
            logger.debug(f"Purging {run.name} run ...")
            click.echo(Texts.PURGING_START_MSG.format(run_name=run.name))
            try:
                with spinner(text=Texts.PURGING_PROGRESS_MSG.format(
                        run_name=run.name)):
                    # purge helm release
                    delete_helm_release(run.name,
                                        namespace=namespace,
                                        purge=True)
                    # delete run
                    kubectl.delete_k8s_object("run", run.name)
                    purged_runs.append(run)
            except Exception as exe:
                not_purged_runs.append(run)
                logger.exception("Error during purging runs.")
                # occurence of NotFound error may mean, that run has been removed earlier
                if "NotFound" not in str(exe):
                    click.echo(
                        Texts.INCOMPLETE_PURGE_ERROR_MSG.format(
                            experiment_name=experiment_name))
                    raise exe
            try:
                # clear run logs
                if is_current_user_administrator():
                    logger.debug(f"Clearing logs for {run.name} run.")
                    with spinner(text=Texts.PURGING_LOGS_PROGRESS_MSG.format(
                            run_name=run.name)):
                        k8s_es_client.delete_logs_for_run(run=run.name,
                                                          namespace=namespace)
            except Exception:
                logger.exception("Error during clearing run logs.")

            # CAN-1099 - docker garbage collector has errors that prevent from correct removal of images
            # try:
            # try to remove images from docker registry
            #    delete_images_for_experiment(exp_name=run.name)
            # except Exception:
            #    logger.exception("Error during removing images.")

        if cancel_whole_experiment and not not_purged_runs:
            try:
                kubectl.delete_k8s_object("experiment", exp_name)
            except Exception:
                # problems during deleting experiments are hidden as if runs were
                # cancelled user doesn't have a possibility to remove them
                logger.exception("Error during purging experiment.")

    except Exception:
        logger.exception("Error during purging experiment.")
        return purged_runs, not_purged_runs

    return purged_runs, not_purged_runs
コード例 #10
0
ファイル: logs_utils.py プロジェクト: pnijhara/nauta
def get_logs(experiment_name: str, min_severity: SeverityLevel,
             start_date: str, end_date: str, pod_ids: str,
             pod_status: PodStatus, match: str, output: bool, pager: bool,
             follow: bool, runs_kinds: List[RunKinds], instance_type: str):
    """
    Show logs for a given experiment.
    """
    # check whether we have runs with a given name
    if experiment_name and match:
        handle_error(user_msg=Texts.NAME_M_BOTH_GIVEN_ERROR_MSG.format(
            instance_type=instance_type))
        exit(1)
    elif not experiment_name and not match:
        handle_error(user_msg=Texts.NAME_M_NONE_GIVEN_ERROR_MSG.format(
            instance_type=instance_type))
        exit(1)

    try:
        es_client = K8sElasticSearchClient(
            host=f'{get_kubectl_host(with_port=True)}'
            f'/api/v1/namespaces/nauta/services/nauta-elasticsearch:nauta/proxy',
            verify_certs=False,
            use_ssl=True,
            headers={'Authorization': get_api_key()})
        namespace = get_kubectl_current_context_namespace()
        if match:
            experiment_name = match
            name_filter = match
        else:
            name_filter = f'^{experiment_name}$'
        runs = Run.list(namespace=namespace,
                        name_filter=name_filter,
                        run_kinds_filter=runs_kinds)
        if not runs:
            raise ValueError(
                f'Run with given name: {experiment_name} does not exists in namespace {namespace}.'
            )
        pod_ids = pod_ids.split(',') if pod_ids else None  # type: ignore
        follow_logs = True if follow and not output else False
        if output and len(runs) > 1:
            click.echo(Texts.MORE_EXP_LOGS_MESSAGE)
        for run in runs:
            start_date = start_date if start_date else run.creation_timestamp
            run_logs_generator = es_client.get_experiment_logs_generator(
                run=run,
                namespace=namespace,
                min_severity=min_severity,
                start_date=start_date,
                end_date=end_date,
                pod_ids=pod_ids,
                pod_status=pod_status,
                follow=follow_logs)
            if output:
                save_logs_to_file(logs_generator=run_logs_generator,
                                  instance_name=run.name,
                                  instance_type=instance_type)
            else:
                if len(runs) > 1:
                    click.echo(f'Experiment : {run.name}')
                print_logs(run_logs_generator=run_logs_generator, pager=pager)
    except ValueError:
        handle_error(
            logger,
            Texts.EXPERIMENT_NOT_EXISTS_ERROR_MSG.format(
                experiment_name=experiment_name,
                instance_type=instance_type.capitalize()),
            Texts.EXPERIMENT_NOT_EXISTS_ERROR_MSG.format(
                experiment_name=experiment_name,
                instance_type=instance_type.capitalize()))
        exit(1)
    except Exception:
        handle_error(
            logger,
            Texts.LOGS_GET_OTHER_ERROR_MSG.format(instance_type=instance_type),
            Texts.LOGS_GET_OTHER_ERROR_MSG.format(instance_type=instance_type))
        exit(1)
コード例 #11
0
def test_list_runs_name_filter(mock_k8s_api_client: CustomObjectsApi):
    mock_k8s_api_client.list_cluster_custom_object.return_value = LIST_RUNS_RESPONSE_RAW
    runs = Run.list(name_filter=TEST_RUNS[1].name)
    assert [TEST_RUNS[1]] == runs
コード例 #12
0
def test_list_runs_filter_status(mock_k8s_api_client: CustomObjectsApi):
    mock_k8s_api_client.list_cluster_custom_object.return_value = LIST_RUNS_RESPONSE_RAW
    runs = Run.list(state_list=[RunStatus.QUEUED])
    assert [TEST_RUNS[0]] == runs
コード例 #13
0
def test_list_runs(mock_k8s_api_client):
    mock_k8s_api_client.list_cluster_custom_object.return_value = LIST_RUNS_RESPONSE_RAW
    runs = Run.list()
    assert runs == TEST_RUNS
コード例 #14
0
def test_list_runs_invalid_name_filter(mock_k8s_api_client: CustomObjectsApi):
    mock_k8s_api_client.list_cluster_custom_object.return_value = LIST_RUNS_RESPONSE_RAW
    with pytest.raises(InvalidRegularExpressionError):
        Run.list(name_filter='*')
コード例 #15
0
ファイル: experiment.py プロジェクト: zhcf/nauta
 def get_runs(self) -> List[Run]:
     return Run.list(namespace=self.metadata['namespace'],
                     exp_name_filter=[self.name])