Ejemplo n.º 1
0
def test_upload_experiment_to_git_repo_manager(mocker, tmpdir, git_client_mock):
    get_private_key_path_mock = mocker.patch('git_repo_manager.utils.get_git_private_key_path',
                                             return_value='/fake-config/.fake-user-ssh-key')
    proxy_mock = mocker.patch('git_repo_manager.utils.TcpK8sProxy')
    config_mock = mocker.patch('git_repo_manager.utils.Config')
    fake_hash = 'a12b34c'
    env_hash_mock = mocker.patch('git_repo_manager.utils.compute_hash_of_k8s_env_address', return_value=fake_hash)

    experiment_name = 'fake-experiment'
    experiments_workdir = tmpdir.mkdir(f'experiments')
    experiments_workdir.mkdir(experiment_name)

    upload_experiment_to_git_repo_manager(experiments_workdir=experiments_workdir, experiment_name=experiment_name,
                                          run_name=experiment_name, username='******')

    assert env_hash_mock.call_count == 1
    assert config_mock.call_count == 1
    assert get_private_key_path_mock.call_count == 1
    assert proxy_mock.call_count == 1

    # Assert clone bare repo & pull flow
    assert git_client_mock.remote.call_count == 1

    assert git_client_mock.clone.call_count == 1

    assert git_client_mock.config.call_count == 3
    assert git_client_mock.checkout.call_count == 1
    assert git_client_mock.pull.call_count == 0
    assert git_client_mock.add.call_count == 1
    assert git_client_mock.commit.call_count == 1
    assert git_client_mock.tag.call_count == 1
    assert git_client_mock.push.call_count == 2
Ejemplo n.º 2
0
def test_upload_experiment_to_git_repo_manager_error(mocker, tmpdir, git_client_mock):
    get_private_key_path_mock = mocker.patch('git_repo_manager.utils.get_git_private_key_path',
                                             return_value='/fake-config/.fake-user-ssh-key')
    git_client_mock.push.side_effect = RuntimeError
    proxy_mock = mocker.patch('git_repo_manager.utils.TcpK8sProxy')
    config_mock = mocker.patch('git_repo_manager.utils.Config')
    fake_hash = 'a12b34c'
    env_hash_mock = mocker.patch('git_repo_manager.utils.compute_hash_of_k8s_env_address', return_value=fake_hash)

    experiment_name = 'fake-experiment'
    experiments_workdir = tmpdir.mkdir(f'experiments')
    experiments_workdir.mkdir(f'.nauta-git-fake-user-{fake_hash}')
    experiments_workdir.mkdir(experiment_name)

    with pytest.raises(RuntimeError):
        upload_experiment_to_git_repo_manager(experiments_workdir=experiments_workdir, run_name=experiment_name,
                                              experiment_name=experiment_name, username='******')

    # function is retried 5 times
    assert env_hash_mock.call_count == 5
    assert config_mock.call_count == 5
    assert get_private_key_path_mock.call_count == 5
    assert proxy_mock.call_count == 5

    # Check if rollback was called
    assert git_client_mock.reset.call_count == 5
Ejemplo n.º 3
0
def submit_experiment(
    template: str,
    name: str = None,
    run_kind: RunKinds = RunKinds.TRAINING,
    script_location: str = None,
    script_parameters: Tuple[str, ...] = None,
    pack_params: List[Tuple[str, str]] = None,
    parameter_range: List[Tuple[str, str]] = None,
    parameter_set: Tuple[str, ...] = None,
    script_folder_location: str = None,
    env_variables: List[str] = None,
    requirements_file: str = None
) -> Tuple[List[Run], Dict[str, str], Optional[str]]:

    script_parameters: Union[Tuple[str, ...], Tuple[(
    )]] = script_parameters if script_parameters else ()
    parameter_set: Union[Tuple[str, ...],
                         Tuple[()]] = parameter_set if parameter_set else ()
    parameter_range = parameter_range if parameter_range else []
    pack_params = pack_params if pack_params else []

    log.debug("Submit experiment - start")
    try:
        namespace = get_kubectl_current_context_namespace()
        global submitted_namespace
        submitted_namespace = namespace
    except Exception:
        message = Texts.GET_NAMESPACE_ERROR_MSG
        log.exception(message)
        raise SubmitExperimentError(message)

    try:
        with spinner(text=Texts.PREPARING_RESOURCE_DEFINITIONS_MSG):
            experiment_name, labels = generate_exp_name_and_labels(
                script_name=script_location,
                namespace=namespace,
                name=name,
                run_kind=run_kind)
            runs_list = prepare_list_of_runs(experiment_name=experiment_name,
                                             parameter_range=parameter_range,
                                             parameter_set=parameter_set,
                                             template_name=template)
    except SubmitExperimentError as exe:
        log.exception(str(exe))
        raise exe
    except Exception:
        message = Texts.SUBMIT_PREPARATION_ERROR_MSG
        log.exception(message)
        raise SubmitExperimentError(message)

    global submitted_experiment
    submitted_experiment = experiment_name

    # Ctrl-C handling
    signal.signal(signal.SIGINT, ctrl_c_handler_for_submit)
    signal.signal(signal.SIGTERM, ctrl_c_handler_for_submit)

    try:
        experiment_run_folders = [
        ]  # List of local directories used by experiment's runs
        try:
            cluster_registry_port = get_app_service_node_port(
                nauta_app_name=NAUTAAppNames.DOCKER_REGISTRY)
            # prepare environments for all experiment's runs
            for experiment_run in runs_list:
                if script_parameters and experiment_run.parameters:
                    current_script_parameters = script_parameters + experiment_run.parameters
                elif script_parameters:
                    current_script_parameters = script_parameters
                elif experiment_run.parameters:
                    current_script_parameters = experiment_run.parameters
                else:
                    current_script_parameters = None
                run_folder, script_location, pod_count = \
                    prepare_experiment_environment(experiment_name=experiment_name,
                                                   run_name=experiment_run.name,
                                                   local_script_location=script_location,
                                                   script_folder_location=script_folder_location,  # noqa: E501
                                                   script_parameters=current_script_parameters,
                                                   pack_type=template, pack_params=pack_params,
                                                   cluster_registry_port=cluster_registry_port,
                                                   env_variables=env_variables,
                                                   requirements_file=requirements_file,
                                                   username=namespace,
                                                   run_kind=run_kind)
                # Set correct pod count
                if not pod_count or pod_count < 1:
                    raise SubmitExperimentError(
                        'Unable to determine pod count: make sure that values.yaml '
                        'file in your pack has podCount field with positive integer value.'
                    )

                experiment_run.pod_count = pod_count
                experiment_run_folders.append(run_folder)
                script_name = None
                if script_location is not None:
                    script_name = os.path.basename(script_location)
                # Prepend script_name parameter to run description only for display purposes.
                experiment_run.parameters = script_parameters if not experiment_run.parameters \
                    else experiment_run.parameters + script_parameters
                if experiment_run.parameters and script_name:
                    experiment_run.parameters = (
                        script_name, ) + experiment_run.parameters
                elif script_name:
                    experiment_run.parameters = (script_name, )
        except SubmitExperimentError as e:
            log.exception(Texts.ENV_CREATION_ERROR_MSG)
            e.message += f' {Texts.ENV_CREATION_ERROR_MSG}'
            raise
        except Exception:
            # any error in this step breaks execution of this command
            message = Texts.ENV_CREATION_ERROR_MSG
            log.exception(message)
            # just in case - remove folders that were created with a success
            for experiment_run_folder in experiment_run_folders:
                delete_environment(experiment_run_folder)
        # if ps or pr option is used - first ask whether experiment(s) should be submitted
        if parameter_range or parameter_set:
            click.echo(Texts.CONFIRM_SUBMIT_MSG)
            click.echo(
                tabulate(
                    {
                        RUN_NAME: [run.name for run in runs_list],
                        RUN_PARAMETERS: [
                            "\n".join(run.parameters) if run.parameters else ""
                            for run in runs_list
                        ]
                    },
                    headers=[RUN_NAME, RUN_PARAMETERS],
                    tablefmt=TBLT_TABLE_FORMAT))
            if not click.confirm(Texts.CONFIRM_SUBMIT_QUESTION_MSG,
                                 default=True):
                for experiment_run_folder in experiment_run_folders:
                    delete_environment(experiment_run_folder)
                exit()
        # create Experiment model
        # TODO template_name & template_namespace should be filled after Template implementation
        parameter_range_spec = [
            f'-pr {param_name} {param_value}'
            for param_name, param_value in parameter_range
        ]
        parameter_set_spec = [f'-ps {ps_spec}' for ps_spec in parameter_set]
        experiment_parameters_spec = list(
            script_parameters) + parameter_range_spec + parameter_set_spec
        template_version = get_template_version(template)
        experiment = experiments_model.Experiment(
            name=experiment_name,
            template_name=template,
            parameters_spec=experiment_parameters_spec,
            template_namespace="template-namespace",
            template_version=template_version)
        experiment.create(namespace=namespace, labels=labels)

        with spinner('Uploading experiment...'):
            try:
                upload_experiment_to_git_repo_manager(
                    experiments_workdir=get_run_environment_path(''),
                    experiment_name=experiment_name,
                    run_name=runs_list[0].name,
                    username=namespace)
            except Exception:
                log.exception('Failed to upload experiment.')
                try:
                    experiment.state = experiments_model.ExperimentStatus.FAILED
                    experiment.update()
                except Exception:
                    log.exception(
                        f'Failed to set state of {experiment.name} experiment '
                        f'to {experiments_model.ExperimentStatus.FAILED}')
                raise SubmitExperimentError('Failed to upload experiment.')

        with spinner('Building experiment image...'):
            try:
                image_build_workflow: ExperimentImageBuildWorkflow = ExperimentImageBuildWorkflow.from_yaml(
                    yaml_template_path=
                    f'{Config().config_path}/workflows/{EXP_IMAGE_BUILD_WORKFLOW_SPEC}',
                    username=namespace,
                    experiment_name=experiment_name)
                image_build_workflow.create(namespace=namespace)
                image_build_workflow.wait_for_completion()
            except Exception:
                error_msg = 'Failed to build experiment image.'
                log.exception(error_msg)
                # Try to get workflow logs
                _debug_workflow_logs(workflow=image_build_workflow,
                                     namespace=namespace)

                if image_build_workflow.name:
                    error_msg += f' Run nctl workflow logs {image_build_workflow.name} command for more details.'

                try:
                    experiment.state = experiments_model.ExperimentStatus.FAILED
                    experiment.update()
                except Exception:
                    log.exception(
                        f'Failed to set state of {experiment.name} experiment '
                        f'to {experiments_model.ExperimentStatus.FAILED}')
                raise SubmitExperimentError(error_msg)
        # submit runs
        run_errors: Dict[str, str] = {}
        for run, run_folder in zip(runs_list, experiment_run_folders):
            try:
                run.state = RunStatus.QUEUED
                with spinner(text=Texts.CREATING_RESOURCES_MSG.format(
                        run_name=run.name)):
                    # Add Run object with runKind label and pack params as annotations
                    run.create(namespace=namespace,
                               labels={'runKind': run_kind.value},
                               annotations={
                                   pack_param_name: pack_param_value
                                   for pack_param_name, pack_param_value in
                                   pack_params
                               })
                    submitted_runs.append(run)
                    submit_draft_pack(run_name=run.name,
                                      run_folder=run_folder,
                                      namespace=namespace)
            except Exception as exe:
                delete_environment(run_folder)
                try:
                    run.state = RunStatus.FAILED
                    run_errors[run.name] = str(exe)
                    run.update()
                except Exception as rexe:
                    # update of non-existing run may fail
                    log.debug(Texts.ERROR_DURING_PATCHING_RUN.format(
                        str(rexe)))
        # Delete experiment if no Runs were submitted
        if not submitted_runs:
            click.echo(Texts.SUBMISSION_FAIL_ERROR_MSG)
            delete_k8s_object("experiment", experiment_name)
        # Change experiment status to submitted
        experiment.state = experiments_model.ExperimentStatus.SUBMITTED
        experiment.update()
    except LocalPortOccupiedError as exe:
        click.echo(exe.message)
        raise SubmitExperimentError(exe.message)
    except K8sProxyCloseError:
        log.exception('Error during closing of a proxy for a {}'.format(
            NAUTAAppNames.DOCKER_REGISTRY))
        raise K8sProxyCloseError(Texts.PROXY_CLOSE_ERROR_MSG)
    except K8sProxyOpenError:
        error_msg = Texts.PROXY_OPEN_ERROR_MSG
        log.exception(error_msg)
        raise SubmitExperimentError(error_msg)
    except SubmitExperimentError:
        raise
    except Exception as exe:
        error_msg = Texts.SUBMIT_OTHER_ERROR_MSG
        log.exception(error_msg)
        raise SubmitExperimentError(error_msg) from exe
    finally:
        # remove semaphores from all exp folders
        remove_sempahore(experiment_name)

    log.debug("Submit - finish")
    return runs_list, run_errors, script_location