Beispiel #1
0
def generate_exp_name_and_labels(script_name: str, namespace: str, name: str = None,
                                 run_kind: RunKinds = RunKinds.TRAINING) -> Tuple[str, Dict[str, str]]:
    if script_name:
        script_name = Path(script_name).name

    if name:
        # CASE 1: If user pass name as param, then use it. If experiment with this name exists - return error
        experiment = Experiment.get(namespace=namespace, name=name)
        experiment_runs: List[Run] = experiment.get_runs() if experiment else []
        if experiment and experiment_runs:
            raise SubmitExperimentError(Texts.EXPERIMENT_ALREADY_EXISTS_ERROR_MSG.format(name=name))
        # subcase when experiment has no associated runs.
        if experiment and not experiment_runs:
            raise SubmitExperimentError(Texts.EXPERIMENT_INVALID_STATE_MSG.format(name=name))
        # if there are still artifacts from previous experiment with the same name
        if list_pods(namespace=namespace, label_selector=f'runName={name}'):
            raise SubmitExperimentError(Texts.EXPERIMENT_PREV_EXP_STILL_TERMINATING)
        return name, prepare_label(script_name, name, name, run_kind=run_kind)
    else:
        # CASE 2: If user submit exp without name, but there is already exp with the same script name, then:
        # --> use existing exp name and add post-fix with next index
        generated_name, labels = generate_name_for_existing_exps(script_name, namespace, run_kind=run_kind)
        if generated_name:
            return generated_name, labels

        # CASE 3: If user submit exp without name and there is no existing exps with matching script name,then:
        # --> generate new name

        result = generate_name(script_name)

        experiments = Experiment.list(namespace=namespace, name_filter=result)
        if experiments and len(experiments) > 0:
            result = f'{result}-{len(experiments)}'
            return result, prepare_label(script_name, result, run_kind=run_kind)
        return result, prepare_label(script_name, result, run_kind=run_kind)
Beispiel #2
0
def create_environment(experiment_name: str, file_location: str, folder_location: str) -> str:
    """
    Creates a complete environment for executing a training using draft.

    :param experiment_name: name of an experiment used to create a folder
                            with content of an experiment
    :param file_location: location of a training script
    :param folder_location: location of a folder with additional data
    :return: (experiment_folder)
    experiment_folder - folder with experiment's artifacts
    In case of any problems during creation of an enviornment it throws an
    exception with a description of a problem
    """
    log.debug("Create environment - start")
    message_prefix = Texts.CREATE_ENV_MSG_PREFIX

    # create a folder for experiment's purposes
    run_environment_path = get_run_environment_path(experiment_name)
    folder_path = os.path.join(run_environment_path, FOLDER_DIR_NAME)

    try:
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
    except Exception:
        log.exception("Create environment - creating experiment folder error.")
        raise SubmitExperimentError(message_prefix.format(reason=Texts.EXP_DIR_CANT_BE_CREATED))

    # create a semaphore saying that experiment is under submission
    Path(os.path.join(run_environment_path, EXP_SUB_SEMAPHORE_FILENAME)).touch()

    # copy training script - it overwrites the file taken from a folder_location
    if file_location:
        try:
            shutil.copy2(file_location, folder_path)
            if get_current_os() == OS.WINDOWS:
                os.chmod(os.path.join(folder_path, os.path.basename(file_location)), 0o666)
        except Exception:
            log.exception("Create environment - copying training script error.")
            raise SubmitExperimentError(message_prefix.format(reason=Texts.TRAINING_SCRIPT_CANT_BE_CREATED))

    # copy folder content
    if folder_location:
        try:
            copy_tree(folder_location, folder_path)
        except Exception:
            log.exception("Create environment - copying training folder error.")
            raise SubmitExperimentError(message_prefix.format(reason=Texts.DIR_CANT_BE_COPIED_ERROR_TEXT))

    log.debug("Create environment - end")

    return run_environment_path
Beispiel #3
0
def test_submit_start_depl_fail(prepare_mocks: SubmitExperimentMocks):
    prepare_mocks.submit_one.side_effect = SubmitExperimentError()

    runs_list, _, _ = submit_experiment(script_location=SCRIPT_LOCATION, script_folder_location=None, pack_params=[],
                                        template=None, name=None, parameter_range=[], parameter_set=(),
                                        script_parameters=(), run_kind=RunKinds.TRAINING)

    assert runs_list[0].state == RunStatus.FAILED
    check_asserts(prepare_mocks, del_env_count=1, update_run_count=1)
Beispiel #4
0
def test_submit_experiment_failure(prepare_mocks: SubmitMocks):
    exe_message = "error message"
    prepare_mocks.submit_experiment.side_effect = SubmitExperimentError(
        exe_message)

    result = CliRunner().invoke(submit, [SCRIPT_LOCATION])

    assert Texts.SUBMIT_ERROR_MSG.format(
        exception_message=exe_message) in result.output
    assert result.exit_code == 1
Beispiel #5
0
def submit_draft_pack(run_folder: str, run_name: str, namespace: str = None):
    """
    Submits one run using draft's environment located in a folder given as a parameter.
    :param run_folder: location of a folder with a description of an environment
    :param run_name: run's name
    :param local_registry_port: port of destination local registry where pack should be submitted
    :param namespace: namespace where tiller used during deployment is located
    In case of any problems it throws an exception with a description of a problem
    """
    log.debug(f'Submit one run: {run_folder} - start')

    # run training
    try:
        cmd.up(run_name=run_name, working_directory=run_folder, namespace=namespace)
    except Exception:
        delete_environment(run_folder)
        raise SubmitExperimentError(Texts.JOB_NOT_DEPLOYED_ERROR_MSG)
    log.debug(f'Submit one run {run_folder} - finish')
Beispiel #6
0
def submit_draft_pack(run_folder: str, namespace: str = None):
    """
    Submits one run using draft's environment located in a folder given as a parameter.
    :param run_folder: location of a folder with a description of an environment
    :param run_name: run's name
    :param namespace: namespace where tiller used during deployment is located
    In case of any problems it throws an exception with a description of a problem
    """
    log.debug(f'Submit one run: {run_folder} - start')

    # run training
    output, exit_code, log_output = cmd.up(working_directory=run_folder, namespace=namespace)

    if exit_code:
        error_message = Texts.JOB_NOT_DEPLOYED_ERROR_MSG

        log_filename = get_log_filename(str(log_output))
        if log_filename:
            error_message = error_message + Texts.JOB_NOT_DEPLOYED_ERROR_MSG_LOGFILE.format(log_filename=log_filename)
        log.error(log_output)
        delete_environment(run_folder)
        raise SubmitExperimentError(error_message)
    log.debug(f'Submit one run {run_folder} - finish')
Beispiel #7
0
def prepare_experiment_environment(experiment_name: str, run_name: str, local_script_location: str,
                                   script_parameters: Tuple[str, ...],
                                   pack_type: str, local_registry_port: int, cluster_registry_port: int,
                                   script_folder_location: str = None,
                                   pack_params: List[Tuple[str, str]] = None,
                                   env_variables: List[str] = None,
                                   requirements_file: str = None) -> PrepareExperimentResult:
    """
    Prepares draft's environment for a certain run based on provided parameters
    :param experiment_name: name of an experiment
    :param run_name: name of an experiment run
    :param local_script_location: location of a script used for training purposes on local machine
    :param script_folder_location: location of an additional folder used in training
    :param script_parameters: parameters passed to a script
    :param pack_type: type of a pack used to start training job
    :param local_registry_port: port on which docker registry is accessible locally
    :param cluster_registry_port: port on which docker registry is accessible within nauta cluster
    :param pack_params: additional pack params
    :param env_variables: environmental variables to be passed to training
    :param requirements_file: path to a file with experiment requirements
    :return: name of folder with an environment created for this run, a name of script used for training purposes
            and count of Pods
    In case of any problems - an exception with a description of a problem is thrown
    """
    log.debug(f'Prepare run {run_name} environment - start')
    run_folder = get_run_environment_path(run_name)
    try:
        # check environment directory
        check_run_environment(run_folder)
        with spinner(text=Texts.CREATING_ENVIRONMENT_MSG.format(run_name=run_name)):
            # create an environment
            create_environment(run_name, local_script_location, script_folder_location)
            # generate draft's data
            output, exit_code, log_output = cmd.create(working_directory=run_folder, pack_type=pack_type)

            # copy requirements file if it was provided, create empty requirements file otherwise
            dest_requirements_file = os.path.join(run_folder, 'requirements.txt')
            if requirements_file:
                shutil.copyfile(requirements_file, dest_requirements_file)
            else:
                Path(dest_requirements_file).touch()

        if exit_code:
            raise SubmitExperimentError(Texts.DRAFT_TEMPLATES_NOT_GENERATED_ERROR_MSG.format(reason=log_output))

        # Script location on experiment container
        remote_script_location = Path(local_script_location).name if local_script_location else ''

        if pack_type in JUPYTER_NOTEBOOK_TEMPLATES_NAMES and remote_script_location.endswith(".py"):
                # for interact (jupyter notebooks) try to convert .py file into .ipynb
                py_script_location = os.path.join(run_folder, FOLDER_DIR_NAME, remote_script_location)
                ipynb_file_name = convert_py_to_ipynb(py_script_location, os.path.join(run_folder, FOLDER_DIR_NAME))
                local_script_location = ipynb_file_name

        # reconfigure draft's templates
        update_configuration(run_folder=run_folder, script_location=remote_script_location,
                             script_parameters=script_parameters,
                             experiment_name=experiment_name, run_name=run_name,
                             local_registry_port=local_registry_port, cluster_registry_port=cluster_registry_port,
                             pack_type=pack_type, pack_params=pack_params,
                             script_folder_location=script_folder_location,
                             env_variables=env_variables)

        pod_count = get_pod_count(run_folder=run_folder, pack_type=pack_type)
    except Exception as exe:
        delete_environment(run_folder)
        raise SubmitExperimentError('Problems during creation of environments.') from exe
    log.debug(f'Prepare run {run_name} environment - finish')
    return PrepareExperimentResult(folder_name=run_folder, script_name=local_script_location, pod_count=pod_count)
Beispiel #8
0
def submit_experiment(template: str, name: str = None, run_kind: RunKinds = RunKinds.TRAINING,
                      script_location: str = None, script_parameters: Tuple[str, ...] = None,
                      pack_params: List[Tuple[str, str]] = None, parameter_range: List[Tuple[str, str]] = None,
                      parameter_set: Tuple[str, ...] = None,
                      script_folder_location: str = None,
                      env_variables: List[str] = None,
                      requirements_file: str = None) -> (List[Run], Dict[str, str], str):

    script_parameters = script_parameters if script_parameters else ()
    parameter_set = parameter_set if parameter_set else ()
    parameter_range = parameter_range if parameter_range else []

    log.debug("Submit experiment - start")
    try:
        namespace = get_kubectl_current_context_namespace()
        global submitted_namespace
        submitted_namespace = namespace
    except Exception:
        message = Texts.GET_NAMESPACE_ERROR_MSG
        log.exception(message)
        raise SubmitExperimentError(message)

    try:
        with spinner(text=Texts.PREPARING_RESOURCE_DEFINITIONS_MSG):
            experiment_name, labels = generate_exp_name_and_labels(script_name=script_location,
                                                                   namespace=namespace, name=name,
                                                                   run_kind=run_kind)
            runs_list = prepare_list_of_runs(experiment_name=experiment_name, parameter_range=parameter_range,
                                             parameter_set=parameter_set, template_name=template)
    except SubmitExperimentError as exe:
        log.exception(str(exe))
        raise exe
    except Exception:
        message = Texts.SUBMIT_PREPARATION_ERROR_MSG
        log.exception(message)
        raise SubmitExperimentError(message)

    global submitted_experiment
    submitted_experiment = experiment_name

    # Ctrl-C handling
    signal.signal(signal.SIGINT, ctrl_c_handler_for_submit)
    signal.signal(signal.SIGTERM, ctrl_c_handler_for_submit)

    try:
        config = Config()

        # start port forwarding
        # noinspection PyBroadException
        with K8sProxy(NAUTAAppNames.DOCKER_REGISTRY, port=config.local_registry_port) as proxy:
            # Save port that was actually used in configuration
            if proxy.tunnel_port != config.local_registry_port:
                config.local_registry_port = proxy.tunnel_port

            experiment_run_folders = []  # List of local directories used by experiment's runs
            try:
                # run socat if on Windows or Mac OS
                if get_current_os() in (OS.WINDOWS, OS.MACOS):
                    # noinspection PyBroadException
                    try:
                        with spinner(text=Texts.CLUSTER_CONNECTION_MSG):
                            socat.start(proxy.tunnel_port)
                    except Exception:
                        error_msg = Texts.LOCAL_DOCKER_TUNNEL_ERROR_MSG
                        log.exception(error_msg)
                        raise SubmitExperimentError(error_msg)

                cluster_registry_port = get_app_service_node_port(nauta_app_name=NAUTAAppNames.DOCKER_REGISTRY)

                # prepare environments for all experiment's runs
                for experiment_run in runs_list:
                    if script_parameters and experiment_run.parameters:
                        current_script_parameters = script_parameters + experiment_run.parameters
                    elif script_parameters:
                        current_script_parameters = script_parameters
                    elif experiment_run.parameters:
                        current_script_parameters = experiment_run.parameters
                    else:
                        current_script_parameters = ""

                    run_folder, script_location, pod_count = \
                        prepare_experiment_environment(experiment_name=experiment_name,
                                                       run_name=experiment_run.name,
                                                       local_script_location=script_location,
                                                       script_folder_location=script_folder_location,  # noqa: E501
                                                       script_parameters=current_script_parameters,
                                                       pack_type=template, pack_params=pack_params,
                                                       local_registry_port=proxy.tunnel_port,
                                                       cluster_registry_port=cluster_registry_port,
                                                       env_variables=env_variables,
                                                       requirements_file=requirements_file)
                    # Set correct pod count
                    if not pod_count or pod_count < 1:
                        raise SubmitExperimentError('Unable to determine pod count: make sure that values.yaml '
                                                    'file in your pack has podCount field with positive integer value.')
                    experiment_run.pod_count = pod_count

                    experiment_run_folders.append(run_folder)
                    script_name = None
                    if script_location is not None:
                        script_name = os.path.basename(script_location)

                    # Prepend script_name parameter to run description only for display purposes.
                    experiment_run.parameters = script_parameters if not experiment_run.parameters \
                        else experiment_run.parameters + script_parameters
                    if experiment_run.parameters and script_name:
                        experiment_run.parameters = (script_name, ) + experiment_run.parameters
                    elif script_name:
                        experiment_run.parameters = (script_name, )
            except SubmitExperimentError as e:
                log.exception(Texts.ENV_CREATION_ERROR_MSG)
                e.message += f' {Texts.ENV_CREATION_ERROR_MSG}'
                raise
            except Exception:
                # any error in this step breaks execution of this command
                message = Texts.ENV_CREATION_ERROR_MSG
                log.exception(message)
                # just in case - remove folders that were created with a success
                for experiment_run_folder in experiment_run_folders:
                    delete_environment(experiment_run_folder)

            # if ps or pr option is used - first ask whether experiment(s) should be submitted
            if parameter_range or parameter_set:
                click.echo(Texts.CONFIRM_SUBMIT_MSG)
                click.echo(tabulate({RUN_NAME: [run.name for run in runs_list],
                                     RUN_PARAMETERS: ["\n".join(run.parameters) if run.parameters
                                                      else "" for run in runs_list]},
                                    headers=[RUN_NAME, RUN_PARAMETERS], tablefmt="orgtbl"))

                if not click.confirm(Texts.CONFIRM_SUBMIT_QUESTION_MSG, default=True):
                    for experiment_run_folder in experiment_run_folders:
                        delete_environment(experiment_run_folder)
                    exit()

            # create Experiment model
            # TODO template_name & template_namespace should be filled after Template implementation
            parameter_range_spec = [f'-pr {param_name} {param_value}' for param_name, param_value in parameter_range]
            parameter_set_spec = [f'-ps {ps_spec}' for ps_spec in parameter_set]
            experiment_parameters_spec = list(script_parameters) + parameter_range_spec + parameter_set_spec
            experiment = experiments_model.Experiment(name=experiment_name, template_name=template,
                                                      parameters_spec=experiment_parameters_spec,
                                                      template_namespace="template-namespace")

            experiment.create(namespace=namespace, labels=labels)

            # submit runs
            run_errors = {}
            for run, run_folder in zip(runs_list, experiment_run_folders):
                try:
                    run.state = RunStatus.QUEUED
                    with spinner(text=Texts.CREATING_RESOURCES_MSG.format(run_name=run.name)):
                        # Add Run object with runKind label and pack params as annotations
                        run.create(namespace=namespace, labels={'runKind': run_kind.value},
                                   annotations={pack_param_name: pack_param_value
                                                for pack_param_name, pack_param_value in pack_params})
                        submitted_runs.append(run)
                        submit_draft_pack(run_folder, namespace=namespace)
                except Exception as exe:
                    delete_environment(run_folder)
                    try:
                        run.state = RunStatus.FAILED
                        run_errors[run.name] = str(exe)
                        run.update()
                    except Exception as rexe:
                        # update of non-existing run may fail
                        log.debug(Texts.ERROR_DURING_PATCHING_RUN.format(str(rexe)))

            # Delete experiment if no Runs were submitted
            if not submitted_runs:
                click.echo(Texts.SUBMISSION_FAIL_ERROR_MSG)
                delete_k8s_object("experiment", experiment_name)

            # Change experiment status to submitted
            experiment.state = experiments_model.ExperimentStatus.SUBMITTED
            experiment.update()
    except LocalPortOccupiedError as exe:
        click.echo(exe.message)
        raise SubmitExperimentError(exe.message)
    except K8sProxyCloseError:
        log.exception('Error during closing of a proxy for a {}'.format(NAUTAAppNames.DOCKER_REGISTRY))
        raise K8sProxyCloseError(Texts.PROXY_CLOSE_ERROR_MSG)
    except K8sProxyOpenError:
        error_msg = Texts.PROXY_OPEN_ERROR_MSG
        log.exception(error_msg)
        raise SubmitExperimentError(error_msg)
    except SubmitExperimentError:
        raise
    except Exception as exe:
        error_msg = Texts.SUBMIT_OTHER_ERROR_MSG
        log.exception(error_msg)
        raise SubmitExperimentError(error_msg) from exe
    finally:
        with spinner(text=Texts.CLUSTER_CONNECTION_CLOSING_MSG):
            # noinspection PyBroadException
            try:
                socat.stop()
            except Exception:
                log.exception("Error during closing of a proxy for a local docker-host tunnel")
                raise K8sProxyCloseError(Texts.DOCKER_TUNNEL_CLOSE_ERROR_MSG)
        # remove semaphores from all exp folders
        remove_sempahore(experiment_name)

    log.debug("Submit - finish")
    return runs_list, run_errors, script_location
Beispiel #9
0
def submit_experiment(
    template: str,
    name: str = None,
    run_kind: RunKinds = RunKinds.TRAINING,
    script_location: str = None,
    script_parameters: Tuple[str, ...] = None,
    pack_params: List[Tuple[str, str]] = None,
    parameter_range: List[Tuple[str, str]] = None,
    parameter_set: Tuple[str, ...] = None,
    script_folder_location: str = None,
    env_variables: List[str] = None,
    requirements_file: str = None
) -> Tuple[List[Run], Dict[str, str], Optional[str]]:

    script_parameters: Union[Tuple[str, ...], Tuple[(
    )]] = script_parameters if script_parameters else ()
    parameter_set: Union[Tuple[str, ...],
                         Tuple[()]] = parameter_set if parameter_set else ()
    parameter_range = parameter_range if parameter_range else []
    pack_params = pack_params if pack_params else []

    log.debug("Submit experiment - start")
    try:
        namespace = get_kubectl_current_context_namespace()
        global submitted_namespace
        submitted_namespace = namespace
    except Exception:
        message = Texts.GET_NAMESPACE_ERROR_MSG
        log.exception(message)
        raise SubmitExperimentError(message)

    try:
        with spinner(text=Texts.PREPARING_RESOURCE_DEFINITIONS_MSG):
            experiment_name, labels = generate_exp_name_and_labels(
                script_name=script_location,
                namespace=namespace,
                name=name,
                run_kind=run_kind)
            runs_list = prepare_list_of_runs(experiment_name=experiment_name,
                                             parameter_range=parameter_range,
                                             parameter_set=parameter_set,
                                             template_name=template)
    except SubmitExperimentError as exe:
        log.exception(str(exe))
        raise exe
    except Exception:
        message = Texts.SUBMIT_PREPARATION_ERROR_MSG
        log.exception(message)
        raise SubmitExperimentError(message)

    global submitted_experiment
    submitted_experiment = experiment_name

    # Ctrl-C handling
    signal.signal(signal.SIGINT, ctrl_c_handler_for_submit)
    signal.signal(signal.SIGTERM, ctrl_c_handler_for_submit)

    try:
        experiment_run_folders = [
        ]  # List of local directories used by experiment's runs
        try:
            cluster_registry_port = get_app_service_node_port(
                nauta_app_name=NAUTAAppNames.DOCKER_REGISTRY)
            # prepare environments for all experiment's runs
            for experiment_run in runs_list:
                if script_parameters and experiment_run.parameters:
                    current_script_parameters = script_parameters + experiment_run.parameters
                elif script_parameters:
                    current_script_parameters = script_parameters
                elif experiment_run.parameters:
                    current_script_parameters = experiment_run.parameters
                else:
                    current_script_parameters = None
                run_folder, script_location, pod_count = \
                    prepare_experiment_environment(experiment_name=experiment_name,
                                                   run_name=experiment_run.name,
                                                   local_script_location=script_location,
                                                   script_folder_location=script_folder_location,  # noqa: E501
                                                   script_parameters=current_script_parameters,
                                                   pack_type=template, pack_params=pack_params,
                                                   cluster_registry_port=cluster_registry_port,
                                                   env_variables=env_variables,
                                                   requirements_file=requirements_file,
                                                   username=namespace,
                                                   run_kind=run_kind)
                # Set correct pod count
                if not pod_count or pod_count < 1:
                    raise SubmitExperimentError(
                        'Unable to determine pod count: make sure that values.yaml '
                        'file in your pack has podCount field with positive integer value.'
                    )

                experiment_run.pod_count = pod_count
                experiment_run_folders.append(run_folder)
                script_name = None
                if script_location is not None:
                    script_name = os.path.basename(script_location)
                # Prepend script_name parameter to run description only for display purposes.
                experiment_run.parameters = script_parameters if not experiment_run.parameters \
                    else experiment_run.parameters + script_parameters
                if experiment_run.parameters and script_name:
                    experiment_run.parameters = (
                        script_name, ) + experiment_run.parameters
                elif script_name:
                    experiment_run.parameters = (script_name, )
        except SubmitExperimentError as e:
            log.exception(Texts.ENV_CREATION_ERROR_MSG)
            e.message += f' {Texts.ENV_CREATION_ERROR_MSG}'
            raise
        except Exception:
            # any error in this step breaks execution of this command
            message = Texts.ENV_CREATION_ERROR_MSG
            log.exception(message)
            # just in case - remove folders that were created with a success
            for experiment_run_folder in experiment_run_folders:
                delete_environment(experiment_run_folder)
        # if ps or pr option is used - first ask whether experiment(s) should be submitted
        if parameter_range or parameter_set:
            click.echo(Texts.CONFIRM_SUBMIT_MSG)
            click.echo(
                tabulate(
                    {
                        RUN_NAME: [run.name for run in runs_list],
                        RUN_PARAMETERS: [
                            "\n".join(run.parameters) if run.parameters else ""
                            for run in runs_list
                        ]
                    },
                    headers=[RUN_NAME, RUN_PARAMETERS],
                    tablefmt=TBLT_TABLE_FORMAT))
            if not click.confirm(Texts.CONFIRM_SUBMIT_QUESTION_MSG,
                                 default=True):
                for experiment_run_folder in experiment_run_folders:
                    delete_environment(experiment_run_folder)
                exit()
        # create Experiment model
        # TODO template_name & template_namespace should be filled after Template implementation
        parameter_range_spec = [
            f'-pr {param_name} {param_value}'
            for param_name, param_value in parameter_range
        ]
        parameter_set_spec = [f'-ps {ps_spec}' for ps_spec in parameter_set]
        experiment_parameters_spec = list(
            script_parameters) + parameter_range_spec + parameter_set_spec
        template_version = get_template_version(template)
        experiment = experiments_model.Experiment(
            name=experiment_name,
            template_name=template,
            parameters_spec=experiment_parameters_spec,
            template_namespace="template-namespace",
            template_version=template_version)
        experiment.create(namespace=namespace, labels=labels)

        with spinner('Uploading experiment...'):
            try:
                upload_experiment_to_git_repo_manager(
                    experiments_workdir=get_run_environment_path(''),
                    experiment_name=experiment_name,
                    run_name=runs_list[0].name,
                    username=namespace)
            except Exception:
                log.exception('Failed to upload experiment.')
                try:
                    experiment.state = experiments_model.ExperimentStatus.FAILED
                    experiment.update()
                except Exception:
                    log.exception(
                        f'Failed to set state of {experiment.name} experiment '
                        f'to {experiments_model.ExperimentStatus.FAILED}')
                raise SubmitExperimentError('Failed to upload experiment.')

        with spinner('Building experiment image...'):
            try:
                image_build_workflow: ExperimentImageBuildWorkflow = ExperimentImageBuildWorkflow.from_yaml(
                    yaml_template_path=
                    f'{Config().config_path}/workflows/{EXP_IMAGE_BUILD_WORKFLOW_SPEC}',
                    username=namespace,
                    experiment_name=experiment_name)
                image_build_workflow.create(namespace=namespace)
                image_build_workflow.wait_for_completion()
            except Exception:
                error_msg = 'Failed to build experiment image.'
                log.exception(error_msg)
                # Try to get workflow logs
                _debug_workflow_logs(workflow=image_build_workflow,
                                     namespace=namespace)

                if image_build_workflow.name:
                    error_msg += f' Run nctl workflow logs {image_build_workflow.name} command for more details.'

                try:
                    experiment.state = experiments_model.ExperimentStatus.FAILED
                    experiment.update()
                except Exception:
                    log.exception(
                        f'Failed to set state of {experiment.name} experiment '
                        f'to {experiments_model.ExperimentStatus.FAILED}')
                raise SubmitExperimentError(error_msg)
        # submit runs
        run_errors: Dict[str, str] = {}
        for run, run_folder in zip(runs_list, experiment_run_folders):
            try:
                run.state = RunStatus.QUEUED
                with spinner(text=Texts.CREATING_RESOURCES_MSG.format(
                        run_name=run.name)):
                    # Add Run object with runKind label and pack params as annotations
                    run.create(namespace=namespace,
                               labels={'runKind': run_kind.value},
                               annotations={
                                   pack_param_name: pack_param_value
                                   for pack_param_name, pack_param_value in
                                   pack_params
                               })
                    submitted_runs.append(run)
                    submit_draft_pack(run_name=run.name,
                                      run_folder=run_folder,
                                      namespace=namespace)
            except Exception as exe:
                delete_environment(run_folder)
                try:
                    run.state = RunStatus.FAILED
                    run_errors[run.name] = str(exe)
                    run.update()
                except Exception as rexe:
                    # update of non-existing run may fail
                    log.debug(Texts.ERROR_DURING_PATCHING_RUN.format(
                        str(rexe)))
        # Delete experiment if no Runs were submitted
        if not submitted_runs:
            click.echo(Texts.SUBMISSION_FAIL_ERROR_MSG)
            delete_k8s_object("experiment", experiment_name)
        # Change experiment status to submitted
        experiment.state = experiments_model.ExperimentStatus.SUBMITTED
        experiment.update()
    except LocalPortOccupiedError as exe:
        click.echo(exe.message)
        raise SubmitExperimentError(exe.message)
    except K8sProxyCloseError:
        log.exception('Error during closing of a proxy for a {}'.format(
            NAUTAAppNames.DOCKER_REGISTRY))
        raise K8sProxyCloseError(Texts.PROXY_CLOSE_ERROR_MSG)
    except K8sProxyOpenError:
        error_msg = Texts.PROXY_OPEN_ERROR_MSG
        log.exception(error_msg)
        raise SubmitExperimentError(error_msg)
    except SubmitExperimentError:
        raise
    except Exception as exe:
        error_msg = Texts.SUBMIT_OTHER_ERROR_MSG
        log.exception(error_msg)
        raise SubmitExperimentError(error_msg) from exe
    finally:
        # remove semaphores from all exp folders
        remove_sempahore(experiment_name)

    log.debug("Submit - finish")
    return runs_list, run_errors, script_location
Beispiel #10
0
def create_environment(experiment_name: str,
                       file_location: str = None,
                       folder_location: str = None,
                       show_folder_size_warning=True,
                       max_folder_size_in_bytes=1024 * 1024,
                       spinner_to_hide=None) -> str:
    """
    Creates a complete environment for executing a training using draft.

    :param experiment_name: name of an experiment used to create a folder
                            with content of an experiment
    :param file_location: location of a training script
    :param folder_location: location of a folder with additional data
    :param show_folder_size_warning: if True, a warning will be shown if script folder size exceeds
     value in max_folder_size_in_bytes param
    :param max_folder_size_in_bytes: maximum script folder size,
    :param spinner_to_hide: provide spinner, if it should be hidden before folder size warning
    :return: (experiment_folder)
    experiment_folder - folder with experiment's artifacts
    In case of any problems during creation of an enviornment it throws an
    exception with a description of a problem
    """
    log.debug("Create environment - start")
    message_prefix = Texts.CREATE_ENV_MSG_PREFIX

    # create a folder for experiment's purposes
    run_environment_path = get_run_environment_path(experiment_name)
    folder_path = os.path.join(run_environment_path, FOLDER_DIR_NAME)

    try:
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
    except Exception:
        log.exception("Create environment - creating experiment folder error.")
        raise SubmitExperimentError(
            message_prefix.format(reason=Texts.EXP_DIR_CANT_BE_CREATED))

    # create a semaphore saying that experiment is under submission
    Path(os.path.join(run_environment_path,
                      EXP_SUB_SEMAPHORE_FILENAME)).touch()

    # copy training script - it overwrites the file taken from a folder_location
    if file_location:
        try:
            shutil.copy2(file_location, folder_path)
            if get_current_os() == OS.WINDOWS:
                os.chmod(
                    os.path.join(folder_path, os.path.basename(file_location)),
                    0o666)  # nosec
        except Exception:
            log.exception(
                "Create environment - copying training script error.")
            raise SubmitExperimentError(
                message_prefix.format(
                    reason=Texts.TRAINING_SCRIPT_CANT_BE_CREATED))

    # copy folder content
    if folder_location:
        folder_size = get_total_directory_size_in_bytes(folder_location)
        if show_folder_size_warning and folder_size >= max_folder_size_in_bytes:
            if spinner_to_hide:
                spinner_to_hide.hide()
            if not click.confirm(
                    f'Experiment\'s script folder location size ({folder_size/1024/1024:.2f} MB) '
                    f'exceeds {max_folder_size_in_bytes/1024/1024:.2f} MB. '
                    f'It is highly recommended to use input/output shares for large amounts of data '
                    f'instead of submitting them along with experiment. Do you want to continue?'
            ):
                exit(2)
            if spinner_to_hide:
                spinner_to_hide.show()
        try:
            copy_tree(folder_location, folder_path)
        except Exception:
            log.exception(
                "Create environment - copying training folder error.")
            raise SubmitExperimentError(
                message_prefix.format(
                    reason=Texts.DIR_CANT_BE_COPIED_ERROR_TEXT))

    log.debug("Create environment - end")

    return run_environment_path