Example #1
0
def submit_experiment(template: str, name: str = None, run_kind: RunKinds = RunKinds.TRAINING,
                      script_location: str = None, script_parameters: Tuple[str, ...] = None,
                      pack_params: List[Tuple[str, str]] = None, parameter_range: List[Tuple[str, str]] = None,
                      parameter_set: Tuple[str, ...] = None,
                      script_folder_location: str = None,
                      env_variables: List[str] = None,
                      requirements_file: str = None) -> (List[Run], Dict[str, str], str):

    script_parameters = script_parameters if script_parameters else ()
    parameter_set = parameter_set if parameter_set else ()
    parameter_range = parameter_range if parameter_range else []

    log.debug("Submit experiment - start")
    try:
        namespace = get_kubectl_current_context_namespace()
        global submitted_namespace
        submitted_namespace = namespace
    except Exception:
        message = Texts.GET_NAMESPACE_ERROR_MSG
        log.exception(message)
        raise SubmitExperimentError(message)

    try:
        with spinner(text=Texts.PREPARING_RESOURCE_DEFINITIONS_MSG):
            experiment_name, labels = generate_exp_name_and_labels(script_name=script_location,
                                                                   namespace=namespace, name=name,
                                                                   run_kind=run_kind)
            runs_list = prepare_list_of_runs(experiment_name=experiment_name, parameter_range=parameter_range,
                                             parameter_set=parameter_set, template_name=template)
    except SubmitExperimentError as exe:
        log.exception(str(exe))
        raise exe
    except Exception:
        message = Texts.SUBMIT_PREPARATION_ERROR_MSG
        log.exception(message)
        raise SubmitExperimentError(message)

    global submitted_experiment
    submitted_experiment = experiment_name

    # Ctrl-C handling
    signal.signal(signal.SIGINT, ctrl_c_handler_for_submit)
    signal.signal(signal.SIGTERM, ctrl_c_handler_for_submit)

    try:
        config = Config()

        # start port forwarding
        # noinspection PyBroadException
        with K8sProxy(NAUTAAppNames.DOCKER_REGISTRY, port=config.local_registry_port) as proxy:
            # Save port that was actually used in configuration
            if proxy.tunnel_port != config.local_registry_port:
                config.local_registry_port = proxy.tunnel_port

            experiment_run_folders = []  # List of local directories used by experiment's runs
            try:
                # run socat if on Windows or Mac OS
                if get_current_os() in (OS.WINDOWS, OS.MACOS):
                    # noinspection PyBroadException
                    try:
                        with spinner(text=Texts.CLUSTER_CONNECTION_MSG):
                            socat.start(proxy.tunnel_port)
                    except Exception:
                        error_msg = Texts.LOCAL_DOCKER_TUNNEL_ERROR_MSG
                        log.exception(error_msg)
                        raise SubmitExperimentError(error_msg)

                cluster_registry_port = get_app_service_node_port(nauta_app_name=NAUTAAppNames.DOCKER_REGISTRY)

                # prepare environments for all experiment's runs
                for experiment_run in runs_list:
                    if script_parameters and experiment_run.parameters:
                        current_script_parameters = script_parameters + experiment_run.parameters
                    elif script_parameters:
                        current_script_parameters = script_parameters
                    elif experiment_run.parameters:
                        current_script_parameters = experiment_run.parameters
                    else:
                        current_script_parameters = ""

                    run_folder, script_location, pod_count = \
                        prepare_experiment_environment(experiment_name=experiment_name,
                                                       run_name=experiment_run.name,
                                                       local_script_location=script_location,
                                                       script_folder_location=script_folder_location,  # noqa: E501
                                                       script_parameters=current_script_parameters,
                                                       pack_type=template, pack_params=pack_params,
                                                       local_registry_port=proxy.tunnel_port,
                                                       cluster_registry_port=cluster_registry_port,
                                                       env_variables=env_variables,
                                                       requirements_file=requirements_file)
                    # Set correct pod count
                    if not pod_count or pod_count < 1:
                        raise SubmitExperimentError('Unable to determine pod count: make sure that values.yaml '
                                                    'file in your pack has podCount field with positive integer value.')
                    experiment_run.pod_count = pod_count

                    experiment_run_folders.append(run_folder)
                    script_name = None
                    if script_location is not None:
                        script_name = os.path.basename(script_location)

                    # Prepend script_name parameter to run description only for display purposes.
                    experiment_run.parameters = script_parameters if not experiment_run.parameters \
                        else experiment_run.parameters + script_parameters
                    if experiment_run.parameters and script_name:
                        experiment_run.parameters = (script_name, ) + experiment_run.parameters
                    elif script_name:
                        experiment_run.parameters = (script_name, )
            except SubmitExperimentError as e:
                log.exception(Texts.ENV_CREATION_ERROR_MSG)
                e.message += f' {Texts.ENV_CREATION_ERROR_MSG}'
                raise
            except Exception:
                # any error in this step breaks execution of this command
                message = Texts.ENV_CREATION_ERROR_MSG
                log.exception(message)
                # just in case - remove folders that were created with a success
                for experiment_run_folder in experiment_run_folders:
                    delete_environment(experiment_run_folder)

            # if ps or pr option is used - first ask whether experiment(s) should be submitted
            if parameter_range or parameter_set:
                click.echo(Texts.CONFIRM_SUBMIT_MSG)
                click.echo(tabulate({RUN_NAME: [run.name for run in runs_list],
                                     RUN_PARAMETERS: ["\n".join(run.parameters) if run.parameters
                                                      else "" for run in runs_list]},
                                    headers=[RUN_NAME, RUN_PARAMETERS], tablefmt="orgtbl"))

                if not click.confirm(Texts.CONFIRM_SUBMIT_QUESTION_MSG, default=True):
                    for experiment_run_folder in experiment_run_folders:
                        delete_environment(experiment_run_folder)
                    exit()

            # create Experiment model
            # TODO template_name & template_namespace should be filled after Template implementation
            parameter_range_spec = [f'-pr {param_name} {param_value}' for param_name, param_value in parameter_range]
            parameter_set_spec = [f'-ps {ps_spec}' for ps_spec in parameter_set]
            experiment_parameters_spec = list(script_parameters) + parameter_range_spec + parameter_set_spec
            experiment = experiments_model.Experiment(name=experiment_name, template_name=template,
                                                      parameters_spec=experiment_parameters_spec,
                                                      template_namespace="template-namespace")

            experiment.create(namespace=namespace, labels=labels)

            # submit runs
            run_errors = {}
            for run, run_folder in zip(runs_list, experiment_run_folders):
                try:
                    run.state = RunStatus.QUEUED
                    with spinner(text=Texts.CREATING_RESOURCES_MSG.format(run_name=run.name)):
                        # Add Run object with runKind label and pack params as annotations
                        run.create(namespace=namespace, labels={'runKind': run_kind.value},
                                   annotations={pack_param_name: pack_param_value
                                                for pack_param_name, pack_param_value in pack_params})
                        submitted_runs.append(run)
                        submit_draft_pack(run_folder, namespace=namespace)
                except Exception as exe:
                    delete_environment(run_folder)
                    try:
                        run.state = RunStatus.FAILED
                        run_errors[run.name] = str(exe)
                        run.update()
                    except Exception as rexe:
                        # update of non-existing run may fail
                        log.debug(Texts.ERROR_DURING_PATCHING_RUN.format(str(rexe)))

            # Delete experiment if no Runs were submitted
            if not submitted_runs:
                click.echo(Texts.SUBMISSION_FAIL_ERROR_MSG)
                delete_k8s_object("experiment", experiment_name)

            # Change experiment status to submitted
            experiment.state = experiments_model.ExperimentStatus.SUBMITTED
            experiment.update()
    except LocalPortOccupiedError as exe:
        click.echo(exe.message)
        raise SubmitExperimentError(exe.message)
    except K8sProxyCloseError:
        log.exception('Error during closing of a proxy for a {}'.format(NAUTAAppNames.DOCKER_REGISTRY))
        raise K8sProxyCloseError(Texts.PROXY_CLOSE_ERROR_MSG)
    except K8sProxyOpenError:
        error_msg = Texts.PROXY_OPEN_ERROR_MSG
        log.exception(error_msg)
        raise SubmitExperimentError(error_msg)
    except SubmitExperimentError:
        raise
    except Exception as exe:
        error_msg = Texts.SUBMIT_OTHER_ERROR_MSG
        log.exception(error_msg)
        raise SubmitExperimentError(error_msg) from exe
    finally:
        with spinner(text=Texts.CLUSTER_CONNECTION_CLOSING_MSG):
            # noinspection PyBroadException
            try:
                socat.stop()
            except Exception:
                log.exception("Error during closing of a proxy for a local docker-host tunnel")
                raise K8sProxyCloseError(Texts.DOCKER_TUNNEL_CLOSE_ERROR_MSG)
        # remove semaphores from all exp folders
        remove_sempahore(experiment_name)

    log.debug("Submit - finish")
    return runs_list, run_errors, script_location