def generate_exp_name_and_labels(script_name: str, namespace: str, name: str = None, run_kind: RunKinds = RunKinds.TRAINING) -> Tuple[str, Dict[str, str]]: if script_name: script_name = Path(script_name).name if name: # CASE 1: If user pass name as param, then use it. If experiment with this name exists - return error experiment = Experiment.get(namespace=namespace, name=name) experiment_runs: List[Run] = experiment.get_runs() if experiment else [] if experiment and experiment_runs: raise SubmitExperimentError(Texts.EXPERIMENT_ALREADY_EXISTS_ERROR_MSG.format(name=name)) # subcase when experiment has no associated runs. if experiment and not experiment_runs: raise SubmitExperimentError(Texts.EXPERIMENT_INVALID_STATE_MSG.format(name=name)) # if there are still artifacts from previous experiment with the same name if list_pods(namespace=namespace, label_selector=f'runName={name}'): raise SubmitExperimentError(Texts.EXPERIMENT_PREV_EXP_STILL_TERMINATING) return name, prepare_label(script_name, name, name, run_kind=run_kind) else: # CASE 2: If user submit exp without name, but there is already exp with the same script name, then: # --> use existing exp name and add post-fix with next index generated_name, labels = generate_name_for_existing_exps(script_name, namespace, run_kind=run_kind) if generated_name: return generated_name, labels # CASE 3: If user submit exp without name and there is no existing exps with matching script name,then: # --> generate new name result = generate_name(script_name) experiments = Experiment.list(namespace=namespace, name_filter=result) if experiments and len(experiments) > 0: result = f'{result}-{len(experiments)}' return result, prepare_label(script_name, result, run_kind=run_kind) return result, prepare_label(script_name, result, run_kind=run_kind)
def create_environment(experiment_name: str, file_location: str, folder_location: str) -> str: """ Creates a complete environment for executing a training using draft. :param experiment_name: name of an experiment used to create a folder with content of an experiment :param file_location: location of a training script :param folder_location: location of a folder with additional data :return: (experiment_folder) experiment_folder - folder with experiment's artifacts In case of any problems during creation of an enviornment it throws an exception with a description of a problem """ log.debug("Create environment - start") message_prefix = Texts.CREATE_ENV_MSG_PREFIX # create a folder for experiment's purposes run_environment_path = get_run_environment_path(experiment_name) folder_path = os.path.join(run_environment_path, FOLDER_DIR_NAME) try: if not os.path.exists(folder_path): os.makedirs(folder_path) except Exception: log.exception("Create environment - creating experiment folder error.") raise SubmitExperimentError(message_prefix.format(reason=Texts.EXP_DIR_CANT_BE_CREATED)) # create a semaphore saying that experiment is under submission Path(os.path.join(run_environment_path, EXP_SUB_SEMAPHORE_FILENAME)).touch() # copy training script - it overwrites the file taken from a folder_location if file_location: try: shutil.copy2(file_location, folder_path) if get_current_os() == OS.WINDOWS: os.chmod(os.path.join(folder_path, os.path.basename(file_location)), 0o666) except Exception: log.exception("Create environment - copying training script error.") raise SubmitExperimentError(message_prefix.format(reason=Texts.TRAINING_SCRIPT_CANT_BE_CREATED)) # copy folder content if folder_location: try: copy_tree(folder_location, folder_path) except Exception: log.exception("Create environment - copying training folder error.") raise SubmitExperimentError(message_prefix.format(reason=Texts.DIR_CANT_BE_COPIED_ERROR_TEXT)) log.debug("Create environment - end") return run_environment_path
def test_submit_start_depl_fail(prepare_mocks: SubmitExperimentMocks): prepare_mocks.submit_one.side_effect = SubmitExperimentError() runs_list, _, _ = submit_experiment(script_location=SCRIPT_LOCATION, script_folder_location=None, pack_params=[], template=None, name=None, parameter_range=[], parameter_set=(), script_parameters=(), run_kind=RunKinds.TRAINING) assert runs_list[0].state == RunStatus.FAILED check_asserts(prepare_mocks, del_env_count=1, update_run_count=1)
def test_submit_experiment_failure(prepare_mocks: SubmitMocks): exe_message = "error message" prepare_mocks.submit_experiment.side_effect = SubmitExperimentError( exe_message) result = CliRunner().invoke(submit, [SCRIPT_LOCATION]) assert Texts.SUBMIT_ERROR_MSG.format( exception_message=exe_message) in result.output assert result.exit_code == 1
def submit_draft_pack(run_folder: str, run_name: str, namespace: str = None): """ Submits one run using draft's environment located in a folder given as a parameter. :param run_folder: location of a folder with a description of an environment :param run_name: run's name :param local_registry_port: port of destination local registry where pack should be submitted :param namespace: namespace where tiller used during deployment is located In case of any problems it throws an exception with a description of a problem """ log.debug(f'Submit one run: {run_folder} - start') # run training try: cmd.up(run_name=run_name, working_directory=run_folder, namespace=namespace) except Exception: delete_environment(run_folder) raise SubmitExperimentError(Texts.JOB_NOT_DEPLOYED_ERROR_MSG) log.debug(f'Submit one run {run_folder} - finish')
def submit_draft_pack(run_folder: str, namespace: str = None): """ Submits one run using draft's environment located in a folder given as a parameter. :param run_folder: location of a folder with a description of an environment :param run_name: run's name :param namespace: namespace where tiller used during deployment is located In case of any problems it throws an exception with a description of a problem """ log.debug(f'Submit one run: {run_folder} - start') # run training output, exit_code, log_output = cmd.up(working_directory=run_folder, namespace=namespace) if exit_code: error_message = Texts.JOB_NOT_DEPLOYED_ERROR_MSG log_filename = get_log_filename(str(log_output)) if log_filename: error_message = error_message + Texts.JOB_NOT_DEPLOYED_ERROR_MSG_LOGFILE.format(log_filename=log_filename) log.error(log_output) delete_environment(run_folder) raise SubmitExperimentError(error_message) log.debug(f'Submit one run {run_folder} - finish')
def prepare_experiment_environment(experiment_name: str, run_name: str, local_script_location: str, script_parameters: Tuple[str, ...], pack_type: str, local_registry_port: int, cluster_registry_port: int, script_folder_location: str = None, pack_params: List[Tuple[str, str]] = None, env_variables: List[str] = None, requirements_file: str = None) -> PrepareExperimentResult: """ Prepares draft's environment for a certain run based on provided parameters :param experiment_name: name of an experiment :param run_name: name of an experiment run :param local_script_location: location of a script used for training purposes on local machine :param script_folder_location: location of an additional folder used in training :param script_parameters: parameters passed to a script :param pack_type: type of a pack used to start training job :param local_registry_port: port on which docker registry is accessible locally :param cluster_registry_port: port on which docker registry is accessible within nauta cluster :param pack_params: additional pack params :param env_variables: environmental variables to be passed to training :param requirements_file: path to a file with experiment requirements :return: name of folder with an environment created for this run, a name of script used for training purposes and count of Pods In case of any problems - an exception with a description of a problem is thrown """ log.debug(f'Prepare run {run_name} environment - start') run_folder = get_run_environment_path(run_name) try: # check environment directory check_run_environment(run_folder) with spinner(text=Texts.CREATING_ENVIRONMENT_MSG.format(run_name=run_name)): # create an environment create_environment(run_name, local_script_location, script_folder_location) # generate draft's data output, exit_code, log_output = cmd.create(working_directory=run_folder, pack_type=pack_type) # copy requirements file if it was provided, create empty requirements file otherwise dest_requirements_file = os.path.join(run_folder, 'requirements.txt') if requirements_file: shutil.copyfile(requirements_file, dest_requirements_file) else: Path(dest_requirements_file).touch() if exit_code: raise SubmitExperimentError(Texts.DRAFT_TEMPLATES_NOT_GENERATED_ERROR_MSG.format(reason=log_output)) # Script location on experiment container remote_script_location = Path(local_script_location).name if local_script_location else '' if pack_type in JUPYTER_NOTEBOOK_TEMPLATES_NAMES and remote_script_location.endswith(".py"): # for interact (jupyter notebooks) try to convert .py file into .ipynb py_script_location = os.path.join(run_folder, FOLDER_DIR_NAME, remote_script_location) ipynb_file_name = convert_py_to_ipynb(py_script_location, os.path.join(run_folder, FOLDER_DIR_NAME)) local_script_location = ipynb_file_name # reconfigure draft's templates update_configuration(run_folder=run_folder, script_location=remote_script_location, script_parameters=script_parameters, experiment_name=experiment_name, run_name=run_name, local_registry_port=local_registry_port, cluster_registry_port=cluster_registry_port, pack_type=pack_type, pack_params=pack_params, script_folder_location=script_folder_location, env_variables=env_variables) pod_count = get_pod_count(run_folder=run_folder, pack_type=pack_type) except Exception as exe: delete_environment(run_folder) raise SubmitExperimentError('Problems during creation of environments.') from exe log.debug(f'Prepare run {run_name} environment - finish') return PrepareExperimentResult(folder_name=run_folder, script_name=local_script_location, pod_count=pod_count)
def submit_experiment(template: str, name: str = None, run_kind: RunKinds = RunKinds.TRAINING, script_location: str = None, script_parameters: Tuple[str, ...] = None, pack_params: List[Tuple[str, str]] = None, parameter_range: List[Tuple[str, str]] = None, parameter_set: Tuple[str, ...] = None, script_folder_location: str = None, env_variables: List[str] = None, requirements_file: str = None) -> (List[Run], Dict[str, str], str): script_parameters = script_parameters if script_parameters else () parameter_set = parameter_set if parameter_set else () parameter_range = parameter_range if parameter_range else [] log.debug("Submit experiment - start") try: namespace = get_kubectl_current_context_namespace() global submitted_namespace submitted_namespace = namespace except Exception: message = Texts.GET_NAMESPACE_ERROR_MSG log.exception(message) raise SubmitExperimentError(message) try: with spinner(text=Texts.PREPARING_RESOURCE_DEFINITIONS_MSG): experiment_name, labels = generate_exp_name_and_labels(script_name=script_location, namespace=namespace, name=name, run_kind=run_kind) runs_list = prepare_list_of_runs(experiment_name=experiment_name, parameter_range=parameter_range, parameter_set=parameter_set, template_name=template) except SubmitExperimentError as exe: log.exception(str(exe)) raise exe except Exception: message = Texts.SUBMIT_PREPARATION_ERROR_MSG log.exception(message) raise SubmitExperimentError(message) global submitted_experiment submitted_experiment = experiment_name # Ctrl-C handling signal.signal(signal.SIGINT, ctrl_c_handler_for_submit) signal.signal(signal.SIGTERM, ctrl_c_handler_for_submit) try: config = Config() # start port forwarding # noinspection PyBroadException with K8sProxy(NAUTAAppNames.DOCKER_REGISTRY, port=config.local_registry_port) as proxy: # Save port that was actually used in configuration if proxy.tunnel_port != config.local_registry_port: config.local_registry_port = proxy.tunnel_port experiment_run_folders = [] # List of local directories used by experiment's runs try: # run socat if on Windows or Mac OS if get_current_os() in (OS.WINDOWS, OS.MACOS): # noinspection PyBroadException try: with spinner(text=Texts.CLUSTER_CONNECTION_MSG): socat.start(proxy.tunnel_port) except Exception: error_msg = Texts.LOCAL_DOCKER_TUNNEL_ERROR_MSG log.exception(error_msg) raise SubmitExperimentError(error_msg) cluster_registry_port = get_app_service_node_port(nauta_app_name=NAUTAAppNames.DOCKER_REGISTRY) # prepare environments for all experiment's runs for experiment_run in runs_list: if script_parameters and experiment_run.parameters: current_script_parameters = script_parameters + experiment_run.parameters elif script_parameters: current_script_parameters = script_parameters elif experiment_run.parameters: current_script_parameters = experiment_run.parameters else: current_script_parameters = "" run_folder, script_location, pod_count = \ prepare_experiment_environment(experiment_name=experiment_name, run_name=experiment_run.name, local_script_location=script_location, script_folder_location=script_folder_location, # noqa: E501 script_parameters=current_script_parameters, pack_type=template, pack_params=pack_params, local_registry_port=proxy.tunnel_port, cluster_registry_port=cluster_registry_port, env_variables=env_variables, requirements_file=requirements_file) # Set correct pod count if not pod_count or pod_count < 1: raise SubmitExperimentError('Unable to determine pod count: make sure that values.yaml ' 'file in your pack has podCount field with positive integer value.') experiment_run.pod_count = pod_count experiment_run_folders.append(run_folder) script_name = None if script_location is not None: script_name = os.path.basename(script_location) # Prepend script_name parameter to run description only for display purposes. experiment_run.parameters = script_parameters if not experiment_run.parameters \ else experiment_run.parameters + script_parameters if experiment_run.parameters and script_name: experiment_run.parameters = (script_name, ) + experiment_run.parameters elif script_name: experiment_run.parameters = (script_name, ) except SubmitExperimentError as e: log.exception(Texts.ENV_CREATION_ERROR_MSG) e.message += f' {Texts.ENV_CREATION_ERROR_MSG}' raise except Exception: # any error in this step breaks execution of this command message = Texts.ENV_CREATION_ERROR_MSG log.exception(message) # just in case - remove folders that were created with a success for experiment_run_folder in experiment_run_folders: delete_environment(experiment_run_folder) # if ps or pr option is used - first ask whether experiment(s) should be submitted if parameter_range or parameter_set: click.echo(Texts.CONFIRM_SUBMIT_MSG) click.echo(tabulate({RUN_NAME: [run.name for run in runs_list], RUN_PARAMETERS: ["\n".join(run.parameters) if run.parameters else "" for run in runs_list]}, headers=[RUN_NAME, RUN_PARAMETERS], tablefmt="orgtbl")) if not click.confirm(Texts.CONFIRM_SUBMIT_QUESTION_MSG, default=True): for experiment_run_folder in experiment_run_folders: delete_environment(experiment_run_folder) exit() # create Experiment model # TODO template_name & template_namespace should be filled after Template implementation parameter_range_spec = [f'-pr {param_name} {param_value}' for param_name, param_value in parameter_range] parameter_set_spec = [f'-ps {ps_spec}' for ps_spec in parameter_set] experiment_parameters_spec = list(script_parameters) + parameter_range_spec + parameter_set_spec experiment = experiments_model.Experiment(name=experiment_name, template_name=template, parameters_spec=experiment_parameters_spec, template_namespace="template-namespace") experiment.create(namespace=namespace, labels=labels) # submit runs run_errors = {} for run, run_folder in zip(runs_list, experiment_run_folders): try: run.state = RunStatus.QUEUED with spinner(text=Texts.CREATING_RESOURCES_MSG.format(run_name=run.name)): # Add Run object with runKind label and pack params as annotations run.create(namespace=namespace, labels={'runKind': run_kind.value}, annotations={pack_param_name: pack_param_value for pack_param_name, pack_param_value in pack_params}) submitted_runs.append(run) submit_draft_pack(run_folder, namespace=namespace) except Exception as exe: delete_environment(run_folder) try: run.state = RunStatus.FAILED run_errors[run.name] = str(exe) run.update() except Exception as rexe: # update of non-existing run may fail log.debug(Texts.ERROR_DURING_PATCHING_RUN.format(str(rexe))) # Delete experiment if no Runs were submitted if not submitted_runs: click.echo(Texts.SUBMISSION_FAIL_ERROR_MSG) delete_k8s_object("experiment", experiment_name) # Change experiment status to submitted experiment.state = experiments_model.ExperimentStatus.SUBMITTED experiment.update() except LocalPortOccupiedError as exe: click.echo(exe.message) raise SubmitExperimentError(exe.message) except K8sProxyCloseError: log.exception('Error during closing of a proxy for a {}'.format(NAUTAAppNames.DOCKER_REGISTRY)) raise K8sProxyCloseError(Texts.PROXY_CLOSE_ERROR_MSG) except K8sProxyOpenError: error_msg = Texts.PROXY_OPEN_ERROR_MSG log.exception(error_msg) raise SubmitExperimentError(error_msg) except SubmitExperimentError: raise except Exception as exe: error_msg = Texts.SUBMIT_OTHER_ERROR_MSG log.exception(error_msg) raise SubmitExperimentError(error_msg) from exe finally: with spinner(text=Texts.CLUSTER_CONNECTION_CLOSING_MSG): # noinspection PyBroadException try: socat.stop() except Exception: log.exception("Error during closing of a proxy for a local docker-host tunnel") raise K8sProxyCloseError(Texts.DOCKER_TUNNEL_CLOSE_ERROR_MSG) # remove semaphores from all exp folders remove_sempahore(experiment_name) log.debug("Submit - finish") return runs_list, run_errors, script_location
def submit_experiment( template: str, name: str = None, run_kind: RunKinds = RunKinds.TRAINING, script_location: str = None, script_parameters: Tuple[str, ...] = None, pack_params: List[Tuple[str, str]] = None, parameter_range: List[Tuple[str, str]] = None, parameter_set: Tuple[str, ...] = None, script_folder_location: str = None, env_variables: List[str] = None, requirements_file: str = None ) -> Tuple[List[Run], Dict[str, str], Optional[str]]: script_parameters: Union[Tuple[str, ...], Tuple[( )]] = script_parameters if script_parameters else () parameter_set: Union[Tuple[str, ...], Tuple[()]] = parameter_set if parameter_set else () parameter_range = parameter_range if parameter_range else [] pack_params = pack_params if pack_params else [] log.debug("Submit experiment - start") try: namespace = get_kubectl_current_context_namespace() global submitted_namespace submitted_namespace = namespace except Exception: message = Texts.GET_NAMESPACE_ERROR_MSG log.exception(message) raise SubmitExperimentError(message) try: with spinner(text=Texts.PREPARING_RESOURCE_DEFINITIONS_MSG): experiment_name, labels = generate_exp_name_and_labels( script_name=script_location, namespace=namespace, name=name, run_kind=run_kind) runs_list = prepare_list_of_runs(experiment_name=experiment_name, parameter_range=parameter_range, parameter_set=parameter_set, template_name=template) except SubmitExperimentError as exe: log.exception(str(exe)) raise exe except Exception: message = Texts.SUBMIT_PREPARATION_ERROR_MSG log.exception(message) raise SubmitExperimentError(message) global submitted_experiment submitted_experiment = experiment_name # Ctrl-C handling signal.signal(signal.SIGINT, ctrl_c_handler_for_submit) signal.signal(signal.SIGTERM, ctrl_c_handler_for_submit) try: experiment_run_folders = [ ] # List of local directories used by experiment's runs try: cluster_registry_port = get_app_service_node_port( nauta_app_name=NAUTAAppNames.DOCKER_REGISTRY) # prepare environments for all experiment's runs for experiment_run in runs_list: if script_parameters and experiment_run.parameters: current_script_parameters = script_parameters + experiment_run.parameters elif script_parameters: current_script_parameters = script_parameters elif experiment_run.parameters: current_script_parameters = experiment_run.parameters else: current_script_parameters = None run_folder, script_location, pod_count = \ prepare_experiment_environment(experiment_name=experiment_name, run_name=experiment_run.name, local_script_location=script_location, script_folder_location=script_folder_location, # noqa: E501 script_parameters=current_script_parameters, pack_type=template, pack_params=pack_params, cluster_registry_port=cluster_registry_port, env_variables=env_variables, requirements_file=requirements_file, username=namespace, run_kind=run_kind) # Set correct pod count if not pod_count or pod_count < 1: raise SubmitExperimentError( 'Unable to determine pod count: make sure that values.yaml ' 'file in your pack has podCount field with positive integer value.' ) experiment_run.pod_count = pod_count experiment_run_folders.append(run_folder) script_name = None if script_location is not None: script_name = os.path.basename(script_location) # Prepend script_name parameter to run description only for display purposes. experiment_run.parameters = script_parameters if not experiment_run.parameters \ else experiment_run.parameters + script_parameters if experiment_run.parameters and script_name: experiment_run.parameters = ( script_name, ) + experiment_run.parameters elif script_name: experiment_run.parameters = (script_name, ) except SubmitExperimentError as e: log.exception(Texts.ENV_CREATION_ERROR_MSG) e.message += f' {Texts.ENV_CREATION_ERROR_MSG}' raise except Exception: # any error in this step breaks execution of this command message = Texts.ENV_CREATION_ERROR_MSG log.exception(message) # just in case - remove folders that were created with a success for experiment_run_folder in experiment_run_folders: delete_environment(experiment_run_folder) # if ps or pr option is used - first ask whether experiment(s) should be submitted if parameter_range or parameter_set: click.echo(Texts.CONFIRM_SUBMIT_MSG) click.echo( tabulate( { RUN_NAME: [run.name for run in runs_list], RUN_PARAMETERS: [ "\n".join(run.parameters) if run.parameters else "" for run in runs_list ] }, headers=[RUN_NAME, RUN_PARAMETERS], tablefmt=TBLT_TABLE_FORMAT)) if not click.confirm(Texts.CONFIRM_SUBMIT_QUESTION_MSG, default=True): for experiment_run_folder in experiment_run_folders: delete_environment(experiment_run_folder) exit() # create Experiment model # TODO template_name & template_namespace should be filled after Template implementation parameter_range_spec = [ f'-pr {param_name} {param_value}' for param_name, param_value in parameter_range ] parameter_set_spec = [f'-ps {ps_spec}' for ps_spec in parameter_set] experiment_parameters_spec = list( script_parameters) + parameter_range_spec + parameter_set_spec template_version = get_template_version(template) experiment = experiments_model.Experiment( name=experiment_name, template_name=template, parameters_spec=experiment_parameters_spec, template_namespace="template-namespace", template_version=template_version) experiment.create(namespace=namespace, labels=labels) with spinner('Uploading experiment...'): try: upload_experiment_to_git_repo_manager( experiments_workdir=get_run_environment_path(''), experiment_name=experiment_name, run_name=runs_list[0].name, username=namespace) except Exception: log.exception('Failed to upload experiment.') try: experiment.state = experiments_model.ExperimentStatus.FAILED experiment.update() except Exception: log.exception( f'Failed to set state of {experiment.name} experiment ' f'to {experiments_model.ExperimentStatus.FAILED}') raise SubmitExperimentError('Failed to upload experiment.') with spinner('Building experiment image...'): try: image_build_workflow: ExperimentImageBuildWorkflow = ExperimentImageBuildWorkflow.from_yaml( yaml_template_path= f'{Config().config_path}/workflows/{EXP_IMAGE_BUILD_WORKFLOW_SPEC}', username=namespace, experiment_name=experiment_name) image_build_workflow.create(namespace=namespace) image_build_workflow.wait_for_completion() except Exception: error_msg = 'Failed to build experiment image.' log.exception(error_msg) # Try to get workflow logs _debug_workflow_logs(workflow=image_build_workflow, namespace=namespace) if image_build_workflow.name: error_msg += f' Run nctl workflow logs {image_build_workflow.name} command for more details.' try: experiment.state = experiments_model.ExperimentStatus.FAILED experiment.update() except Exception: log.exception( f'Failed to set state of {experiment.name} experiment ' f'to {experiments_model.ExperimentStatus.FAILED}') raise SubmitExperimentError(error_msg) # submit runs run_errors: Dict[str, str] = {} for run, run_folder in zip(runs_list, experiment_run_folders): try: run.state = RunStatus.QUEUED with spinner(text=Texts.CREATING_RESOURCES_MSG.format( run_name=run.name)): # Add Run object with runKind label and pack params as annotations run.create(namespace=namespace, labels={'runKind': run_kind.value}, annotations={ pack_param_name: pack_param_value for pack_param_name, pack_param_value in pack_params }) submitted_runs.append(run) submit_draft_pack(run_name=run.name, run_folder=run_folder, namespace=namespace) except Exception as exe: delete_environment(run_folder) try: run.state = RunStatus.FAILED run_errors[run.name] = str(exe) run.update() except Exception as rexe: # update of non-existing run may fail log.debug(Texts.ERROR_DURING_PATCHING_RUN.format( str(rexe))) # Delete experiment if no Runs were submitted if not submitted_runs: click.echo(Texts.SUBMISSION_FAIL_ERROR_MSG) delete_k8s_object("experiment", experiment_name) # Change experiment status to submitted experiment.state = experiments_model.ExperimentStatus.SUBMITTED experiment.update() except LocalPortOccupiedError as exe: click.echo(exe.message) raise SubmitExperimentError(exe.message) except K8sProxyCloseError: log.exception('Error during closing of a proxy for a {}'.format( NAUTAAppNames.DOCKER_REGISTRY)) raise K8sProxyCloseError(Texts.PROXY_CLOSE_ERROR_MSG) except K8sProxyOpenError: error_msg = Texts.PROXY_OPEN_ERROR_MSG log.exception(error_msg) raise SubmitExperimentError(error_msg) except SubmitExperimentError: raise except Exception as exe: error_msg = Texts.SUBMIT_OTHER_ERROR_MSG log.exception(error_msg) raise SubmitExperimentError(error_msg) from exe finally: # remove semaphores from all exp folders remove_sempahore(experiment_name) log.debug("Submit - finish") return runs_list, run_errors, script_location
def create_environment(experiment_name: str, file_location: str = None, folder_location: str = None, show_folder_size_warning=True, max_folder_size_in_bytes=1024 * 1024, spinner_to_hide=None) -> str: """ Creates a complete environment for executing a training using draft. :param experiment_name: name of an experiment used to create a folder with content of an experiment :param file_location: location of a training script :param folder_location: location of a folder with additional data :param show_folder_size_warning: if True, a warning will be shown if script folder size exceeds value in max_folder_size_in_bytes param :param max_folder_size_in_bytes: maximum script folder size, :param spinner_to_hide: provide spinner, if it should be hidden before folder size warning :return: (experiment_folder) experiment_folder - folder with experiment's artifacts In case of any problems during creation of an enviornment it throws an exception with a description of a problem """ log.debug("Create environment - start") message_prefix = Texts.CREATE_ENV_MSG_PREFIX # create a folder for experiment's purposes run_environment_path = get_run_environment_path(experiment_name) folder_path = os.path.join(run_environment_path, FOLDER_DIR_NAME) try: if not os.path.exists(folder_path): os.makedirs(folder_path) except Exception: log.exception("Create environment - creating experiment folder error.") raise SubmitExperimentError( message_prefix.format(reason=Texts.EXP_DIR_CANT_BE_CREATED)) # create a semaphore saying that experiment is under submission Path(os.path.join(run_environment_path, EXP_SUB_SEMAPHORE_FILENAME)).touch() # copy training script - it overwrites the file taken from a folder_location if file_location: try: shutil.copy2(file_location, folder_path) if get_current_os() == OS.WINDOWS: os.chmod( os.path.join(folder_path, os.path.basename(file_location)), 0o666) # nosec except Exception: log.exception( "Create environment - copying training script error.") raise SubmitExperimentError( message_prefix.format( reason=Texts.TRAINING_SCRIPT_CANT_BE_CREATED)) # copy folder content if folder_location: folder_size = get_total_directory_size_in_bytes(folder_location) if show_folder_size_warning and folder_size >= max_folder_size_in_bytes: if spinner_to_hide: spinner_to_hide.hide() if not click.confirm( f'Experiment\'s script folder location size ({folder_size/1024/1024:.2f} MB) ' f'exceeds {max_folder_size_in_bytes/1024/1024:.2f} MB. ' f'It is highly recommended to use input/output shares for large amounts of data ' f'instead of submitting them along with experiment. Do you want to continue?' ): exit(2) if spinner_to_hide: spinner_to_hide.show() try: copy_tree(folder_location, folder_path) except Exception: log.exception( "Create environment - copying training folder error.") raise SubmitExperimentError( message_prefix.format( reason=Texts.DIR_CANT_BE_COPIED_ERROR_TEXT)) log.debug("Create environment - end") return run_environment_path