def modify_dockerfile(experiment_folder: str, experiment_name: str, username: str, script_location: str = None, script_folder_location: str = None): log.debug("Modify dockerfile - start") dockerfile_name = os.path.join(experiment_folder, "Dockerfile") dockerfile_temp_name = os.path.join(experiment_folder, "Dockerfile_Temp") dockerfile_temp_content = "" with open(dockerfile_name, "r") as dockerfile: for line in dockerfile: if line.startswith("ADD training.py"): if script_location or script_folder_location: dockerfile_temp_content = dockerfile_temp_content + f"COPY {FOLDER_DIR_NAME} ." elif line.startswith("FROM nauta/tensorflow-py"): nauta_config_map = NAUTAConfigMap() if line.find('-py2') != -1: tf_image_name = nauta_config_map.py2_image_name else: tf_image_name = nauta_config_map.py3_image_name tf_image_repository = f'{NAUTA_REGISTRY_ADDRESS}/{tf_image_name}' dockerfile_temp_content = dockerfile_temp_content + f'FROM {tf_image_repository}' elif line.startswith("FROM nauta/horovod"): nauta_config_map = NAUTAConfigMap() if line.find('-py2') != -1: horovod_image_name = nauta_config_map.py2_horovod_image_name else: horovod_image_name = nauta_config_map.py3_horovod_image_name image_repository = f'{NAUTA_REGISTRY_ADDRESS}/{horovod_image_name}' dockerfile_temp_content = dockerfile_temp_content + f'FROM {image_repository}' elif line.startswith("FROM nauta/pytorch"): nauta_config_map = NAUTAConfigMap() pytorch_image_name = nauta_config_map.pytorch_image_name image_repository = f'{NAUTA_REGISTRY_ADDRESS}/{pytorch_image_name}' dockerfile_temp_content = dockerfile_temp_content + f'FROM {image_repository}' else: dockerfile_temp_content = dockerfile_temp_content + line # Append experiment metadata to Dockerfile - besides enabling access to experiment/user name in experiment's # container, it will also make image manifest digest unique, in order to avoid issues with race conditions when # image manifest is pushed to docker registry dockerfile_temp_content += f'\nENV NAUTA_EXPERIMENT_NAME {experiment_name}\n' dockerfile_temp_content += f'\nENV NAUTA_USERNAME {username}\n' with open(dockerfile_temp_name, "w") as dockerfile_temp: dockerfile_temp.write(dockerfile_temp_content) shutil.move(dockerfile_temp_name, dockerfile_name) log.debug("Modify dockerfile - end")
def __init__(self, username: str = None, experiment_name: str = None, name: str = None, namespace: str = None, started_at: str = None, finished_at: str = None, status: dict = None, phase: str = None, body: dict = None, k8s_custom_object_api: CustomObjectsApi = None, failure_message: str = None, steps: List[ArgoWorkflowStep] = None): super().__init__(k8s_custom_object_api=k8s_custom_object_api, name=name, namespace=namespace, body=body, steps=steps) self.started_at = started_at self.finished_at = finished_at self.status = status self.phase = phase self.failure_message = failure_message self.parameters = { 'git-address': self.GIT_REPO_MANAGER_SERVICE, 'docker-registry-address': self.DOCKER_REGISTRY_SERVICE, 'buildkitd-address': self.BUILDKITD_SERVICE, 'cluster-registry-address': NAUTAConfigMap().registry, 'user-name': username, 'experiment-name': experiment_name } self.generate_name = f'{experiment_name}-image-build-' self.experiment_name = experiment_name
def version(ctx: click.Context): """ Returns the version of the installed nctl application. """ platform_version: Optional[str] = Texts.INITIAL_PLATFORM_VERSION error_msg = "" platform_version_fail = False try: platform_version = NAUTAConfigMap(config_map_request_timeout=PLATFORM_VERSION_REQUEST_TIMEOUT).platform_version if not platform_version: platform_version_fail = True raise ValueError(Texts.KUBECTL_INT_ERROR_MSG) except KubernetesError: error_msg = Texts.KUBECTL_INT_ERROR_MSG platform_version_fail = True except Exception: error_msg = Texts.OTHER_ERROR_MSG platform_version_fail = True version_table: List[list] = [[Texts.TABLE_APP_ROW_NAME, VERSION], [Texts.TABLE_PLATFORM_ROW_NAME, platform_version]] click.echo(tabulate(version_table, headers=Texts.TABLE_HEADERS, tablefmt=TBLT_TABLE_FORMAT)) if platform_version_fail: handle_error(logger, error_msg, error_msg, add_verbosity_msg=ctx.obj.verbosity == 0)
def config(ctx: click.Context, cpu: str, memory: str): if not cpu or not memory: handle_error(logger, Texts.MISSING_ARGUMENTS, Texts.MISSING_ARGUMENTS) sys.exit(1) if not validate_cpu_settings(cpu): handle_error(logger, Texts.CPU_WRONG_FORMAT, Texts.CPU_WRONG_FORMAT) sys.exit(1) if not validate_memory_settings(memory): handle_error(logger, Texts.MEMORY_WRONG_FORMAT, Texts.MEMORY_WRONG_FORMAT) sys.exit(1) configuration = NAUTAConfigMap() if configuration.minimal_node_memory_amount and \ convert_k8s_memory_resource(configuration.minimal_node_memory_amount) > convert_k8s_memory_resource(memory): error_message = Texts.MEMORY_SETTINGS_TOO_LOW.format( memory_value=configuration.minimal_node_memory_amount) handle_error(logger, error_message, error_message) sys.exit(1) if configuration.minimal_node_cpu_number and \ convert_k8s_cpu_resource(configuration.minimal_node_cpu_number) > convert_k8s_cpu_resource(cpu): error_message = Texts.CPU_SETTINGS_TOO_LOW.format( cpu_value=configuration.minimal_node_cpu_number) handle_error(logger, error_message, error_message) sys.exit(1) update_resources_in_packs(cpu, memory) click.echo(Texts.SUCCESS_MESSAGE)
def version(state: State): """ Returns the version of the installed nctl application. """ platform_version = Texts.INITIAL_PLATFORM_VERSION error_msg = "" platform_version_fail = False try: platform_version = NAUTAConfigMap( config_map_request_timeout=PLATFORM_VERSION_REQUEST_TIMEOUT ).platform_version except KubernetesError: error_msg = Texts.KUBECTL_INT_ERROR_MSG platform_version_fail = True except Exception: error_msg = Texts.OTHER_ERROR_MSG platform_version_fail = True version_table = [[Texts.TABLE_APP_ROW_NAME, VERSION], [Texts.TABLE_PLATFORM_ROW_NAME, platform_version]] click.echo( tabulate(version_table, headers=Texts.TABLE_HEADERS, tablefmt="orgtbl")) if platform_version_fail: handle_error(logger, error_msg, error_msg, add_verbosity_msg=state.verbosity == 0)
def export(path: str, format: str, operation_options: Tuple[str, ...]): additional_params_str = " ".join(operation_options) format = format.lower() workflow_exports_files = os.listdir( f'{Config().config_path}/workflows/exports') formats = [ file.rstrip('.yaml') for file in workflow_exports_files if file.endswith('.yaml') ] if format not in formats: click.echo(f'Format: {format} does not exist. Choose from: {formats}') sys.exit(2) try: current_namespace = get_kubectl_current_context_namespace() export_workflow = ArgoWorkflow.from_yaml( f'{Config().config_path}/workflows/exports/{format}.yaml') export_workflow.parameters = { 'cluster-registry-address': NAUTAConfigMap().registry, 'saved-model-dir-path': path, 'additional-params': additional_params_str } export_workflow.create(namespace=current_namespace) except Exception: error_msg = 'Failed to create export workflow.' click.echo(error_msg) logger.exception(error_msg) sys.exit(1) click.echo(f'Successfully created export workflow: {export_workflow.name}')
def modify_dockerfile(experiment_folder: str, script_location: str, local_registry_port: int, script_folder_location: str = None): log.debug("Modify dockerfile - start") dockerfile_name = os.path.join(experiment_folder, "Dockerfile") dockerfile_temp_name = os.path.join(experiment_folder, "Dockerfile_Temp") dockerfile_temp_content = "" with open(dockerfile_name, "r") as dockerfile: for line in dockerfile: if line.startswith("ADD training.py"): if script_location or script_folder_location: dockerfile_temp_content = dockerfile_temp_content + f"COPY {FOLDER_DIR_NAME} ." elif line.startswith("FROM nauta/tensorflow-py"): nauta_config_map = NAUTAConfigMap() if line.find('-py2') != -1: tf_image_name = nauta_config_map.py2_image_name else: tf_image_name = nauta_config_map.py3_image_name tf_image_repository = f'127.0.0.1:{local_registry_port}/{tf_image_name}' dockerfile_temp_content = dockerfile_temp_content + f'FROM {tf_image_repository}' # pull image from platform's registry pull_tf_image(tf_image_repository=tf_image_repository) elif line.startswith("FROM nauta/horovod"): nauta_config_map = NAUTAConfigMap() if line.find('-py2') != -1: image_name = nauta_config_map.py2_horovod_image_name else: image_name = nauta_config_map.py3_horovod_image_name image_repository = f'127.0.0.1:{local_registry_port}/{image_name}' dockerfile_temp_content = dockerfile_temp_content + f'FROM {image_repository}' # pull image from platform's registry pull_tf_image(tf_image_repository=image_repository) else: dockerfile_temp_content = dockerfile_temp_content + line with open(dockerfile_temp_name, "w") as dockerfile_temp: dockerfile_temp.write(dockerfile_temp_content) shutil.move(dockerfile_temp_name, dockerfile_name) log.debug("Modify dockerfile - end")
def modify_dockerfile(experiment_folder: str, script_location: str = None, script_folder_location: str = None): log.debug("Modify dockerfile - start") dockerfile_name = os.path.join(experiment_folder, "Dockerfile") dockerfile_temp_name = os.path.join(experiment_folder, "Dockerfile_Temp") dockerfile_temp_content = "" with open(dockerfile_name, "r") as dockerfile: for line in dockerfile: if line.startswith("ADD training.py"): if script_location or script_folder_location: dockerfile_temp_content = dockerfile_temp_content + f"COPY {FOLDER_DIR_NAME} ." elif line.startswith("FROM nauta/tensorflow-py"): nauta_config_map = NAUTAConfigMap() if line.find('-py2') != -1: tf_image_name = nauta_config_map.py2_image_name else: tf_image_name = nauta_config_map.py3_image_name tf_image_repository = f'{NAUTA_REGISTRY_ADDRESS}/{tf_image_name}' dockerfile_temp_content = dockerfile_temp_content + f'FROM {tf_image_repository}' elif line.startswith("FROM nauta/horovod"): nauta_config_map = NAUTAConfigMap() if line.find('-py2') != -1: horovod_image_name = nauta_config_map.py2_horovod_image_name else: horovod_image_name = nauta_config_map.py3_horovod_image_name image_repository = f'{NAUTA_REGISTRY_ADDRESS}/{horovod_image_name}' dockerfile_temp_content = dockerfile_temp_content + f'FROM {image_repository}' elif line.startswith("FROM nauta/pytorch"): nauta_config_map = NAUTAConfigMap() pytorch_image_name = nauta_config_map.pytorch_image_name image_repository = f'{NAUTA_REGISTRY_ADDRESS}/{pytorch_image_name}' dockerfile_temp_content = dockerfile_temp_content + f'FROM {image_repository}' else: dockerfile_temp_content = dockerfile_temp_content + line with open(dockerfile_temp_name, "w") as dockerfile_temp: dockerfile_temp.write(dockerfile_temp_content) shutil.move(dockerfile_temp_name, dockerfile_name) log.debug("Modify dockerfile - end")
def export(path: str, format: str, operation_options: Tuple[str, ...]): if path == FORMATS_OPTION: try: list_of_workflows = get_list_of_workflows(EXPORT_WORKFLOWS_LOCATION) except Exception: handle_error(logger, Texts.EXPORT_LIST_ERROR_MSG, Texts.EXPORT_LIST_ERROR_MSG) sys.exit(1) click.echo(tabulate(list_of_workflows, headers=EXPORT_LIST_HEADERS, tablefmt=TBLT_TABLE_FORMAT)) sys.exit(0) config_path = Config().config_path formats: List[str] = [] # noqa: E701 if os.path.isdir(config_path): workflow_exports_files = os.listdir(f'{config_path}/workflows/exports') formats = [os.path.splitext(file)[0] for file in workflow_exports_files if file.endswith('.yaml')] if not format: click.echo(Texts.MISSING_EXPORT_FORMAT.format(formats=formats)) sys.exit(2) format = format.lower() if format not in formats: click.echo(Texts.WRONG_EXPORT_FORMAT.format(format=format, formats=formats)) sys.exit(2) additional_params_str = " ".join(operation_options) try: current_namespace = get_kubectl_current_context_namespace() export_workflow = ArgoWorkflow.from_yaml(f'{Config().config_path}/workflows/exports/{format}.yaml') export_workflow.parameters = { 'cluster-registry-address': NAUTAConfigMap().registry, 'saved-model-dir-path': path, 'additional-params': additional_params_str } export_workflow.create(namespace=current_namespace) workflow: ArgoWorkflow = ArgoWorkflow.get(namespace=current_namespace, name=export_workflow.name) except Exception: error_msg = 'Failed to create export workflow.' click.echo(error_msg) logger.exception(error_msg) sys.exit(1) click.echo(tabulate([workflow.cli_representation], headers=MODEL_HEADERS, tablefmt=TBLT_TABLE_FORMAT)) click.echo(f'\nSuccessfully created export workflow')
def process(path: str, kind: str, options: Tuple[str, ...]): additional_params_str = " ".join(options) kind = kind.lower() config_path = Config().config_path process_path = f'{config_path}/workflows/processes' kinds: List[str] = [] if os.path.isdir(process_path): process_kinds = os.listdir(f'{config_path}/workflows/processes') kinds = [ os.path.splitext(file)[0] for file in process_kinds if file.endswith('.yaml') ] if kind not in kinds: click.echo(Texts.WRONG_PROCESS_KIND.format(process=kind, kinds=kinds)) sys.exit(2) try: current_namespace = get_kubectl_current_context_namespace() process_workflow = ArgoWorkflow.from_yaml( f'{Config().config_path}/workflows/processes/{kind}.yaml') process_workflow.parameters = { 'cluster-registry-address': NAUTAConfigMap().registry, 'saved-model-dir-path': path, 'additional-params': additional_params_str } process_workflow.create(namespace=current_namespace) except Exception: error_msg = 'Failed to create export workflow.' click.echo(error_msg) logger.exception(error_msg) sys.exit(1) click.echo( f'Successfully created process workflow: {process_workflow.name}')
def create(state: State, username: str, list_only: bool, filename: str): """ Adds a new user with a name given as a parameter. :param username: name of a new user """ if list_only and filename: handle_error(user_msg=Texts.F_L_OPTIONS_EXCLUSION_ERROR_MSG) exit(1) try: try: validate_user_name(username) except ValueError as exe: handle_error( logger, Texts.NAME_VALIDATION_ERROR_MSG.format(username=username), str(exe), add_verbosity_msg=state.verbosity == 0) exit(1) if not is_current_user_administrator(): handle_error(logger, Texts.USER_NOT_ADMIN_ERROR_MSG, Texts.USER_NOT_ADMIN_ERROR_MSG) exit(1) user_state = check_users_presence(username) if user_state == UserState.ACTIVE: handle_error( logger, Texts.USER_ALREADY_EXISTS_ERROR_MSG.format(username=username), Texts.USER_ALREADY_EXISTS_ERROR_MSG.format(username=username)) exit(1) if user_state == UserState.TERMINATING: handle_error( logger, Texts.USER_BEING_REMOVED_ERROR_MSG.format(username=username), Texts.USER_BEING_REMOVED_ERROR_MSG.format(username=username)) exit(1) except Exception: handle_error( logger, Texts.USER_VERIFICATION_ERROR_MSG.format(username=username), Texts.USER_VERIFICATION_ERROR_MSG.format(username=username), add_verbosity_msg=state.verbosity == 0) exit(1) try: with spinner(text=Texts.CREATING_USER_PROGRESS_MSG.format( username=username)): chart_location = os.path.join(Config().config_path, ADD_USER_CHART_NAME) nauta_config_map = NAUTAConfigMap() tiller_location = nauta_config_map.image_tiller tensorboard_service_location = nauta_config_map.image_tensorboard_service add_user_command = [ "helm", "install", "--wait", "--namespace", username, "--name", username, chart_location, "--set", "global.nauta=nauta", "--set", f"username={username}", "--set", "TillerImage={}".format(tiller_location), "--set", f"TensorboardServiceImage={tensorboard_service_location}" ] env = os.environ.copy() env['PATH'] = Config().config_path + os.pathsep + env['PATH'] _, err_code, log_output = execute_system_command( ' '.join(add_user_command), env=env, shell=True) if err_code: handle_error(logger, log_output, Texts.USER_ADD_ERROR_MSG, add_verbosity_msg=state.verbosity == 0) if not delete_user(username): handle_error(user_msg=Texts.REMOVE_USER_ERROR_MSG.format( username=username)) sys.exit(1) try: users_password = get_users_token(username) except Exception: handle_error(logger, Texts.PASSWORD_GATHER_ERROR_MSG, Texts.PASSWORD_GATHER_ERROR_MSG, add_verbosity_msg=state.verbosity == 0) users_password = "" try: cert = get_certificate(username) except Exception: handle_error(logger, Texts.CERT_GATHER_ERROR_MSG, Texts.CERT_GATHER_ERROR_MSG, add_verbosity_msg=state.verbosity == 0) cert = "" except Exception: handle_error(logger, Texts.USER_ADD_ERROR_MSG.format(username=username), Texts.USER_ADD_ERROR_MSG.format(username=username), add_verbosity_msg=state.verbosity == 0) if not delete_user(username): handle_error(user_msg=Texts.REMOVE_USER_ERROR_MSG.format( username=username)) sys.exit(1) if is_user_created(username, 90): click.echo(Texts.USER_CREATION_SUCCESS_MSG.format(username=username)) else: # if during 90 seconds a user hasn't been created - app displays information about it # but don't step processing the command - config file generated here my be useful later # when user has been created click.echo(Texts.USER_NOT_READY_ERROR_MSG.format(username=username)) try: kubeconfig = generate_kubeconfig(username, username, get_kubectl_host(), users_password, cert) except Exception: handle_error(logger, Texts.CONFIG_CREATION_ERROR_MSG, Texts.CONFIG_CREATION_ERROR_MSG, add_verbosity_msg=state.verbosity == 0) exit(1) if list_only: click.echo(Texts.LIST_ONLY_HEADER) click.echo(kubeconfig) else: if not filename: filename = DEFAULT_FILENAME.format(username) try: with open(filename, "w") as file: file.write(kubeconfig) click.echo(Texts.CONFIG_SAVE_SUCCESS_MSG.format(filename=filename)) except Exception: handle_error(logger, Texts.CONFIG_SAVE_FAIL_MSG, Texts.CONFIG_SAVE_FAIL_MSG, add_verbosity_msg=state.verbosity == 0) click.echo(Texts.CONFIG_SAVE_FAIL_INSTRUCTIONS_MSG) click.echo(kubeconfig) sys.exit(1)
def config(state: State, cpu: str, memory: str): if not cpu or not memory: handle_error(logger, Texts.MISSING_ARGUMENTS, Texts.MISSING_ARGUMENTS) sys.exit(1) if not validate_cpu_settings(cpu): handle_error(logger, Texts.CPU_WRONG_FORMAT, Texts.CPU_WRONG_FORMAT) sys.exit(1) if not validate_memory_settings(memory): handle_error(logger, Texts.MEMORY_WRONG_FORMAT, Texts.MEMORY_WRONG_FORMAT) sys.exit(1) configuration = NAUTAConfigMap() if configuration.minimal_node_memory_amount and \ convert_k8s_memory_resource(configuration.minimal_node_memory_amount) > convert_k8s_memory_resource(memory): error_message = Texts.MEMORY_SETTINGS_TOO_LOW.format( memory_value=configuration.minimal_node_memory_amount) handle_error(logger, error_message, error_message) sys.exit(1) if configuration.minimal_node_cpu_number and \ convert_k8s_cpu_resource(configuration.minimal_node_cpu_number) > convert_k8s_cpu_resource(cpu): error_message = Texts.CPU_SETTINGS_TOO_LOW.format( cpu_value=configuration.minimal_node_cpu_number) handle_error(logger, error_message, error_message) sys.exit(1) config_file_location = os.path.join(Config().config_path, NODE_CONFIG_FILENAME) if not os.path.isfile(config_file_location): handle_error(logger, Texts.MISSING_CONFIG_FILE, Texts.MISSING_CONFIG_FILE) sys.exit(1) with open(config_file_location, 'r+', encoding='utf-8') as config_file, \ spinner(text=Texts.CONFIG_UPDATE): config_file_content = yaml.safe_load(config_file) cpu_number = str(config_file_content.get(CPU_NUMBER_FIELDNAME)) memory_amount = str(config_file_content.get(MEMORY_AMOUNT_FIELDNAME)) cpu_system_required_min = str( config_file_content.get(CPU_SYSTEM_REQUIRED_MIN_FIELDNAME)) cpu_system_required_percent = str( config_file_content.get(CPU_SYSTEM_REQUIRED_PERCENT_FIELDNAME)) memory_system_required_min = str( config_file_content.get(MEMORY_SYSTEM_REQUIRED_MIN_FIELDNAME)) memory_system_required_percent = str( config_file_content.get(MEMORY_SYSTEM_REQUIRED_PERCENT_FIELDNAME)) if not cpu_number or cpu_number == "None" or not memory_amount or memory_amount == "None": handle_error(logger, Texts.CONFIG_FILE_INCORRECT, Texts.CONFIG_FILE_INCORRECT) sys.exit(1) try: override_values_in_packs( new_cpu_number=cpu, new_memory_amount=memory, current_cpu_number=cpu_number, current_mem_amount=memory_amount, cpu_system_required_min=cpu_system_required_min, cpu_system_required_percent=cpu_system_required_percent, mem_system_required_min=memory_system_required_min, mem_system_required_percent=memory_system_required_percent) except Exception: logger.exception(Texts.ERROR_DURING_UPDATE) handle_error(logger, Texts.ERROR_DURING_UPDATE, Texts.ERROR_DURING_UPDATE) sys.exit(1) config_file.seek(0) config_file.truncate() config_file_content[CPU_NUMBER_FIELDNAME] = cpu config_file_content[MEMORY_AMOUNT_FIELDNAME] = memory yaml.dump(config_file_content, config_file, default_flow_style=False, explicit_start=True) click.echo(Texts.SUCCESS_MESSAGE)