def __init__(self, build_job, repo_path, from_image, copy_code=True, build_steps=None, env_vars=None, dockerfile_name='Dockerfile'): self.build_job = build_job self.job_uuid = build_job.uuid.hex self.job_name = build_job.unique_name self.from_image = from_image self.image_name = get_image_name(self.build_job) self.image_tag = self.job_uuid self.folder_name = repo_path.split('/')[-1] self.repo_path = repo_path self.copy_code = copy_code self.build_path = '/'.join(self.repo_path.split('/')[:-1]) self.build_steps = to_list(build_steps, check_none=True) self.env_vars = to_list(env_vars, check_none=True) self.dockerfile_path = os.path.join(self.build_path, dockerfile_name) self.polyaxon_requirements_path = self._get_requirements_path() self.polyaxon_setup_path = self._get_setup_path() self.docker = APIClient(version='auto') self.registry_host = None self.docker_url = None self.is_pushing = False
def __init__(self, repo_path, from_image, image_name, image_tag, copy_code=True, in_tmp_repo=True, build_steps=None, env_vars=None, dockerfile_name='Dockerfile'): # This will help create a unique tmp folder for dockerizer in case of concurrent jobs self.uuid = uuid.uuid4().hex self.from_image = from_image self.image_name = image_name self.image_tag = image_tag self.repo_path = repo_path self.folder_name = repo_path.split('/')[-1] self.copy_code = copy_code self.in_tmp_repo = in_tmp_repo if in_tmp_repo and copy_code: self.build_repo_path = self.create_tmp_repo() else: self.build_repo_path = self.repo_path self.build_path = '/'.join(self.build_repo_path.split('/')[:-1]) self.build_steps = to_list(build_steps, check_none=True) self.env_vars = to_list(env_vars, check_none=True) self.dockerfile_path = os.path.join(self.build_path, dockerfile_name) self.polyaxon_requirements_path = self._get_requirements_path() self.polyaxon_setup_path = self._get_setup_path() self.docker = APIClient(version='auto') self.registry_host = None self.docker_url = None
def get_init_container(self, init_command, init_args, env_vars, context_mounts, persistence_outputs, persistence_data): """Pod init container for setting outputs path.""" env_vars = to_list(env_vars, check_none=True) outputs_path = stores.get_job_outputs_path( persistence=persistence_outputs, job_name=self.job_name) _, outputs_volume_mount = get_pod_outputs_volume( persistence_outputs=persistence_outputs) volume_mounts = outputs_volume_mount + to_list(context_mounts, check_none=True) init_command = init_command or ["/bin/sh", "-c"] init_args = init_args or to_list( get_output_args(command=InitCommands.CREATE, outputs_path=outputs_path)) init_args += to_list( get_auth_context_args(entity='job', entity_name=self.job_name)) return client.V1Container( name=self.init_container_name, image=self.init_docker_image, image_pull_policy=self.init_docker_image_pull_policy, command=init_command, args=[''.join(init_args)], env=env_vars, volume_mounts=volume_mounts)
def __init__(self, repo_path, from_image, copy_code=True, build_steps=None, env_vars=None, nvidia_bin=None, dockerfile_name=POLYAXON_DOCKERFILE_NAME, lang_env=None, uid=None, gid=None): self.from_image = from_image self.folder_name = repo_path.split('/')[-1] self.repo_path = repo_path self.copy_code = copy_code self.build_path = '/'.join(self.repo_path.split('/')[:-1]) self.build_steps = to_list(build_steps, check_none=True) self.env_vars = to_list(env_vars, check_none=True) self.nvidia_bin = nvidia_bin self.dockerfile_path = os.path.join(self.build_path, dockerfile_name) self.polyaxon_requirements_path = self._get_requirements_path() self.polyaxon_conda_env_path = self._get_conda_env_path() self.polyaxon_setup_path = self._get_setup_path() self.lang_env = lang_env self.uid = uid self.gid = gid self.is_pushing = False
def get_task_pod_spec(self, volume_mounts, volumes, resource_name, persistence_outputs=None, persistence_data=None, outputs_refs_jobs=None, outputs_refs_experiments=None, env_vars=None, command=None, args=None, resources=None, ports=None, secret_refs=None, configmap_refs=None, ephemeral_token=None, node_selector=None, affinity=None, tolerations=None, restart_policy='OnFailure'): """Pod spec to be used to create pods for tasks: master, worker, ps.""" volume_mounts = to_list(volume_mounts, check_none=True) volumes = to_list(volumes, check_none=True) gpu_volume_mounts, gpu_volumes = get_gpu_volumes_def(resources) volume_mounts += gpu_volume_mounts volumes += gpu_volumes pod_container = self.get_pod_container(volume_mounts=volume_mounts, persistence_outputs=persistence_outputs, persistence_data=persistence_data, outputs_refs_jobs=outputs_refs_jobs, outputs_refs_experiments=outputs_refs_experiments, secret_refs=secret_refs, configmap_refs=configmap_refs, resources=resources, env_vars=env_vars, command=command, args=args, ports=ports, ephemeral_token=ephemeral_token) containers = [pod_container] if self.use_sidecar: sidecar_container = self.get_sidecar_container(resource_name=resource_name) containers.append(sidecar_container) node_selector = self._get_node_selector(node_selector=node_selector) affinity = self._get_affinity(affinity=affinity) tolerations = self._get_tolerations(tolerations=tolerations) service_account_name = self._get_service_account_name() return client.V1PodSpec( restart_policy=restart_policy, service_account_name=service_account_name, init_containers=to_list(self.get_init_container(persistence_outputs), check_none=True), containers=containers, volumes=volumes, node_selector=node_selector, tolerations=tolerations, affinity=affinity)
def get_init_container(self, init_command, init_args, env_vars, context_mounts, persistence_outputs, persistence_data): """Pod init container for setting outputs path.""" env_vars = to_list(env_vars, check_none=True) _, outputs_volume_mount = get_pod_outputs_volume( persistence_outputs=persistence_outputs) volume_mounts = outputs_volume_mount + to_list(context_mounts, check_none=True) init_command = init_command or ["/bin/sh", "-c"] init_args = init_args or [] init_args += to_list( self.get_init_path_args(persistence_outputs=persistence_outputs), check_none=True) init_args += to_list( get_auth_context_args(entity='experiment', entity_name=self.experiment_name)) return [ client.V1Container( name=self.init_container_name, image=self.init_docker_image, image_pull_policy=self.init_docker_image_pull_policy, command=init_command, args=[''.join(init_args)], env=env_vars, resources=get_init_resources(), volume_mounts=volume_mounts) ]
def get_is_default(self, obj): defaults = to_list(self.context.get('defaults', None), check_none=True) if defaults is not None: return obj.id in defaults else: # Get the requesting user if set in the context request = self.context.get('request', None) if request and is_user(request.user): defaults = to_list(conf.get(self.default_option)) return obj.id in defaults return False
def get_project_pod_spec(volume_mounts, volumes, image, command, args, ports, env_vars=None, env_from=None, container_name=None, resources=None, node_selector=None, affinity=None, tolerations=None, image_pull_policy=None, restart_policy=None, service_account_name=None): """Pod spec to be used to create pods for project: tensorboard, notebooks.""" env_vars = to_list(env_vars, check_none=True) volume_mounts = to_list(volume_mounts, check_none=True) volumes = to_list(volumes, check_none=True) gpu_volume_mounts, gpu_volumes = get_gpu_volumes_def(resources) volume_mounts += gpu_volume_mounts volumes += gpu_volumes ports = [client.V1ContainerPort(container_port=port) for port in ports] env_vars += get_resources_env_vars(resources=resources) containers = [ client.V1Container(name=container_name, image=image, image_pull_policy=image_pull_policy, command=command, args=args, ports=ports, env=env_vars, env_from=env_from, resources=get_resources(resources), volume_mounts=volume_mounts) ] if service_account_name and not conf.get('K8S_RBAC_ENABLED'): service_account_name = None return client.V1PodSpec(restart_policy=restart_policy, service_account_name=service_account_name, containers=containers, volumes=volumes, node_selector=node_selector, affinity=affinity, tolerations=tolerations)
def render(self): docker_template = jinja2.Template(POLYAXON_DOCKER_TEMPLATE) return docker_template.render( image=self.build_context.image, copy=to_list(self.build_context.copy, check_none=True), run=to_list(self.build_context.run, check_none=True), env=to_list(self.build_context.env, check_none=True), workdir=self.build_context.workdir, path=to_list(self.build_context.path, check_none=True), workdir_path=self.build_context.workdir_path, lang_env=self.build_context.lang_env, uid=self.build_context.uid, gid=self.build_context.gid, shell=self.build_context.shell, )
def __init__(self, filepaths, params=None, debug_ttl=False): filepaths = to_list(filepaths) for filepath in filepaths: if not os.path.isfile(filepath): raise PolyaxonfileError( "`{}` must be a valid file".format(filepath)) self._filenames = [ os.path.basename(filepath) for filepath in filepaths ] if params: if not isinstance(params, Mapping): raise PolyaxonfileError( "Params: `{}` must be a valid mapping".format(params)) filepaths.append({'params': params}) if debug_ttl: if not isinstance(debug_ttl, int): raise PolyaxonfileError( "Debug TTL `{}` must be a valid integer".format(debug_ttl)) filepaths.append({'run': {'cmd': 'sleep {}'.format(debug_ttl)}}) data = rhea.read(filepaths) kind = BaseSpecification.get_kind(data=data) debug_cond = (debug_ttl and not (BaseSpecification.check_kind_experiment(kind) or BaseSpecification.check_kind_job(kind))) if debug_cond: raise PolyaxonfileError( 'You can only trigger debug mode on a job or an experiment specification, ' 'received instead a `{}` specification'.format(kind)) try: self.specification = SPECIFICATION_BY_KIND[kind](data) except PolyaxonConfigurationError as e: raise PolyaxonfileError(e)
def handle_logs(message): log_lines = to_list(message['log_lines']) status = message.get('status') if not status and log_lines: handle_log_lines(log_lines) else: handle_status(status, log_lines)
def check_polyaxonfile(file, # pylint:disable=redefined-builtin params=None, debug_ttl=None, log=True): file = to_list(file) exists = [os.path.isfile(f) for f in file] parsed_params = None if params: parsed_params = parse_params(params) if not any(exists): Printer.print_error('Polyaxonfile is not present, ' 'please run {}'.format(constants.INIT_COMMAND)) sys.exit(1) try: plx_file = PolyaxonFile(file, params=parsed_params, debug_ttl=debug_ttl) if log: Printer.print_success("Polyaxonfile valid") return plx_file except Exception as e: Printer.print_error("Polyaxonfile is not valid.") Printer.print_error('Error message `{}`.'.format(e)) sys.exit(1)
def get_init_container(self, persistence_outputs): """Pod init container for setting outputs path.""" if self.original_name is not None and self.cloning_strategy == CloningStrategy.RESUME: return [] if self.original_name is not None and self.cloning_strategy == CloningStrategy.COPY: command = InitCommands.COPY original_outputs_path = stores.get_experiment_outputs_path( persistence=persistence_outputs, experiment_name=self.original_name) else: command = InitCommands.CREATE original_outputs_path = None outputs_path = stores.get_experiment_outputs_path( persistence=persistence_outputs, experiment_name=self.experiment_name) _, outputs_volume_mount = get_pod_outputs_volume( persistence_outputs=persistence_outputs) return [ client.V1Container( name=self.init_container_name, image=self.init_docker_image, command=["/bin/sh", "-c"], args=to_list( get_output_args( command=command, outputs_path=outputs_path, original_outputs_path=original_outputs_path)), volume_mounts=outputs_volume_mount) ]
def handle_logs(message): log_lines = to_list(message["log_lines"]) status = message.get("status") if not status and log_lines: handle_log_lines(log_lines) else: handle_status(status, log_lines)
def get_pod_container(self, volume_mounts, env_vars=None, command=None, args=None, persistence_outputs=None, persistence_data=None, outputs_refs_jobs=None, outputs_refs_experiments=None, secret_refs=None, configmap_refs=None, resources=None, ephemeral_token=None): """Pod job container for task.""" assert self.cluster_def is not None # Env vars preparations env_vars = to_list(env_vars, check_none=True) outputs_path = get_experiment_outputs_path( persistence_outputs=persistence_outputs, experiment_name=self.experiment_name, original_name=self.original_name, cloning_strategy=self.cloning_strategy) env_vars += get_job_env_vars( persistence_outputs=persistence_outputs, outputs_path=outputs_path, persistence_data=persistence_data, log_level=self.log_level, logs_path=get_experiment_logs_path(self.experiment_name, temp=False), outputs_refs_jobs=outputs_refs_jobs, outputs_refs_experiments=outputs_refs_experiments, ephemeral_token=ephemeral_token, ) env_vars += [ get_env_var(name=constants.CONFIG_MAP_CLUSTER_KEY_NAME, value=json.dumps(self.cluster_def)), get_env_var(name=constants.CONFIG_MAP_DECLARATIONS_KEY_NAME, value=self.declarations), get_env_var(name=constants.CONFIG_MAP_EXPERIMENT_INFO_KEY_NAME, value=json.dumps(self.experiment_labels)), ] env_vars += get_resources_env_vars(resources=resources) # Env from configmap and secret refs env_from = get_pod_env_from(secret_refs=secret_refs, configmap_refs=configmap_refs) ports = [ client.V1ContainerPort(container_port=port) for port in self.ports ] return client.V1Container(name=self.job_container_name, image=self.job_docker_image, command=command, args=args, ports=ports, env=env_vars, env_from=env_from, resources=get_resources(resources), volume_mounts=volume_mounts)
def check_polyaxonfile(polyaxonfile, params=None, profile=None, queue=None, nocache=None, log=True): if not polyaxonfile: polyaxonfile = PolyaxonFile.check_default_path(path=".") if not polyaxonfile: polyaxonfile = "" polyaxonfile = to_list(polyaxonfile) exists = [os.path.isfile(f) for f in polyaxonfile] parsed_params = None if params: parsed_params = parse_params(params) if not any(exists): Printer.print_error("Polyaxonfile is not present, " "please run {}".format(constants.INIT_COMMAND)) sys.exit(1) try: plx_file = PolyaxonFile(polyaxonfile) plx_file = plx_file.get_op_specification(params=parsed_params, profile=profile, queue=queue, nocache=nocache) if log: Printer.print_success("Polyaxonfile valid") return plx_file except Exception as e: handle_cli_error(e, message="Polyaxonfile is not valid.") sys.exit(1)
def _set_persistence(instance, default_persistence_data=None, default_persistence_outputs=None): if instance.persistence: return data_refs = None artifact_refs = None cond = (instance.specification and instance.specification.environment and instance.specification.environment.data_refs) if cond: data_refs = instance.specification.environment.data_refs cond = (instance.specification and instance.specification.environment and instance.specification.environment.artifact_refs) if cond: # TODO: this is a temp workaround until the finalized Polyflow version artifact_refs = to_list( instance.specification.environment.artifact_refs)[0] if not data_refs and default_persistence_data: data_refs = default_persistence_data if not artifact_refs and default_persistence_outputs: artifact_refs = default_persistence_outputs persistence_data = validate_persistence_data(persistence_data=data_refs) persistence_outputs = validate_persistence_outputs( persistence_outputs=artifact_refs) persistence_config = PersistenceConfig(data=persistence_data, outputs=persistence_outputs) instance.persistence = persistence_config.to_dict()
def get_sidecar_container(job_container_name, sidecar_container_name, sidecar_docker_image, sidecar_docker_image_pull_policy, namespace, sidecar_config, sidecar_args, internal_health_check_url, internal_reconcile_url, volume_mounts, env_vars=None): """Return a pod sidecar container.""" env_vars = to_list(env_vars) if env_vars else [] env_vars += get_sidecar_env_vars( namespace=namespace, job_container_name=job_container_name, internal_health_check_url=internal_health_check_url, internal_reconcile_url=internal_reconcile_url) for k, v in sidecar_config.items(): env_vars.append(get_env_var(name=k, value=v)) return client.V1Container( name=sidecar_container_name, image=sidecar_docker_image, image_pull_policy=sidecar_docker_image_pull_policy, command=get_sidecar_command(), env=env_vars, volume_mounts=volume_mounts, resources=get_sidecar_resources(), args=sidecar_args)
def gpu_resources(cls, jobs_resources): jobs_resources = to_list(jobs_resources) click.clear() data = [[ 'job_name', 'name', 'GPU Usage', 'GPU Mem Usage / Total', 'GPU Temperature', 'Power Draw / Limit' ]] non_gpu_jobs = 0 for job_resources in jobs_resources: job_resources = ContainerResourcesConfig.from_dict(job_resources) line = [] if not job_resources.gpu_resources: non_gpu_jobs += 1 continue for gpu_resources in job_resources.gpu_resources: line += [ job_resources.job_name, gpu_resources.name, to_percentage(gpu_resources.utilization_gpu / 100), '{} / {}'.format( to_unit_memory(gpu_resources.memory_used), to_unit_memory(gpu_resources.memory_total)), gpu_resources.temperature_gpu, '{} / {}'.format(gpu_resources.power_draw, gpu_resources.power_limit), ] data.append(line) if non_gpu_jobs == len(jobs_resources): Printer.print_error( 'No GPU job was found, please run `resources` command without `-g | --gpu` option.' ) exit(1) click.echo(tabulate(data, headers="firstrow")) sys.stdout.flush()
def _get_valid_config(cls, config, *fields) -> ConfigType: config = to_list(config) web_hooks = [] for web_hook in config: if not web_hook.get('url'): logger.warning("Settings contains a non compatible web hook: `%s`", web_hook) continue url = web_hook['url'] if not validate_url(url): raise PolyaxonActionException('{} received invalid URL `{}`.'.format(cls.name, url)) method = web_hook.get('method', 'POST') if not isinstance(method, str): raise PolyaxonActionException( '{} received invalid method `{}`.'.format(cls.name, method)) _method = method.upper() if _method not in ['GET', 'POST']: raise PolyaxonActionException( '{} received non compatible method `{}`.'.format(cls.name, method)) result_web_hook = {'url': url, 'method': _method} for field in fields: if field in web_hook: result_web_hook[field] = web_hook[field] web_hooks.append(result_web_hook) return web_hooks
def __init__(self, filepaths): filepaths = to_list(filepaths) for filepath in filepaths: if not os.path.isfile(filepath): raise PolyaxonfileError("`{}` must be a valid file".format(filepath)) self._filenames = [os.path.basename(filepath) for filepath in filepaths] self.specification = get_specification(data=reader.read(filepaths))
def get_task_pod(self, task_type, task_idx, volume_mounts, volumes, labels, env_vars=None, init_env_vars=None, command=None, args=None, ports=None, persistence_outputs=None, persistence_data=None, outputs_refs_jobs=None, outputs_refs_experiments=None, secret_refs=None, configmap_refs=None, resources=None, ephemeral_token=None, node_selector=None, affinity=None, tolerations=None, sidecar_context_mounts=None, init_context_mounts=None, restart_policy=None): resource_name = self.get_resource_name(task_type=task_type, task_idx=task_idx) env_vars = to_list(env_vars, check_none=True) env_vars.append( client.V1EnvVar(name=constants.CONFIG_MAP_TASK_INFO_KEY_NAME, value=json.dumps({ 'type': task_type, 'index': task_idx }))) return self.get_pod(resource_name=resource_name, volume_mounts=volume_mounts, volumes=volumes, labels=labels, env_vars=env_vars, command=command, args=args, init_env_vars=init_env_vars, ports=ports, persistence_outputs=persistence_outputs, persistence_data=persistence_data, outputs_refs_jobs=outputs_refs_jobs, outputs_refs_experiments=outputs_refs_experiments, secret_refs=secret_refs, configmap_refs=configmap_refs, resources=resources, ephemeral_token=ephemeral_token, node_selector=node_selector, affinity=affinity, tolerations=tolerations, sidecar_context_mounts=sidecar_context_mounts, init_context_mounts=init_context_mounts, restart_policy=restart_policy)
def validate_tags(tags): if not tags: return None if isinstance(tags, six.string_types): tags = [tag.strip() for tag in tags.split(",")] tags = to_list(tags) tags = [tag for tag in tags if (tag and isinstance(tag, six.string_types))] return tags
def upload(self, url, files, files_size, params=None, json_data=None, timeout=None, headers=None, session=None): if files_size > settings.WARN_UPLOAD_SIZE: logger.warning( "You are uploading %s, there's a hard limit of %s.\n" "If you have data files in the current directory, " "please make sure to add them to .polyaxonignore or " "add them directly to your data volume, or upload them " "separately using `polyaxon data` command and remove them from here.\n", self.format_sizeof(settings.WARN_UPLOAD_SIZE), self.format_sizeof(settings.MAX_UPLOAD_SIZE)) if files_size > settings.MAX_UPLOAD_SIZE: raise PolyaxonShouldExitError( "Files too large to sync, please keep it under {}.\n" "If you have data files in the current directory, " "please add them directly to your data volume, or upload them " "separately using `polyaxon data` command and remove them from here.\n" .format(self.format_sizeof(settings.MAX_UPLOAD_SIZE))) files = to_list(files) if json_data: files.append(('json', json.dumps(json_data))) multipart_encoder = MultipartEncoder(fields=files) request_headers = headers or {} request_headers.update( {"Content-Type": multipart_encoder.content_type}) # Attach progress bar progress_callback, callback_bar = self.create_progress_callback( multipart_encoder) multipart_encoder_monitor = MultipartEncoderMonitor( multipart_encoder, progress_callback) timeout = timeout if timeout is not None else settings.LONG_REQUEST_TIMEOUT try: response = self.put(url=url, params=params, data=multipart_encoder_monitor, headers=request_headers, timeout=timeout, session=session) finally: # always make sure we clear the console callback_bar.done() return response
def get_init_container(self, init_command, init_args, env_vars, context_mounts, persistence_outputs, persistence_data): """Pod init container for setting outputs path.""" env_vars = to_list(env_vars, check_none=True) volume_mounts = to_list(context_mounts, check_none=True) init_command = init_command or ["/bin/sh", "-c"] init_args = to_list( get_auth_context_args(entity='notebook', entity_name=self.job_name)) return client.V1Container( name=self.init_container_name, image=self.init_docker_image, image_pull_policy=self.init_docker_image_pull_policy, command=init_command, args=init_args, env=env_vars, volume_mounts=volume_mounts)
def __init__(self, namespace, name, project_name, project_uuid, job_name, job_uuid, job_docker_image, job_container_name=None, sidecar_container_name=None, sidecar_docker_image=None, sidecar_docker_image_pull_policy=None, init_container_name=None, init_docker_image=None, role_label=None, type_label=None, ports=None, use_sidecar=False, sidecar_config=None, health_check_url=None, log_level=None): self.namespace = namespace self.name = name self.project_name = project_name self.project_uuid = project_uuid self.job_name = job_name self.job_uuid = job_uuid self.job_container_name = job_container_name or conf.get( 'CONTAINER_NAME_JOB') self.job_docker_image = job_docker_image self.sidecar_container_name = sidecar_container_name or conf.get( 'CONTAINER_NAME_SIDECAR') self.sidecar_docker_image = sidecar_docker_image or conf.get( 'JOB_SIDECAR_DOCKER_IMAGE') self.sidecar_docker_image_pull_policy = ( sidecar_docker_image_pull_policy or conf.get('JOB_SIDECAR_DOCKER_IMAGE_PULL_POLICY')) self.init_container_name = init_container_name or conf.get( 'CONTAINER_NAME_INIT') self.init_docker_image = init_docker_image or conf.get( 'JOB_INIT_DOCKER_IMAGE') self.role_label = role_label or conf.get('ROLE_LABELS_WORKER') self.type_label = type_label or conf.get('TYPE_LABELS_RUNNER') self.app_label = conf.get('APP_LABELS_JOB') self.labels = self.get_labels() self.k8s_job_name = self.get_k8s_job_name() self.ports = to_list(ports) if ports else [] self.use_sidecar = use_sidecar if use_sidecar and not sidecar_config: raise PolyaxonConfigurationError( 'In order to use a `sidecar_config` is required. ' 'The `sidecar_config` must correspond to the sidecar docker image used.' ) self.sidecar_config = sidecar_config self.health_check_url = health_check_url self.log_level = log_level
def get_init_container(self, init_command, init_args, env_vars, context_mounts, persistence_outputs, persistence_data): """Pod init container for setting outputs path.""" env_vars = to_list(env_vars, check_none=True) if self.original_name is not None and self.cloning_strategy == CloningStrategy.RESUME: return [] if self.original_name is not None and self.cloning_strategy == CloningStrategy.COPY: command = InitCommands.COPY original_outputs_path = stores.get_experiment_outputs_path( persistence=persistence_outputs, experiment_name=self.original_name) else: command = InitCommands.CREATE original_outputs_path = None outputs_path = stores.get_experiment_outputs_path( persistence=persistence_outputs, experiment_name=self.experiment_name) _, outputs_volume_mount = get_pod_outputs_volume(persistence_outputs=persistence_outputs) volume_mounts = outputs_volume_mount + to_list(context_mounts, check_none=True) init_command = init_command or ["/bin/sh", "-c"] init_args = init_args or to_list( get_output_args(command=command, outputs_path=outputs_path, original_outputs_path=original_outputs_path)) init_args += to_list(get_auth_context_args(entity='experiment', entity_name=self.experiment_name)) return [ client.V1Container( name=self.init_container_name, image=self.init_docker_image, image_pull_policy=self.init_docker_image_pull_policy, command=init_command, args=[''.join(init_args)], env=env_vars, resources=get_init_resources(), volume_mounts=volume_mounts) ]
def run(cls, task_bind, *args, **kwargs): retry_for = cls.retry_for or [] retry_for = to_list(retry_for) if SoftTimeLimitExceeded not in retry_for: retry_for.append(SoftTimeLimitExceeded) try: return cls._run(task_bind, *args, **kwargs) except tuple(retry_for) as exc: # pylint:disable=catching-non-exception if task_bind.request.retries < task_bind.max_retries: raise task_bind.retry(countdown=task_bind.countdown) else: raise exc # pylint:disable=raising-non-exception
def __init__(self, repo_path: str, from_image: str, copy_code: bool = True, build_steps: Optional[List[str]] = None, env_vars: Optional[List[Tuple[str, str]]] = None, nvidia_bin: str = None, dockerfile_name: str = 'Dockerfile') -> None: self.from_image = from_image self.folder_name = repo_path.split('/')[-1] self.repo_path = repo_path self.copy_code = copy_code self.build_path = '/'.join(self.repo_path.split('/')[:-1]) self.build_steps = to_list(build_steps, check_none=True) self.env_vars = to_list(env_vars, check_none=True) self.nvidia_bin = nvidia_bin self.dockerfile_path = os.path.join(self.build_path, dockerfile_name) self.polyaxon_requirements_path = self._get_requirements_path() self.polyaxon_setup_path = self._get_setup_path() self.is_pushing = False
def publish_job_log(self, log_lines, job_uuid, job_name, send_task=True): log_lines = to_list(log_lines) self._logger.info("Publishing log event for task: %s", job_uuid) if send_task: celery_app.send_task(LogsCeleryTasks.LOGS_HANDLE_JOB, kwargs={ 'job_uuid': job_uuid, 'job_name': job_name, 'log_lines': log_lines }) self._stream_job_log(job_uuid=job_uuid, log_lines=log_lines, routing_key=RoutingKeys.STREAM_LOGS_SIDECARS_JOBS)