def get_init_container(self, persistence_outputs): """Pod init container for setting outputs path.""" if self.original_name is not None and self.cloning_strategy == CloningStrategy.RESUME: return [] if self.original_name is not None and self.cloning_strategy == CloningStrategy.COPY: command = InitCommands.COPY original_outputs_path = stores.get_experiment_outputs_path( persistence=persistence_outputs, experiment_name=self.original_name) else: command = InitCommands.CREATE original_outputs_path = None outputs_path = stores.get_experiment_outputs_path( persistence=persistence_outputs, experiment_name=self.experiment_name) _, outputs_volume_mount = get_pod_outputs_volume( persistence_outputs=persistence_outputs) return [ client.V1Container( name=self.init_container_name, image=self.init_docker_image, command=["/bin/sh", "-c"], args=to_list( get_output_args( command=command, outputs_path=outputs_path, original_outputs_path=original_outputs_path)), volume_mounts=outputs_volume_mount) ]
def get_init_path_args(self, persistence_outputs): if self.original_name is not None and self.cloning_strategy == CloningStrategy.RESUME: command = InitCommands.CREATE outputs_path = stores.get_experiment_outputs_path( persistence=persistence_outputs, experiment_name=self.original_name) original_outputs_path = None elif self.original_name is not None and self.cloning_strategy == CloningStrategy.COPY: command = InitCommands.COPY outputs_path = stores.get_experiment_outputs_path( persistence=persistence_outputs, experiment_name=self.experiment_name) original_outputs_path = stores.get_experiment_outputs_path( persistence=persistence_outputs, experiment_name=self.original_name) else: command = InitCommands.CREATE outputs_path = stores.get_experiment_outputs_path( persistence=persistence_outputs, experiment_name=self.experiment_name) original_outputs_path = None return get_output_args(command=command, outputs_path=outputs_path, original_outputs_path=original_outputs_path)
def get(self, request, *args, **kwargs): filepath = request.query_params.get('path') if not filepath: raise ValidationError('Files view expect a path to the file.') experiment_outputs_path = stores.get_experiment_outputs_path( persistence=self.experiment.persistence_outputs, experiment_name=self.experiment.unique_name, original_name=self.experiment.original_unique_name, cloning_strategy=self.experiment.cloning_strategy) download_filepath = archive_outputs_file( persistence_outputs=self.experiment.persistence_outputs, outputs_path=experiment_outputs_path, namepath=self.experiment.unique_name, filepath=filepath) filename = os.path.basename(download_filepath) chunk_size = 8192 try: wrapped_file = FileWrapper(open(download_filepath, 'rb'), chunk_size) response = StreamingHttpResponse( wrapped_file, content_type=mimetypes.guess_type(download_filepath)[0]) response['Content-Length'] = os.path.getsize(download_filepath) response['Content-Disposition'] = "attachment; filename={}".format(filename) return response except FileNotFoundError: _logger.warning('Outputs file not found: log_path=%s', download_filepath) return Response(status=status.HTTP_404_NOT_FOUND, data='Outputs file not found: log_path={}'.format(download_filepath))
def test_copying_an_experiment(self): with patch('scheduler.tasks.experiments.experiments_build.apply_async') as _: # noqa experiment1 = ExperimentFactory() # We create some outputs files for the experiment path = stores.create_experiment_outputs_path( persistence=experiment1.persistence_outputs, experiment_name=experiment1.unique_name) open(os.path.join(path, 'file'), 'w+') # Create a new experiment that is a clone of the previous with patch('scheduler.tasks.experiments.experiments_build.apply_async') as _: # noqa experiment2 = ExperimentFactory(original_experiment=experiment1) # Check that outputs path for experiment2 does not exist yet experiment2_outputs_path = stores.get_experiment_outputs_path( persistence=experiment2.persistence_outputs, experiment_name=experiment2.unique_name) assert os.path.exists(experiment2_outputs_path) is False # Handle restart should create the outputs and copy the content of experiment 1 copy_experiment(experiment2) assert os.path.exists(experiment2_outputs_path) is True assert os.path.exists(os.path.join(experiment2_outputs_path, 'file')) is True
def get_experiments_outputs_spec(self): import stores if not self.experiments.count(): return None annotation = { 'persistence_outputs': KeyTransform('outputs', 'persistence') } query = self.experiments.annotate(**annotation) experiment_data = query.values_list('id', 'experiment_group__id', 'project__user__username', 'project__name', 'persistence_outputs') outputs_spec_data = {} for data in experiment_data: project_name = PROJECT_UNIQUE_NAME_FORMAT.format(user=data[2], project=data[3]) if data[1]: group_name = GROUP_UNIQUE_NAME_FORMAT.format( project_name=project_name, id=data[1]) experiment_name = EXPERIMENT_UNIQUE_NAME_FORMAT.format( parent_name=group_name, id=data[0]) else: experiment_name = EXPERIMENT_UNIQUE_NAME_FORMAT.format( parent_name=project_name, id=data[0]) outputs_path = stores.get_experiment_outputs_path( persistence=data[4], experiment_name=experiment_name) outputs_spec_data[data[0]] = OutputsRefsSpec(path=outputs_path, persistence=data[4]) return outputs_spec_data
def get_config_map(namespace, project_name, experiment_group_name, experiment_name, project_uuid, experiment_group_uuid, experiment_uuid, original_name, cloning_strategy, cluster_def, persistence_outputs, persistence_data, params, log_level): name = constants.CONFIG_MAP_NAME.format(uuid=experiment_uuid) labels = get_map_labels(project_name, experiment_group_name, experiment_name, project_uuid, experiment_group_uuid, experiment_uuid) metadata = client.V1ObjectMeta(name=name, labels=labels, namespace=namespace) experiment_outputs_path = stores.get_experiment_outputs_path( persistence=persistence_outputs, experiment_name=experiment_name, original_name=original_name, cloning_strategy=cloning_strategy) experiment_logs_path = stores.get_experiment_logs_path( experiment_name=experiment_name, temp=False) data = { constants.CONFIG_MAP_CLUSTER_KEY_NAME: json.dumps(cluster_def), constants.CONFIG_MAP_PARAMS_KEY_NAME: json.dumps(params) or '{}', constants.CONFIG_MAP_EXPERIMENT_INFO_KEY_NAME: json.dumps(labels), constants.CONFIG_MAP_LOG_LEVEL_KEY_NAME: log_level, constants.CONFIG_MAP_RUN_OUTPUTS_PATH_KEY_NAME: experiment_outputs_path, constants.CONFIG_MAP_RUN_LOGS_PATH_KEY_NAME: experiment_logs_path, constants.CONFIG_MAP_RUN_DATA_PATHS_KEY_NAME: persistence_data, API_HTTP_URL: get_settings_http_api_url(), API_WS_HOST: get_settings_ws_api_url(), } return client.V1ConfigMap(api_version=k8s_constants.K8S_API_VERSION_V1, kind=k8s_constants.K8S_CONFIG_MAP_KIND, metadata=metadata, data=data)
def get(self, request, *args, **kwargs): try: store_manager = stores.get_outputs_store( persistence_outputs=self.experiment.persistence_outputs) except (PolyaxonStoresException, VolumeNotFoundError) as e: raise ValidationError(e) experiment_outputs_path = stores.get_experiment_outputs_path( persistence=self.experiment.persistence_outputs, experiment_name=self.experiment.unique_name, original_name=self.experiment.original_unique_name, cloning_strategy=self.experiment.cloning_strategy) if request.query_params.get('path'): experiment_outputs_path = os.path.join( experiment_outputs_path, request.query_params.get('path')) try: data = store_manager.ls(experiment_outputs_path) except VolumeNotFoundError: raise ValidationError( 'Store manager could not load the volume requested,' ' to get the outputs data.') except Exception: raise ValidationError( 'Experiment outputs path does not exists or bad configuration.' ) return Response(data=data, status=200)
def get_init_container(self, init_command, init_args, env_vars, context_mounts, persistence_outputs, persistence_data): """Pod init container for setting outputs path.""" env_vars = to_list(env_vars, check_none=True) if self.original_name is not None and self.cloning_strategy == CloningStrategy.RESUME: return [] if self.original_name is not None and self.cloning_strategy == CloningStrategy.COPY: command = InitCommands.COPY original_outputs_path = stores.get_experiment_outputs_path( persistence=persistence_outputs, experiment_name=self.original_name) else: command = InitCommands.CREATE original_outputs_path = None outputs_path = stores.get_experiment_outputs_path( persistence=persistence_outputs, experiment_name=self.experiment_name) _, outputs_volume_mount = get_pod_outputs_volume(persistence_outputs=persistence_outputs) volume_mounts = outputs_volume_mount + to_list(context_mounts, check_none=True) init_command = init_command or ["/bin/sh", "-c"] init_args = init_args or to_list( get_output_args(command=command, outputs_path=outputs_path, original_outputs_path=original_outputs_path)) init_args += to_list(get_auth_context_args(entity='experiment', entity_name=self.experiment_name)) return [ client.V1Container( name=self.init_container_name, image=self.init_docker_image, image_pull_policy=self.init_docker_image_pull_policy, command=init_command, args=[''.join(init_args)], env=env_vars, resources=get_init_resources(), volume_mounts=volume_mounts) ]
def get_env_vars(self, task_type, task_idx): tf_config = { 'cluster': self.get_cluster(), 'task': {'type': task_type, 'index': task_idx}, 'model_dir': stores.get_experiment_outputs_path( persistence=self.persistence_config.outputs, experiment_name=self.experiment_name, cloning_strategy=self.cloning_strategy), 'environment': 'cloud' } return get_env_var(name='TF_CONFIG', value=tf_config)
def test_experiment_outputs_path_creation_deletion(self): experiment_outputs_path = stores.get_experiment_outputs_path( persistence=self.experiment.persistence_outputs, experiment_name=self.experiment.unique_name) assert os.path.exists(experiment_outputs_path) is False stores.create_experiment_outputs_path( persistence=self.experiment.persistence_outputs, experiment_name=self.experiment.unique_name) assert os.path.exists(experiment_outputs_path) is True stores_schedule_outputs_deletion(persistence=None, subpath=self.experiment.subpath) assert os.path.exists(experiment_outputs_path) is False
def get_named_experiment_outputs_path(experiment): persistence = experiment.persistence_outputs outputs_path = stores.get_experiment_outputs_path( persistence=persistence, experiment_name=experiment.unique_name, original_name=experiment.original_unique_name, cloning_strategy=experiment.cloning_strategy) tensorboard_path = '{}:{}'.format(experiment.unique_name, outputs_path) return [ OutputsRefsSpec(path=outputs_path, persistence=persistence) ], tensorboard_path
def _get_experiment_outputs_path(self) -> Tuple[List, str]: import stores from stores.validators import validate_persistence_outputs persistence = validate_persistence_outputs( persistence_outputs=self.experiment.persistence_outputs) outputs_path = stores.get_experiment_outputs_path( persistence=persistence, experiment_name=self.experiment.unique_name, original_name=self.experiment.original_unique_name, cloning_strategy=self.experiment.cloning_strategy) return [OutputsRefsSpec(path=outputs_path, persistence=persistence)], outputs_path
def get(self, request, *args, **kwargs): auditor.record(event_type=EXPERIMENT_OUTPUTS_DOWNLOADED, instance=self.experiment, actor_id=self.request.user.id, actor_name=self.request.user.username) experiment_outputs_path = stores.get_experiment_outputs_path( persistence=self.experiment.persistence_outputs, experiment_name=self.experiment.unique_name, original_name=self.experiment.original_unique_name, cloning_strategy=self.experiment.cloning_strategy) archived_path, archive_name = archive_outputs( outputs_path=experiment_outputs_path, name=self.experiment.unique_name) return self.redirect(path='{}/{}'.format(archived_path, archive_name))
def _get_named_experiment_outputs_path(experiment, persistence) -> Tuple[List, str]: import stores from stores.validators import validate_persistence_outputs persistence = validate_persistence_outputs(persistence_outputs=persistence) outputs_path = stores.get_experiment_outputs_path( persistence=persistence, experiment_name=experiment.unique_name, original_name=experiment.original_unique_name, cloning_strategy=experiment.cloning_strategy) tensorboard_path = '{}:{}'.format( experiment.unique_name, outputs_path) return [OutputsRefsSpec(path=outputs_path, persistence=persistence)], tensorboard_path
def get_absolute_outputs_paths(self) -> str: import stores if self.experiment: return stores.get_experiment_outputs_path( persistence=self.experiment.persistence_outputs, experiment_name=self.experiment.unique_name, original_name=self.experiment.original_unique_name, cloning_strategy=self.experiment.cloning_strategy) if self.experiment_group: return stores.get_experiment_group_outputs_path( persistence=self.experiment_group.persistence_outputs, experiment_group_name=self.experiment_group.unique_name) return stores.get_project_outputs_path( persistence_outputs=None, project_name=self.project.unique_name)
def test_experiment_group_outputs_path_creation_deletion(self): experiment = ExperimentFactory(user=self.project.user, project=self.project, experiment_group=self.experiment_group) stores.create_experiment_outputs_path( persistence=experiment.persistence_outputs, experiment_name=experiment.unique_name) experiment_outputs_path = stores.get_experiment_outputs_path( persistence=experiment.persistence_outputs, experiment_name=experiment.unique_name) experiment_group_outputs_path = stores.get_experiment_group_outputs_path( persistence=self.experiment_group.persistence_outputs, experiment_group_name=self.experiment_group.unique_name) assert os.path.exists(experiment_outputs_path) is True assert os.path.exists(experiment_group_outputs_path) is True stores_schedule_outputs_deletion(persistence=None, subpath=self.experiment_group.subpath) assert os.path.exists(experiment_outputs_path) is False assert os.path.exists(experiment_group_outputs_path) is False
def test_project_outputs_path_creation_deletion(self): with patch('scheduler.tasks.experiments.experiments_build.apply_async' ) as _: # noqa experiment = ExperimentFactory(user=self.project.user, project=self.project) stores.create_experiment_outputs_path( persistence=experiment.persistence_outputs, experiment_name=experiment.unique_name) experiment_outputs_path = stores.get_experiment_outputs_path( persistence=experiment.persistence_outputs, experiment_name=experiment.unique_name) project_outputs_path = stores.get_project_outputs_path( persistence=None, project_name=self.project.unique_name) assert os.path.exists(experiment_outputs_path) is True assert os.path.exists(project_outputs_path) is True stores_schedule_outputs_deletion(persistence='outputs', subpath=self.project.subpath) assert os.path.exists(experiment_outputs_path) is False assert os.path.exists(project_outputs_path) is False
def get(self, request, *args, **kwargs): filepath = request.query_params.get('path') if not filepath: raise ValidationError('Files view expect a path to the file.') experiment_outputs_path = stores.get_experiment_outputs_path( persistence=self.experiment.persistence_outputs, experiment_name=self.experiment.unique_name, original_name=self.experiment.original_unique_name, cloning_strategy=self.experiment.cloning_strategy) download_filepath = archive_outputs_file( persistence_outputs=self.experiment.persistence_outputs, outputs_path=experiment_outputs_path, namepath=self.experiment.unique_name, filepath=filepath) if not download_filepath: return Response(status=status.HTTP_404_NOT_FOUND, data='Outputs file not found: log_path={}'.format(download_filepath)) return stream_file(file_path=download_filepath, logger=_logger)
def _get_outputs_path(self, persistence_outputs): return stores.get_experiment_outputs_path( persistence=persistence_outputs, experiment_name=self.experiment_name, original_name=self.original_name, cloning_strategy=self.cloning_strategy)
def get_pod_container(self, volume_mounts, env_vars=None, command=None, args=None, persistence_outputs=None, persistence_data=None, outputs_refs_jobs=None, outputs_refs_experiments=None, secret_refs=None, configmap_refs=None, resources=None, ephemeral_token=None): """Pod job container for task.""" assert self.cluster_def is not None # Env vars preparations env_vars = to_list(env_vars, check_none=True) logs_path = stores.get_experiment_logs_path( experiment_name=self.experiment_name, temp=False) outputs_path = stores.get_experiment_outputs_path( persistence=persistence_outputs, experiment_name=self.experiment_name, original_name=self.original_name, cloning_strategy=self.cloning_strategy) env_vars += get_job_env_vars( persistence_outputs=persistence_outputs, outputs_path=outputs_path, persistence_data=persistence_data, log_level=self.log_level, logs_path=logs_path, outputs_refs_jobs=outputs_refs_jobs, outputs_refs_experiments=outputs_refs_experiments, ephemeral_token=ephemeral_token, ) env_vars += [ get_env_var(name=constants.CONFIG_MAP_CLUSTER_KEY_NAME, value=json.dumps(self.cluster_def)), get_env_var(name=constants.CONFIG_MAP_DECLARATIONS_KEY_NAME, value=self.declarations), get_env_var(name=constants.CONFIG_MAP_EXPERIMENT_INFO_KEY_NAME, value=json.dumps(self.experiment_labels)), ] env_vars += get_resources_env_vars(resources=resources) # Env from configmap and secret refs env_from = get_pod_env_from(secret_refs=secret_refs, configmap_refs=configmap_refs) ports = [ client.V1ContainerPort(container_port=port) for port in self.ports ] return client.V1Container(name=self.job_container_name, image=self.job_docker_image, command=command, args=args, ports=ports, env=env_vars, env_from=env_from, resources=get_resources(resources), volume_mounts=volume_mounts)