Beispiel #1
0
    def get_init_container(self):
        """Pod init container for setting outputs path."""
        if self.original_name is not None and self.cloning_strategy == CloningStrategy.RESUME:
            return []
        if self.original_name is not None and self.cloning_strategy == CloningStrategy.COPY:
            command = InitCommands.COPY
            original_outputs_path = get_experiment_outputs_path(
                experiment_name=self.original_name)
        else:
            command = InitCommands.CREATE
            original_outputs_path = None

        outputs_path = get_experiment_outputs_path(
            experiment_name=self.experiment_name)
        outputs_volume_mount = get_volume_mount(
            volume=constants.DATA_VOLUME, volume_mount=settings.DATA_ROOT)
        return [
            client.V1Container(
                name=self.init_container_name,
                image=self.init_docker_image,
                command=["/bin/sh", "-c"],
                args=to_list(
                    get_output_args(
                        command=command,
                        outputs_path=outputs_path,
                        original_outputs_path=original_outputs_path)),
                volume_mounts=[outputs_volume_mount])
        ]
Beispiel #2
0
    def get_init_container(self):
        """Pod init container for setting outputs path."""
        if self.original_name is not None and self.cloning_strategy == CloningStrategy.RESUME:
            return []
        if self.original_name is not None and self.cloning_strategy == CloningStrategy.COPY:
            command = InitCommands.COPY
            original_outputs_path = get_experiment_outputs_path(experiment_name=self.original_name)
        else:
            command = InitCommands.CREATE
            original_outputs_path = None

        outputs_path = get_experiment_outputs_path(experiment_name=self.experiment_name)
        outputs_volume_mount = get_volume_mount(
            volume=constants.DATA_VOLUME,
            volume_mount=settings.DATA_ROOT)
        return [
            client.V1Container(
                name=self.init_container_name,
                image=self.init_docker_image,
                command=["/bin/sh", "-c"],
                args=to_list(get_output_args(command=command,
                                             outputs_path=outputs_path,
                                             original_outputs_path=original_outputs_path)),
                volume_mounts=[outputs_volume_mount])
        ]
Beispiel #3
0
    def get_init_container(self, persistence_outputs):
        """Pod init container for setting outputs path."""
        if self.original_name is not None and self.cloning_strategy == CloningStrategy.RESUME:
            return []
        if self.original_name is not None and self.cloning_strategy == CloningStrategy.COPY:
            command = InitCommands.COPY
            original_outputs_path = get_experiment_outputs_path(
                persistence_outputs=persistence_outputs, experiment_name=self.original_name)
        else:
            command = InitCommands.CREATE
            original_outputs_path = None

        outputs_path = get_experiment_outputs_path(persistence_outputs=persistence_outputs,
                                                   experiment_name=self.experiment_name)
        _, outputs_volume_mount = get_pod_outputs_volume(persistence_outputs=persistence_outputs)
        return [
            client.V1Container(
                name=self.init_container_name,
                image=self.init_docker_image,
                command=["/bin/sh", "-c"],
                args=to_list(get_output_args(command=command,
                                             outputs_path=outputs_path,
                                             original_outputs_path=original_outputs_path)),
                volume_mounts=outputs_volume_mount)
        ]
Beispiel #4
0
    def get(self, request, *args, **kwargs):
        filepath = request.query_params.get('path')
        if not filepath:
            raise ValidationError('Files view expect a path to the file.')

        experiment_outputs_path = get_experiment_outputs_path(
            persistence_outputs=self.experiment.persistence_outputs,
            experiment_name=self.experiment.unique_name,
            original_name=self.experiment.original_unique_name,
            cloning_strategy=self.experiment.cloning_strategy)

        download_filepath = archive_outputs_file(
            persistence_outputs=self.experiment.persistence_outputs,
            outputs_path=experiment_outputs_path,
            namepath=self.experiment.unique_name,
            filepath=filepath)

        filename = os.path.basename(download_filepath)
        chunk_size = 8192
        try:
            wrapped_file = FileWrapper(open(download_filepath, 'rb'),
                                       chunk_size)
            response = StreamingHttpResponse(
                wrapped_file,
                content_type=mimetypes.guess_type(download_filepath)[0])
            response['Content-Length'] = os.path.getsize(download_filepath)
            response['Content-Disposition'] = "attachment; filename={}".format(
                filename)
            return response
        except FileNotFoundError:
            _logger.warning('Log file not found: log_path=%s',
                            download_filepath)
            return Response(status=status.HTTP_404_NOT_FOUND,
                            data='Log file not found: log_path={}'.format(
                                download_filepath))
Beispiel #5
0
def get_config_map(namespace, project_name, experiment_group_name,
                   experiment_name, project_uuid, experiment_group_uuid,
                   experiment_uuid, original_name, cloning_strategy,
                   cluster_def, declarations, log_level):
    name = constants.CONFIG_MAP_NAME.format(experiment_uuid=experiment_uuid)
    labels = get_map_labels(project_name, experiment_group_name,
                            experiment_name, project_uuid,
                            experiment_group_uuid, experiment_uuid)
    metadata = client.V1ObjectMeta(name=name,
                                   labels=labels,
                                   namespace=namespace)
    experiment_outputs_path = get_experiment_outputs_path(
        experiment_name=experiment_name,
        original_name=original_name,
        cloning_strategy=cloning_strategy)
    experiment_logs_path = get_experiment_logs_path(experiment_name)
    experiment_data_path = get_project_data_path(project_name)
    data = {
        constants.CONFIG_MAP_CLUSTER_KEY_NAME: json.dumps(cluster_def),
        constants.CONFIG_MAP_DECLARATIONS_KEY_NAME: json.dumps(declarations)
        or '{}',
        constants.CONFIG_MAP_EXPERIMENT_INFO_KEY_NAME: json.dumps(labels),
        constants.CONFIG_MAP_LOG_LEVEL_KEY_NAME: log_level,
        API_KEY_NAME: get_settings_api_url(),
        constants.CONFIG_MAP_EXPERIMENT_OUTPUTS_PATH_KEY_NAME:
        experiment_outputs_path,
        constants.CONFIG_MAP_EXPERIMENT_LOGS_PATH_KEY_NAME:
        experiment_logs_path,
        constants.CONFIG_MAP_EXPERIMENT_DATA_PATH_KEY_NAME:
        experiment_data_path,
    }
    return client.V1ConfigMap(api_version=k8s_constants.K8S_API_VERSION_V1,
                              kind=k8s_constants.K8S_CONFIG_MAP_KIND,
                              metadata=metadata,
                              data=data)
Beispiel #6
0
    def get_experiments_outputs_spec(self):
        from libs.paths.experiments import get_experiment_outputs_path

        if not self.experiments.count():
            return None
        annotation = {
            'persistence_outputs': KeyTransform('outputs', 'persistence')
        }
        query = self.experiments.annotate(**annotation)
        experiment_data = query.values_list('id', 'experiment_group__id',
                                            'project__user__username',
                                            'project__name',
                                            'persistence_outputs')
        outputs_spec_data = {}
        for data in experiment_data:
            project_name = PROJECT_UNIQUE_NAME_FORMAT.format(user=data[2],
                                                             project=data[3])

            if data[1]:
                group_name = GROUP_UNIQUE_NAME_FORMAT.format(
                    project_name=project_name, id=data[1])
                experiment_name = EXPERIMENT_UNIQUE_NAME_FORMAT.format(
                    parent_name=group_name, id=data[0])
            else:
                experiment_name = EXPERIMENT_UNIQUE_NAME_FORMAT.format(
                    parent_name=project_name, id=data[0])
            outputs_path = get_experiment_outputs_path(
                persistence_outputs=data[4], experiment_name=experiment_name)
            outputs_spec_data[data[0]] = OutputsRefsSpec(path=outputs_path,
                                                         persistence=data[4])

        return outputs_spec_data
Beispiel #7
0
    def get_pod_container(self,
                          volume_mounts,
                          env_vars=None,
                          command=None,
                          args=None,
                          persistence_outputs=None,
                          persistence_data=None,
                          outputs_refs_jobs=None,
                          outputs_refs_experiments=None,
                          secret_refs=None,
                          configmap_refs=None,
                          resources=None,
                          ephemeral_token=None):
        """Pod job container for task."""
        assert self.cluster_def is not None

        # Env vars preparations
        env_vars = to_list(env_vars, check_none=True)
        outputs_path = get_experiment_outputs_path(
            persistence_outputs=persistence_outputs,
            experiment_name=self.experiment_name,
            original_name=self.original_name,
            cloning_strategy=self.cloning_strategy)
        env_vars += get_job_env_vars(
            persistence_outputs=persistence_outputs,
            outputs_path=outputs_path,
            persistence_data=persistence_data,
            log_level=self.log_level,
            logs_path=get_experiment_logs_path(self.experiment_name,
                                               temp=False),
            outputs_refs_jobs=outputs_refs_jobs,
            outputs_refs_experiments=outputs_refs_experiments,
            ephemeral_token=ephemeral_token,
        )
        env_vars += [
            get_env_var(name=constants.CONFIG_MAP_CLUSTER_KEY_NAME,
                        value=json.dumps(self.cluster_def)),
            get_env_var(name=constants.CONFIG_MAP_DECLARATIONS_KEY_NAME,
                        value=self.declarations),
            get_env_var(name=constants.CONFIG_MAP_EXPERIMENT_INFO_KEY_NAME,
                        value=json.dumps(self.experiment_labels)),
        ]
        env_vars += get_resources_env_vars(resources=resources)

        # Env from configmap and secret refs
        env_from = get_pod_env_from(secret_refs=secret_refs,
                                    configmap_refs=configmap_refs)

        ports = [
            client.V1ContainerPort(container_port=port) for port in self.ports
        ]
        return client.V1Container(name=self.job_container_name,
                                  image=self.job_docker_image,
                                  command=command,
                                  args=args,
                                  ports=ports,
                                  env=env_vars,
                                  env_from=env_from,
                                  resources=get_resources(resources),
                                  volume_mounts=volume_mounts)
 def test_experiment_outputs_path_creation_deletion(self):
     experiment_outputs_path = get_experiment_outputs_path(self.experiment.unique_name)
     assert os.path.exists(experiment_outputs_path) is False
     create_experiment_outputs_path(self.experiment.unique_name)
     assert os.path.exists(experiment_outputs_path) is True
     delete_experiment_outputs(self.experiment.unique_name)
     assert os.path.exists(experiment_outputs_path) is False
Beispiel #9
0
    def test_copying_an_experiment(self):
        with patch('scheduler.tasks.experiments.experiments_build.apply_async'
                   ) as _:  # noqa
            experiment1 = ExperimentFactory()

        # We create some outputs files for the experiment
        path = create_experiment_outputs_path(
            persistence_outputs=experiment1.persistence_outputs,
            experiment_name=experiment1.unique_name)
        open(os.path.join(path, 'file'), 'w+')

        # Create a new experiment that is a clone of the previous
        with patch('scheduler.tasks.experiments.experiments_build.apply_async'
                   ) as _:  # noqa
            experiment2 = ExperimentFactory(original_experiment=experiment1)

        # Check that outputs path for experiment2 does not exist yet
        experiment2_outputs_path = get_experiment_outputs_path(
            persistence_outputs=experiment2.persistence_outputs,
            experiment_name=experiment2.unique_name)
        assert os.path.exists(experiment2_outputs_path) is False

        # Handle restart should create the outputs and copy the content of experiment 1
        copy_experiment(experiment2)

        assert os.path.exists(experiment2_outputs_path) is True
        assert os.path.exists(os.path.join(experiment2_outputs_path,
                                           'file')) is True
 def test_experiment_outputs_path_creation_deletion(self):
     experiment_outputs_path = get_experiment_outputs_path(
         self.experiment.unique_name)
     assert os.path.exists(experiment_outputs_path) is False
     create_experiment_outputs_path(self.experiment.unique_name)
     assert os.path.exists(experiment_outputs_path) is True
     delete_experiment_outputs(self.experiment.unique_name)
     assert os.path.exists(experiment_outputs_path) is False
Beispiel #11
0
def archive_experiment_outputs(persistence_outputs, experiment_name):
    check_archive_path(settings.OUTPUTS_ARCHIVE_ROOT)
    experiment_outputs_path = get_experiment_outputs_path(persistence_outputs=persistence_outputs,
                                                          experiment_name=experiment_name)
    outputs_files = get_files_in_path(experiment_outputs_path)
    tar_name = "{}.tar.gz".format(experiment_name.replace('.', '_'))
    create_tarfile(files=outputs_files, tar_path=os.path.join(settings.OUTPUTS_ARCHIVE_ROOT,
                                                              tar_name))
    return settings.OUTPUTS_ARCHIVE_ROOT, tar_name
Beispiel #12
0
 def get_env_vars(self, task_type, task_idx):
     tf_config = {
         'cluster': self.get_cluster(),
         'task': {'type': task_type, 'index': task_idx},
         'model_dir': get_experiment_outputs_path(
             persistence_outputs=self.persistence_config.outputs,
             experiment_name=self.experiment_name,
             cloning_strategy=self.cloning_strategy),
         'environment': 'cloud'
     }
     return get_env_var(name='TF_CONFIG', value=tf_config)
Beispiel #13
0
 def get_named_experiment_outputs_path(experiment):
     persistence = experiment.persistence_outputs
     outputs_path = get_experiment_outputs_path(
         persistence_outputs=experiment.persistence_outputs,
         experiment_name=experiment.unique_name,
         original_name=experiment.original_unique_name,
         cloning_strategy=experiment.cloning_strategy)
     tensorboard_path = '{}:{}'.format(
         experiment.unique_name,
         outputs_path)
     return [OutputsRefsSpec(path=outputs_path, persistence=persistence)], tensorboard_path
 def test_project_outputs_path_creation_deletion(self):
     with patch('scheduler.tasks.experiments.experiments_build.apply_async') as _:  # noqa
         experiment = ExperimentFactory(user=self.project.user, project=self.project)
     create_experiment_outputs_path(experiment.unique_name)
     create_experiment_logs_path(experiment.unique_name)
     experiment_outputs_path = get_experiment_outputs_path(experiment.unique_name)
     project_outputs_path = get_project_outputs_path(self.project.unique_name)
     assert os.path.exists(experiment_outputs_path) is True
     assert os.path.exists(project_outputs_path) is True
     delete_project_outputs(self.project.unique_name)
     assert os.path.exists(experiment_outputs_path) is False
     assert os.path.exists(project_outputs_path) is False
 def test_experiment_group_outputs_path_creation_deletion(self):
     experiment = ExperimentFactory(user=self.project.user,
                                    project=self.project,
                                    experiment_group=self.experiment_group)
     create_experiment_outputs_path(experiment.unique_name)
     experiment_outputs_path = get_experiment_outputs_path(experiment.unique_name)
     experiment_group_outputs_path = get_experiment_group_outputs_path(
         self.experiment_group.unique_name)
     assert os.path.exists(experiment_outputs_path) is True
     assert os.path.exists(experiment_group_outputs_path) is True
     delete_experiment_group_outputs(self.experiment_group.unique_name)
     assert os.path.exists(experiment_outputs_path) is False
     assert os.path.exists(experiment_group_outputs_path) is False
Beispiel #16
0
 def get(self, request, *args, **kwargs):
     auditor.record(event_type=EXPERIMENT_OUTPUTS_DOWNLOADED,
                    instance=self.experiment,
                    actor_id=self.request.user.id,
                    actor_name=self.request.user.username)
     experiment_outputs_path = get_experiment_outputs_path(
         persistence_outputs=self.experiment.persistence_outputs,
         experiment_name=self.experiment.unique_name,
         original_name=self.experiment.original_unique_name,
         cloning_strategy=self.experiment.cloning_strategy)
     archived_path, archive_name = archive_outputs(
         outputs_path=experiment_outputs_path,
         name=self.experiment.unique_name)
     return self.redirect(path='{}/{}'.format(archived_path, archive_name))
Beispiel #17
0
    def outputs_path(self):
        if self.experiment:
            from libs.paths.experiments import get_experiment_outputs_path
            return get_experiment_outputs_path(
                experiment_name=self.experiment.unique_name,
                original_name=self.experiment.original_unique_name,
                cloning_strategy=self.experiment.cloning_strategy)
        if self.experiment_group:
            from libs.paths.experiment_groups import get_experiment_group_outputs_path
            return get_experiment_group_outputs_path(
                experiment_group_name=self.experiment_group.unique_name)

        from libs.paths.projects import get_project_outputs_path
        return get_project_outputs_path(project_name=self.project.unique_name)
Beispiel #18
0
 def test_experiment_group_outputs_path_creation_deletion(self):
     experiment = ExperimentFactory(user=self.project.user,
                                    project=self.project,
                                    experiment_group=self.experiment_group)
     create_experiment_outputs_path(experiment.unique_name)
     experiment_outputs_path = get_experiment_outputs_path(
         experiment.unique_name)
     experiment_group_outputs_path = get_experiment_group_outputs_path(
         self.experiment_group.unique_name)
     assert os.path.exists(experiment_outputs_path) is True
     assert os.path.exists(experiment_group_outputs_path) is True
     delete_experiment_group_outputs(self.experiment_group.unique_name)
     assert os.path.exists(experiment_outputs_path) is False
     assert os.path.exists(experiment_group_outputs_path) is False
Beispiel #19
0
    def get_pod_container(self,
                          volume_mounts,
                          env_vars=None,
                          command=None,
                          args=None,
                          persistence_outputs=None,
                          persistence_data=None,
                          outputs_refs_jobs=None,
                          outputs_refs_experiments=None,
                          resources=None):
        """Pod job container for task."""
        assert self.cluster_def is not None

        env_vars = get_list(env_vars)
        outputs_path = get_experiment_outputs_path(
            persistence_outputs=persistence_outputs,
            experiment_name=self.experiment_name,
            original_name=self.original_name,
            cloning_strategy=self.cloning_strategy)
        env_vars += get_job_env_vars(
            log_level=self.log_level,
            outputs_path=outputs_path,
            data_paths=get_data_paths(persistence_data),
            logs_path=get_experiment_logs_path(self.experiment_name),
            outputs_refs_jobs=outputs_refs_jobs,
            outputs_refs_experiments=outputs_refs_experiments)
        env_vars += [
            get_env_var(name=constants.CONFIG_MAP_CLUSTER_KEY_NAME,
                        value=json.dumps(self.cluster_def)),
            get_env_var(name=constants.CONFIG_MAP_DECLARATIONS_KEY_NAME,
                        value=self.declarations),
            get_env_var(name=constants.CONFIG_MAP_EXPERIMENT_INFO_KEY_NAME,
                        value=json.dumps(self.experiment_labels)),
        ]

        env_vars += get_resources_env_vars(resources=resources)

        ports = [
            client.V1ContainerPort(container_port=port) for port in self.ports
        ]
        return client.V1Container(name=self.job_container_name,
                                  image=self.job_docker_image,
                                  command=command,
                                  args=args,
                                  ports=ports,
                                  env=env_vars,
                                  resources=get_resources(resources),
                                  volume_mounts=volume_mounts)
Beispiel #20
0
    def get_pod_container(self,
                          volume_mounts,
                          env_vars=None,
                          command=None,
                          args=None,
                          resources=None):
        """Pod job container for task."""
        assert self.cluster_def is not None

        env_vars = get_list(env_vars)
        outputs_path = get_experiment_outputs_path(
            experiment_name=self.experiment_name,
            original_name=self.original_name,
            cloning_strategy=self.cloning_strategy)
        env_vars += get_job_env_vars(
            log_level=self.log_level,
            outputs_path=outputs_path,
            logs_path=get_experiment_logs_path(self.experiment_name),
            data_path=get_experiment_data_path(self.experiment_name),
            project_data_path=get_project_data_path(project_name=self.project_name)
        )
        env_vars += [
            get_env_var(name=constants.CONFIG_MAP_CLUSTER_KEY_NAME,
                        value=json.dumps(self.cluster_def)),
            get_env_var(name=constants.CONFIG_MAP_DECLARATIONS_KEY_NAME,
                        value=self.declarations),
            get_env_var(name=constants.CONFIG_MAP_EXPERIMENT_INFO_KEY_NAME,
                        value=json.dumps(self.experiment_labels)),
        ]

        if resources:
            env_vars += get_resources_env_vars(resources=resources)

        ports = [client.V1ContainerPort(container_port=port) for port in self.ports]
        return client.V1Container(name=self.job_container_name,
                                  image=self.job_docker_image,
                                  command=command,
                                  args=args,
                                  ports=ports,
                                  env=env_vars,
                                  resources=get_resources(resources),
                                  volume_mounts=volume_mounts)
Beispiel #21
0
def get_config_map(namespace,
                   project_name,
                   experiment_group_name,
                   experiment_name,
                   project_uuid,
                   experiment_group_uuid,
                   experiment_uuid,
                   original_name,
                   cloning_strategy,
                   cluster_def,
                   declarations,
                   log_level):
    name = constants.CONFIG_MAP_NAME.format(uuid=experiment_uuid)
    labels = get_map_labels(project_name,
                            experiment_group_name,
                            experiment_name,
                            project_uuid,
                            experiment_group_uuid,
                            experiment_uuid)
    metadata = client.V1ObjectMeta(name=name, labels=labels, namespace=namespace)
    experiment_outputs_path = get_experiment_outputs_path(experiment_name=experiment_name,
                                                          original_name=original_name,
                                                          cloning_strategy=cloning_strategy)
    experiment_logs_path = get_experiment_logs_path(experiment_name)
    experiment_data_path = get_project_data_path(project_name)
    data = {
        constants.CONFIG_MAP_CLUSTER_KEY_NAME: json.dumps(cluster_def),
        constants.CONFIG_MAP_DECLARATIONS_KEY_NAME: json.dumps(declarations) or '{}',
        constants.CONFIG_MAP_EXPERIMENT_INFO_KEY_NAME: json.dumps(labels),
        constants.CONFIG_MAP_LOG_LEVEL_KEY_NAME: log_level,
        API_KEY_NAME: get_settings_api_url(),
        constants.CONFIG_MAP_RUN_OUTPUTS_PATH_KEY_NAME: experiment_outputs_path,
        constants.CONFIG_MAP_RUN_LOGS_PATH_KEY_NAME: experiment_logs_path,
        constants.CONFIG_MAP_RUN_DATA_PATH_KEY_NAME: experiment_data_path,
    }
    return client.V1ConfigMap(api_version=k8s_constants.K8S_API_VERSION_V1,
                              kind=k8s_constants.K8S_CONFIG_MAP_KIND,
                              metadata=metadata,
                              data=data)
Beispiel #22
0
 def get(self, request, *args, **kwargs):
     store_manager = get_outputs_store(
         persistence_outputs=self.experiment.persistence_outputs)
     experiment_outputs_path = get_experiment_outputs_path(
         persistence_outputs=self.experiment.persistence_outputs,
         experiment_name=self.experiment.unique_name,
         original_name=self.experiment.original_unique_name,
         cloning_strategy=self.experiment.cloning_strategy)
     if request.query_params.get('path'):
         experiment_outputs_path = os.path.join(
             experiment_outputs_path, request.query_params.get('path'))
     try:
         data = store_manager.ls(experiment_outputs_path)
     except VolumeNotFoundError:
         raise ValidationError(
             'Store manager could not load the volume requested,'
             ' to get the outputs data.')
     except Exception:
         raise ValidationError(
             'Experiment outputs path does not exists or bad configuration.'
         )
     return Response(data=data, status=200)