Exemple #1
0
 def handle(self,
            start=2000,
            stop=1000,
            dataset_aging=1.0,
            log_aging=1.0,
            sandbox_aging=1.0,
            synch=False,
            wait=timedelta(seconds=0),
            batch_size=100,
            **kwargs):
     # noinspection PyBroadException
     try:
         if synch:
             logger.debug('Starting purge synchronization.')
             self.synch_model(Container, 'file', wait, batch_size)
             self.synch_model(ContainerRun, 'sandbox_path', wait, batch_size)
             self.synch_model(ContainerLog, 'long_text', wait, batch_size)
             self.synch_model(Dataset, 'dataset_file', wait, batch_size)
             Dataset.external_file_check(batch_size=batch_size)
             logger.debug('Finished purge synchronization.')
         else:
             self.purge(start,
                        stop,
                        dataset_aging,
                        log_aging,
                        sandbox_aging,
                        batch_size)
     except Exception:
         logger.error('Purge failed.', exc_info=True)
Exemple #2
0
 def handle(self,
            start=2000,
            stop=1000,
            dataset_aging=1.0,
            log_aging=1.0,
            sandbox_aging=1.0,
            synch=False,
            wait=timedelta(seconds=0),
            batch_size=100,
            **kwargs):
     # noinspection PyBroadException
     try:
         if synch:
             logger.debug('Starting purge synchronization.')
             self.synch_model(Container, 'file', wait, batch_size)
             self.synch_model(ContainerRun, 'sandbox_path', wait,
                              batch_size)
             self.synch_model(ContainerLog, 'long_text', wait, batch_size)
             self.synch_model(Dataset, 'dataset_file', wait, batch_size)
             Dataset.external_file_check(batch_size=batch_size)
             logger.debug('Finished purge synchronization.')
         else:
             self.purge(start, stop, dataset_aging, log_aging,
                        sandbox_aging, batch_size)
     except Exception:
         logger.error('Purge failed.', exc_info=True)
Exemple #3
0
    def create(self, validated_data):
        """
        Create a Dataset object from deserialized and validated data.
        """
        # The default behaviour for keep_file depends on the mode of creation.
        keep_file = True
        file_path = validated_data.get("external_path", "")
        efd = validated_data.get("externalfiledirectory", None)
        # Both or neither are specified (this is enforced in serializer validation).
        if file_path:
            file_path = os.path.join(efd.path, file_path)
            keep_file = False  # don't retain a copy by default

        # Override the default if specified.
        keep_file = validated_data.get("save_in_db", keep_file)

        dataset = Dataset.create_dataset(
            is_uploaded=True,  # Assume serializer is only used for uploads.
            file_path=file_path,
            user=self.context["request"].user,
            users_allowed=validated_data["users_allowed"],
            groups_allowed=validated_data["groups_allowed"],
            keep_file=keep_file,
            name=validated_data["name"],
            description=validated_data.get("description"),
            file_source=None,
            check=True,
            file_handle=validated_data.get(
                "dataset_file",
                None),  # should be freshly opened so cursor is at start
            externalfiledirectory=efd)
        return dataset
Exemple #4
0
    def save_outputs(self, run):
        output_path = os.path.join(run.full_sandbox_path, 'output')
        upload_path = os.path.join(run.full_sandbox_path, 'upload')
        os.mkdir(upload_path)
        for argument in run.app.arguments.filter(type=ContainerArgument.OUTPUT):
            argument_path = os.path.join(output_path, argument.name)
            dataset_name = self.build_dataset_name(run, argument.name)
            new_argument_path = os.path.join(upload_path, dataset_name)
            try:
                os.rename(argument_path, new_argument_path)
                dataset = Dataset.create_dataset(new_argument_path,
                                                 name=dataset_name,
                                                 user=run.user)
                dataset.copy_permissions(run)
                run.datasets.create(dataset=dataset,
                                    argument=argument)
            except (OSError, IOError) as ex:
                if ex.errno != errno.ENOENT:
                    raise
        logs_path = os.path.join(run.full_sandbox_path, 'logs')
        for file_name, log_type in (('stdout.txt', ContainerLog.STDOUT),
                                    ('stderr.txt', ContainerLog.STDERR)):
            run.load_log(os.path.join(logs_path, file_name), log_type)

        run.set_md5()
        run.state = (ContainerRun.COMPLETE
                     if run.return_code == 0
                     else ContainerRun.FAILED)
        run.end_time = timezone.now()
Exemple #5
0
    def save_outputs(self, run):
        output_path = os.path.join(run.full_sandbox_path, 'output')
        upload_path = os.path.join(run.full_sandbox_path, 'upload')
        os.mkdir(upload_path)
        for argument in run.app.arguments.filter(
                type=ContainerArgument.OUTPUT):
            argument_path = os.path.join(output_path, argument.name)
            dataset_name = self.build_dataset_name(run, argument.name)
            new_argument_path = os.path.join(upload_path, dataset_name)
            try:
                os.rename(argument_path, new_argument_path)
                dataset = Dataset.create_dataset(new_argument_path,
                                                 name=dataset_name,
                                                 user=run.user)
                dataset.copy_permissions(run)
                run.datasets.create(dataset=dataset, argument=argument)
            except (OSError, IOError) as ex:
                if ex.errno != errno.ENOENT:
                    raise
        logs_path = os.path.join(run.full_sandbox_path, 'logs')
        for file_name, log_type in (('stdout.txt', ContainerLog.STDOUT),
                                    ('stderr.txt', ContainerLog.STDERR)):
            run.load_log(os.path.join(logs_path, file_name), log_type)

        run.set_md5()
        run.state = (ContainerRun.COMPLETE
                     if run.return_code == 0 else ContainerRun.FAILED)
        run.end_time = timezone.now()
Exemple #6
0
 def _save_output_directory_argument(cls, run: ContainerRun,
                                     argument: ContainerArgument,
                                     output_path: str,
                                     upload_path: str) -> None:
     output_path = pathlib.Path(output_path).absolute()
     dirarg_path = output_path / argument.name
     for dirpath, _, filenames in os.walk(dirarg_path):
         dirpath = pathlib.Path(dirpath)
         for filename in filenames:
             datafile_path: pathlib.Path = (dirpath / filename).absolute()
             dataset_filename = cls._build_directory_file_name(
                 run.id, output_path, datafile_path)
             destination_path = os.path.join(upload_path, dataset_filename)
             dataset_name = cls._build_directory_dataset_name(
                 run.id, output_path, datafile_path)
             try:
                 os.rename(datafile_path, destination_path)
                 dataset = Dataset.create_dataset(
                     destination_path,
                     name=dataset_name,
                     user=run.user,
                 )
                 dataset.copy_permissions(run)
                 run.datasets.create(dataset=dataset, argument=argument)
             except (OSError, IOError) as ex:
                 if ex.errno != errno.ENOENT:
                     raise
Exemple #7
0
    def create_datasets(self, user):
        """
        Creates the Datasets and the corresponding SymbolicDatasets in same order as cleaned_data["dataset_files"].
        Will still save successful Datasets to database even if some of the Datasets fail to create.

        :return:  None and a list of the created Dataset objects in the same order
            as cleaned_data["dataset_files"].
            If particular Dataset failed to create, then the list element contains a dict that can be
        used to inform the user about the file.
        """
        results = []
        for file_size, uploaded_file in self.cleaned_data['dataset_file']:
            # Note that uploaded_file should be seek'd to the beginning.  It was presumably
            # just opened so that should be OK but if this ever changes we will have to fix this.
            dataset = error_str = auto_name = None
            try:
                # TODO:  use correct unique constraints
                name_prefix = ""
                if self.cleaned_data["name_prefix"]:
                    name_prefix = self.cleaned_data["name_prefix"] + "_"
                auto_name = name_prefix + uploaded_file.name + "_" + datetime.now(
                ).strftime('%Y%m%d%H%M%S%f')

                if self.cleaned_data["description"]:
                    auto_description = self.cleaned_data["description"]
                else:
                    auto_description = "Bulk Uploaded File " + uploaded_file.name

                dataset = Dataset.create_dataset(is_uploaded=True,
                                                 file_path=None,
                                                 user=user,
                                                 keep_file=True,
                                                 name=auto_name,
                                                 description=auto_description,
                                                 file_source=None,
                                                 check=True,
                                                 file_handle=uploaded_file)
                dataset.grant_from_json(self.cleaned_data["permissions"])

            except Exception as e:
                error_str = str(e)
                LOGGER.exception(
                    "Error while creating Dataset for file with original file name="
                    + str(uploaded_file.name) +
                    " and autogenerated Dataset name = " + str(auto_name))

            if dataset and error_str is None:
                results.append(dataset)
            elif error_str and dataset is None:
                results.append({
                    "name": uploaded_file.name,
                    "errstr": error_str,
                    "size": file_size
                })
            else:
                raise ValueError(
                    "Invalid situation.  Must either have a dataset or error.  Can not have both or none."
                )

        return None, results
Exemple #8
0
    def test_removal_skips_inputs(self):
        run = ContainerRun(id=42, state=ContainerRun.COMPLETE)
        dataset = Dataset(id=43)
        argument = ContainerArgument(type=ContainerArgument.INPUT)
        run.datasets.create(dataset=dataset, argument=argument)
        expected_plan = {'ContainerRuns': {run}}

        plan = run.build_removal_plan()

        self.assertEqual(expected_plan, strip_removal_plan(plan))
Exemple #9
0
 def test_create_next_month_upload_dir03(self):
     """ Test the creation of a monthly dir, where the dir is already present."""
     dataset_dir = os.path.join(settings.MEDIA_ROOT, Dataset.UPLOAD_DIR)
     date_str = (date.today() + timedelta(days=30)).strftime('%Y_%m')
     next_dirname = os.path.join(dataset_dir, date_str)
     # make the directory iff it doesn't exist
     if not os.path.exists(next_dirname):
         os.makedirs(next_dirname)
     gg = Dataset.idle_create_next_month_upload_dir()
     self.man._add_idletask(gg)
     time_limit = time.time() + 1000.0
     self.man._do_idle_tasks(time_limit)
     self.assertTrue(os.path.exists(next_dirname), "directory was not made")
Exemple #10
0
 def test_create_next_month_upload_dir02(self):
     """ Test the creation of a monthly directory where Dataset may be present."""
     dataset_dir = os.path.join(settings.MEDIA_ROOT, Dataset.UPLOAD_DIR)
     date_str = (date.today() + timedelta(days=30)).strftime('%Y_%m')
     next_dirname = os.path.join(dataset_dir, date_str)
     # delete the dir iff it exists.
     try:
         shutil.rmtree(next_dirname)
     except os.error as e:
         if e.errno != errno.ENOENT:
             raise
     gg = Dataset.idle_create_next_month_upload_dir()
     self.man._add_idletask(gg)
     time_limit = time.time() + 1000.0
     self.man._do_idle_tasks(time_limit)
     self.assertTrue(os.path.exists(next_dirname), "directory was not made")
Exemple #11
0
    def setUp(self):
        super(RawTests, self).setUp()

        self.addTypeEqualityFunc(str, self.assertMultiLineEqual)
        self.pipeline_raw = tools.make_first_pipeline(
            "raw noop", "a pipeline to do nothing to raw data",
            self.user_bob)
        tools.create_linear_pipeline(self.pipeline_raw, [self.method_noop_raw], "raw_in", "raw_out")
        self.pipeline_raw.create_outputs()

        self.dataset_raw = Dataset.create_dataset(
            "/usr/share/dict/words",
            user=self.user_bob,
            cdt=None,
            keep_file=True,
            name="raw",
            description="some raw data"
        )
Exemple #12
0
    def build(self):
        user = User.objects.first()
        assert user is not None
        input_path = os.path.abspath(
            os.path.join(
                __file__,
                '../../../../../samplecode/singularity/host_input/example_names.csv'
            ))
        family = ContainerFamily.objects.create(name='fixture family',
                                                user=user)
        container_path = os.path.abspath(
            os.path.join(
                __file__,
                '../../../../../samplecode/singularity/python2-alpine-trimmed.simg'
            ))
        with open(container_path, "rb") as f:
            container_md5 = compute_md5(f)
        container = family.containers.create(
            tag='vFixture',
            user=user,
            file='Containers/kive-default.simg',
            md5=container_md5)
        app = container.apps.create()
        arg1 = app.arguments.create(type=ContainerArgument.INPUT,
                                    name='names_csv',
                                    position=1)
        app.arguments.create(type=ContainerArgument.OUTPUT,
                             name='greetings_csv',
                             position=2)
        dataset = Dataset.create_dataset(input_path,
                                         name='names.csv',
                                         user=user)
        run = app.runs.create(name='fixture run', user=user)
        run.sandbox_path = ""  # blank this out as it won't be accessible in testing anyway
        run.slurm_job_id = None  # this also would cause tests to fail on a fresh system
        run.save(schedule=False)  # scheduling would overwrite sandbox_path
        run.datasets.create(argument=arg1, dataset=dataset)

        upload_path = os.path.join(settings.MEDIA_ROOT, Container.UPLOAD_DIR)
        readme_path = os.path.join(upload_path, 'README.md')
        os.makedirs(upload_path)
        with open(readme_path, 'w') as f:
            f.write('Just a placeholder to create the folder for containers.')
Exemple #13
0
 def _save_output_argument(
     self,
     run: ContainerRun,
     argument: ContainerArgument,
     output_path: str,
     upload_path: str,
 ):
     argument_path = os.path.join(output_path, argument.name)
     dataset_name = self.build_dataset_name(run, argument.name)
     new_argument_path = os.path.join(upload_path, dataset_name)
     try:
         os.rename(argument_path, new_argument_path)
         dataset = Dataset.create_dataset(new_argument_path,
                                          name=dataset_name,
                                          user=run.user)
         dataset.copy_permissions(run)
         run.datasets.create(dataset=dataset, argument=argument)
     except (OSError, IOError) as ex:
         if ex.errno != errno.ENOENT:
             raise
Exemple #14
0
    def setUp(self):
        self.ds_owner = User.objects.create_user("Noonian",
                                                 "*****@*****.**",
                                                 "feeeeeeelings")
        self.ds_owner.save()
        self.ds_owner.groups.add(everyone_group())

        self.lore = User.objects.create_user("Lore", "*****@*****.**",
                                             "Asimov's Three Laws")
        self.lore.save()
        self.lore.groups.add(everyone_group())

        self.developers_group = Group.objects.get(pk=groups.DEVELOPERS_PK)

        self.dataset = Dataset.create_empty(user=self.ds_owner)
        self.dataset.name = "Test"
        self.dataset.description = "Test dataset"
        self.dataset.save()

        self.users_to_intersect = User.objects.filter(
            pk__in=[self.ds_owner.pk, self.lore.pk])
        self.groups_to_intersect = Group.objects.filter(
            pk__in=[self.developers_group.pk,
                    everyone_group().pk])
Exemple #15
0
    def create(self, validated_data):
        """
        Create a Dataset object from deserialized and validated data.
        """
        cdt = None
        if "structure" in validated_data:
            cdt = validated_data["structure"].get("compounddatatype", None)

        # The default behaviour for keep_file depends on the mode of creation.
        keep_file = True
        file_path = validated_data.get("external_path", "")
        efd = validated_data.get("externalfiledirectory", None)
        # Both or neither are specified (this is enforced in serializer validation).
        if file_path:
            file_path = os.path.join(efd.path, file_path)
            keep_file = False  # don't retain a copy by default

        # Override the default if specified.
        keep_file = validated_data.get("save_in_db", keep_file)

        dataset = Dataset.create_dataset(
            is_uploaded=True,  # Assume serializer is only used for uploads.
            file_path=file_path,
            user=self.context["request"].user,
            users_allowed=validated_data["users_allowed"],
            groups_allowed=validated_data["groups_allowed"],
            cdt=cdt,
            keep_file=keep_file,
            name=validated_data["name"],
            description=validated_data.get("description"),
            file_source=None,
            check=True,
            file_handle=validated_data.get("dataset_file", None),  # should be freshly opened so cursor is at start
            externalfiledirectory=efd
        )
        return dataset
Exemple #16
0
def dataset_view(request, dataset_id):
    """
    Display the file associated with the dataset in the browser, or update its name/description.
    """
    return_to_run = request.GET.get('run_id', None)
    is_view_results = "view_results" in request.GET
    is_view_run = "view_run" in request.GET
    return_url = reverse("datasets")
    if return_to_run is not None:
        if is_view_run:
            return_url = reverse('view_run', kwargs={'run_id': return_to_run})
        elif is_view_results:
            return_url = reverse('view_results', kwargs={'run_id': return_to_run})

    try:
        if admin_check(request.user):
            accessible_datasets = Dataset.objects
        else:
            accessible_datasets = Dataset.filter_by_user(request.user)
        dataset = accessible_datasets.prefetch_related(
            'structure',
            'structure__compounddatatype',
            'structure__compounddatatype__members',
            'structure__compounddatatype__members__datatype',
            'structure__compounddatatype__members__datatype__basic_constraints'
        ).get(pk=dataset_id)

    except ObjectDoesNotExist:
        raise Http404("ID {} cannot be accessed".format(dataset_id))

    # Figure out which users and groups could be given access to this Dataset.
    # If the Dataset is uploaded, it's anyone who doesn't already have access;
    # if it was generated, it's anyone who had access to the generating run.
    addable_users, addable_groups = dataset.other_users_groups()

    if dataset.file_source is None:
        generating_run = None
    else:
        generating_run = dataset.file_source.top_level_run
    container_dataset = dataset.containers.filter(argument__type='O').first()  # Output from which runs?
    if container_dataset is None:
        container_run = None
    else:
        container_run = container_dataset.run
    inputs_count = dataset.containers.filter(
        argument__type='I').values('run_id').distinct().count()

    if request.method == "POST":
        # We are going to try and update this Dataset.
        dataset_form = DatasetDetailsForm(
            request.POST,
            access_limits=dataset.get_access_limits(),
            instance=dataset
        )
        try:
            if dataset_form.is_valid():
                dataset.name = dataset_form.cleaned_data["name"]
                dataset.description = dataset_form.cleaned_data["description"]
                dataset.clean()
                dataset.save()
                with transaction.atomic():
                    dataset.grant_from_json(dataset_form.cleaned_data["permissions"])
                    dataset.validate_restrict_access(dataset.get_access_limits())

                return HttpResponseRedirect(return_url)
        except (AttributeError, ValidationError, ValueError) as e:
            LOGGER.exception(e.message)
            dataset_form.add_error(None, e)

    else:
        # A DatasetForm which we can use to make submission and editing easier.
        dataset_form = DatasetDetailsForm(
            access_limits=dataset.get_access_limits(),
            initial={"name": dataset.name, "description": dataset.description}
        )

    c = {
        "is_admin": admin_check(request.user),
        "is_owner": dataset.user == request.user,
        "dataset": dataset,
        "return": return_url,
        "dataset_form": dataset_form,
        "generating_run": generating_run,
        "inputs_count": inputs_count,
        "container_run": container_run
    }

    if not dataset.has_data():
        t = loader.get_template("librarian/missing_dataset_view.html")
        if dataset.external_path:
            c["missing_data_message"] = "This dataset's external file is missing or has "\
                                        "been modified (MD5 mismatch).  " \
                                        "Please consult your system administrator if this is unexpected."
        elif dataset.is_redacted():
            c["missing_data_message"] = "Data has been redacted."
        else:
            c["missing_data_message"] = "Data was not retained or has been purged."
        rendered_response = t.render(c, request)

    elif dataset.is_raw():
        t = loader.get_template("librarian/raw_dataset_view.html")

        # Test whether this is a binary file or not.
        # Read 1000 characters.
        data_handle = dataset.get_open_file_handle('r')
        if data_handle is None:
            c["missing_data_message"] = "Data has been removed or renamed."
        else:
            with data_handle:
                sample_content = data_handle.read(1000)
            c.update({"sample_content": sample_content})
        c["is_binary"] = False
        try:
            rendered_response = t.render(c, request)
        except DjangoUnicodeDecodeError as e:
            c["is_binary"] = True
            del c["sample_content"]
            rendered_response = t.render(c, request)
    else:
        extra_errors = []
        # If we have a mismatched output, we do an alignment
        # over the columns.
        if dataset.content_matches_header:
            col_matching, processed_rows = None, dataset.rows(
                True,
                limit=settings.DATASET_DISPLAY_MAX,
                extra_errors=extra_errors)
        else:
            col_matching, insert = dataset.column_alignment()
            processed_rows = dataset.rows(data_check=True,
                                          insert_at=insert,
                                          limit=settings.DATASET_DISPLAY_MAX,
                                          extra_errors=extra_errors)
        t = loader.get_template("librarian/csv_dataset_view.html")
        processed_rows = list(processed_rows)
        c.update(
            {
                'column_matching': col_matching,
                'processed_rows': processed_rows,
                'extra_errors': extra_errors,
                "are_rows_truncated": len(processed_rows) >= settings.DATASET_DISPLAY_MAX
            }
        )
        rendered_response = t.render(c, request)
    return HttpResponse(rendered_response)
Exemple #17
0
    def create_datasets(self, user):
        """
        Creates the Datasets and the corresponding SymbolicDatasets in same order as cleaned_data["dataset_files"].
        Will still save successful Datasets to database even if some of the Datasets fail to create.

        :return:  CDT object and a list of the created Dataset objects in the same order
            as cleaned_data["dataset_files"].
            If particular Dataset failed to create, then the list element contains a dict that can be
        used to inform the user about the file.
        """
        compound_datatype_obj = None
        if self.cleaned_data['compound_datatype'] != CompoundDatatype.RAW_ID:
            compound_datatype_obj = CompoundDatatype.objects.get(pk=self.cleaned_data['compound_datatype'])

        results = []
        for file_size, uploaded_file in self.cleaned_data['dataset_file']:
            # Note that uploaded_file should be seek'd to the beginning.  It was presumably
            # just opened so that should be OK but if this ever changes we will have to fix this.
            dataset = error_str = auto_name = None
            try:
                # TODO:  use correct unique constraints
                name_prefix = ""
                if self.cleaned_data["name_prefix"]:
                    name_prefix = self.cleaned_data["name_prefix"] + "_"
                auto_name = name_prefix + uploaded_file.name + "_" + datetime.now().strftime('%Y%m%d%H%M%S%f')

                if self.cleaned_data["description"]:
                    auto_description = self.cleaned_data["description"]
                else:
                    auto_description = "Bulk Uploaded File " + uploaded_file.name

                dataset = Dataset.create_dataset(
                    is_uploaded=True,
                    file_path=None,
                    user=user,
                    cdt=compound_datatype_obj,
                    keep_file=True,
                    name=auto_name,
                    description=auto_description,
                    file_source=None,
                    check=True,
                    file_handle=uploaded_file
                )
                dataset.grant_from_json(self.cleaned_data["permissions"])

            except Exception as e:
                error_str = str(e)
                LOGGER.exception("Error while creating Dataset for file with original file name=" +
                                 str(uploaded_file.name) +
                                 " and autogenerated Dataset name = " +
                                 str(auto_name))

            if dataset and error_str is None:
                results.append(dataset)
            elif error_str and dataset is None:
                results.append({"name": uploaded_file.name,
                                "errstr": error_str,
                                "size": file_size})
            else:
                raise ValueError("Invalid situation.  Must either have a dataset or error.  Can not have both or none.")

        return compound_datatype_obj, results
Exemple #18
0
    def purge(self, start, stop, dataset_aging, log_aging, sandbox_aging,
              batch_size):
        logger.debug('Starting purge.')
        container_total = self.set_file_sizes(Container, 'file', 'file_size',
                                              'created')
        sandbox_total = self.set_file_sizes(ContainerRun, 'sandbox_path',
                                            'sandbox_size', 'end_time')
        log_total = self.set_file_sizes(ContainerLog, 'long_text', 'log_size',
                                        'run__end_time')
        dataset_total = self.set_file_sizes(Dataset, 'dataset_file',
                                            'dataset_size', 'date_created')

        total_storage = remaining_storage = (container_total + sandbox_total +
                                             log_total + dataset_total)
        if total_storage <= start:
            storage_text = self.summarize_storage(container_total,
                                                  dataset_total, sandbox_total,
                                                  log_total)
            logger.debug(u"No purge needed for %s: %s.",
                         filesizeformat(total_storage), storage_text)
            return

        sandbox_ages = ContainerRun.find_unneeded().annotate(
            entry_type=Value('r', models.CharField()),
            age=ExpressionWrapper(sandbox_aging * (Now() - F('end_time')),
                                  output_field=DurationField())).values_list(
                                      'entry_type', 'id', 'age').order_by()

        log_ages = ContainerLog.find_unneeded().annotate(
            entry_type=Value('l', models.CharField()),
            age=ExpressionWrapper(log_aging * (Now() - F('run__end_time')),
                                  output_field=DurationField())).values_list(
                                      'entry_type', 'id', 'age').order_by()

        dataset_ages = Dataset.find_unneeded().annotate(
            entry_type=Value('d', models.CharField()),
            age=ExpressionWrapper(dataset_aging * (Now() - F('date_created')),
                                  output_field=FloatField())).values_list(
                                      'entry_type', 'id', 'age').order_by()

        purge_counts = Counter()
        max_purge_dates = {}
        min_purge_dates = {}
        purge_entries = sandbox_ages.union(log_ages, dataset_ages,
                                           all=True).order_by('-age')
        while remaining_storage > stop:
            entry_count = 0
            for entry_type, entry_id, age in purge_entries[:batch_size]:
                entry_count += 1
                if entry_type == 'r':
                    run = ContainerRun.objects.get(id=entry_id)
                    entry_size = run.sandbox_size
                    entry_date = run.end_time
                    logger.debug("Purged container run %d containing %s.",
                                 run.pk, filesizeformat(entry_size))
                    try:
                        run.delete_sandbox()
                    except OSError:
                        logger.error(
                            u"Failed to purge container run %d at %r.",
                            run.id,
                            run.sandbox_path,
                            exc_info=True)
                        run.sandbox_path = ''
                    run.save()
                elif entry_type == 'l':
                    log = ContainerLog.objects.get(id=entry_id)
                    entry_size = log.log_size
                    entry_date = log.run.end_time
                    logger.debug("Purged container log %d containing %s.",
                                 log.id, filesizeformat(entry_size))
                    log.long_text.delete()
                else:
                    assert entry_type == 'd'
                    dataset = Dataset.objects.get(id=entry_id)
                    entry_size = dataset.dataset_size
                    dataset_total -= dataset.dataset_size
                    entry_date = dataset.date_created
                    logger.debug("Purged dataset %d containing %s.",
                                 dataset.pk, filesizeformat(entry_size))
                    dataset.dataset_file.delete()
                purge_counts[entry_type] += 1
                purge_counts[entry_type + ' bytes'] += entry_size
                # PyCharm false positives...
                # noinspection PyUnresolvedReferences
                min_purge_dates[entry_type] = min(
                    entry_date, min_purge_dates.get(entry_type, entry_date))
                # noinspection PyUnresolvedReferences
                max_purge_dates[entry_type] = max(
                    entry_date, max_purge_dates.get(entry_type, entry_date))
                remaining_storage -= entry_size
                if remaining_storage <= stop:
                    break
            if entry_count == 0:
                break
        for entry_type, entry_name in (('r', 'container run'),
                                       ('l', 'container log'), ('d',
                                                                'dataset')):
            purged_count = purge_counts[entry_type]
            if not purged_count:
                continue
            min_purge_date = min_purge_dates[entry_type]
            max_purge_date = max_purge_dates[entry_type]
            collective = entry_name + pluralize(purged_count)
            bytes_removed = purge_counts[entry_type + ' bytes']
            start_text = naturaltime(min_purge_date)
            end_text = naturaltime(max_purge_date)
            date_range = (start_text if start_text == end_text else
                          start_text + ' to ' + end_text)
            logger.info("Purged %d %s containing %s from %s.", purged_count,
                        collective, filesizeformat(bytes_removed), date_range)
        if remaining_storage > stop:
            storage_text = self.summarize_storage(container_total,
                                                  dataset_total)
            logger.error('Cannot reduce storage to %s: %s.',
                         filesizeformat(stop), storage_text)
# A dummy Datatype with a prototype.
with tempfile.TemporaryFile() as f:
    f.write("""example,valid
True,True
true,False
y,False
n,False
False,False
false,false"""
    )
    f.seek(0)
    proto_SD = Dataset.create_dataset(
        file_path=None,
        user=kive_user(),
        cdt=CompoundDatatype.objects.get(pk=CDTs.PROTOTYPE_PK),
        name="AlwaysTruePrototype",
        description="Prototype for dummy Datatype",
        file_handle=f
    )

always_true = Datatype(
    user=kive_user(),
    name="Python True",
    description="True in python",
    proto_SD=proto_SD
)
always_true.save()
always_true.restricts.add(Datatype.objects.get(pk=datatypes.BOOL_PK))

always_true.basic_constraints.create(
    ruletype=BasicConstraint.REGEXP,
Exemple #20
0
def datasets_add_archive(request):
    """
    Add datasets in bulk to db from an archive file (zip or tarfile).
    Redirect to /datasets_bulk view so user can examine upload status of each dataset.
    """
    c = {}
    # If we got posted to, try to create DB entries
    if request.method == 'POST':
        try:
            archive_add_dataset_form = ArchiveAddDatasetForm(
                data=request.POST, files=request.FILES)
            # Try to retrieve new datasets. If this fails, we return to our current page
            is_ok = archive_add_dataset_form.is_valid()
            if is_ok:
                CDT_obj, add_results = archive_add_dataset_form.create_datasets(
                    request.user)
                is_ok = len(add_results) > 0
            if not is_ok:
                # give up and let user try again
                t = loader.get_template('librarian/datasets_add_archive.html')
                c = {'archiveAddDatasetForm': archive_add_dataset_form}
                return HttpResponse(t.render(c, request))
            # have some files in the archive, lets display them
            # NOTE: at this point, we have a list of files in the archive.
            # some files might be legit, others not.
            # we have to cobble together information from add_results and the form cleaned data
            # for display.
            uploaded_files = archive_add_dataset_form.cleaned_data[
                "dataset_file"]

            if len(uploaded_files) != len(add_results):
                raise RuntimeError("List length mismatch")
            t = loader.get_template('librarian/datasets_bulk.html')
            # Now have add_results, a list of elements e, where e is either
            # a dataset if the dataset was successfully created
            # or
            # a dict if a dataset was not successfully created
            # Generate a response
            archive_display_results = []
            # Fill in default values for the form fields
            for add_result, upload_info in zip(add_results, uploaded_files):
                display_result = {}
                if isinstance(add_result, dict):
                    # the dataset is invalid
                    display_result["name"] = add_result["name"]
                    display_result["description"] = ""
                    display_result["orig_filename"] = add_result["name"]
                    display_result["filesize"] = add_result["size"]
                    display_result["md5"] = ""
                    display_result["id"] = ""
                    display_result["is_valid"] = False
                else:
                    display_result["name"] = add_result.name
                    display_result["description"] = add_result.description
                    # This is the original filename as uploaded by the client, not the filename as stored
                    # on the file server.
                    display_result["orig_filename"] = upload_info[1].name
                    display_result[
                        "filesize"] = add_result.get_formatted_filesize()
                    display_result["md5"] = add_result.compute_md5()
                    display_result["id"] = add_result.id
                    display_result["is_valid"] = True
                archive_display_results.append(display_result)

            # now create forms from the display results.
            BulkDatasetUpdateFormSet = formset_factory(
                form=BulkDatasetUpdateForm,
                max_num=len(archive_display_results))
            bulk_dataset_update_formset = BulkDatasetUpdateFormSet(
                initial=archive_display_results)

            # Fill in the attributes that are not fields in the form
            # These are not set by the BulkDatasetUpdateFormSet(initial=...) parameter,
            # so we have to tweak the forms after they have been created
            for dataset_form, display_result, add_result in zip(
                    bulk_dataset_update_formset, archive_display_results,
                    add_results):
                if display_result["is_valid"]:
                    dataset_form.dataset = add_result
                    dataset_form.status = BulkDatasetDisplay.STATUS_SUCCESS
                else:
                    dataset_form.dataset = Dataset()
                    dataset_form.non_field_errors = add_result["errstr"]
                    dataset_form.status = BulkDatasetDisplay.STATUS_FAIL

            # finally, add some other pertinent information which the template will display
            num_files_added = sum(
                [a["is_valid"] for a in archive_display_results])
            c["bulk_dataset_formset"] = bulk_dataset_update_formset
            c["num_files_selected"] = len(add_results)
            c["num_files_added"] = num_files_added
            c["cdt_typestr"] = "Unstructured" if CDT_obj is None else CDT_obj
        except ValidationError as e:
            LOGGER.exception(e.message)
            archive_add_dataset_form.add_error(None, e)
            t = loader.get_template('librarian/datasets_add_archive.html')
            c.update({'archiveAddDatasetForm': archive_add_dataset_form})

    else:  # return an empty form for the user to fill in
        t = loader.get_template('librarian/datasets_add_archive.html')
        c['archiveAddDatasetForm'] = ArchiveAddDatasetForm()

    return HttpResponse(t.render(c, request))
Exemple #21
0
from django.core.files import File
from django.contrib.auth.models import User

import metadata.models
from librarian.models import Dataset
import method.models
import kive.testing_utils as tools

# This comes from the initial_user fixture.
kive_user = User.objects.get(pk=1)

test_fasta = Dataset.create_dataset(
    file_path="../samplecode/step_0_raw.fasta",
    user=kive_user,
    cdt=None,
    keep_file=True,
    name="TestFASTA",
    description="Toy FASTA file for testing pipelines")

# Set up a test Pipeline.
resource = method.models.CodeResource(name="Fasta2CSV",
                                      description="FASTA converter script",
                                      filename="Fasta2CSV.py")
resource.clean()
resource.save()
with open("../samplecode/fasta2csv.py", "rb") as f:
    revision = method.models.CodeResourceRevision(
        coderesource=resource,
        revision_name="v1",
        revision_desc="First version",
Exemple #22
0
    def purge(self,
              start,
              stop,
              dataset_aging,
              log_aging,
              sandbox_aging,
              batch_size):
        logger.debug('Starting purge.')
        container_total = self.set_file_sizes(Container,
                                              'file',
                                              'file_size',
                                              'created')
        sandbox_total = self.set_file_sizes(ContainerRun,
                                            'sandbox_path',
                                            'sandbox_size',
                                            'end_time')
        log_total = self.set_file_sizes(ContainerLog,
                                        'long_text',
                                        'log_size',
                                        'run__end_time')
        dataset_total = self.set_file_sizes(Dataset,
                                            'dataset_file',
                                            'dataset_size',
                                            'date_created')

        total_storage = remaining_storage = (
                container_total + sandbox_total + log_total + dataset_total)
        if total_storage <= start:
            storage_text = self.summarize_storage(container_total,
                                                  dataset_total,
                                                  sandbox_total,
                                                  log_total)
            logger.debug(u"No purge needed for %s: %s.",
                         filesizeformat(total_storage),
                         storage_text)
            return

        sandbox_ages = ContainerRun.find_unneeded().annotate(
            entry_type=Value('r', models.CharField()),
            age=sandbox_aging * (Now() - F('end_time'))).values_list(
            'entry_type',
            'id',
            'age').order_by()

        log_ages = ContainerLog.find_unneeded().annotate(
            entry_type=Value('l', models.CharField()),
            age=log_aging * (Now() - F('run__end_time'))).values_list(
            'entry_type',
            'id',
            'age').order_by()

        dataset_ages = Dataset.find_unneeded().annotate(
            entry_type=Value('d', models.CharField()),
            age=dataset_aging * (Now() - F('date_created'))).values_list(
            'entry_type',
            'id',
            'age').order_by()

        purge_counts = Counter()
        max_purge_dates = {}
        min_purge_dates = {}
        purge_entries = sandbox_ages.union(log_ages,
                                           dataset_ages,
                                           all=True).order_by('-age')
        while remaining_storage > stop:
            entry_count = 0
            for entry_type, entry_id, age in purge_entries[:batch_size]:
                entry_count += 1
                if entry_type == 'r':
                    run = ContainerRun.objects.get(id=entry_id)
                    entry_size = run.sandbox_size
                    entry_date = run.end_time
                    logger.debug("Purged container run %d containing %s.",
                                 run.pk,
                                 filesizeformat(entry_size))
                    try:
                        run.delete_sandbox()
                    except OSError:
                        logger.error(u"Failed to purge container run %d at %r.",
                                     run.id,
                                     run.sandbox_path,
                                     exc_info=True)
                        run.sandbox_path = ''
                    run.save()
                elif entry_type == 'l':
                    log = ContainerLog.objects.get(id=entry_id)
                    entry_size = log.log_size
                    entry_date = log.run.end_time
                    logger.debug("Purged container log %d containing %s.",
                                 log.id,
                                 filesizeformat(entry_size))
                    log.long_text.delete()
                else:
                    assert entry_type == 'd'
                    dataset = Dataset.objects.get(id=entry_id)
                    entry_size = dataset.dataset_size
                    dataset_total -= dataset.dataset_size
                    entry_date = dataset.date_created
                    logger.debug("Purged dataset %d containing %s.",
                                 dataset.pk,
                                 filesizeformat(entry_size))
                    dataset.dataset_file.delete()
                purge_counts[entry_type] += 1
                purge_counts[entry_type + ' bytes'] += entry_size
                # PyCharm false positives...
                # noinspection PyUnresolvedReferences
                min_purge_dates[entry_type] = min(entry_date,
                                                  min_purge_dates.get(entry_type, entry_date))
                # noinspection PyUnresolvedReferences
                max_purge_dates[entry_type] = max(entry_date,
                                                  max_purge_dates.get(entry_type, entry_date))
                remaining_storage -= entry_size
                if remaining_storage <= stop:
                    break
            if entry_count == 0:
                break
        for entry_type, entry_name in (('r', 'container run'),
                                       ('l', 'container log'),
                                       ('d', 'dataset')):
            purged_count = purge_counts[entry_type]
            if not purged_count:
                continue
            min_purge_date = min_purge_dates[entry_type]
            max_purge_date = max_purge_dates[entry_type]
            collective = entry_name + pluralize(purged_count)
            bytes_removed = purge_counts[entry_type + ' bytes']
            start_text = naturaltime(min_purge_date)
            end_text = naturaltime(max_purge_date)
            date_range = (start_text
                          if start_text == end_text
                          else start_text + ' to ' + end_text)
            logger.info("Purged %d %s containing %s from %s.",
                        purged_count,
                        collective,
                        filesizeformat(bytes_removed),
                        date_range)
        if remaining_storage > stop:
            storage_text = self.summarize_storage(container_total,
                                                  dataset_total)
            logger.error('Cannot reduce storage to %s: %s.',
                         filesizeformat(stop),
                         storage_text)
prototype_CDT = CompoundDatatype.objects.get(pk=CDTs.PROTOTYPE_PK)

# A dummy Datatype with a prototype.
with tempfile.TemporaryFile() as f:
    f.write("""example,valid
True,True
true,False
y,False
n,False
False,False
false,false""")
    f.seek(0)
    proto_SD = Dataset.create_dataset(
        file_path=None,
        user=kive_user(),
        cdt=CompoundDatatype.objects.get(pk=CDTs.PROTOTYPE_PK),
        name="AlwaysTruePrototype",
        description="Prototype for dummy Datatype",
        file_handle=f)

always_true = Datatype(user=kive_user(),
                       name="Python True",
                       description="True in python",
                       proto_SD=proto_SD)
always_true.save()
always_true.restricts.add(Datatype.objects.get(pk=datatypes.BOOL_PK))

always_true.basic_constraints.create(ruletype=BasicConstraint.REGEXP,
                                     rule="True")
Exemple #24
0
def datasets_add_bulk(request):
    """
    Add datasets in bulk to db.  Redirect to /datasets_bulk view so user can examine upload
    status of each dataset.
    """
    # Redirect to page to allow user to view status of added datasets.
    c = {}
    if request.method == 'POST':
        try:
            # Add new datasets.
            bulk_add_dataset_form = BulkAddDatasetForm(data=request.POST,
                                                       files=request.FILES)
            isok = bulk_add_dataset_form.is_valid()
            if isok:
                CDT_obj, add_results = bulk_add_dataset_form.create_datasets(
                    request.user)
                isok = len(add_results) > 0
            if not isok:
                # give up and let user try again
                t = loader.get_template('librarian/datasets_add_bulk.html')
                c = {'bulkAddDatasetForm': bulk_add_dataset_form}
                return HttpResponse(t.render(c, request))

            # Generate response.
            uploaded_files = bulk_add_dataset_form.cleaned_data[
                "dataset_files"]
            if len(uploaded_files) != len(add_results):
                raise RuntimeError("List length mismatch")

            t = loader.get_template('librarian/datasets_bulk.html')
            bulk_display_results = []
            # Fill in default values for the form fields
            for add_result, upload_info in zip(add_results, uploaded_files):
                display_result = {}
                if isinstance(add_result, dict):
                    # dataset is invalid
                    display_result["name"] = add_result["name"]
                    display_result["description"] = ""
                    display_result["orig_filename"] = add_result["name"]
                    display_result["filesize"] = add_result["size"]
                    display_result["md5"] = ""
                    display_result["id"] = ""
                    display_result["is_valid"] = False
                else:
                    display_result["name"] = add_result.name
                    display_result["description"] = add_result.description
                    # This is the original filename as uploaded by the client, not the filename as stored
                    # on the file server.
                    display_result["orig_filename"] = upload_info[1].name
                    display_result[
                        "filesize"] = add_result.get_formatted_filesize()
                    display_result["md5"] = add_result.compute_md5()
                    display_result["id"] = add_result.id
                    display_result["is_valid"] = True
                bulk_display_results.append(display_result)

            BulkDatasetUpdateFormSet = formset_factory(
                form=BulkDatasetUpdateForm, max_num=len(bulk_display_results))
            bulk_dataset_update_formset = BulkDatasetUpdateFormSet(
                initial=bulk_display_results)

            # Fill in the attributes that are not fields in the form
            # These are not set by the BulkDatasetUpdateFormSet(initial=...) parameter
            for dataset_form, display_result, add_result in zip(
                    bulk_dataset_update_formset, bulk_display_results,
                    add_results):
                if display_result["is_valid"]:
                    dataset_form.dataset = add_result
                    dataset_form.status = BulkDatasetDisplay.STATUS_SUCCESS
                else:
                    dataset_form.dataset = Dataset()
                    dataset_form.non_field_errors = add_result["errstr"]
                    dataset_form.status = BulkDatasetDisplay.STATUS_FAIL

            # finally, add some other pertinent information which the template will display
            num_files_added = sum(
                [a["is_valid"] for a in bulk_display_results])
            c["bulk_dataset_formset"] = bulk_dataset_update_formset
            c["num_files_selected"] = len(add_results)
            c["num_files_added"] = num_files_added
            c["cdt_typestr"] = "Unstructured" if CDT_obj is None else CDT_obj

        except ValidationError as e:
            LOGGER.exception(e.message)
            bulk_add_dataset_form.add_error(None, e)
            c.update({'bulkAddDatasetForm': bulk_add_dataset_form})

    else:  # return an empty form for the user to fill in
        t = loader.get_template('librarian/datasets_add_bulk.html')
        c.update({'bulkAddDatasetForm': BulkAddDatasetForm()})

    return HttpResponse(t.render(c, request))
from django.core.files import File
from django.contrib.auth.models import User

import metadata.models
from librarian.models import Dataset
import method.models
import kive.testing_utils as tools

# This comes from the initial_user fixture.
kive_user = User.objects.get(pk=1)

test_fasta = Dataset.create_dataset(
    file_path="../samplecode/step_0_raw.fasta",
    user=kive_user,
    cdt=None,
    keep_file=True,
    name="TestFASTA",
    description="Toy FASTA file for testing pipelines"
)

# Set up a test Pipeline.
resource = method.models.CodeResource(name="Fasta2CSV", description="FASTA converter script", filename="Fasta2CSV.py")
resource.clean()
resource.save()
with open("../samplecode/fasta2csv.py", "rb") as f:
    revision = method.models.CodeResourceRevision(
        coderesource=resource,
        revision_name="v1",
        revision_desc="First version",
        content_file=File(f))
    revision.clean()
Exemple #26
0
 def filter_granted(self, queryset):
     """ Filter a queryset to only include records explicitly granted.
     """
     return Dataset.filter_by_user(self.request.user)
Exemple #27
0
 def __init__(self, *args, **kwargs):
     super(BulkDatasetUpdateForm, self).__init__(*args, **kwargs)
     self.dataset = Dataset()
     self.status = 0
Exemple #28
0
 def filter_granted(self, queryset):
     """ Filter a queryset to only include records explicitly granted.
     """
     return Dataset.filter_by_user(self.request.user)
Exemple #29
0
def dataset_view(request, dataset_id):
    """
    Display the file associated with the dataset in the browser, or update its name/description.
    """
    return_to_run = request.GET.get('run_id', None)
    is_view_results = "view_results" in request.GET
    is_view_run = "view_run" in request.GET
    return_url = reverse("datasets")
    if return_to_run is not None:
        if is_view_run:
            return_url = reverse('view_run', kwargs={'run_id': return_to_run})
        elif is_view_results:
            return_url = reverse('view_results',
                                 kwargs={'run_id': return_to_run})

    try:
        if admin_check(request.user):
            accessible_datasets = Dataset.objects
        else:
            accessible_datasets = Dataset.filter_by_user(request.user)
        dataset = accessible_datasets.prefetch_related(
            'structure', 'structure__compounddatatype',
            'structure__compounddatatype__members',
            'structure__compounddatatype__members__datatype',
            'structure__compounddatatype__members__datatype__basic_constraints'
        ).get(pk=dataset_id)

    except ObjectDoesNotExist:
        raise Http404("ID {} cannot be accessed".format(dataset_id))

    # Figure out which users and groups could be given access to this Dataset.
    # If the Dataset is uploaded, it's anyone who doesn't already have access;
    # if it was generated, it's anyone who had access to the generating run.
    addable_users, addable_groups = dataset.other_users_groups()

    if dataset.file_source is None:
        generating_run = None
    else:
        generating_run = dataset.file_source.top_level_run
    container_dataset = dataset.containers.filter(
        argument__type='O').first()  # Output from which runs?
    if container_dataset is None:
        container_run = None
    else:
        container_run = container_dataset.run
    inputs_count = dataset.containers.filter(
        argument__type='I').values('run_id').distinct().count()

    if request.method == "POST":
        # We are going to try and update this Dataset.
        dataset_form = DatasetDetailsForm(
            request.POST,
            access_limits=dataset.get_access_limits(),
            instance=dataset)
        try:
            if dataset_form.is_valid():
                dataset.name = dataset_form.cleaned_data["name"]
                dataset.description = dataset_form.cleaned_data["description"]
                dataset.clean()
                dataset.save()
                with transaction.atomic():
                    dataset.grant_from_json(
                        dataset_form.cleaned_data["permissions"])
                    dataset.validate_restrict_access(
                        dataset.get_access_limits())

                return HttpResponseRedirect(return_url)
        except (AttributeError, ValidationError, ValueError) as e:
            LOGGER.exception(e.message)
            dataset_form.add_error(None, e)

    else:
        # A DatasetForm which we can use to make submission and editing easier.
        dataset_form = DatasetDetailsForm(
            access_limits=dataset.get_access_limits(),
            initial={
                "name": dataset.name,
                "description": dataset.description
            })

    c = {
        "is_admin": admin_check(request.user),
        "is_owner": dataset.user == request.user,
        "dataset": dataset,
        "return": return_url,
        "dataset_form": dataset_form,
        "generating_run": generating_run,
        "inputs_count": inputs_count,
        "container_run": container_run
    }

    if not dataset.has_data():
        t = loader.get_template("librarian/missing_dataset_view.html")
        if dataset.external_path:
            c["missing_data_message"] = "This dataset's external file is missing or has "\
                                        "been modified (MD5 mismatch).  " \
                                        "Please consult your system administrator if this is unexpected."
        elif dataset.is_redacted():
            c["missing_data_message"] = "Data has been redacted."
        else:
            c["missing_data_message"] = "Data was not retained or has been purged."
        rendered_response = t.render(c, request)

    elif dataset.is_raw():
        t = loader.get_template("librarian/raw_dataset_view.html")

        # Test whether this is a binary file or not.
        # Read 1000 characters.
        data_handle = dataset.get_open_file_handle('r')
        if data_handle is None:
            c["missing_data_message"] = "Data has been removed or renamed."
        else:
            with data_handle:
                sample_content = data_handle.read(1000)
            c.update({"sample_content": sample_content})
        c["is_binary"] = False
        try:
            rendered_response = t.render(c, request)
        except DjangoUnicodeDecodeError as e:
            c["is_binary"] = True
            del c["sample_content"]
            rendered_response = t.render(c, request)
    else:
        extra_errors = []
        # If we have a mismatched output, we do an alignment
        # over the columns.
        if dataset.content_matches_header:
            col_matching, processed_rows = None, dataset.rows(
                True,
                limit=settings.DATASET_DISPLAY_MAX,
                extra_errors=extra_errors)
        else:
            col_matching, insert = dataset.column_alignment()
            processed_rows = dataset.rows(data_check=True,
                                          insert_at=insert,
                                          limit=settings.DATASET_DISPLAY_MAX,
                                          extra_errors=extra_errors)
        t = loader.get_template("librarian/csv_dataset_view.html")
        processed_rows = list(processed_rows)
        c.update({
            'column_matching':
            col_matching,
            'processed_rows':
            processed_rows,
            'extra_errors':
            extra_errors,
            "are_rows_truncated":
            len(processed_rows) >= settings.DATASET_DISPLAY_MAX
        })
        rendered_response = t.render(c, request)
    return HttpResponse(rendered_response)
Exemple #30
0
def dataset_view(request, dataset_id):
    """
    Display the file associated with the dataset in the browser, or update its name/description.
    """
    return_url = reverse("datasets")

    try:
        if admin_check(request.user):
            accessible_datasets = Dataset.objects
        else:
            accessible_datasets = Dataset.filter_by_user(request.user)
        dataset = accessible_datasets.get(pk=dataset_id)

    except ObjectDoesNotExist:
        raise Http404("ID {} cannot be accessed".format(dataset_id))

    # Figure out which users and groups could be given access to this Dataset.
    # If the Dataset is uploaded, it's anyone who doesn't already have access;
    # if it was generated, it's anyone who had access to the generating run.
    addable_users, addable_groups = dataset.other_users_groups()

    generating_run = None
    container_dataset = dataset.containers.filter(
        argument__type='O').first()  # Output from which runs?
    if container_dataset is None:
        container_run = None
    else:
        container_run = container_dataset.run
    inputs_count = dataset.containers.filter(
        argument__type='I').values('run_id').distinct().count()

    if request.method == "POST":
        # We are going to try and update this Dataset.
        dataset_form = DatasetDetailsForm(
            request.POST,
            access_limits=dataset.get_access_limits(),
            instance=dataset)
        try:
            if dataset_form.is_valid():
                dataset.name = dataset_form.cleaned_data["name"]
                dataset.description = dataset_form.cleaned_data["description"]
                dataset.clean()
                dataset.save()
                with transaction.atomic():
                    dataset.grant_from_json(
                        dataset_form.cleaned_data["permissions"])
                    dataset.validate_restrict_access(
                        dataset.get_access_limits())

                return HttpResponseRedirect(return_url)
        except (AttributeError, ValidationError, ValueError) as e:
            LOGGER.exception(e.message)
            dataset_form.add_error(None, e)

    else:
        # A DatasetForm which we can use to make submission and editing easier.
        dataset_form = DatasetDetailsForm(
            access_limits=dataset.get_access_limits(),
            initial={
                "name": dataset.name,
                "description": dataset.description
            })

    c = {
        "is_admin": admin_check(request.user),
        "is_owner": dataset.user == request.user,
        "dataset": dataset,
        "return": return_url,
        "dataset_form": dataset_form,
        "generating_run": generating_run,
        "inputs_count": inputs_count,
        "container_run": container_run
    }

    if not dataset.has_data():
        t = loader.get_template("librarian/missing_dataset_view.html")
        if dataset.external_path:
            c["missing_data_message"] = "This dataset's external file is missing or has "\
                                        "been modified (MD5 mismatch).  " \
                                        "Please consult your system administrator if this is unexpected."
        elif dataset.is_redacted():
            c["missing_data_message"] = "Data has been redacted."
        else:
            c["missing_data_message"] = "Data was not retained or has been purged."
        rendered_response = t.render(c, request)

    else:
        t = loader.get_template("librarian/raw_dataset_view.html")

        # Test whether this is a binary file or not.
        # Read 1000 characters.
        data_handle = dataset.get_open_file_handle('r')
        if data_handle is None:
            c["missing_data_message"] = "Data has been removed or renamed."
        else:
            with data_handle:
                sample_content = data_handle.read(1000)
            c.update({"sample_content": sample_content})
        c["is_binary"] = False
        try:
            rendered_response = t.render(c, request)
        except DjangoUnicodeDecodeError:
            c["is_binary"] = True
            del c["sample_content"]
            rendered_response = t.render(c, request)
    return HttpResponse(rendered_response)