Example #1
0
    def _instantiate_datafile(self, file):
        """Instantiate a datafile from multiple input formats.

        :param str|dict|octue.resources.datafile.Datafile file:
        :return octue.resources.datafile.Datafile:
        """
        if isinstance(file, Datafile):
            return file

        if isinstance(file, str):
            return Datafile(path=file, hypothetical=self._hypothetical)

        return Datafile.deserialise(file)
Example #2
0
    def _make_serialised_input_manifest_with_correct_dataset_file_tags(
            self, dataset_path):
        """Make a serialised input manifest and create one dataset and its metadata on the filesystem so that, when
        the manifest is loaded, the dataset and its metadata are also loaded. The tags on the dataset's files are
        correct for the `TWINE_WITH_INPUT_MANIFEST_STRAND_WITH_TAG_TEMPLATE` twine (see the test class variable).

        :param str dataset_path: the path to make the dataset at
        :return str: the serialised input manifest
        """
        input_manifest = {
            "id": "8ead7669-8162-4f64-8cd5-4abe92509e17",
            "datasets": {
                "met_mast_data": dataset_path
            }
        }

        # Make two datafiles with the correct tags for `TWINE_WITH_INPUT_MANIFEST_STRAND_WITH_TAG_TEMPLATE`
        for filename in ("file_1.csv", "file_2.csv"):
            with Datafile(
                    path=os.path.join(dataset_path, filename),
                    tags={
                        "manufacturer": "vestas",
                        "height": 500,
                        "is_recycled": True,
                        "number_of_blades": 3,
                    },
                    mode="w",
            ) as (datafile, f):
                f.write("hello")

        return input_manifest
Example #3
0
    def test_error_raised_when_required_tags_missing_for_validate_input_manifest(
            self):
        """Test that an error is raised when required tags from the file tags template for a dataset are missing when
        validating the input manifest.
        """
        with tempfile.TemporaryDirectory() as temporary_directory:
            dataset_path = os.path.join(temporary_directory, "met_mast_data")

            # Make a datafile with no tags.
            with Datafile(os.path.join(dataset_path, "my_file_0.txt"),
                          mode="w") as (datafile, f):
                f.write("hello")

            input_manifest = {
                "id": "8ead7669-8162-4f64-8cd5-4abe92509e17",
                "datasets": {
                    "met_mast_data": dataset_path,
                },
            }

            runner = Runner(
                app_src=app,
                twine=self.TWINE_WITH_INPUT_MANIFEST_STRAND_WITH_TAG_TEMPLATE)

            with self.assertRaises(twined.exceptions.InvalidManifestContents):
                runner.run(input_manifest=input_manifest)
Example #4
0
    def test_validate_input_manifest_raises_error_if_required_tags_are_not_of_required_type(
            self):
        """Test that an error is raised if the required tags from the file tags template for a dataset are present but
        are not of the required type when validating an input manifest.
        """
        with tempfile.TemporaryDirectory() as temporary_directory:
            dataset_path = os.path.join(temporary_directory, "met_mast_data")

            input_manifest = {
                "id": "8ead7669-8162-4f64-8cd5-4abe92509e17",
                "datasets": {
                    "met_mast_data": dataset_path,
                },
            }

            runner = Runner(
                app_src=app,
                twine=self.TWINE_WITH_INPUT_MANIFEST_STRAND_WITH_TAG_TEMPLATE)

            for tags in (
                {
                    "manufacturer": "Vestas",
                    "height": 350,
                    "is_recycled": False,
                    "number_of_blades": "3"
                },
                {
                    "manufacturer": "Vestas",
                    "height": 350,
                    "is_recycled": "no",
                    "number_of_blades": 3
                },
                {
                    "manufacturer": False,
                    "height": 350,
                    "is_recycled": "false",
                    "number_of_blades": 3
                },
            ):
                with self.subTest(tags=tags):

                    # Make a datafile with the given tags.
                    with Datafile(
                            path=os.path.join(dataset_path, "my_file_0.txt"),
                            tags=tags,
                            mode="w",
                    ) as (datafile, f):
                        f.write("hello")

                    with self.assertRaises(
                            twined.exceptions.InvalidManifestContents):
                        runner.run(input_manifest=input_manifest)
Example #5
0
    def _set_metadata(self, metadata):
        """Set the dataset's metadata.

        :param dict metadata:
        :return None:
        """
        if "files" in metadata:
            # There's no need to provide the hypothetical parameter here as this method is only used for
            # non-hypothetical datasets.
            self.files = FilterSet(
                Datafile(path=path) for path in metadata["files"])

        for attribute in self._METADATA_ATTRIBUTES:
            if attribute not in metadata:
                continue

            if attribute == "id":
                self._set_id(metadata["id"])
                continue

            setattr(self, attribute, metadata[attribute])
Example #6
0
    def _instantiate_from_local_directory(self, path):
        """Instantiate the dataset from a local directory.

        :param str path: the path to a local directory
        :return None:
        """
        self.files = FilterSet()

        for level, (directory_path, _, filenames) in enumerate(os.walk(path)):
            for filename in filenames:

                if filename == METADATA_FILENAME:
                    continue

                if not self._recursive and level > 0:
                    break

                self.files.add(
                    Datafile(path=os.path.join(directory_path, filename),
                             hypothetical=self._hypothetical))

        if not self._hypothetical:
            self._use_local_metadata()
Example #7
0
    def _instantiate_from_cloud(self, path):
        """Instantiate the dataset from a cloud directory.

        :param str path: the cloud path to a directory in cloud storage
        :return None:
        """
        if not self._hypothetical:
            self._use_cloud_metadata()

        if not self.files:
            bucket_name = storage.path.split_bucket_name_from_cloud_path(
                path)[0]

            self.files = FilterSet(
                Datafile(path=storage.path.generate_gs_path(
                    bucket_name, blob.name),
                         hypothetical=self._hypothetical)
                for blob in GoogleCloudStorageClient().scandir(
                    path,
                    recursive=self._recursive,
                    filter=(lambda blob:
                            (not blob.name.endswith(METADATA_FILENAME) and
                             SIGNED_METADATA_DIRECTORY not in blob.name)),
                ))
Example #8
0
    def test_validate_input_manifest_with_required_tags_in_several_datasets(
            self):
        """Test that required tags for different datasets' file tags templates are validated separately and correctly
        for each dataset.
        """
        twine_with_input_manifest_with_required_tags_for_multiple_datasets = {
            "input_manifest": {
                "datasets": {
                    "first_dataset": {
                        "purpose":
                        "A dataset containing meteorological mast data",
                        "file_tags_template": {
                            "type": "object",
                            "properties": {
                                "manufacturer": {
                                    "type": "string"
                                },
                                "height": {
                                    "type": "number"
                                }
                            },
                        },
                    },
                    "second_dataset": {
                        "file_tags_template": {
                            "type": "object",
                            "properties": {
                                "is_recycled": {
                                    "type": "boolean"
                                },
                                "number_of_blades": {
                                    "type": "number"
                                }
                            },
                        }
                    },
                }
            },
            "output_values_schema": {},
        }

        with tempfile.TemporaryDirectory() as temporary_directory:

            dataset_paths = (
                os.path.join(temporary_directory, "first_dataset"),
                os.path.join(temporary_directory, "second_dataset"),
            )

            input_manifest = {
                "id": "8ead7669-8162-4f64-8cd5-4abe92509e17",
                "datasets": {
                    "first_dataset": dataset_paths[0],
                    "second_dataset": dataset_paths[1],
                },
            }

            with Datafile(
                    path=os.path.join(dataset_paths[0], "file_0.csv"),
                    tags={
                        "manufacturer": "vestas",
                        "height": 503.7
                    },
                    mode="w",
            ) as (datafile, f):
                f.write("hello")

            with Datafile(
                    path=os.path.join(dataset_paths[1], "file_1.csv"),
                    tags={
                        "is_recycled": True,
                        "number_of_blades": 3
                    },
                    mode="w",
            ) as (datafile, f):
                f.write("hello")

            runner = Runner(
                app_src=app,
                twine=
                twine_with_input_manifest_with_required_tags_for_multiple_datasets
            )
            runner.run(input_manifest=input_manifest)