def _instantiate_datafile(self, file): """Instantiate a datafile from multiple input formats. :param str|dict|octue.resources.datafile.Datafile file: :return octue.resources.datafile.Datafile: """ if isinstance(file, Datafile): return file if isinstance(file, str): return Datafile(path=file, hypothetical=self._hypothetical) return Datafile.deserialise(file)
def _make_serialised_input_manifest_with_correct_dataset_file_tags( self, dataset_path): """Make a serialised input manifest and create one dataset and its metadata on the filesystem so that, when the manifest is loaded, the dataset and its metadata are also loaded. The tags on the dataset's files are correct for the `TWINE_WITH_INPUT_MANIFEST_STRAND_WITH_TAG_TEMPLATE` twine (see the test class variable). :param str dataset_path: the path to make the dataset at :return str: the serialised input manifest """ input_manifest = { "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", "datasets": { "met_mast_data": dataset_path } } # Make two datafiles with the correct tags for `TWINE_WITH_INPUT_MANIFEST_STRAND_WITH_TAG_TEMPLATE` for filename in ("file_1.csv", "file_2.csv"): with Datafile( path=os.path.join(dataset_path, filename), tags={ "manufacturer": "vestas", "height": 500, "is_recycled": True, "number_of_blades": 3, }, mode="w", ) as (datafile, f): f.write("hello") return input_manifest
def test_error_raised_when_required_tags_missing_for_validate_input_manifest( self): """Test that an error is raised when required tags from the file tags template for a dataset are missing when validating the input manifest. """ with tempfile.TemporaryDirectory() as temporary_directory: dataset_path = os.path.join(temporary_directory, "met_mast_data") # Make a datafile with no tags. with Datafile(os.path.join(dataset_path, "my_file_0.txt"), mode="w") as (datafile, f): f.write("hello") input_manifest = { "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", "datasets": { "met_mast_data": dataset_path, }, } runner = Runner( app_src=app, twine=self.TWINE_WITH_INPUT_MANIFEST_STRAND_WITH_TAG_TEMPLATE) with self.assertRaises(twined.exceptions.InvalidManifestContents): runner.run(input_manifest=input_manifest)
def test_validate_input_manifest_raises_error_if_required_tags_are_not_of_required_type( self): """Test that an error is raised if the required tags from the file tags template for a dataset are present but are not of the required type when validating an input manifest. """ with tempfile.TemporaryDirectory() as temporary_directory: dataset_path = os.path.join(temporary_directory, "met_mast_data") input_manifest = { "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", "datasets": { "met_mast_data": dataset_path, }, } runner = Runner( app_src=app, twine=self.TWINE_WITH_INPUT_MANIFEST_STRAND_WITH_TAG_TEMPLATE) for tags in ( { "manufacturer": "Vestas", "height": 350, "is_recycled": False, "number_of_blades": "3" }, { "manufacturer": "Vestas", "height": 350, "is_recycled": "no", "number_of_blades": 3 }, { "manufacturer": False, "height": 350, "is_recycled": "false", "number_of_blades": 3 }, ): with self.subTest(tags=tags): # Make a datafile with the given tags. with Datafile( path=os.path.join(dataset_path, "my_file_0.txt"), tags=tags, mode="w", ) as (datafile, f): f.write("hello") with self.assertRaises( twined.exceptions.InvalidManifestContents): runner.run(input_manifest=input_manifest)
def _set_metadata(self, metadata): """Set the dataset's metadata. :param dict metadata: :return None: """ if "files" in metadata: # There's no need to provide the hypothetical parameter here as this method is only used for # non-hypothetical datasets. self.files = FilterSet( Datafile(path=path) for path in metadata["files"]) for attribute in self._METADATA_ATTRIBUTES: if attribute not in metadata: continue if attribute == "id": self._set_id(metadata["id"]) continue setattr(self, attribute, metadata[attribute])
def _instantiate_from_local_directory(self, path): """Instantiate the dataset from a local directory. :param str path: the path to a local directory :return None: """ self.files = FilterSet() for level, (directory_path, _, filenames) in enumerate(os.walk(path)): for filename in filenames: if filename == METADATA_FILENAME: continue if not self._recursive and level > 0: break self.files.add( Datafile(path=os.path.join(directory_path, filename), hypothetical=self._hypothetical)) if not self._hypothetical: self._use_local_metadata()
def _instantiate_from_cloud(self, path): """Instantiate the dataset from a cloud directory. :param str path: the cloud path to a directory in cloud storage :return None: """ if not self._hypothetical: self._use_cloud_metadata() if not self.files: bucket_name = storage.path.split_bucket_name_from_cloud_path( path)[0] self.files = FilterSet( Datafile(path=storage.path.generate_gs_path( bucket_name, blob.name), hypothetical=self._hypothetical) for blob in GoogleCloudStorageClient().scandir( path, recursive=self._recursive, filter=(lambda blob: (not blob.name.endswith(METADATA_FILENAME) and SIGNED_METADATA_DIRECTORY not in blob.name)), ))
def test_validate_input_manifest_with_required_tags_in_several_datasets( self): """Test that required tags for different datasets' file tags templates are validated separately and correctly for each dataset. """ twine_with_input_manifest_with_required_tags_for_multiple_datasets = { "input_manifest": { "datasets": { "first_dataset": { "purpose": "A dataset containing meteorological mast data", "file_tags_template": { "type": "object", "properties": { "manufacturer": { "type": "string" }, "height": { "type": "number" } }, }, }, "second_dataset": { "file_tags_template": { "type": "object", "properties": { "is_recycled": { "type": "boolean" }, "number_of_blades": { "type": "number" } }, } }, } }, "output_values_schema": {}, } with tempfile.TemporaryDirectory() as temporary_directory: dataset_paths = ( os.path.join(temporary_directory, "first_dataset"), os.path.join(temporary_directory, "second_dataset"), ) input_manifest = { "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", "datasets": { "first_dataset": dataset_paths[0], "second_dataset": dataset_paths[1], }, } with Datafile( path=os.path.join(dataset_paths[0], "file_0.csv"), tags={ "manufacturer": "vestas", "height": 503.7 }, mode="w", ) as (datafile, f): f.write("hello") with Datafile( path=os.path.join(dataset_paths[1], "file_1.csv"), tags={ "is_recycled": True, "number_of_blades": 3 }, mode="w", ) as (datafile, f): f.write("hello") runner = Runner( app_src=app, twine= twine_with_input_manifest_with_required_tags_for_multiple_datasets ) runner.run(input_manifest=input_manifest)