def test_metadata_file_attributes(manifest_for_metadata): """ Test that Manifest.metadata_file_attributes returns the correct CacheFileAttributes object and raises the correct error when you ask for a metadata file that does not exist """ mfest = Manifest('/my/cache/dir/', manifest_for_metadata) a_obj = mfest.metadata_file_attributes('a.txt') assert a_obj.url == 'http://my.url.com/path/to/a.txt' assert a_obj.version_id == '12345' assert a_obj.file_hash == 'abcde' expected = safe_system_path('/my/cache/dir/some-project-000/to/a.txt') expected = pathlib.Path(expected).resolve() assert a_obj.local_path == expected b_obj = mfest.metadata_file_attributes('b.txt') assert b_obj.url == 'http://my.other.url.com/different/path/to/b.txt' assert b_obj.version_id == '67890' assert b_obj.file_hash == 'fghijk' expected = safe_system_path('/my/cache/dir/some-project-000/path/to/b.txt') expected = pathlib.Path(expected).resolve() assert b_obj.local_path == expected # test that the correct error is raised when you ask # for a metadata file that does not exist with pytest.raises(ValueError) as context: _ = mfest.metadata_file_attributes('c.txt') msg = "c.txt\nis not in self.metadata_file_names" assert msg in context.value.args[0]
def test_file_attribute_errors(meta_json_path): """ Test that Manifest raises the correct error if you try to get file attributes before loading a manifest.json """ mfest = Manifest("/my/cache/dir", meta_json_path) with pytest.raises(ValueError, match=r".* not in self.metadata_file_names"): mfest.metadata_file_attributes('some_file.txt') with pytest.raises(ValueError, match=r".* not a data file listed in manifest"): mfest.data_file_attributes('other_file.txt')
class CloudCacheBase(ABC): """ A class to handle the downloading and accessing of data served from a cloud storage system Parameters ---------- cache_dir: str or pathlib.Path Path to the directory where data will be stored on the local system project_name: str the name of the project this cache is supposed to access. This will be the root directory for all files stored in the bucket. """ _bucket_name = None def __init__(self, cache_dir, project_name): os.makedirs(cache_dir, exist_ok=True) self._manifest = None self._cache_dir = cache_dir self._project_name = project_name self._manifest_file_names = self._list_all_manifests() @abstractmethod def _list_all_manifests(self) -> list: """ Return a list of all of the file names of the manifests associated with this dataset """ raise NotImplementedError() @property def latest_manifest_file(self) -> str: """parses available manifest files for semver string and returns the latest one self.manifest_file_names are assumed to be of the form '<anything>_v<semver_str>.json' Returns ------- str the filename whose semver string is the latest one """ vstrs = [ s.split(".json")[0].split("_v")[-1] for s in self.manifest_file_names ] versions = [semver.VersionInfo.parse(v) for v in vstrs] imax = versions.index(max(versions)) return self.manifest_file_names[imax] def load_latest_manifest(self): self.load_manifest(self.latest_manifest_file) @abstractmethod def _download_manifest(self, manifest_name: str): """ Download a manifest from the dataset Parameters ---------- manifest_name: str The name of the manifest to load. Must be an element in self.manifest_file_names """ raise NotImplementedError() @abstractmethod def _download_file(self, file_attributes: CacheFileAttributes) -> bool: """ Check if a file exists and is in the expected state. If it is, return True. If it is not, download the file, creating the directory where the file is to be stored if necessary. If the download is successful, return True. If the download fails (file hash does not match expectation), return False. Parameters ---------- file_attributes: CacheFileAttributes Describes the file to download Returns ------- None Raises ------ RuntimeError If the path to the directory where the file is to be saved points to something that is not a directory. RuntimeError If it is not able to successfully download the file after 10 iterations """ raise NotImplementedError() @property def project_name(self) -> str: """ The name of the project that this cache is accessing """ return self._project_name @property def manifest_prefix(self) -> str: """ On-line prefix for manifest files """ return f'{self.project_name}/manifests/' @property def file_id_column(self) -> str: """ The column in the metadata files used to uniquely identify data files """ return self._manifest.file_id_column @property def version(self) -> str: """ The version of the dataset currently loaded """ return self._manifest.version @property def metadata_file_names(self) -> list: """ List of metadata file names associated with this dataset """ return self._manifest.metadata_file_names @property def manifest_file_names(self) -> list: """ Sorted list of manifest file names associated with this dataset """ return copy.deepcopy(self._manifest_file_names) def load_manifest(self, manifest_name: str): """ Load a manifest from this dataset. Parameters ---------- manifest_name: str The name of the manifest to load. Must be an element in self.manifest_file_names """ if manifest_name not in self.manifest_file_names: raise ValueError(f"manifest: {manifest_name}\n" "is not one of the valid manifest names " "for this dataset:\n" f"{self.manifest_file_names}") filepath = os.path.join(self._cache_dir, manifest_name) if not os.path.exists(filepath): self._download_manifest(manifest_name) with open(filepath) as f: self._manifest = Manifest(cache_dir=self._cache_dir, json_input=f) def _file_exists(self, file_attributes: CacheFileAttributes) -> bool: """ Given a CacheFileAttributes describing a file, assess whether or not that file exists locally and is valid (i.e. has the expected file hash) Parameters ---------- file_attributes: CacheFileAttributes Description of the file to look for Returns ------- bool True if the file exists and is valid; False otherwise Raises ----- RuntimeError If file_attributes.local_path exists but is not a file. It would be unclear how the cache should proceed in this case. """ if not file_attributes.local_path.exists(): return False if not file_attributes.local_path.is_file(): raise RuntimeError(f"{file_attributes.local_path}\n" "exists, but is not a file;\n" "unsure how to proceed") full_path = file_attributes.local_path.resolve() test_checksum = file_hash_from_path(full_path) if test_checksum != file_attributes.file_hash: return False return True def data_path(self, file_id) -> dict: """ Return the local path to a data file, and test for the file's existence/validity Parameters ---------- file_id: The unique identifier of the file to be accessed Returns ------- dict 'path' will be a pathlib.Path pointing to the file's location 'exists' will be a boolean indicating if the file exists in a valid state 'file_attributes' is a CacheFileAttributes describing the file in more detail Raises ------ RuntimeError If the file cannot be downloaded """ file_attributes = self._manifest.data_file_attributes(file_id) exists = self._file_exists(file_attributes) local_path = file_attributes.local_path output = { 'local_path': local_path, 'exists': exists, 'file_attributes': file_attributes } return output def download_data(self, file_id) -> pathlib.Path: """ Return the local path to a data file, downloading the file if necessary Parameters ---------- file_id: The unique identifier of the file to be accessed Returns ------- pathlib.Path The path indicating where the file is stored on the local system Raises ------ RuntimeError If the file cannot be downloaded """ super_attributes = self.data_path(file_id) file_attributes = super_attributes['file_attributes'] self._download_file(file_attributes) return file_attributes.local_path def metadata_path(self, fname: str) -> dict: """ Return the local path to a metadata file, and test for the file's existence/validity Parameters ---------- fname: str The name of the metadata file to be accessed Returns ------- dict 'path' will be a pathlib.Path pointing to the file's location 'exists' will be a boolean indicating if the file exists in a valid state 'file_attributes' is a CacheFileAttributes describing the file in more detail Raises ------ RuntimeError If the file cannot be downloaded """ file_attributes = self._manifest.metadata_file_attributes(fname) exists = self._file_exists(file_attributes) local_path = file_attributes.local_path output = { 'local_path': local_path, 'exists': exists, 'file_attributes': file_attributes } return output def download_metadata(self, fname: str) -> pathlib.Path: """ Return the local path to a metadata file, downloading the file if necessary Parameters ---------- fname: str The name of the metadata file to be accessed Returns ------- pathlib.Path The path indicating where the file is stored on the local system Raises ------ RuntimeError If the file cannot be downloaded """ super_attributes = self.metadata_path(fname) file_attributes = super_attributes['file_attributes'] self._download_file(file_attributes) return file_attributes.local_path def get_metadata(self, fname: str) -> pd.DataFrame: """ Return a pandas DataFrame of metadata Parameters ---------- fname: str The name of the metadata file to load Returns ------- pd.DataFrame Notes ----- This method will check to see if the specified metadata file exists locally. If it does not, the method will download the file. Use self.metadata_path() to find where the file is stored """ local_path = self.download_metadata(fname) return pd.read_csv(local_path)