Example #1
0
def test_metadata_file_attributes(manifest_for_metadata):
    """
    Test that Manifest.metadata_file_attributes returns the
    correct CacheFileAttributes object and raises the correct
    error when you ask for a metadata file that does not exist
    """

    mfest = Manifest('/my/cache/dir/', manifest_for_metadata)

    a_obj = mfest.metadata_file_attributes('a.txt')
    assert a_obj.url == 'http://my.url.com/path/to/a.txt'
    assert a_obj.version_id == '12345'
    assert a_obj.file_hash == 'abcde'
    expected = safe_system_path('/my/cache/dir/some-project-000/to/a.txt')
    expected = pathlib.Path(expected).resolve()
    assert a_obj.local_path == expected

    b_obj = mfest.metadata_file_attributes('b.txt')
    assert b_obj.url == 'http://my.other.url.com/different/path/to/b.txt'
    assert b_obj.version_id == '67890'
    assert b_obj.file_hash == 'fghijk'
    expected = safe_system_path('/my/cache/dir/some-project-000/path/to/b.txt')
    expected = pathlib.Path(expected).resolve()
    assert b_obj.local_path == expected

    # test that the correct error is raised when you ask
    # for a metadata file that does not exist

    with pytest.raises(ValueError) as context:
        _ = mfest.metadata_file_attributes('c.txt')
    msg = "c.txt\nis not in self.metadata_file_names"
    assert msg in context.value.args[0]
Example #2
0
def test_data_file_attributes(manifest_with_data):
    """
    Test that Manifest.data_file_attributes returns the correct
    CacheFileAttributes object and raises the correct error when
    you ask for a data file that does not exist
    """
    mfest = Manifest('/my/cache/dir', manifest_with_data)

    a_obj = mfest.data_file_attributes('a')
    assert a_obj.url == 'http://my.url.com/myproject/path/to/a.nwb'
    assert a_obj.version_id == '12345'
    assert a_obj.file_hash == 'abcde'
    expected = safe_system_path('/my/cache/dir/myproject-0/path/to/a.nwb')
    assert a_obj.local_path == pathlib.Path(expected).resolve()

    b_obj = mfest.data_file_attributes('b')
    assert b_obj.url == 'http://my.other.url.com/different/path/b.nwb'
    assert b_obj.version_id == '67890'
    assert b_obj.file_hash == 'fghijk'
    expected = safe_system_path('/my/cache/dir/myproject-0/path/b.nwb')
    assert b_obj.local_path == pathlib.Path(expected).resolve()

    with pytest.raises(ValueError) as context:
        _ = mfest.data_file_attributes('c')
    msg = "file_id: c\nIs not a data file listed in manifest:"
    assert msg in context.value.args[0]
Example #3
0
def test_constructor(meta_json_path):
    """
    Make sure that the Manifest class __init__ runs and
    raises an error if you give it an unexpected cache_dir
    """
    Manifest('my/cache/dir', meta_json_path)
    Manifest(pathlib.Path('my/other/cache/dir'), meta_json_path)
    with pytest.raises(ValueError, match=r"cache_dir must be either a str.*"):
        Manifest(1234.2, meta_json_path)
Example #4
0
    def _load_manifest(self, manifest_name: str) -> Manifest:
        """
        Load and return a manifest from this dataset.

        Parameters
        ----------
        manifest_name: str
            The name of the manifest to load. Must be an element in
            self.manifest_file_names

        Returns
        -------
        Manifest
        """
        if manifest_name not in self.manifest_file_names:
            raise ValueError(f"manifest: {manifest_name}\n"
                             "is not one of the valid manifest names "
                             "for this dataset:\n"
                             f"{self.manifest_file_names}")

        filepath = os.path.join(self._cache_dir, manifest_name)
        if not os.path.exists(filepath):
            self._download_manifest(manifest_name)

        with open(filepath) as f:
            local_manifest = Manifest(
                cache_dir=self._cache_dir,
                json_input=f
            )

        with open(self._manifest_last_used, 'w') as out_file:
            out_file.write(manifest_name)

        return local_manifest
Example #5
0
def test_create_file_attributes(meta_json_path):
    """
    Test that Manifest._create_file_attributes correctly
    handles input parameters (this is mostly a test of
    local_path generation)
    """
    mfest = Manifest('/my/cache/dir', meta_json_path)
    attr = mfest._create_file_attributes('http://my.url.com/path/to/file.txt',
                                         '12345', 'aaabbbcccddd')

    assert isinstance(attr, CacheFileAttributes)
    assert attr.url == 'http://my.url.com/path/to/file.txt'
    assert attr.version_id == '12345'
    assert attr.file_hash == 'aaabbbcccddd'
    expected_path = '/my/cache/dir/X-Y/to/file.txt'
    assert attr.local_path == pathlib.Path(expected_path).resolve()
Example #6
0
    def load_manifest(self, manifest_name: str):
        """
        Load a manifest from this dataset.

        Parameters
        ----------
        manifest_name: str
            The name of the manifest to load. Must be an element in
            self.manifest_file_names
        """
        if manifest_name not in self.manifest_file_names:
            raise ValueError(f"manifest: {manifest_name}\n"
                             "is not one of the valid manifest names "
                             "for this dataset:\n"
                             f"{self.manifest_file_names}")

        filepath = os.path.join(self._cache_dir, manifest_name)
        if not os.path.exists(filepath):
            self._download_manifest(manifest_name)

        with open(filepath) as f:
            self._manifest = Manifest(cache_dir=self._cache_dir, json_input=f)
Example #7
0
    def _load_manifest(self,
                       manifest_name: str,
                       use_static_project_dir: bool = False) -> Manifest:
        """
        Load and return a manifest from this dataset.

        Parameters
        ----------
        manifest_name: str
            The name of the manifest to load. Must be an element in
            self.manifest_file_names
        use_static_project_dir: bool
            When determining what the local path of a remote resource
            (data or metadata file) should be, the Manifest class will
            typically create a versioned project subdirectory under the user
            provided `cache_dir`
            (e.g. f"{cache_dir}/{project_name}-{manifest_version}")
            to allow the possibility of multiple manifest (and data) versions
            to be used. In certain cases, like when using a project's s3 bucket
            directly as the cache_dir, the project directory name needs to be
            static (e.g. f"{cache_dir}/{project_name}"). When set to True,
            the Manifest class will use a static project directory to determine
            local paths for remote resources. Defaults to False.

        Returns
        -------
        Manifest
        """
        if manifest_name not in self.manifest_file_names:
            raise ValueError(
                f"Manifest to load ({manifest_name}) is not one of the "
                "valid manifest names for this dataset. Valid names include:\n"
                f"{self.manifest_file_names}")

        if use_static_project_dir:
            manifest_path = os.path.join(self._cache_dir, self.project_name,
                                         "manifests", manifest_name)
        else:
            manifest_path = os.path.join(self._cache_dir, manifest_name)

        with open(manifest_path, "r") as f:
            local_manifest = Manifest(
                cache_dir=self._cache_dir,
                json_input=f,
                use_static_project_dir=use_static_project_dir)

        return local_manifest
Example #8
0
def test_file_attribute_errors(meta_json_path):
    """
    Test that Manifest raises the correct error if you try to get file
    attributes before loading a manifest.json
    """
    mfest = Manifest("/my/cache/dir", meta_json_path)
    with pytest.raises(ValueError,
                       match=r".* not in self.metadata_file_names"):
        mfest.metadata_file_attributes('some_file.txt')

    with pytest.raises(ValueError,
                       match=r".* not a data file listed in manifest"):
        mfest.data_file_attributes('other_file.txt')
def test_windows_path_to_isilon(monkeypatch, tmpdir):
    """
    This test is just meant to verify on Windows CI instances
    that, if a path to the `/allen/` shared file store is used as
    cache_dir, the path to files will come out useful (i.e. without any
    spurious C:/ prepended as in AllenSDK issue #1964
    """

    cache_dir = Path(tmpdir)

    manifest_1 = {
        'manifest_version': '1',
        'metadata_file_id_column_name': 'file_id',
        'data_pipeline': 'placeholder',
        'project_name': 'my-project',
        'metadata_files': {
            'a.csv': {
                'url': 'http://www.junk.com/path/to/a.csv',  # noqa: E501
                'version_id': '1111',
                'file_hash': 'abcde'
            },
            'b.csv': {
                'url': 'http://silly.com/path/to/b.csv',  # noqa: E501
                'version_id': '2222',
                'file_hash': 'fghijk'
            }
        },
        'data_files': {
            'data_1': {
                'url': 'http://www.junk.com/data/path/data.csv',  # noqa: E501
                'version_id': '1111',
                'file_hash': 'lmnopqrst'
            }
        }
    }
    manifest_path = tmpdir / "manifest.json"
    with open(manifest_path, "w") as f:
        json.dump(manifest_1, f)

    def dummy_file_exists(self, m):
        return True

    # we do not want paths to `/allen` to be resolved to
    # a local drive on the user's machine
    bad_windows_pattern = re.compile('^[A-Z]\:')  # noqa: W605

    # make sure pattern is correctly formulated
    m = bad_windows_pattern.search('C:\\a\windows\path')  # noqa: W605
    assert m is not None

    with monkeypatch.context() as ctx:

        class TestCloudCache(CloudCacheBase):
            def _download_file(self, m, o):
                pass

            def _download_manifest(self, m, o):
                pass

            def _list_all_manifests(self):
                pass

        ctx.setattr(TestCloudCache, '_file_exists', dummy_file_exists)

        cache = TestCloudCache(cache_dir, 'proj')
        cache._manifest = Manifest(cache_dir, json_input=manifest_path)

        m_path = cache.metadata_path('a.csv')
        assert bad_windows_pattern.match(str(m_path)) is None
        d_path = cache.data_path('data_1')
        assert bad_windows_pattern.match(str(d_path)) is None
Example #10
0
class CloudCacheBase(ABC):
    """
    A class to handle the downloading and accessing of data served from a cloud
    storage system

    Parameters
    ----------
    cache_dir: str or pathlib.Path
        Path to the directory where data will be stored on the local system

    project_name: str
        the name of the project this cache is supposed to access. This will
        be the root directory for all files stored in the bucket.
    """

    _bucket_name = None

    def __init__(self, cache_dir, project_name):
        os.makedirs(cache_dir, exist_ok=True)

        self._manifest = None
        self._cache_dir = cache_dir
        self._project_name = project_name
        self._manifest_file_names = self._list_all_manifests()

    @abstractmethod
    def _list_all_manifests(self) -> list:
        """
        Return a list of all of the file names of the manifests associated
        with this dataset
        """
        raise NotImplementedError()

    @property
    def latest_manifest_file(self) -> str:
        """parses available manifest files for semver string
        and returns the latest one
        self.manifest_file_names are assumed to be of the form
        '<anything>_v<semver_str>.json'

        Returns
        -------
        str
            the filename whose semver string is the latest one
        """
        vstrs = [
            s.split(".json")[0].split("_v")[-1]
            for s in self.manifest_file_names
        ]
        versions = [semver.VersionInfo.parse(v) for v in vstrs]
        imax = versions.index(max(versions))
        return self.manifest_file_names[imax]

    def load_latest_manifest(self):
        self.load_manifest(self.latest_manifest_file)

    @abstractmethod
    def _download_manifest(self, manifest_name: str):
        """
        Download a manifest from the dataset

        Parameters
        ----------
        manifest_name: str
            The name of the manifest to load. Must be an element in
            self.manifest_file_names
        """
        raise NotImplementedError()

    @abstractmethod
    def _download_file(self, file_attributes: CacheFileAttributes) -> bool:
        """
        Check if a file exists and is in the expected state.

        If it is, return True.

        If it is not, download the file, creating the directory
        where the file is to be stored if necessary.

        If the download is successful, return True.

        If the download fails (file hash does not match expectation),
        return False.

        Parameters
        ----------
        file_attributes: CacheFileAttributes
            Describes the file to download

        Returns
        -------
        None

        Raises
        ------
        RuntimeError
            If the path to the directory where the file is to be saved
            points to something that is not a directory.

        RuntimeError
            If it is not able to successfully download the file after
            10 iterations
        """
        raise NotImplementedError()

    @property
    def project_name(self) -> str:
        """
        The name of the project that this cache is accessing
        """
        return self._project_name

    @property
    def manifest_prefix(self) -> str:
        """
        On-line prefix for manifest files
        """
        return f'{self.project_name}/manifests/'

    @property
    def file_id_column(self) -> str:
        """
        The column in the metadata files used to uniquely
        identify data files
        """
        return self._manifest.file_id_column

    @property
    def version(self) -> str:
        """
        The version of the dataset currently loaded
        """
        return self._manifest.version

    @property
    def metadata_file_names(self) -> list:
        """
        List of metadata file names associated with this dataset
        """
        return self._manifest.metadata_file_names

    @property
    def manifest_file_names(self) -> list:
        """
        Sorted list of manifest file names associated with this
        dataset
        """
        return copy.deepcopy(self._manifest_file_names)

    def load_manifest(self, manifest_name: str):
        """
        Load a manifest from this dataset.

        Parameters
        ----------
        manifest_name: str
            The name of the manifest to load. Must be an element in
            self.manifest_file_names
        """
        if manifest_name not in self.manifest_file_names:
            raise ValueError(f"manifest: {manifest_name}\n"
                             "is not one of the valid manifest names "
                             "for this dataset:\n"
                             f"{self.manifest_file_names}")

        filepath = os.path.join(self._cache_dir, manifest_name)
        if not os.path.exists(filepath):
            self._download_manifest(manifest_name)

        with open(filepath) as f:
            self._manifest = Manifest(cache_dir=self._cache_dir, json_input=f)

    def _file_exists(self, file_attributes: CacheFileAttributes) -> bool:
        """
        Given a CacheFileAttributes describing a file, assess whether or
        not that file exists locally and is valid (i.e. has the expected
        file hash)

        Parameters
        ----------
        file_attributes: CacheFileAttributes
            Description of the file to look for

        Returns
        -------
        bool
            True if the file exists and is valid; False otherwise

        Raises
        -----
        RuntimeError
            If file_attributes.local_path exists but is not a file.
            It would be unclear how the cache should proceed in this case.
        """

        if not file_attributes.local_path.exists():
            return False
        if not file_attributes.local_path.is_file():
            raise RuntimeError(f"{file_attributes.local_path}\n"
                               "exists, but is not a file;\n"
                               "unsure how to proceed")

        full_path = file_attributes.local_path.resolve()
        test_checksum = file_hash_from_path(full_path)
        if test_checksum != file_attributes.file_hash:
            return False

        return True

    def data_path(self, file_id) -> dict:
        """
        Return the local path to a data file, and test for the
        file's existence/validity

        Parameters
        ----------
        file_id:
            The unique identifier of the file to be accessed

        Returns
        -------
        dict

            'path' will be a pathlib.Path pointing to the file's location

            'exists' will be a boolean indicating if the file
            exists in a valid state

            'file_attributes' is a CacheFileAttributes describing the file
            in more detail

        Raises
        ------
        RuntimeError
            If the file cannot be downloaded
        """
        file_attributes = self._manifest.data_file_attributes(file_id)
        exists = self._file_exists(file_attributes)
        local_path = file_attributes.local_path
        output = {
            'local_path': local_path,
            'exists': exists,
            'file_attributes': file_attributes
        }

        return output

    def download_data(self, file_id) -> pathlib.Path:
        """
        Return the local path to a data file, downloading the file
        if necessary

        Parameters
        ----------
        file_id:
            The unique identifier of the file to be accessed

        Returns
        -------
        pathlib.Path
            The path indicating where the file is stored on the
            local system

        Raises
        ------
        RuntimeError
            If the file cannot be downloaded
        """
        super_attributes = self.data_path(file_id)
        file_attributes = super_attributes['file_attributes']
        self._download_file(file_attributes)
        return file_attributes.local_path

    def metadata_path(self, fname: str) -> dict:
        """
        Return the local path to a metadata file, and test for the
        file's existence/validity

        Parameters
        ----------
        fname: str
            The name of the metadata file to be accessed

        Returns
        -------
        dict

            'path' will be a pathlib.Path pointing to the file's location

            'exists' will be a boolean indicating if the file
            exists in a valid state

            'file_attributes' is a CacheFileAttributes describing the file
            in more detail

        Raises
        ------
        RuntimeError
            If the file cannot be downloaded
        """
        file_attributes = self._manifest.metadata_file_attributes(fname)
        exists = self._file_exists(file_attributes)
        local_path = file_attributes.local_path
        output = {
            'local_path': local_path,
            'exists': exists,
            'file_attributes': file_attributes
        }

        return output

    def download_metadata(self, fname: str) -> pathlib.Path:
        """
        Return the local path to a metadata file, downloading the
        file if necessary

        Parameters
        ----------
        fname: str
            The name of the metadata file to be accessed

        Returns
        -------
        pathlib.Path
            The path indicating where the file is stored on the
            local system

        Raises
        ------
        RuntimeError
            If the file cannot be downloaded
        """
        super_attributes = self.metadata_path(fname)
        file_attributes = super_attributes['file_attributes']
        self._download_file(file_attributes)
        return file_attributes.local_path

    def get_metadata(self, fname: str) -> pd.DataFrame:
        """
        Return a pandas DataFrame of metadata

        Parameters
        ----------
        fname: str
            The name of the metadata file to load

        Returns
        -------
        pd.DataFrame

        Notes
        -----
        This method will check to see if the specified metadata file exists
        locally. If it does not, the method will download the file. Use
        self.metadata_path() to find where the file is stored
        """
        local_path = self.download_metadata(fname)
        return pd.read_csv(local_path)