コード例 #1
0
ファイル: cloud_cache.py プロジェクト: matchings/AllenSDK
    def construct_local_manifest(self) -> None:
        """
        Construct the dict that maps between file_hash and
        absolute local path. Save it to self._downloaded_data_path
        """
        lookup = {}
        files_to_hash = set()
        c_dir = pathlib.Path(self._cache_dir)
        file_iterator = c_dir.glob('**/*')
        for file_name in file_iterator:
            if file_name.is_file():
                if 'json' not in file_name.name:
                    if file_name != self._manifest_last_used:
                        files_to_hash.add(file_name.resolve())

        with tqdm.tqdm(files_to_hash,
                       total=len(files_to_hash),
                       unit='(files hashed)') as pbar:

            for local_path in pbar:
                hsh = file_hash_from_path(local_path)
                lookup[str(local_path.absolute())] = hsh

        with open(self._downloaded_data_path, 'w') as out_file:
            out_file.write(json.dumps(lookup, indent=2, sort_keys=True))
コード例 #2
0
ファイル: test_utils.py プロジェクト: rgerkin/AllenSDK
def test_file_hash_from_path(tmpdir):

    rng = np.random.RandomState(881)
    alphabet = list('abcdefghijklmnopqrstuvwxyz')
    fname = tmpdir / 'hash_dummy.txt'
    with open(fname, 'w') as out_file:
        for ii in range(10):
            out_file.write(''.join(rng.choice(alphabet, size=10)))
            out_file.write('\n')

    hasher = hashlib.blake2b()
    with open(fname, 'rb') as in_file:
        chunk = in_file.read(7)
        while len(chunk) > 0:
            hasher.update(chunk)
            chunk = in_file.read(7)

    ans = utils.file_hash_from_path(fname)
    assert ans == hasher.hexdigest()
コード例 #3
0
ファイル: cloud_cache.py プロジェクト: kedoxey/AllenSDK
    def _file_exists(self, file_attributes: CacheFileAttributes) -> bool:
        """
        Given a CacheFileAttributes describing a file, assess whether or
        not that file exists locally and is valid (i.e. has the expected
        file hash)

        Parameters
        ----------
        file_attributes: CacheFileAttributes
            Description of the file to look for

        Returns
        -------
        bool
            True if the file exists and is valid; False otherwise

        Raises
        -----
        RuntimeError
            If file_attributes.local_path exists but is not a file.
            It would be unclear how the cache should proceed in this case.
        """

        if not file_attributes.local_path.exists():
            return False
        if not file_attributes.local_path.is_file():
            raise RuntimeError(f"{file_attributes.local_path}\n"
                               "exists, but is not a file;\n"
                               "unsure how to proceed")

        full_path = file_attributes.local_path.resolve()
        test_checksum = file_hash_from_path(full_path)
        if test_checksum != file_attributes.file_hash:
            return False

        return True
コード例 #4
0
ファイル: cloud_cache.py プロジェクト: matchings/AllenSDK
    def _download_file(self, file_attributes: CacheFileAttributes) -> bool:
        """
        Check if a file exists locally. If it does not, download it
        and return True. Return False otherwise.

        Parameters
        ----------
        file_attributes: CacheFileAttributes
            Describes the file to download

        Returns
        -------
        bool
            True if the file was downloaded; False otherwise

        Raises
        ------
        RuntimeError
            If the path to the directory where the file is to be saved
            points to something that is not a directory.

        RuntimeError
            If it is not able to successfully download the file after
            10 iterations
        """
        was_downloaded = False

        local_path = file_attributes.local_path

        local_dir = pathlib.Path(safe_system_path(str(local_path.parents[0])))

        # make sure Windows references to Allen Institute
        # local networked file system get handled correctly
        local_path = pathlib.Path(safe_system_path(str(local_path)))

        # using os here rather than pathlib because safe_system_path
        # returns a str
        os.makedirs(local_dir, exist_ok=True)
        if not os.path.isdir(local_dir):
            raise RuntimeError(f"{local_dir}\n" "is not a directory")

        bucket_name = bucket_name_from_url(file_attributes.url)
        obj_key = relative_path_from_url(file_attributes.url)

        n_iter = 0
        max_iter = 10  # maximum number of times to try download

        version_id = file_attributes.version_id

        pbar = None
        if not self._file_exists(file_attributes):
            response = self.s3_client.list_object_versions(Bucket=bucket_name,
                                                           Prefix=str(obj_key))
            object_info = [
                i for i in response["Versions"] if i["VersionId"] == version_id
            ][0]
            pbar = tqdm.tqdm(desc=object_info["Key"].split("/")[-1],
                             total=object_info["Size"],
                             unit_scale=True,
                             unit_divisor=1000.,
                             unit="MB")

        while not self._file_exists(file_attributes):
            was_downloaded = True
            response = self.s3_client.get_object(Bucket=bucket_name,
                                                 Key=str(obj_key),
                                                 VersionId=version_id)

            if 'Body' in response:
                with open(local_path, 'wb') as out_file:
                    for chunk in response['Body'].iter_chunks():
                        out_file.write(chunk)
                        pbar.update(len(chunk))

            # Verify the hash of the downloaded file
            full_path = file_attributes.local_path.resolve()
            test_checksum = file_hash_from_path(full_path)
            if test_checksum != file_attributes.file_hash:
                file_attributes.local_path.exists()
                file_attributes.local_path.unlink()

            n_iter += 1
            if n_iter > max_iter:
                pbar.close()
                raise RuntimeError("Could not download\n"
                                   f"{file_attributes}\n"
                                   "In {max_iter} iterations")
        if pbar is not None:
            pbar.close()

        return was_downloaded
コード例 #5
0
def mounted_s3_dataset_fixture(tmp_path, request) -> Tuple[Path, str, dict]:
    """A fixture which simulates a project s3 bucket that has been mounted
    as a local directory.
    """

    # Get fixture parameters
    project_name = request.param.get("project_name", "test_project_name_1")
    dataset_version = request.param.get("dataset_version", "0.3.0")
    metadata_file_id_column_name = request.param.get(
        "metadata_file_id_column_name", "file_id"
    )
    metadata_files_contents = request.param.get(
        "metadata_files_contents",
        # Each item in list is a tuple of:
        # (metadata_filename, metadata_contents)
        [
            ("metadata_1.csv", {"mouse": [1, 2, 3], "sex": ["F", "F", "M"]}),
            (
                "metadata_2.csv",
                {
                    "experiment": [4, 5, 6],
                    metadata_file_id_column_name: ["data1", "data2", "data3"]
                }
            )
        ]
    )
    data_files_contents = request.param.get(
        "data_files_contents",
        # Each item in list is a tuple of:
        # (data_filename, data_contents)
        [
            ("data_1.nwb", "123456"),
            ("data_2.nwb", "abcdef"),
            ("data_3.nwb", "ghijkl")
        ]
    )

    # Create mock mounted s3 directory structure
    mock_mounted_base_dir = tmp_path / "mounted_remote_data"
    mock_mounted_base_dir.mkdir()
    mock_project_dir = mock_mounted_base_dir / project_name
    mock_project_dir.mkdir()

    # Create metadata files and manifest entries
    mock_metadata_dir = mock_project_dir / "project_metadata"
    mock_metadata_dir.mkdir()

    manifest_meta_entries = dict()
    for meta_fname, meta_contents in metadata_files_contents:
        meta_save_path = mock_metadata_dir / meta_fname
        df_to_save = pd.DataFrame(meta_contents)
        df_to_save.to_csv(str(meta_save_path), index=False)

        manifest_meta_entries[meta_fname.rstrip(".csv")] = {
            "url": (
                f"http://{project_name}.s3.amazonaws.com/{project_name}"
                f"/project_metadata/{meta_fname}"
            ),
            "version_id": "test_placeholder",
            "file_hash": file_hash_from_path(meta_save_path)
        }

    # Create data files and manifest entries
    mock_data_dir = mock_project_dir / "project_data"
    mock_data_dir.mkdir()

    manifest_data_entries = dict()
    for file_fname, file_contents in data_files_contents:
        file_save_path = mock_data_dir / file_fname
        with file_save_path.open('w') as f:
            f.write(file_contents)

        manifest_data_entries[file_fname.rstrip(".nwb")] = {
            "url": (
                f"http://{project_name}.s3.amazonaws.com/{project_name}"
                f"/project_data/{file_fname}"
            ),
            "version_id": "test_placeholder",
            "file_hash": file_hash_from_path(file_save_path)
        }

    # Create manifest dir and manifest
    mock_manifests_dir = mock_project_dir / "manifests"
    mock_manifests_dir.mkdir()
    manifest_fname = f"test_manifest_v{dataset_version}.json"
    manifest_path = mock_manifests_dir / manifest_fname

    manifest_contents = {
        "project_name": project_name,
        "manifest_version": dataset_version,
        "data_pipeline": [
            {
                "name": "AllenSDK",
                "version": "2.11.0",
                "comment": "This is a test entry. NOT REAL."
            }
        ],
        "metadata_file_id_column_name": metadata_file_id_column_name,
        "metadata_files": manifest_meta_entries,
        "data_files": manifest_data_entries
    }

    with manifest_path.open('w') as f:
        json.dump(manifest_contents, f, indent=4)

    expected = {
        "expected_metadata": metadata_files_contents,
        "expected_data": data_files_contents
    }

    return mock_mounted_base_dir, project_name, expected