Esempio n. 1
0
 def _write_output_manifest(self):
     """
     Adds the file path column to the manifest and writes the copy to the current directory. If the original manifest
     is in the current directory it is overwritten with a warning.
     """
     output = os.path.basename(self.manifest)
     fieldnames, source_manifest = self._parse_manifest(self.manifest)
     if 'file_path' not in fieldnames:
         fieldnames.append('file_path')
     with atomic_write(output, overwrite=True, newline='') as f:
         writer = tsv.DictWriter(f, fieldnames)
         writer.writeheader()
         for row in source_manifest:
             row['file_path'] = self._file_path(row['file_sha256'],
                                                self.download_dir)
             writer.writerow(row)
         if os.path.isfile(output):
             logger.warning('Overwriting manifest %s', output)
     logger.info(
         'Rewrote manifest %s with additional column containing path to downloaded files.',
         output)
Esempio n. 2
0
    def download_collection(self,
                            uuid,
                            replica,
                            version=None,
                            download_dir=''):
        """
        Download a bundle and save it to the local filesystem as a directory.

        :param str uuid: The uuid of the collection to download
        :param str replica: the replica to download from. The supported
            replicas are: `aws` for Amazon Web Services, and `gcp` for
            Google Cloud Platform. [aws, gcp]
        :param str version: The version to download, else if not specified,
            download the latest. The version is a timestamp of bundle creation
            in RFC3339
        :param str download_dir: The directory into which to download

        Download a bundle and save it to the local filesystem as a directory.
        """
        collection = self._serialize_col_to_manifest(uuid, replica, version)
        # Explicitly declare mode `w` (default `w+b`) for Python 3 string compat
        with tempfile.NamedTemporaryFile(mode='w') as manifest:
            writer = tsv.DictWriter(manifest,
                                    fieldnames=('bundle_uuid',
                                                'bundle_version', 'file_name',
                                                'file_sha256', 'file_uuid',
                                                'file_version', 'file_size'))
            writer.writeheader()
            writer.writerows(collection)
            # Flushing the I/O buffer here is preferable to closing the file
            # handle and deleting the temporary file later because within the
            # context manager there is a guarantee that the temporary file
            # will be deleted when we are done
            manifest.flush()
            self.download_manifest(manifest=manifest.name,
                                   replica=replica,
                                   download_dir=download_dir,
                                   layout='bundle')
Esempio n. 3
0
    def test_python_manifest_download(self):
        bundle_path = os.path.join(TEST_DIR, "res", "bundle")
        uploaded_files = set(os.listdir(bundle_path))

        manifest = self.client.upload(src_dir=bundle_path,
                                      replica="aws",
                                      staging_bucket=self.staging_bucket)
        manifest_files = manifest['files']
        self.assertEqual({file['name']
                          for file in manifest_files}, uploaded_files)

        # Work around https://github.com/HumanCellAtlas/data-store/issues/1331
        for file in manifest_files:
            file['indexed'] = file['name'].endswith('.json')

        bundle_uuid = manifest['bundle_uuid']
        bundle_version = manifest['version']
        bundle_fqid = bundle_uuid + '.' + bundle_version
        data_files = tuple(file['name'] for file in manifest_files
                           if not file['indexed'])

        for bad_bundle in False, True:
            with self.subTest(bad_bundle=bad_bundle):
                with tempfile.TemporaryDirectory() as work_dir:
                    cwd = os.getcwd()
                    os.chdir(work_dir)
                    try:
                        with open('manifest.tsv', 'w', newline='') as manifest:
                            writer = tsv.DictWriter(
                                manifest,
                                fieldnames=('bundle_uuid', 'bundle_version',
                                            'file_name', 'file_sha256'))
                            writer.writeheader()
                            writer.writerow(
                                dict(bundle_uuid=bundle_uuid,
                                     bundle_version=bundle_version,
                                     file_name=data_files[0],
                                     file_sha256=
                                     '9b4c0dde8683f924975d0867903dc7a9'
                                     '67f46bee5c0a025c451b9ba73e43f120'))
                            if bad_bundle:
                                writer.writerow(
                                    dict(bundle_uuid=str(uuid.uuid4()),
                                         bundle_version=bundle_version,
                                         file_name=data_files[0],
                                         file_sha256=
                                         '9b4c0dde8683f924975d0867903dc7a9'
                                         '67f46bee5c0a025c451b9ba73e43f120'))

                        dest_dir = os.path.join(work_dir, bundle_fqid)
                        try:
                            self.client.download_manifest('manifest.tsv',
                                                          replica="aws",
                                                          layout='bundle')
                        except RuntimeError as e:
                            self.assertTrue(
                                bad_bundle,
                                "Should only raise with a bad bundle in the manifest"
                            )
                            self.assertEqual('1 download task(s) failed.',
                                             e.args[0])
                        else:
                            self.assertFalse(bad_bundle)
                        for file in manifest_files:
                            uploaded_file = os.path.join(
                                bundle_path, file['name'])
                            downloaded_file = os.path.join(
                                dest_dir, file['name'])
                            if file['indexed'] or file['name'] == data_files[0]:
                                self.assertTrue(
                                    filecmp.cmp(uploaded_file, downloaded_file,
                                                False))
                            else:
                                self.assertTrue(os.path.exists(uploaded_file))
                                self.assertFalse(
                                    os.path.exists(downloaded_file))
                    finally:
                        os.chdir(cwd)
Esempio n. 4
0
import json
import pprint
from get_bundle_api import fetch_bundle, save_bundle, BUNDLE_JSON

dss = DSSClient()

if not os.path.isfile(BUNDLE_JSON):
    bundle = fetch_bundle()
    save_bundle(bundle)

with open("manifest.tsv", "w", newline='') as manifest:
    writer = tsv.DictWriter(manifest,
                            fieldnames=(
                                "bundle_uuid",
                                "bundle_version",
                                "file_name",
                                "file_uuid",
                                "file_version",
                                "file_sha256",
                                "file_size",
                            ))
    writer.writeheader()

    with open(BUNDLE_JSON, "w") as jsonfile:
        try:
            data = json.load(jsonfile)
            bundle_uuid, bundle_version = (
                data["bundle"]["uuid"],
                data["bundle"]["version"],
            )
            pprint.pprint(data)
            for content in data["bundle"]["files"]: