def _write_output_manifest(self): """ Adds the file path column to the manifest and writes the copy to the current directory. If the original manifest is in the current directory it is overwritten with a warning. """ output = os.path.basename(self.manifest) fieldnames, source_manifest = self._parse_manifest(self.manifest) if 'file_path' not in fieldnames: fieldnames.append('file_path') with atomic_write(output, overwrite=True, newline='') as f: writer = tsv.DictWriter(f, fieldnames) writer.writeheader() for row in source_manifest: row['file_path'] = self._file_path(row['file_sha256'], self.download_dir) writer.writerow(row) if os.path.isfile(output): logger.warning('Overwriting manifest %s', output) logger.info( 'Rewrote manifest %s with additional column containing path to downloaded files.', output)
def download_collection(self, uuid, replica, version=None, download_dir=''): """ Download a bundle and save it to the local filesystem as a directory. :param str uuid: The uuid of the collection to download :param str replica: the replica to download from. The supported replicas are: `aws` for Amazon Web Services, and `gcp` for Google Cloud Platform. [aws, gcp] :param str version: The version to download, else if not specified, download the latest. The version is a timestamp of bundle creation in RFC3339 :param str download_dir: The directory into which to download Download a bundle and save it to the local filesystem as a directory. """ collection = self._serialize_col_to_manifest(uuid, replica, version) # Explicitly declare mode `w` (default `w+b`) for Python 3 string compat with tempfile.NamedTemporaryFile(mode='w') as manifest: writer = tsv.DictWriter(manifest, fieldnames=('bundle_uuid', 'bundle_version', 'file_name', 'file_sha256', 'file_uuid', 'file_version', 'file_size')) writer.writeheader() writer.writerows(collection) # Flushing the I/O buffer here is preferable to closing the file # handle and deleting the temporary file later because within the # context manager there is a guarantee that the temporary file # will be deleted when we are done manifest.flush() self.download_manifest(manifest=manifest.name, replica=replica, download_dir=download_dir, layout='bundle')
def test_python_manifest_download(self): bundle_path = os.path.join(TEST_DIR, "res", "bundle") uploaded_files = set(os.listdir(bundle_path)) manifest = self.client.upload(src_dir=bundle_path, replica="aws", staging_bucket=self.staging_bucket) manifest_files = manifest['files'] self.assertEqual({file['name'] for file in manifest_files}, uploaded_files) # Work around https://github.com/HumanCellAtlas/data-store/issues/1331 for file in manifest_files: file['indexed'] = file['name'].endswith('.json') bundle_uuid = manifest['bundle_uuid'] bundle_version = manifest['version'] bundle_fqid = bundle_uuid + '.' + bundle_version data_files = tuple(file['name'] for file in manifest_files if not file['indexed']) for bad_bundle in False, True: with self.subTest(bad_bundle=bad_bundle): with tempfile.TemporaryDirectory() as work_dir: cwd = os.getcwd() os.chdir(work_dir) try: with open('manifest.tsv', 'w', newline='') as manifest: writer = tsv.DictWriter( manifest, fieldnames=('bundle_uuid', 'bundle_version', 'file_name', 'file_sha256')) writer.writeheader() writer.writerow( dict(bundle_uuid=bundle_uuid, bundle_version=bundle_version, file_name=data_files[0], file_sha256= '9b4c0dde8683f924975d0867903dc7a9' '67f46bee5c0a025c451b9ba73e43f120')) if bad_bundle: writer.writerow( dict(bundle_uuid=str(uuid.uuid4()), bundle_version=bundle_version, file_name=data_files[0], file_sha256= '9b4c0dde8683f924975d0867903dc7a9' '67f46bee5c0a025c451b9ba73e43f120')) dest_dir = os.path.join(work_dir, bundle_fqid) try: self.client.download_manifest('manifest.tsv', replica="aws", layout='bundle') except RuntimeError as e: self.assertTrue( bad_bundle, "Should only raise with a bad bundle in the manifest" ) self.assertEqual('1 download task(s) failed.', e.args[0]) else: self.assertFalse(bad_bundle) for file in manifest_files: uploaded_file = os.path.join( bundle_path, file['name']) downloaded_file = os.path.join( dest_dir, file['name']) if file['indexed'] or file['name'] == data_files[0]: self.assertTrue( filecmp.cmp(uploaded_file, downloaded_file, False)) else: self.assertTrue(os.path.exists(uploaded_file)) self.assertFalse( os.path.exists(downloaded_file)) finally: os.chdir(cwd)
import json import pprint from get_bundle_api import fetch_bundle, save_bundle, BUNDLE_JSON dss = DSSClient() if not os.path.isfile(BUNDLE_JSON): bundle = fetch_bundle() save_bundle(bundle) with open("manifest.tsv", "w", newline='') as manifest: writer = tsv.DictWriter(manifest, fieldnames=( "bundle_uuid", "bundle_version", "file_name", "file_uuid", "file_version", "file_sha256", "file_size", )) writer.writeheader() with open(BUNDLE_JSON, "w") as jsonfile: try: data = json.load(jsonfile) bundle_uuid, bundle_version = ( data["bundle"]["uuid"], data["bundle"]["version"], ) pprint.pprint(data) for content in data["bundle"]["files"]: