def test_generate_ro_manifest_update(self): logger.info( self.getTestHeader( 'create bag with auto-generation of RO manifest in update mode' )) try: bdb.make_bag(self.test_data_dir, algs=['md5', 'sha1', 'sha256', 'sha512'], remote_file_manifest=ospj(self.test_config_dir, 'test-fetch-manifest.json')) bdb.generate_ro_manifest(self.test_data_dir, overwrite=True) ro = bdbro.read_bag_ro_metadata(self.test_data_dir) old_agg_dict = dict() for entry in ro.get("aggregates", []): old_agg_dict[entry["uri"]] = entry bdbro.add_file_metadata(ro, local_path="../data/FAKE.txt", bundled_as=bdbro.make_bundled_as()) bdbro.write_bag_ro_metadata(ro, self.test_data_dir) bdb.generate_ro_manifest(self.test_data_dir, overwrite=False) ro = bdbro.read_bag_ro_metadata(self.test_data_dir) for entry in ro.get("aggregates", []): if entry["uri"] in old_agg_dict: self.assertTrue(entry["bundledAs"]["uri"] == old_agg_dict[ entry["uri"]]["bundledAs"]["uri"]) except Exception as e: self.fail(get_typed_exception(e))
def downloadFiles(self, input_manifest): logging.info("Retrieving file(s)...") try: with open(input_manifest, "r") as in_file: file_list = list() for line in in_file: entry = json.loads(line) url = entry.get('url') if not url: raise RuntimeError( "Missing required attribute \"url\" in download manifest entry %s" % json.dumps(entry)) store = self.getHatracStore(url) filename = entry.get('filename') envvars = self.envars.copy() envvars.update(entry) subdir = self.sub_path.format(**envvars) if not filename: if store: head = store.head(url, headers=self.HEADERS) content_disposition = head.headers.get("Content-Disposition") if head.ok else None filename = os.path.basename(filename).split(":")[0] if not content_disposition else \ parse_content_disposition(content_disposition) else: filename = os.path.basename(url) file_path = os.path.abspath(os.path.join( self.base_path, 'data' if self.is_bag else '', subdir, filename)) output_dir = os.path.dirname(file_path) self.makeDirs(output_dir) if store: resp = store.get_obj(url, self.HEADERS, file_path) length = int(resp.headers.get('Content-Length')) content_type = resp.headers.get("Content-Type") url = self.getExternalUrl(url) else: url = self.getExternalUrl(url) file_path, resp = self.getExternalFile(url, file_path, self.HEADERS) length = int(resp.headers.get('Content-Length')) content_type = resp.headers.get("Content-Type") file_bytes = os.path.getsize(file_path) if length != file_bytes: raise RuntimeError( "File size of %s does not match expected size of %s for file %s" % (length, file_bytes, file_path)) output_path = ''.join([subdir, "/", filename]) if subdir else filename if self.ro_manifest: ro.add_file_metadata(self.ro_manifest, source_url=url, local_path=output_path, media_type=content_type, retrieved_on=ro.make_retrieved_on(), retrieved_by=ro.make_retrieved_by( self.ro_author_name, orcid=self.ro_author_orcid), bundled_as=ro.make_bundled_as()) file_list.append(output_path) return file_list finally: os.remove(input_manifest)
def generate_ro_manifest(bag_path, overwrite=False, config_file=DEFAULT_CONFIG_FILE): bag = bdbagit.BDBag(bag_path) bag_ro_metadata_path = os.path.abspath( os.path.join(bag_path, "metadata", "manifest.json")) exists = os.path.isfile(bag_ro_metadata_path) if exists and not overwrite: logger.info("Auto-generating RO manifest: update existing file.") ro_metadata = bdbro.read_bag_ro_metadata(bag_path) else: logger.info( "Auto-generating RO manifest: %s." % "creating new file" if not exists else "overwrite existing file") ro_metadata = bdbro.init_ro_manifest( author_name=bag.info.get("Contact-Name"), author_orcid=bag.info.get("Contact-Orcid"), creator_name=bdbro.BAG_CREATOR_NAME, creator_uri=bdbro.BAG_CREATOR_URI) config = read_config(config_file) resolvers = config.get( ID_RESOLVER_TAG, DEFAULT_ID_RESOLVERS) if config else DEFAULT_ID_RESOLVERS fetched = bag.fetch_entries() local = bag.payload_files() for url, length, filename in fetched: if url.startswith("minid:") or url.startswith("ark:"): url = "".join(["http://", resolvers[0], "/", url]) bdbro.add_file_metadata(ro_metadata, source_url=url, bundled_as=bdbro.make_bundled_as( folder=os.path.dirname(filename), filename=os.path.basename(filename)), update_existing=True) for path in local: bdbro.add_file_metadata(ro_metadata, local_path=path.replace("\\", "/"), bundled_as=bdbro.make_bundled_as(), update_existing=True) bdbro.write_bag_ro_metadata(ro_metadata, bag_path) profile = bag.info.get(BAG_PROFILE_TAG) if profile == BDBAG_PROFILE_ID: bag.info.update({BAG_PROFILE_TAG: BDBAG_RO_PROFILE_ID}) bag.save()
def process(self): headers = self.HEADERS headers.update({'accept': self.content_type}) resp = self.catalogQuery(headers) if self.ro_manifest and self.ro_file_provenance: ro.add_file_metadata(self.ro_manifest, source_url=self.url, local_path=self.output_relpath, media_type=self.content_type, retrieved_on=ro.make_retrieved_on(), retrieved_by=ro.make_retrieved_by(self.ro_author_name, orcid=self.ro_author_orcid), bundled_as=ro.make_bundled_as()) self.outputs.update({self.output_relpath: {LOCAL_PATH_KEY: self.output_abspath, SOURCE_URL_KEY: self.url}}) return self.outputs
def createRemoteFileManifest(self): logging.info("Creating remote file manifest") input_manifest = self.output_abspath remote_file_manifest = self.args.get("remote_file_manifest") with open(input_manifest, "r") as in_file, open(remote_file_manifest, "a") as remote_file: for line in in_file: # get the required bdbag remote file manifest vars from each line of the json-stream input file entry = json.loads(line) entry = self.createManifestEntry(entry) remote_file.write(json.dumps(entry) + "\n") if self.ro_manifest: ro.add_file_metadata(self.ro_manifest, source_url=entry["url"], media_type=entry.get("content_type"), bundled_as=ro.make_bundled_as( folder=os.path.dirname(entry["filename"]), filename=os.path.basename(entry["filename"]))) os.remove(input_manifest) return os.path.relpath(remote_file_manifest, self.base_path)
def process(self): if self.ro_manifest and self.ro_file_provenance: ro.add_file_metadata( self.ro_manifest, source_url=self.url, local_path=self.output_relpath, media_type=guess_content_type(self.output_abspath), retrieved_on=ro.make_retrieved_on(), retrieved_by=ro.make_retrieved_by(self.ro_author_name, orcid=self.ro_author_orcid), bundled_as=ro.make_bundled_as()) if self.delete_input: self._delete_input() self.outputs.update({ self.output_relpath: { LOCAL_PATH_KEY: self.output_abspath, SOURCE_URL_KEY: self.url } }) return self.outputs
def downloadFiles(self, input_manifest): logging.info( "Attempting to download file(s) based on the results of query: %s" % self.query) try: with open(input_manifest, "r") as in_file: file_list = dict() for line in in_file: entry = json.loads(line) url = entry.get('url') if not url: logging.warning( "Skipping download due to missing required attribute \"url\" in download manifest entry %s" % json.dumps(entry)) continue store = self.getHatracStore(url) filename = entry.get('filename') envvars = self.envars.copy() envvars.update(entry) subdir = self.sub_path.format(**envvars) if not filename: if store: try: head = store.head(url, headers=self.HEADERS) except requests.HTTPError as e: raise DerivaDownloadError( "HEAD request for [%s] failed: %s" % (url, e)) content_disposition = head.headers.get( "Content-Disposition") if head.ok else None filename = os.path.basename(filename).split(":")[0] if not content_disposition else \ parse_content_disposition(content_disposition) else: filename = os.path.basename(url) file_path = os.path.abspath( os.path.join(self.base_path, 'data' if self.is_bag else '', subdir, filename)) output_dir = os.path.dirname(file_path) make_dirs(output_dir) if store: try: resp = store.get_obj(url, self.HEADERS, file_path) except requests.HTTPError as e: raise DerivaDownloadError( "File [%s] transfer failed: %s" % (file_path, e)) length = int(resp.headers.get('Content-Length')) content_type = resp.headers.get("Content-Type") url = self.getExternalUrl(url) else: url = self.getExternalUrl(url) file_path, resp = self.getExternalFile( url, file_path, self.HEADERS) length = int(resp.headers.get('Content-Length')) content_type = resp.headers.get("Content-Type") file_bytes = os.path.getsize(file_path) if length != file_bytes: raise DerivaDownloadError( "File size of %s does not match expected size of %s for file %s" % (length, file_bytes, file_path)) output_path = ''.join([subdir, "/", filename ]) if subdir else filename if self.ro_manifest: ro.add_file_metadata( self.ro_manifest, source_url=url, local_path=output_path, media_type=content_type, retrieved_on=ro.make_retrieved_on(), retrieved_by=ro.make_retrieved_by( self.ro_author_name, orcid=self.ro_author_orcid), bundled_as=ro.make_bundled_as()) file_list.update( {output_path: { LOCAL_PATH_KEY: file_path }}) return file_list finally: os.remove(input_manifest)