def test_generate_ro_manifest_update(self): logger.info( self.getTestHeader( 'create bag with auto-generation of RO manifest in update mode' )) try: bdb.make_bag(self.test_data_dir, algs=['md5', 'sha1', 'sha256', 'sha512'], remote_file_manifest=ospj(self.test_config_dir, 'test-fetch-manifest.json')) bdb.generate_ro_manifest(self.test_data_dir, overwrite=True) ro = bdbro.read_bag_ro_metadata(self.test_data_dir) old_agg_dict = dict() for entry in ro.get("aggregates", []): old_agg_dict[entry["uri"]] = entry bdbro.add_file_metadata(ro, local_path="../data/FAKE.txt", bundled_as=bdbro.make_bundled_as()) bdbro.write_bag_ro_metadata(ro, self.test_data_dir) bdb.generate_ro_manifest(self.test_data_dir, overwrite=False) ro = bdbro.read_bag_ro_metadata(self.test_data_dir) for entry in ro.get("aggregates", []): if entry["uri"] in old_agg_dict: self.assertTrue(entry["bundledAs"]["uri"] == old_agg_dict[ entry["uri"]]["bundledAs"]["uri"]) except Exception as e: self.fail(get_typed_exception(e))
def main(argv): parser = argparse.ArgumentParser(description='Program to create a BDBag containing a set of Minids for remote content') parser.add_argument('-m', '--minids', metavar='<minid file>', help='File listing Minids for new bag', required=True) parser.add_argument('-b', '--bagname', metavar='<bag name>', help='Name of directory for new bag.', required=True) parser.add_argument('-v', '--verify', action='store_true', help='Validate bag after building it.', required=False) parser.add_argument('-q', '--quiet', action="store_true", help="Suppress logging output.") parser.add_argument('-d', '--debug', action="store_true", help="Enable debug logging output.") parser.add_argument('-n', '--author-name', metavar="<person or entity name>", help="Optional name of the person or entity responsible for the creation of this bag, " "for inclusion in the bag metadata.") parser.add_argument('-o', '--author-orcid', metavar="<orcid>", help="Optional ORCID identifier of the bag creator, for inclusion in the bag metadata.") args = parser.parse_args() bdb.configure_logging(level=logging.ERROR if args.quiet else (logging.DEBUG if args.debug else logging.INFO)) # Create the directory that will hold the new BDBag bdb.ensure_bag_path_exists(args.bagname) # For each supplied minid, fetch sub-bag to determine its properties minid_fields = extract_fields(args.minids) # Create 'README' file in the newly created bag directory. (moved to 'data' when bag is created) write_readme(args.bagname, minid_fields) # Create remote_file_manifest_file, to be used by make_bag working_dir = temp_path = tempfile.mkdtemp(prefix='encode2bag_') remote_file_manifest_file = osp.abspath(osp.join(working_dir, 'remote-file-manifest.json')) generate_remote_manifest_file(minid_fields, remote_file_manifest_file) # Create the new bag based on the supplied remote manifest file bag = bdb.make_bag(args.bagname, algs=['md5', 'sha256'], remote_file_manifest=remote_file_manifest_file) # Create metadata/manifest.json file with Research Object JSON object ro_manifest = ro.init_ro_manifest(author_name=args.author_name, author_orcid=args.author_orcid, creator_name = 'bagofbags using BDBag version: %s (Bagit version: %s)' % (VERSION, BAGIT_VERSION), creator_uri='https://github.com/fair-research/bdbag/examples/bagofbags/') add_remote_file_manifest_to_ro(ro_manifest, minid_fields) ro.write_bag_ro_metadata(ro_manifest, args.bagname, 'manifest.json') # Run make_bag again to include manifest.json in the checksums etc. bdb.make_bag(args.bagname, update=True) if args.verify: bdb.resolve_fetch(args.bagname, force=True) bdb.validate_bag(args.bagname, fast=False, callback=None)
def generate_ro_manifest(bag_path, overwrite=False, config_file=DEFAULT_CONFIG_FILE): bag = bdbagit.BDBag(bag_path) bag_ro_metadata_path = os.path.abspath( os.path.join(bag_path, "metadata", "manifest.json")) exists = os.path.isfile(bag_ro_metadata_path) if exists and not overwrite: logger.info("Auto-generating RO manifest: update existing file.") ro_metadata = bdbro.read_bag_ro_metadata(bag_path) else: logger.info( "Auto-generating RO manifest: %s." % "creating new file" if not exists else "overwrite existing file") ro_metadata = bdbro.init_ro_manifest( author_name=bag.info.get("Contact-Name"), author_orcid=bag.info.get("Contact-Orcid"), creator_name=bdbro.BAG_CREATOR_NAME, creator_uri=bdbro.BAG_CREATOR_URI) config = read_config(config_file) resolvers = config.get( ID_RESOLVER_TAG, DEFAULT_ID_RESOLVERS) if config else DEFAULT_ID_RESOLVERS fetched = bag.fetch_entries() local = bag.payload_files() for url, length, filename in fetched: if url.startswith("minid:") or url.startswith("ark:"): url = "".join(["http://", resolvers[0], "/", url]) bdbro.add_file_metadata(ro_metadata, source_url=url, bundled_as=bdbro.make_bundled_as( folder=os.path.dirname(filename), filename=os.path.basename(filename)), update_existing=True) for path in local: bdbro.add_file_metadata(ro_metadata, local_path=path.replace("\\", "/"), bundled_as=bdbro.make_bundled_as(), update_existing=True) bdbro.write_bag_ro_metadata(ro_metadata, bag_path) profile = bag.info.get(BAG_PROFILE_TAG) if profile == BDBAG_PROFILE_ID: bag.info.update({BAG_PROFILE_TAG: BDBAG_RO_PROFILE_ID}) bag.save()
def download(self, **kwargs): if not self.config: raise DerivaDownloadConfigurationError( "No configuration specified!") if self.config.get("catalog") is None: raise DerivaDownloadConfigurationError( "Catalog configuration error!") ro_manifest = None ro_author_name = None ro_author_orcid = None remote_file_manifest = os.path.abspath(''.join([ os.path.join(self.output_dir, 'remote-file-manifest_'), str(uuid.uuid4()), ".json" ])) catalog_config = self.config['catalog'] self.envars.update(self.config.get('env', dict())) self.envars.update({"hostname": self.hostname}) # 1. If we don't have a client identity, we need to authenticate identity = kwargs.get("identity") if not identity: try: if not self.credentials: self.set_credentials(get_credential(self.hostname)) logging.info("Validating credentials for host: %s" % self.hostname) attributes = self.catalog.get_authn_session().json() identity = attributes["client"] except HTTPError as he: if he.response.status_code == 404: logging.info( "No existing login session found for host: %s" % self.hostname) except Exception as e: raise DerivaDownloadAuthenticationError( "Unable to validate credentials: %s" % format_exception(e)) wallet = kwargs.get("wallet", {}) # 2. Check for bagging config and initialize bag related variables bag_path = None bag_archiver = None bag_algorithms = None bag_config = self.config.get('bag') create_bag = True if bag_config else False if create_bag: bag_name = bag_config.get( 'bag_name', ''.join([ "deriva_bag", '_', time.strftime("%Y-%m-%d_%H.%M.%S") ])).format(**self.envars) bag_path = os.path.abspath(os.path.join(self.output_dir, bag_name)) bag_archiver = bag_config.get('bag_archiver') bag_algorithms = bag_config.get('bag_algorithms', ['sha256']) bag_metadata = bag_config.get( 'bag_metadata', {"Internal-Sender-Identifier": "deriva@%s" % self.server_url}) bag_ro = create_bag and stob(bag_config.get('bag_ro', "True")) if create_bag: bdb.ensure_bag_path_exists(bag_path) bag = bdb.make_bag(bag_path, algs=bag_algorithms, metadata=bag_metadata) if bag_ro: ro_author_name = bag.info.get( "Contact-Name", None if not identity else identity.get( 'full_name', identity.get('display_name', identity.get('id', None)))) ro_author_orcid = bag.info.get("Contact-Orcid") ro_manifest = ro.init_ro_manifest( author_name=ro_author_name, author_orcid=ro_author_orcid) bag_metadata.update({BAG_PROFILE_TAG: BDBAG_RO_PROFILE_ID}) # 3. Process the set of queries by locating, instantiating, and invoking the specified processor(s) outputs = dict() base_path = bag_path if bag_path else self.output_dir for processor in catalog_config['query_processors']: processor_name = processor["processor"] processor_type = processor.get('processor_type') processor_params = processor.get('processor_params') try: query_processor = find_query_processor(processor_name, processor_type) processor = query_processor( self.envars, inputs=outputs, bag=create_bag, catalog=self.catalog, store=self.store, base_path=base_path, processor_params=processor_params, remote_file_manifest=remote_file_manifest, ro_manifest=ro_manifest, ro_author_name=ro_author_name, ro_author_orcid=ro_author_orcid, identity=identity, wallet=wallet) outputs = processor.process() except Exception as e: logging.error(format_exception(e)) if create_bag: bdb.cleanup_bag(bag_path) raise # 4. Execute anything in the transform processing pipeline, if configured transform_processors = self.config.get('transform_processors', []) if transform_processors: for processor in transform_processors: processor_name = processor["processor"] processor_type = processor.get('processor_type') processor_params = processor.get('processor_params') try: transform_processor = find_transform_processor( processor_name, processor_type) processor = transform_processor( self.envars, inputs=outputs, processor_params=processor_params, base_path=base_path, bag=create_bag, ro_manifest=ro_manifest, ro_author_name=ro_author_name, ro_author_orcid=ro_author_orcid, identity=identity, wallet=wallet) outputs = processor.process() except Exception as e: logging.error(format_exception(e)) raise # 5. Create the bag, and archive (serialize) if necessary if create_bag: try: if ro_manifest: ro.write_bag_ro_metadata(ro_manifest, bag_path) if not os.path.isfile(remote_file_manifest): remote_file_manifest = None bdb.make_bag( bag_path, algs=bag_algorithms, remote_file_manifest=remote_file_manifest if (remote_file_manifest and os.path.getsize(remote_file_manifest) > 0) else None, update=True) except Exception as e: logging.fatal("Exception while updating bag manifests: %s" % format_exception(e)) bdb.cleanup_bag(bag_path) raise finally: if remote_file_manifest and os.path.isfile( remote_file_manifest): os.remove(remote_file_manifest) logging.info('Created bag: %s' % bag_path) if bag_archiver is not None: try: archive = bdb.archive_bag(bag_path, bag_archiver.lower()) bdb.cleanup_bag(bag_path) outputs = { os.path.basename(archive): { LOCAL_PATH_KEY: archive } } except Exception as e: logging.error( "Exception while creating data bag archive: %s" % format_exception(e)) raise else: outputs = { os.path.basename(bag_path): { LOCAL_PATH_KEY: bag_path } } # 6. Execute anything in the post processing pipeline, if configured post_processors = self.config.get('post_processors', []) if post_processors: for processor in post_processors: processor_name = processor["processor"] processor_type = processor.get('processor_type') processor_params = processor.get('processor_params') try: post_processor = find_post_processor( processor_name, processor_type) processor = post_processor( self.envars, inputs=outputs, processor_params=processor_params, identity=identity, wallet=wallet) outputs = processor.process() except Exception as e: logging.error(format_exception(e)) raise return outputs
def download(self, identity=None): if not self.config: raise RuntimeError("No configuration specified!") if self.config.get("catalog") is None: raise RuntimeError("Catalog configuration error!") if not identity: logging.info("Validating credentials") try: if not self.credentials: self.setCredentials(get_credential(self.hostname)) attributes = self.catalog.get_authn_session().json() identity = attributes["client"] except Exception as e: raise RuntimeError("Unable to validate credentials: %s" % format_exception(e)) ro_manifest = None ro_author_name = None ro_author_orcid = None remote_file_manifest = os.path.abspath( ''.join([os.path.join(self.output_dir, 'remote-file-manifest_'), str(uuid.uuid4()), ".json"])) catalog_config = self.config['catalog'] self.envars.update(self.config.get('env', dict())) bag_path = None bag_archiver = None bag_algorithms = None bag_config = self.config.get('bag') create_bag = True if bag_config else False if create_bag: bag_name = bag_config.get('bag_name', ''.join(["deriva_bag", '_', time.strftime("%Y-%m-%d_%H.%M.%S")])) bag_path = os.path.abspath(os.path.join(self.output_dir, bag_name)) bag_archiver = bag_config.get('bag_archiver') bag_algorithms = bag_config.get('bag_algorithms', ['sha256']) bag_metadata = bag_config.get('bag_metadata', {"Internal-Sender-Identifier": "deriva@%s" % self.server_url}) bag_ro = create_bag and stob(bag_config.get('bag_ro', "True")) if create_bag: bdb.ensure_bag_path_exists(bag_path) bag = bdb.make_bag(bag_path, algs=bag_algorithms, metadata=bag_metadata) if bag_ro: ro_author_name = bag.info.get("Contact-Name", identity.get('full_name', identity.get('display_name', identity.get('id', None)))) ro_author_orcid = bag.info.get("Contact-Orcid") ro_manifest = ro.init_ro_manifest(author_name=ro_author_name, author_orcid=ro_author_orcid) bag_metadata.update({BAG_PROFILE_TAG: BDBAG_RO_PROFILE_ID}) file_list = list() base_path = bag_path if bag_path else self.output_dir for query in catalog_config['queries']: query_path = query['query_path'] output_format = query['output_format'] output_processor = query.get("output_format_processor") format_args = query.get('output_format_params', None) output_path = query.get('output_path', '') try: download_processor = findProcessor(output_format, output_processor) processor = download_processor(self.envars, bag=create_bag, catalog=self.catalog, store=self.store, query=query_path, base_path=base_path, sub_path=output_path, format_args=format_args, remote_file_manifest=remote_file_manifest, ro_manifest=ro_manifest, ro_author_name=ro_author_name, ro_author_orcid=ro_author_orcid) file_list.extend(processor.process()) except Exception as e: logging.error(format_exception(e)) if create_bag: bdb.cleanup_bag(bag_path) raise if create_bag: try: if ro_manifest: ro.write_bag_ro_metadata(ro_manifest, bag_path) if not os.path.isfile(remote_file_manifest): remote_file_manifest = None bdb.make_bag(bag_path, algs=bag_algorithms, remote_file_manifest=remote_file_manifest, update=True) except Exception as e: logging.fatal("Exception while updating bag manifests: %s", format_exception(e)) bdb.cleanup_bag(bag_path) raise finally: if remote_file_manifest and os.path.isfile(remote_file_manifest): os.remove(remote_file_manifest) logging.info('Created bag: %s' % bag_path) if bag_archiver is not None: try: archive = bdb.archive_bag(bag_path, bag_archiver.lower()) bdb.cleanup_bag(bag_path) return [archive] except Exception as e: logging.error("Exception while creating data bag archive:", format_exception(e)) raise else: return [bag_path] return file_list