def create_bag_from_metadata_file(metadata_file_path, remote_file_manifest=None, working_dir=None, output_name=None, output_path=None, archive_format=None, creator_name=None, creator_orcid=None, create_ro_manifest=False): temp_path = None if remote_file_manifest is None: if working_dir is None: working_dir = temp_path = tempfile.mkdtemp(prefix="encode2bag_") remote_file_manifest = osp.abspath(osp.join(working_dir, "remote-file-manifest.json")) ro_manifest = None if create_ro_manifest: ro_manifest = init_ro_manifest(creator_name=creator_name, creator_orcid=creator_orcid) convert_tsv_metadata_to_remote_file_manifest(metadata_file_path, remote_file_manifest, ro_manifest) bag_path = get_target_bag_path(output_name=output_name, output_path=output_path) ensure_bag_path_exists(bag_path) shutil.copy(osp.abspath(metadata_file_path), bag_path) bag_metadata = dict() if creator_name: bag_metadata["Contact-Name"] = creator_name if creator_orcid: bag_metadata["Contact-Orcid"] = creator_orcid bag = bdb.make_bag(bag_path, algs=["md5", "sha256"], metadata=bag_metadata, remote_file_manifest=remote_file_manifest) if create_ro_manifest: bag_metadata_dir = os.path.abspath(os.path.join(bag_path, "metadata")) if not os.path.exists(bag_metadata_dir): os.mkdir(bag_metadata_dir) ro_manifest_path = osp.join(bag_metadata_dir, "manifest.json") ro.write_ro_manifest(ro_manifest, ro_manifest_path) bag_metadata.update({'BagIt-Profile-Identifier': "http://raw.githubusercontent.com/ini-bdds/bdbag/master/profiles/bdbag-ro-profile.json"}) bdb.make_bag(bag_path, update=True, metadata=bag_metadata) if archive_format: bag_path = bdb.archive_bag(bag_path, archive_format) if temp_path: shutil.rmtree(temp_path) return bag_path
def test_create_bag(self): logger.info(self.getTestHeader('create bag')) try: bag = bdb.make_bag(self.test_data_dir) self.assertIsInstance(bag, bagit.Bag) except Exception as e: self.fail(bdbag.get_named_exception(e))
def test_update_bag_remove_file(self): logger.info(self.getTestHeader('update bag remove file')) try: os.remove(ospj(self.test_bag_dir, 'data', 'test1', 'test1.txt')) bag = bdb.make_bag(self.test_bag_dir, update=True) output = self.stream.getvalue() self.assertIsInstance(bag, bagit.Bag) self.assertUnexpectedMessages(['test1.txt'], output) except Exception as e: self.fail(bdbag.get_named_exception(e))
def test_update_bag_remove_file(self): logger.info(self.getTestHeader('update bag remove file')) try: os.remove(ospj(self.test_bag_dir, 'data', 'test1', 'test1.txt')) bag = bdb.make_bag(self.test_bag_dir, update=True) output = self.stream.getvalue() self.assertIsInstance(bag, bdbagit.BDBag) self.assertUnexpectedMessages(['test1.txt'], output) except Exception as e: self.fail(get_typed_exception(e))
def test_update_bag_change_file(self): logger.info(self.getTestHeader('update bag change file')) try: with open(ospj(self.test_bag_dir, 'data', 'README.txt'), 'a') as f: f.writelines('Additional data added via unit test.') bag = bdb.make_bag(self.test_bag_dir, update=True) output = self.stream.getvalue() self.assertIsInstance(bag, bagit.Bag) self.assertExpectedMessages(['README.txt'], output) except Exception as e: self.fail(bdbag.get_named_exception(e))
def test_update_bag_add_file(self): logger.info(self.getTestHeader('update bag add file')) try: with open(ospj(self.test_bag_dir, 'data', 'NEWFILE.txt'), 'w') as nf: nf.write('Additional file added via unit test.') bag = bdb.make_bag(self.test_bag_dir, update=True) output = self.stream.getvalue() self.assertIsInstance(bag, bdbagit.BDBag) self.assertExpectedMessages(['NEWFILE.txt'], output) except Exception as e: self.fail(get_typed_exception(e))
def test_update_bag_change_metadata(self): logger.info(self.getTestHeader('update bag change metadata')) try: bag = bdb.make_bag(self.test_bag_dir, update=True, metadata={"Contact-Name": "nobody"}, metadata_file=(ospj(self.test_config_dir, 'test-metadata.json'))) output = self.stream.getvalue() self.assertIsInstance(bag, bagit.Bag) self.assertExpectedMessages(['Reading bag metadata from file', 'test-metadata.json'], output) except Exception as e: self.fail(bdbag.get_named_exception(e))
def test_generate_ro_manifest_update(self): logger.info(self.getTestHeader('create bag with auto-generation of RO manifest in update mode')) try: bdb.make_bag(self.test_data_dir, algs=['md5', 'sha1', 'sha256', 'sha512'], remote_file_manifest=ospj(self.test_config_dir, 'test-fetch-manifest.json')) bdb.generate_ro_manifest(self.test_data_dir, overwrite=True) ro = bdbro.read_bag_ro_metadata(self.test_data_dir) old_agg_dict = dict() for entry in ro.get("aggregates", []): old_agg_dict[entry["uri"]] = entry bdbro.add_file_metadata(ro, local_path="../data/FAKE.txt", bundled_as=bdbro.make_bundled_as()) bdbro.write_bag_ro_metadata(ro, self.test_data_dir) bdb.generate_ro_manifest(self.test_data_dir, overwrite=False) ro = bdbro.read_bag_ro_metadata(self.test_data_dir) for entry in ro.get("aggregates", []): if entry["uri"] in old_agg_dict: self.assertTrue(entry["bundledAs"]["uri"] == old_agg_dict[entry["uri"]]["bundledAs"]["uri"]) except Exception as e: self.fail(get_typed_exception(e))
def test_generate_ro_manifest_overwrite(self): logger.info(self.getTestHeader('create bag with auto-generation of RO manifest in overwrite mode')) try: bdb.make_bag(self.test_data_dir, algs=['md5', 'sha1', 'sha256', 'sha512'], remote_file_manifest=ospj(self.test_config_dir, 'test-fetch-manifest.json')) bdb.generate_ro_manifest(self.test_data_dir, overwrite=True) ro = bdbro.read_bag_ro_metadata(self.test_data_dir) agg_dict = dict() for entry in ro.get("aggregates", []): agg_dict[entry["uri"]] = entry for test_entry in self.ro_test_aggregates: self.assertTrue(test_entry["uri"] in agg_dict) entry = agg_dict[test_entry["uri"]] bundled_as = entry.get("bundledAs") if bundled_as: if "filename" in bundled_as: self.assertTrue(test_entry["bundledAs"]["filename"] == bundled_as["filename"]) if "folder" in bundled_as: self.assertTrue(test_entry["bundledAs"]["folder"] == bundled_as["folder"]) except Exception as e: self.fail(get_typed_exception(e))
def test_update_bag_change_metadata_nested_dict(self): logger.info(self.getTestHeader('update bag change metadata with nested dict')) try: bag = bdb.make_bag(self.test_bag_dir, update=True, save_manifests=False, metadata_file=ospj(self.test_config_dir, 'test-ro-metadata.json')) output = self.stream.getvalue() self.assertIsInstance(bag, bdbagit.BDBag) self.assertExpectedMessages(['Reading bag metadata from file', 'test-ro-metadata.json'], output) self.assertExpectedMessages(["Nested dictionary content not supported in tag file: [bag-info.txt]"], output) except Exception as e: self.fail(get_typed_exception(e))
def test_update_bag_prune(self): logger.info(self.getTestHeader('update bag prune manifests')) try: bag = bdb.make_bag(self.test_bag_dir, algs=['md5'], update=True, prune_manifests=True) self.assertIsInstance(bag, bagit.Bag) self.assertFalse(ospif(ospj(self.test_bag_dir, 'manifest-sha1.txt'))) self.assertFalse(ospif(ospj(self.test_bag_dir, 'manifest-sha256.txt'))) self.assertFalse(ospif(ospj(self.test_bag_dir, 'manifest-sha512.txt'))) self.assertFalse(ospif(ospj(self.test_bag_dir, 'tagmanifest-sha1.txt'))) self.assertFalse(ospif(ospj(self.test_bag_dir, 'tagmanifest-sha256.txt'))) self.assertFalse(ospif(ospj(self.test_bag_dir, 'tagmanifest-sha512.txt'))) except Exception as e: self.fail(bdbag.get_named_exception(e))
def test_update_bag_prune(self): logger.info(self.getTestHeader('update bag prune manifests')) try: bag = bdb.make_bag(self.test_bag_dir, algs=['md5'], update=True, prune_manifests=True) self.assertIsInstance(bag, bdbagit.BDBag) self.assertFalse(ospif(ospj(self.test_bag_dir, 'manifest-sha1.txt'))) self.assertFalse(ospif(ospj(self.test_bag_dir, 'manifest-sha256.txt'))) self.assertFalse(ospif(ospj(self.test_bag_dir, 'manifest-sha512.txt'))) self.assertFalse(ospif(ospj(self.test_bag_dir, 'tagmanifest-sha1.txt'))) self.assertFalse(ospif(ospj(self.test_bag_dir, 'tagmanifest-sha256.txt'))) self.assertFalse(ospif(ospj(self.test_bag_dir, 'tagmanifest-sha512.txt'))) except Exception as e: self.fail(get_typed_exception(e))
def _test_create_or_update_bag_with_metadata(self, update=False, override_file_metadata=False, no_file_metadata=False): try: metadata_param = None if not override_file_metadata else { "Contact-Name": "nobody" } ro_metadata_param = None if not override_file_metadata else { "manifest.json": { "@context": ["https://w3id.org/bundle/context"], "@id": "../" } } bag_dir = self.test_bag_dir if update else self.test_data_dir bag = bdb.make_bag( bag_dir, update=update, metadata=metadata_param, metadata_file=None if no_file_metadata else ospj( self.test_config_dir, 'test-metadata.json'), ro_metadata=ro_metadata_param, ro_metadata_file=None if no_file_metadata else ospj( self.test_config_dir, 'test-ro-metadata.json')) output = self.stream.getvalue() self.assertIsInstance(bag, bdbagit.BDBag) bag_info_txt = self.slurp_text_file(ospj( bag_dir, 'bag-info.txt')).splitlines() if override_file_metadata: self.assertIn('Contact-Name: nobody', bag_info_txt) if not no_file_metadata: self.assertExpectedMessages( ['Reading bag metadata from file', 'test-metadata.json'], output) self.assertExpectedMessages([ 'Reading bag metadata from file', 'test-ro-metadata.json' ], output) self.assertIn('External-Description: Simple bdbag test', bag_info_txt) ro_manifest_file = ospj(bag_dir, 'metadata', 'manifest.json') self.assertTrue(os.path.isfile(ro_manifest_file)) ro_manifest_txt = self.slurp_text_file(ro_manifest_file) ro_test_line = '"uri": "../data/test2/test2.txt"' if override_file_metadata: self.assertNotIn(ro_test_line, ro_manifest_txt) else: self.assertIn(ro_test_line, ro_manifest_txt) except Exception as e: self.fail(get_typed_exception(e))
def test_update_bag_change_metadata(self): logger.info(self.getTestHeader('update bag change metadata')) try: bag = bdb.make_bag(self.test_bag_dir, update=True, metadata={"Contact-Name": "nobody"}, metadata_file=(ospj(self.test_config_dir, 'test-metadata.json'))) output = self.stream.getvalue() self.assertIsInstance(bag, bagit.Bag) self.assertExpectedMessages( ['Reading bag metadata from file', 'test-metadata.json'], output) except Exception as e: self.fail(bdbag.get_named_exception(e))
def create_bag_archive(metadata, bag_algorithms=('md5', 'sha256'), **bag_metadata): bag_name = join(settings.BAG_STAGING_DIR, str(uuid.uuid4())) remote_manifest_filename = join(settings.BAG_STAGING_DIR, str(uuid.uuid4())) remote_manifest_formatted = _format_remote_file_manifest( metadata, bag_algorithms) with open(remote_manifest_filename, 'w') as f: f.write(json.dumps(remote_manifest_formatted)) os.mkdir(bag_name) bdbag_api.make_bag( bag_name, algs=bag_algorithms, metadata=dict(bag_metadata), remote_file_manifest=remote_manifest_filename, ) bdbag_api.archive_bag(bag_name, settings.BAG_ARCHIVE_FORMAT) archive_name = '{}.{}'.format(bag_name, settings.BAG_ARCHIVE_FORMAT) os.remove(remote_manifest_filename) return archive_name
def test_update_bag_change_metadata_only(self): logger.info(self.getTestHeader('update bag change metadata only - do not save manifests')) try: bag = bdb.make_bag(self.test_bag_dir, update=True, save_manifests=False, metadata={"Contact-Name": "nobody"}, metadata_file=(ospj(self.test_config_dir, 'test-metadata.json'))) output = self.stream.getvalue() self.assertIsInstance(bag, bagit.Bag) self.assertExpectedMessages(['Reading bag metadata from file', 'test-metadata.json'], output) self.assertUnexpectedMessages(['updating manifest-sha1.txt', 'updating manifest-sha256.txt', 'updating manifest-sha512.txt', 'updating manifest-md5.txt'], output) except Exception as e: self.fail(bdbag.get_named_exception(e))
def test_create_bag_with_config(self): logger.info(self.getTestHeader('create bag with config')) try: bag = bdb.make_bag(self.test_data_dir, config_file=(ospj(self.test_config_dir, 'test-config.json'))) self.assertIsInstance(bag, bagit.Bag) self.assertFalse(ospif(ospj(self.test_data_dir, 'manifest-sha1.txt'))) self.assertFalse(ospif(ospj(self.test_data_dir, 'manifest-sha256.txt'))) self.assertFalse(ospif(ospj(self.test_data_dir, 'manifest-sha512.txt'))) self.assertFalse(ospif(ospj(self.test_data_dir, 'tagmanifest-sha1.txt'))) self.assertFalse(ospif(ospj(self.test_data_dir, 'tagmanifest-sha256.txt'))) self.assertFalse(ospif(ospj(self.test_data_dir, 'tagmanifest-sha512.txt'))) baginfo = ospj(self.test_data_dir, 'bag-info.txt') with open(baginfo) as bi: baginfo_txt = bi.read() self.assertIn('Contact-Name: bdbag test', baginfo_txt) except Exception as e: self.fail(bdbag.get_named_exception(e))
def test_create_bag_with_config(self): logger.info(self.getTestHeader('create bag with config')) try: bag = bdb.make_bag(self.test_data_dir, config_file=(ospj(self.test_config_dir, 'test-config.json'))) self.assertIsInstance(bag, bdbagit.BDBag) self.assertFalse(ospif(ospj(self.test_data_dir, 'manifest-sha1.txt'))) self.assertFalse(ospif(ospj(self.test_data_dir, 'manifest-sha256.txt'))) self.assertFalse(ospif(ospj(self.test_data_dir, 'manifest-sha512.txt'))) self.assertFalse(ospif(ospj(self.test_data_dir, 'tagmanifest-sha1.txt'))) self.assertFalse(ospif(ospj(self.test_data_dir, 'tagmanifest-sha256.txt'))) self.assertFalse(ospif(ospj(self.test_data_dir, 'tagmanifest-sha512.txt'))) baginfo = ospj(self.test_data_dir, 'bag-info.txt') with open(baginfo) as bi: baginfo_txt = bi.read() self.assertIn('Contact-Name: bdbag test', baginfo_txt) except Exception as e: self.fail(get_typed_exception(e))
def test_update_bag_remote(self): logger.info(self.getTestHeader('update bag add remote file manifest')) try: bag = bdb.make_bag(self.test_bag_dir, update=True, remote_file_manifest=ospj(self.test_config_dir, 'test-fetch-manifest.json')) output = self.stream.getvalue() self.assertIsInstance(bag, bagit.Bag) self.assertExpectedMessages(['Generating remote file references from', 'test-fetch-manifest.json'], output) fetch_file = ospj(self.test_bag_dir, 'fetch.txt') self.assertTrue(ospif(fetch_file)) with open(fetch_file) as ff: fetch_txt = ff.read() self.assertIn( 'https://raw.githubusercontent.com/ini-bdds/bdbag/master/profiles/bdbag-profile.json' '\t723\tdata/bdbag-profile.json', fetch_txt) self.assertIn( 'ark:/88120/r8059v\t632860\tdata/minid_v0.1_Nov_2015.pdf', fetch_txt) except Exception as e: self.fail(bdbag.get_named_exception(e))
def test_update_bag_change_metadata_only(self): logger.info( self.getTestHeader( 'update bag change metadata only - do not save manifests')) try: bag = bdb.make_bag(self.test_bag_dir, update=True, save_manifests=False, metadata={"Contact-Name": "nobody"}, metadata_file=(ospj(self.test_config_dir, 'test-metadata.json'))) output = self.stream.getvalue() self.assertIsInstance(bag, bagit.Bag) self.assertExpectedMessages( ['Reading bag metadata from file', 'test-metadata.json'], output) self.assertUnexpectedMessages([ 'updating manifest-sha1.txt', 'updating manifest-sha256.txt', 'updating manifest-sha512.txt', 'updating manifest-md5.txt' ], output) except Exception as e: self.fail(bdbag.get_named_exception(e))
def test_update_bag_change_metadata_only(self): logger.info(self.getTestHeader('update bag change metadata only - do not save manifests')) try: bag = bdb.make_bag(self.test_bag_dir, update=True, save_manifests=False, metadata={"Contact-Name": "nobody"}, metadata_file=ospj(self.test_config_dir, 'test-metadata.json'), ro_metadata_file=ospj(self.test_config_dir, 'test-ro-metadata.json')) output = self.stream.getvalue() self.assertIsInstance(bag, bdbagit.BDBag) self.assertExpectedMessages(['Reading bag metadata from file', 'test-metadata.json'], output) bag_info_txt = self.slurp_text_file(ospj(self.test_bag_dir, 'bag-info.txt')).splitlines() self.assertIn('Contact-Name: nobody', bag_info_txt) self.assertIn('External-Description: Simple bdbag test', bag_info_txt) self.assertTrue(os.path.isfile(ospj(self.test_bag_dir, 'metadata', 'manifest.json'))) self.assertUnexpectedMessages(['updating manifest-sha1.txt', 'updating manifest-sha256.txt', 'updating manifest-sha512.txt', 'updating manifest-md5.txt'], output) except Exception as e: self.fail(get_typed_exception(e))
def create_bag(output_dir, update): """Create/Update and archive a BDBag from the contents of a passed-in directory.""" bdbag_api.make_bag(output_dir, update=update) return bdbag_api.archive_bag(output_dir, "zip")
def download(self, **kwargs): if not self.config: raise DerivaDownloadConfigurationError( "No configuration specified!") if self.config.get("catalog") is None: raise DerivaDownloadConfigurationError( "Catalog configuration error!") ro_manifest = None ro_author_name = None ro_author_orcid = None remote_file_manifest = os.path.abspath(''.join([ os.path.join(self.output_dir, 'remote-file-manifest_'), str(uuid.uuid4()), ".json" ])) catalog_config = self.config['catalog'] self.envars.update(self.config.get('env', dict())) self.envars.update({"hostname": self.hostname}) # 1. If we don't have a client identity, we need to authenticate identity = kwargs.get("identity") if not identity: try: if not self.credentials: self.set_credentials(get_credential(self.hostname)) logging.info("Validating credentials for host: %s" % self.hostname) attributes = self.catalog.get_authn_session().json() identity = attributes["client"] except HTTPError as he: if he.response.status_code == 404: logging.info( "No existing login session found for host: %s" % self.hostname) except Exception as e: raise DerivaDownloadAuthenticationError( "Unable to validate credentials: %s" % format_exception(e)) wallet = kwargs.get("wallet", {}) # 2. Check for bagging config and initialize bag related variables bag_path = None bag_archiver = None bag_algorithms = None bag_config = self.config.get('bag') create_bag = True if bag_config else False if create_bag: bag_name = bag_config.get( 'bag_name', ''.join([ "deriva_bag", '_', time.strftime("%Y-%m-%d_%H.%M.%S") ])).format(**self.envars) bag_path = os.path.abspath(os.path.join(self.output_dir, bag_name)) bag_archiver = bag_config.get('bag_archiver') bag_algorithms = bag_config.get('bag_algorithms', ['sha256']) bag_metadata = bag_config.get( 'bag_metadata', {"Internal-Sender-Identifier": "deriva@%s" % self.server_url}) bag_ro = create_bag and stob(bag_config.get('bag_ro', "True")) if create_bag: bdb.ensure_bag_path_exists(bag_path) bag = bdb.make_bag(bag_path, algs=bag_algorithms, metadata=bag_metadata) if bag_ro: ro_author_name = bag.info.get( "Contact-Name", None if not identity else identity.get( 'full_name', identity.get('display_name', identity.get('id', None)))) ro_author_orcid = bag.info.get("Contact-Orcid") ro_manifest = ro.init_ro_manifest( author_name=ro_author_name, author_orcid=ro_author_orcid) bag_metadata.update({BAG_PROFILE_TAG: BDBAG_RO_PROFILE_ID}) # 3. Process the set of queries by locating, instantiating, and invoking the specified processor(s) outputs = dict() base_path = bag_path if bag_path else self.output_dir for processor in catalog_config['query_processors']: processor_name = processor["processor"] processor_type = processor.get('processor_type') processor_params = processor.get('processor_params') try: query_processor = find_query_processor(processor_name, processor_type) processor = query_processor( self.envars, inputs=outputs, bag=create_bag, catalog=self.catalog, store=self.store, base_path=base_path, processor_params=processor_params, remote_file_manifest=remote_file_manifest, ro_manifest=ro_manifest, ro_author_name=ro_author_name, ro_author_orcid=ro_author_orcid, identity=identity, wallet=wallet) outputs = processor.process() except Exception as e: logging.error(format_exception(e)) if create_bag: bdb.cleanup_bag(bag_path) raise # 4. Execute anything in the transform processing pipeline, if configured transform_processors = self.config.get('transform_processors', []) if transform_processors: for processor in transform_processors: processor_name = processor["processor"] processor_type = processor.get('processor_type') processor_params = processor.get('processor_params') try: transform_processor = find_transform_processor( processor_name, processor_type) processor = transform_processor( self.envars, inputs=outputs, processor_params=processor_params, base_path=base_path, bag=create_bag, ro_manifest=ro_manifest, ro_author_name=ro_author_name, ro_author_orcid=ro_author_orcid, identity=identity, wallet=wallet) outputs = processor.process() except Exception as e: logging.error(format_exception(e)) raise # 5. Create the bag, and archive (serialize) if necessary if create_bag: try: if ro_manifest: ro.write_bag_ro_metadata(ro_manifest, bag_path) if not os.path.isfile(remote_file_manifest): remote_file_manifest = None bdb.make_bag( bag_path, algs=bag_algorithms, remote_file_manifest=remote_file_manifest if (remote_file_manifest and os.path.getsize(remote_file_manifest) > 0) else None, update=True) except Exception as e: logging.fatal("Exception while updating bag manifests: %s" % format_exception(e)) bdb.cleanup_bag(bag_path) raise finally: if remote_file_manifest and os.path.isfile( remote_file_manifest): os.remove(remote_file_manifest) logging.info('Created bag: %s' % bag_path) if bag_archiver is not None: try: archive = bdb.archive_bag(bag_path, bag_archiver.lower()) bdb.cleanup_bag(bag_path) outputs = { os.path.basename(archive): { LOCAL_PATH_KEY: archive } } except Exception as e: logging.error( "Exception while creating data bag archive: %s" % format_exception(e)) raise else: outputs = { os.path.basename(bag_path): { LOCAL_PATH_KEY: bag_path } } # 6. Execute anything in the post processing pipeline, if configured post_processors = self.config.get('post_processors', []) if post_processors: for processor in post_processors: processor_name = processor["processor"] processor_type = processor.get('processor_type') processor_params = processor.get('processor_params') try: post_processor = find_post_processor( processor_name, processor_type) processor = post_processor( self.envars, inputs=outputs, processor_params=processor_params, identity=identity, wallet=wallet) outputs = processor.process() except Exception as e: logging.error(format_exception(e)) raise return outputs
def main(): sys.stderr.write('\n') args, is_bag, is_file = parse_cli() path = os.path.abspath(args.bag_path) archive = None temp_path = None error = None result = 0 try: if not is_file: # do not try to create or update the bag if the user just wants to validate or complete an existing bag if not ((args.validate or args.validate_profile or args.resolve_fetch) and not (args.update and bdb.is_bag(path))): if args.checksum and 'all' in args.checksum: args.checksum = ['md5', 'sha1', 'sha256', 'sha512'] # create or update the bag depending on the input arguments bdb.make_bag(path, args.checksum, args.update, args.skip_manifests, args.prune_manifests, BAG_METADATA if BAG_METADATA else None, args.metadata_file, args.remote_file_manifest, args.config_file) # otherwise just extract the bag if it is an archive and no other conflicting options specified elif not (args.validate or args.validate_profile or args.resolve_fetch): bdb.extract_bag(path) sys.stderr.write('\n') return result if args.resolve_fetch: if args.validate == 'full': sys.stderr.write(ASYNC_TRANSFER_VALIDATION_WARNING) bdb.resolve_fetch(path, True if args.resolve_fetch == 'all' else False) if args.validate: if is_file: temp_path = bdb.extract_bag(path, temp=True) bdb.validate_bag(temp_path if temp_path else path, True if args.validate == 'fast' else False, args.config_file) if args.archiver: archive = bdb.archive_bag(path, args.archiver) if archive is None and is_file: archive = path if args.validate_profile: if is_file: if not temp_path: temp_path = bdb.extract_bag(path, temp=True) profile = bdb.validate_bag_profile(temp_path if temp_path else path) bdb.validate_bag_serialization(archive if archive else path, profile) except Exception as e: result = 1 error = "Error: %s" % bdbag.get_named_exception(e) finally: if temp_path: bdb.cleanup_bag(os.path.dirname(temp_path)) if result != 0: sys.stderr.write("\n%s" % error) sys.stderr.write('\n') return result
def main(argv): parser = argparse.ArgumentParser( description= 'Program to create a BDBag containing a set of Minids for remote content' ) parser.add_argument('-m', '--minids', metavar='<minid file>', help='File listing Minids for new bag', required=True) parser.add_argument('-b', '--bagname', metavar='<bag name>', help='Name of directory for new bag.', required=True) parser.add_argument('-v', '--verify', action='store_true', help='Validate bag after building it.', required=False) parser.add_argument('-q', '--quiet', action="store_true", help="Suppress logging output.") parser.add_argument('-d', '--debug', action="store_true", help="Enable debug logging output.") parser.add_argument( '-n', '--author-name', metavar="<person or entity name>", help= "Optional name of the person or entity responsible for the creation of this bag, " "for inclusion in the bag metadata.") parser.add_argument( '-o', '--author-orcid', metavar="<orcid>", help= "Optional ORCID identifier of the bag creator, for inclusion in the bag metadata." ) args = parser.parse_args() bdb.configure_logging(level=logging.ERROR if args.quiet else ( logging.DEBUG if args.debug else logging.INFO)) # Create the directory that will hold the new BDBag bdb.ensure_bag_path_exists(args.bagname) # For each supplied minid, fetch sub-bag to determine its properties minid_fields = extract_fields(args.minids) # Create 'README' file in the newly created bag directory. (moved to 'data' when bag is created) write_readme(args.bagname, minid_fields) # Create remote_file_manifest_file, to be used by make_bag working_dir = temp_path = tempfile.mkdtemp(prefix='encode2bag_') remote_file_manifest_file = osp.abspath( osp.join(working_dir, 'remote-file-manifest.json')) generate_remote_manifest_file(minid_fields, remote_file_manifest_file) # Create the new bag based on the supplied remote manifest file bag = bdb.make_bag(args.bagname, algs=['md5', 'sha256'], remote_file_manifest=remote_file_manifest_file) # Create metadata/manifest.json file with Research Object JSON object ro_manifest = ro.init_ro_manifest( author_name=args.author_name, author_orcid=args.author_orcid, creator_name='bagofbags using BDBag version: %s (Bagit version: %s)' % (VERSION, BAGIT_VERSION), creator_uri='https://github.com/fair-research/bdbag/examples/bagofbags/' ) add_remote_file_manifest_to_ro(ro_manifest, minid_fields) ro.write_bag_ro_metadata(ro_manifest, args.bagname, 'manifest.json') # Run make_bag again to include manifest.json in the checksums etc. bdb.make_bag(args.bagname, update=True) if args.verify: bdb.resolve_fetch(args.bagname, force=True) bdb.validate_bag(args.bagname, fast=False, callback=None)
def validate_user_submission(data_path, schema, output_dir=None, delete_dir=False, handle_git_repos=True, bdbag_kwargs=None): """ Arguments: data_path (str): The path to the data to ingest into DERIVA. The path can be: 1) A directory to be formatted into a BDBag 2) A Git repository to be copied into a BDBag 3) A premade BDBag directory 4) A premade BDBag in an archive file schema (str): The named schema or schema file link to validate data against. Default None, to only validate against the declared TableSchema. output_dir (str): The path to create an output directory in. The resulting BDBag archive will be named after this directory. If not set, the directory will be turned into a BDBag in-place. For Git repositories, this is automatically set, but can be overridden. If data_path is a file, this has no effect. This dir MUST NOT be in the `data_path` directory or any subdirectories. Default None. delete_dir (bool): Should the output_dir be deleted after submission? Has no effect if output_dir is not specified. For Git repositories, this is always True. Default False. handle_git_repos (bool): Should Git repositories be detected and handled? When this is False, Git repositories are handled as simple directories instead of Git repositories. Default True. bdbag_kwargs (dict): Extra args to pass to bdbag """ bdbag_kwargs = bdbag_kwargs or {} data_path = os.path.abspath(data_path) if not os.path.exists(data_path): raise FileNotFoundError("Path '{}' does not exist".format(data_path)) if handle_git_repos: logger.debug("Checking for a Git repository") # If Git repo, set output_dir appropriately try: repo = git.Repo(data_path, search_parent_directories=True) # Not Git repo except git.InvalidGitRepositoryError: logger.debug("Not a Git repo") # Path not found, turn into standard FileNotFoundError except git.NoSuchPathError: raise FileNotFoundError( "Path '{}' does not exist".format(data_path)) # Is Git repo else: logger.debug("Git repo found, collecting metadata") # Needs to not have slash at end - is known Git repo already, slash # interferes with os.path.basename/dirname if data_path.endswith("/"): data_path = data_path[:-1] # Set output_dir to new dir named with HEAD commit hash new_dir_name = "{}_{}".format(os.path.basename(data_path), str(repo.head.commit)) output_dir = os.path.join(os.path.dirname(data_path), new_dir_name) # Delete temp dir after archival delete_dir = True # If dir and not already BDBag, make BDBag if os.path.isdir(data_path) and not bdbag_api.is_bag(data_path): logger.debug("Creating BDBag out of directory '{}'".format(data_path)) # If output_dir specified, copy data to output dir first if output_dir: logger.debug("Copying data to '{}' before creating BDBag".format( output_dir)) output_dir = os.path.abspath(output_dir) # If shutil.copytree is called when the destination dir is inside the source dir # by more than one layer, it will recurse infinitely. # (e.g. /source => /source/dir/dest) # Exactly one layer is technically okay (e.g. /source => /source/dest), # but it's easier to forbid all parent/child dir cases. # Check for this error condition by determining if output_dir is a child # of data_path. if os.path.commonpath([data_path]) == os.path.commonpath( [data_path, output_dir]): raise ValueError( "The output_dir ('{}') must not be in data_path ('{}')". format(output_dir, data_path)) try: shutil.copytree(data_path, output_dir) except FileExistsError: raise FileExistsError( ("The output directory must not exist. " "Delete '{}' to submit.\nYou can set delete_dir=True " "to avoid this issue in the future.").format(output_dir)) # Process new dir instead of old path data_path = output_dir # If output_dir not specified, never delete data dir else: delete_dir = False # Make bag bdbag_api.make_bag(data_path, **bdbag_kwargs) if not bdbag_api.is_bag(data_path): raise ValueError( "Failed to create BDBag from {}".format(data_path)) logger.debug("BDBag created at '{}'".format(data_path)) # If dir (must be BDBag at this point), archive if os.path.isdir(data_path): logger.debug("Archiving BDBag at '{}' using '{}'".format( data_path, CONFIG["ARCHIVE_FORMAT"])) new_data_path = bdbag_api.archive_bag(data_path, CONFIG["ARCHIVE_FORMAT"]) logger.debug("BDBag archived to file '{}'".format(new_data_path)) # If requested (e.g. Git repo copied dir), delete data dir if delete_dir: logger.debug("Removing old directory '{}'".format(data_path)) shutil.rmtree(data_path) # Overwrite data_path - don't care about dir for uploading data_path = new_data_path # Validate TableSchema in BDBag logger.debug("Validating TableSchema in BDBag '{}'".format(data_path)) validation_res = ts_validate(data_path, schema=schema) if not validation_res["is_valid"]: raise exc.ValidationException( "TableSchema invalid due to the following errors: " "\n{}\n".format(validation_res["error"])) logger.debug("Validation successful") return data_path
def main(): args, is_bag, is_file = parse_cli() path = os.path.abspath(args.path) archive = None temp_path = None error = None result = 0 if not args.quiet: sys.stderr.write('\n') try: if not is_file: # do not try to create or update the bag if the user just wants to validate or complete an existing bag if not ( (args.validate or args.validate_profile or args.resolve_fetch) and not (args.update and bdb.is_bag(path))): if args.checksum and 'all' in args.checksum: args.checksum = ['md5', 'sha1', 'sha256', 'sha512'] # create or update the bag depending on the input arguments bdb.make_bag(path, algs=args.checksum, update=args.update, save_manifests=not args.skip_manifests, prune_manifests=args.prune_manifests, metadata=BAG_METADATA if BAG_METADATA else None, metadata_file=args.metadata_file, remote_file_manifest=args.remote_file_manifest, config_file=args.config_file, ro_metadata_file=args.ro_metadata_file) # otherwise just extract the bag if it is an archive and no other conflicting options specified elif not (args.validate or args.validate_profile or args.resolve_fetch): bdb.extract_bag(path) if not args.quiet: sys.stderr.write('\n') return result if args.ro_manifest_generate: bdb.generate_ro_manifest( path, True if args.ro_manifest_generate == "overwrite" else False, config_file=args.config_file) if args.resolve_fetch: if args.validate == 'full': sys.stderr.write(ASYNC_TRANSFER_VALIDATION_WARNING) bdb.resolve_fetch( path, force=True if args.resolve_fetch == 'all' else False, keychain_file=args.keychain_file, config_file=args.config_file, filter_expr=args.fetch_filter) if args.validate: if is_file: temp_path = bdb.extract_bag(path, temp=True) if args.validate == 'structure': bdb.validate_bag_structure(temp_path if temp_path else path) else: bdb.validate_bag( temp_path if temp_path else path, fast=True if args.validate == 'fast' else False, config_file=args.config_file) if args.archiver: archive = bdb.archive_bag(path, args.archiver) if archive is None and is_file: archive = path if args.validate_profile: if is_file: if not temp_path: temp_path = bdb.extract_bag(path, temp=True) profile = bdb.validate_bag_profile( temp_path if temp_path else path) bdb.validate_bag_serialization(archive if archive else path, profile) if args.revert: bdb.revert_bag(path) except Exception as e: result = 1 error = "Error: %s" % get_typed_exception(e) finally: if temp_path: bdb.cleanup_bag(os.path.dirname(temp_path)) if result != 0: sys.stderr.write("\n%s" % error) if not args.quiet: sys.stderr.write('\n') return result
def download(self, identity=None): if not self.config: raise RuntimeError("No configuration specified!") if self.config.get("catalog") is None: raise RuntimeError("Catalog configuration error!") if not identity: logging.info("Validating credentials") try: if not self.credentials: self.setCredentials(get_credential(self.hostname)) attributes = self.catalog.get_authn_session().json() identity = attributes["client"] except Exception as e: raise RuntimeError("Unable to validate credentials: %s" % format_exception(e)) ro_manifest = None ro_author_name = None ro_author_orcid = None remote_file_manifest = os.path.abspath( ''.join([os.path.join(self.output_dir, 'remote-file-manifest_'), str(uuid.uuid4()), ".json"])) catalog_config = self.config['catalog'] self.envars.update(self.config.get('env', dict())) bag_path = None bag_archiver = None bag_algorithms = None bag_config = self.config.get('bag') create_bag = True if bag_config else False if create_bag: bag_name = bag_config.get('bag_name', ''.join(["deriva_bag", '_', time.strftime("%Y-%m-%d_%H.%M.%S")])) bag_path = os.path.abspath(os.path.join(self.output_dir, bag_name)) bag_archiver = bag_config.get('bag_archiver') bag_algorithms = bag_config.get('bag_algorithms', ['sha256']) bag_metadata = bag_config.get('bag_metadata', {"Internal-Sender-Identifier": "deriva@%s" % self.server_url}) bag_ro = create_bag and stob(bag_config.get('bag_ro', "True")) if create_bag: bdb.ensure_bag_path_exists(bag_path) bag = bdb.make_bag(bag_path, algs=bag_algorithms, metadata=bag_metadata) if bag_ro: ro_author_name = bag.info.get("Contact-Name", identity.get('full_name', identity.get('display_name', identity.get('id', None)))) ro_author_orcid = bag.info.get("Contact-Orcid") ro_manifest = ro.init_ro_manifest(author_name=ro_author_name, author_orcid=ro_author_orcid) bag_metadata.update({BAG_PROFILE_TAG: BDBAG_RO_PROFILE_ID}) file_list = list() base_path = bag_path if bag_path else self.output_dir for query in catalog_config['queries']: query_path = query['query_path'] output_format = query['output_format'] output_processor = query.get("output_format_processor") format_args = query.get('output_format_params', None) output_path = query.get('output_path', '') try: download_processor = findProcessor(output_format, output_processor) processor = download_processor(self.envars, bag=create_bag, catalog=self.catalog, store=self.store, query=query_path, base_path=base_path, sub_path=output_path, format_args=format_args, remote_file_manifest=remote_file_manifest, ro_manifest=ro_manifest, ro_author_name=ro_author_name, ro_author_orcid=ro_author_orcid) file_list.extend(processor.process()) except Exception as e: logging.error(format_exception(e)) if create_bag: bdb.cleanup_bag(bag_path) raise if create_bag: try: if ro_manifest: ro.write_bag_ro_metadata(ro_manifest, bag_path) if not os.path.isfile(remote_file_manifest): remote_file_manifest = None bdb.make_bag(bag_path, algs=bag_algorithms, remote_file_manifest=remote_file_manifest, update=True) except Exception as e: logging.fatal("Exception while updating bag manifests: %s", format_exception(e)) bdb.cleanup_bag(bag_path) raise finally: if remote_file_manifest and os.path.isfile(remote_file_manifest): os.remove(remote_file_manifest) logging.info('Created bag: %s' % bag_path) if bag_archiver is not None: try: archive = bdb.archive_bag(bag_path, bag_archiver.lower()) bdb.cleanup_bag(bag_path) return [archive] except Exception as e: logging.error("Exception while creating data bag archive:", format_exception(e)) raise else: return [bag_path] return file_list
def update_bag(outdir): bdbag_api.make_bag(outdir, update=True) return bdbag_api.archive_bag(outdir, "zip")
def start_deriva_flow(self, data_path, dcc_id, catalog_id=None, schema=None, server=None, dataset_acls=None, output_dir=None, delete_dir=False, handle_git_repos=True, dry_run=False, test_sub=False, verbose=False, **kwargs): """Start the Globus Automate Flow to ingest CFDE data into DERIVA. Arguments: data_path (str): The path to the data to ingest into DERIVA. The path can be: 1) A directory to be formatted into a BDBag 2) A Git repository to be copied into a BDBag 3) A premade BDBag directory 4) A premade BDBag in an archive file dcc_id (str): The CFDE-recognized DCC ID for this submission. catalog_id (int or str): The ID of the DERIVA catalog to ingest into. Default None, to create a new catalog. schema (str): The named schema or schema file link to validate data against. Default None, to only validate against the declared TableSchema. server (str): The DERIVA server to ingest to. Default None, to use the Action Provider-set default. dataset_acls (dict): The DERIVA ACL(s) to set on the final dataset. Default None, to use the CFDE default ACLs. output_dir (str): The path to create an output directory in. The resulting BDBag archive will be named after this directory. If not set, the directory will be turned into a BDBag in-place. For Git repositories, this is automatically set, but can be overridden. If data_path is a file, this has no effect. This dir MUST NOT be in the `data_path` directory or any subdirectories. Default None. delete_dir (bool): Should the output_dir be deleted after submission? Has no effect if output_dir is not specified. For Git repositories, this is always True. Default False. handle_git_repos (bool): Should Git repositories be detected and handled? When this is False, Git repositories are handled as simple directories instead of Git repositories. Default True. dry_run (bool): Should the data be validated and bagged without starting the Flow? When True, does not ingest into DERIVA or start the Globus Automate Flow, and the return value will not have valid DERIVA Flow information. Default False. test_sub (bool): Should the submission be run in "test mode" where the submission will be inegsted into DERIVA and immediately deleted? When True, the data wil not remain in DERIVA to be viewed and the Flow will terminate before any curation step. verbose (bool): Should intermediate status messages be printed out? Default False. Keyword Arguments: force_http (bool): Should the data be sent using HTTP instead of Globus Transfer, even if Globus Transfer is available? Because Globus Transfer is more robust than HTTP, it is highly recommended to leave this False. Default False. Other keyword arguments are passed directly to the ``make_bag()`` function of the BDBag API (see https://github.com/fair-research/bdbag for details). """ if verbose: print("Startup: Validating input") data_path = os.path.abspath(data_path) if not os.path.exists(data_path): raise FileNotFoundError( "Path '{}' does not exist".format(data_path)) if catalog_id in self.catalogs.keys(): if schema: raise ValueError( "You may not specify a schema ('{}') when ingesting to " "a named catalog ('{}'). Retry without specifying " "a schema.".format(schema, catalog_id)) schema = self.catalogs[catalog_id] # Pull out known kwargs force_http = kwargs.pop("force_http", False) if handle_git_repos: if verbose: print("Checking for a Git repository") # If Git repo, set output_dir appropriately try: repo = git.Repo(data_path, search_parent_directories=True) # Not Git repo except git.InvalidGitRepositoryError: if verbose: print("Not a Git repo") # Path not found, turn into standard FileNotFoundError except git.NoSuchPathError: raise FileNotFoundError( "Path '{}' does not exist".format(data_path)) # Is Git repo else: if verbose: print("Git repo found, collecting metadata") # Needs to not have slash at end - is known Git repo already, slash # interferes with os.path.basename/dirname if data_path.endswith("/"): data_path = data_path[:-1] # Set output_dir to new dir named with HEAD commit hash new_dir_name = "{}_{}".format(os.path.basename(data_path), str(repo.head.commit)) output_dir = os.path.join(os.path.dirname(data_path), new_dir_name) # Delete temp dir after archival delete_dir = True # If dir and not already BDBag, make BDBag if os.path.isdir(data_path) and not bdbag_api.is_bag(data_path): if verbose: print("Creating BDBag out of directory '{}'".format(data_path)) # If output_dir specified, copy data to output dir first if output_dir: if verbose: print("Copying data to '{}' before creating BDBag".format( output_dir)) output_dir = os.path.abspath(output_dir) # If shutil.copytree is called when the destination dir is inside the source dir # by more than one layer, it will recurse infinitely. # (e.g. /source => /source/dir/dest) # Exactly one layer is technically okay (e.g. /source => /source/dest), # but it's easier to forbid all parent/child dir cases. # Check for this error condition by determining if output_dir is a child # of data_path. if os.path.commonpath([data_path]) == os.path.commonpath( [data_path, output_dir]): raise ValueError( "The output_dir ('{}') must not be in data_path ('{}')" .format(output_dir, data_path)) try: shutil.copytree(data_path, output_dir) except FileExistsError: raise FileExistsError( ("The output directory must not exist. " "Delete '{}' to submit.\nYou can set delete_dir=True " "to avoid this issue in the future." ).format(output_dir)) # Process new dir instead of old path data_path = output_dir # If output_dir not specified, never delete data dir else: delete_dir = False # Make bag bdbag_api.make_bag(data_path, **kwargs) if not bdbag_api.is_bag(data_path): raise ValueError( "Failed to create BDBag from {}".format(data_path)) elif verbose: print("BDBag created at '{}'".format(data_path)) # If dir (must be BDBag at this point), archive if os.path.isdir(data_path): if verbose: print("Archiving BDBag at '{}' using '{}'".format( data_path, CONFIG["ARCHIVE_FORMAT"])) new_data_path = bdbag_api.archive_bag(data_path, CONFIG["ARCHIVE_FORMAT"]) if verbose: print("BDBag archived to file '{}'".format(new_data_path)) # If requested (e.g. Git repo copied dir), delete data dir if delete_dir: if verbose: print("Removing old directory '{}'".format(data_path)) shutil.rmtree(data_path) # Overwrite data_path - don't care about dir for uploading data_path = new_data_path # Validate TableSchema in BDBag if verbose: print("Validating TableSchema in BDBag '{}'".format(data_path)) validation_res = ts_validate(data_path, schema=schema) if not validation_res["is_valid"]: return { "success": False, "error": ("TableSchema invalid due to the following errors: \n{}\n". format(validation_res["error"])) } elif verbose: print("Validation successful") # Now BDBag is archived file # Set path on destination dest_path = "{}{}".format(self.flow_info["cfde_ep_path"], os.path.basename(data_path)) # If doing dry run, stop here before making Flow input if dry_run: return { "success": True, "message": "Dry run validated successfully. No data was transferred." } # Set up Flow if verbose: print("Creating input for Flow") # If local EP exists (and not force_http), can use Transfer # Local EP fetched now in case GCP started after Client creation local_endpoint = globus_sdk.LocalGlobusConnectPersonal().endpoint_id if local_endpoint and not force_http: if verbose: print( "Using local Globus Connect Personal Endpoint '{}'".format( local_endpoint)) # Populate Transfer fields in Flow flow_id = self.flow_info["flow_id"] flow_input = { "source_endpoint_id": local_endpoint, "source_path": data_path, "cfde_ep_id": self.flow_info["cfde_ep_id"], "cfde_ep_path": dest_path, "cfde_ep_url": self.flow_info["cfde_ep_url"], "is_directory": False, "test_sub": test_sub, "dcc_id": dcc_id } if catalog_id: flow_input["catalog_id"] = str(catalog_id) if server: flow_input["server"] = server # Otherwise, we must PUT the BDBag on the server else: if verbose: print("No Globus Endpoint detected; using HTTP upload instead") headers = {} self.__https_authorizer.set_authorization_header(headers) data_url = "{}{}".format(self.flow_info["cfde_ep_url"], dest_path) with open(data_path, 'rb') as bag_file: bag_data = bag_file.read() put_res = requests.put(data_url, data=bag_data, headers=headers) # Regenerate headers on 401 if put_res.status_code == 401: self.__https_authorizer.handle_missing_authorization() self.__https_authorizer.set_authorization_header(headers) put_res = requests.put(data_url, data=bag_data, headers=headers) # Error message on failed PUT or any unexpected response if put_res.status_code >= 300: return { "success": False, "error": ("Could not upload BDBag to server (error {}):\n{}".format( put_res.status_code, put_res.content)) } elif put_res.status_code != 200: print( "Warning: HTTP upload returned status code {}, which was unexpected." .format(put_res.status_code)) if verbose: print("Upload successful to '{}': {} {}".format( data_url, put_res.status_code, put_res.content)) flow_id = self.flow_info["flow_id"] flow_input = { "source_endpoint_id": False, "data_url": data_url, "test_sub": test_sub, "dcc_id": dcc_id } if catalog_id: flow_input["catalog_id"] = str(catalog_id) if server: flow_input["server"] = server if verbose: print("Flow input populated:\n{}".format( json.dumps(flow_input, indent=4, sort_keys=True))) # Get Flow scope flow_def = self.flow_client.get_flow(flow_id) flow_scope = flow_def["globus_auth_scope"] # Start Flow if verbose: print("Starting Flow - Submitting data") try: flow_res = self.flow_client.run_flow(flow_id, flow_scope, flow_input) except globus_sdk.GlobusAPIError as e: if e.http_status == 404: return { "success": False, "error": ("Could not access ingest Flow. Are you in the CFDE DERIVA " "Demo Globus Group? Check your membership or apply for access " "here: https://app.globus.org/groups/a437abe3-c9a4-11e9-b441-" "0efb3ba9a670/about") } else: raise self.last_flow_run = { "flow_id": flow_id, "flow_instance_id": flow_res["action_id"] } if verbose: print("Flow started successfully.") return { "success": True, "message": ("Started DERIVA ingest Flow\nFlow ID: {}\nFlow Instance ID: {}". format(flow_id, flow_res["action_id"])), "flow_id": flow_id, "flow_instance_id": flow_res["action_id"], "cfde_dest_path": dest_path, "http_link": "{}{}".format(self.flow_info["cfde_ep_url"], dest_path), "globus_web_link": ("https://app.globus.org/file-manager?origin_id={}&origin_path={}". format(self.flow_info["cfde_ep_id"], os.path.dirname(dest_path))) }