def parse_cli(): description = 'Utility routines for working with BDBags' parser = argparse.ArgumentParser( description=description, epilog="For more information see: http://github.com/fair-research/bdbag" ) parser.add_argument('--quiet', action="store_true", help="Suppress logging output.") parser.add_argument('--debug', action="store_true", help="Enable debug logging output.") subparsers = parser.add_subparsers(dest="subparser", help="sub-command help") create_crfm_fs_subparser(subparsers) create_crfm_file_subparser(subparsers) create_crfm_urls_subparser(subparsers) args = parser.parse_args() bdb.configure_logging(level=logging.ERROR if args.quiet else ( logging.DEBUG if args.debug else logging.INFO)) return args, parser
def main(argv): parser = argparse.ArgumentParser(description='Program to create a BDBag containing a set of Minids for remote content') parser.add_argument('-m', '--minids', metavar='<minid file>', help='File listing Minids for new bag', required=True) parser.add_argument('-b', '--bagname', metavar='<bag name>', help='Name of directory for new bag.', required=True) parser.add_argument('-v', '--verify', action='store_true', help='Validate bag after building it.', required=False) parser.add_argument('-q', '--quiet', action="store_true", help="Suppress logging output.") parser.add_argument('-d', '--debug', action="store_true", help="Enable debug logging output.") parser.add_argument('-n', '--author-name', metavar="<person or entity name>", help="Optional name of the person or entity responsible for the creation of this bag, " "for inclusion in the bag metadata.") parser.add_argument('-o', '--author-orcid', metavar="<orcid>", help="Optional ORCID identifier of the bag creator, for inclusion in the bag metadata.") args = parser.parse_args() bdb.configure_logging(level=logging.ERROR if args.quiet else (logging.DEBUG if args.debug else logging.INFO)) # Create the directory that will hold the new BDBag bdb.ensure_bag_path_exists(args.bagname) # For each supplied minid, fetch sub-bag to determine its properties minid_fields = extract_fields(args.minids) # Create 'README' file in the newly created bag directory. (moved to 'data' when bag is created) write_readme(args.bagname, minid_fields) # Create remote_file_manifest_file, to be used by make_bag working_dir = temp_path = tempfile.mkdtemp(prefix='encode2bag_') remote_file_manifest_file = osp.abspath(osp.join(working_dir, 'remote-file-manifest.json')) generate_remote_manifest_file(minid_fields, remote_file_manifest_file) # Create the new bag based on the supplied remote manifest file bag = bdb.make_bag(args.bagname, algs=['md5', 'sha256'], remote_file_manifest=remote_file_manifest_file) # Create metadata/manifest.json file with Research Object JSON object ro_manifest = ro.init_ro_manifest(author_name=args.author_name, author_orcid=args.author_orcid, creator_name = 'bagofbags using BDBag version: %s (Bagit version: %s)' % (VERSION, BAGIT_VERSION), creator_uri='https://github.com/fair-research/bdbag/examples/bagofbags/') add_remote_file_manifest_to_ro(ro_manifest, minid_fields) ro.write_bag_ro_metadata(ro_manifest, args.bagname, 'manifest.json') # Run make_bag again to include manifest.json in the checksums etc. bdb.make_bag(args.bagname, update=True) if args.verify: bdb.resolve_fetch(args.bagname, force=True) bdb.validate_bag(args.bagname, fast=False, callback=None)
def parse_cli(): description = 'BDBag utility for working with Bagit/RO archives' parser = argparse.ArgumentParser( description=description, epilog="For more information see: http://github.com/fair-research/bdbag" ) parser.add_argument('--version', action='version', version=VERSION) standard_args = parser.add_argument_group('Bag arguments') update_arg = "--update" standard_args.add_argument( update_arg, action="store_true", help= "Update an existing bag dir, regenerating manifests and fetch.txt if necessary." ) revert_arg = "--revert" standard_args.add_argument( revert_arg, action="store_true", help= "Revert an existing bag directory back to a normal directory, deleting all bag metadata files. " "Payload files in the \'data\' directory will be moved back to the directory root, and the \'data\' " "directory will be deleted.") archiver_arg = "--archiver" standard_args.add_argument( archiver_arg, choices=['zip', 'tar', 'tgz'], help="Archive a bag using the specified format.") checksum_arg = "--checksum" standard_args.add_argument( checksum_arg, action='append', choices=['md5', 'sha1', 'sha256', 'sha512', 'all'], help= "Checksum algorithm to use: can be specified multiple times with different values. " "If \'all\' is specified, every supported checksum will be generated") skip_manifests_arg = "--skip-manifests" standard_args.add_argument( skip_manifests_arg, action='store_true', help=str( "If \'skip-manifests\' is specified in conjunction with %s, only tagfile manifests will be " "regenerated, with payload manifests and fetch.txt (if any) left as is. This argument should be used " "when only bag metadata has changed." % update_arg)) prune_manifests_arg = "--prune-manifests" standard_args.add_argument( prune_manifests_arg, action='store_true', help= "If specified, any existing checksum manifests not explicitly configured via either" " the \"checksum\" argument(s) or configuration file will be deleted from the bag during an update." ) fetch_arg = "--resolve-fetch" standard_args.add_argument( fetch_arg, "--fetch", choices=['all', 'missing'], help="Download remote files listed in the bag's fetch.txt file. " "The \"missing\" option only attempts to fetch files that do not " "already exist in the bag payload directory. " "The \"all\" option causes all fetch files to be re-acquired," " even if they already exist in the bag payload directory.") fetch_filter_arg = "--fetch-filter" standard_args.add_argument( fetch_filter_arg, metavar="<column><operator><value>", help= "A simple expression of the form <column><operator><value> where: <column> is the name of a column in " "the bag's fetch.txt to be filtered on, <operator> is one of the following tokens; %s, and <value> is a " "string pattern or integer to be filtered against." % FILTER_DOCSTRING) validate_arg = "--validate" standard_args.add_argument( validate_arg, choices=['fast', 'full', 'structure'], help= "Validate a bag directory or bag archive. If \"fast\" is specified, Payload-Oxum (if present) will be " "used to check that the payload files are present and accounted for. If \"full\" is specified, " "all checksums will be regenerated and compared to the corresponding entries in the manifest. " "If \"structure\" is specified, the bag will be checked for structural validity only." ) validate_profile_arg = "--validate-profile" standard_args.add_argument( validate_profile_arg, action="store_true", help="Validate a bag against the profile specified by the bag's " "\"BagIt-Profile-Identifier\" metadata field, if present.") config_file_arg = "--config-file" standard_args.add_argument( config_file_arg, default=DEFAULT_CONFIG_FILE, metavar='<file>', help= "Optional path to a configuration file. If this argument is not specified, the configuration file " "defaults to: %s " % DEFAULT_CONFIG_FILE) keychain_file_arg = "--keychain-file" standard_args.add_argument( keychain_file_arg, default=DEFAULT_KEYCHAIN_FILE, metavar='<file>', help= "Optional path to a keychain file. If this argument is not specified, the keychain file " "defaults to: %s " % DEFAULT_KEYCHAIN_FILE) metadata_file_arg = "--metadata-file" standard_args.add_argument( metadata_file_arg, metavar='<file>', help="Optional path to a JSON formatted metadata file") ro_metadata_file_arg = "--ro-metadata-file" standard_args.add_argument( ro_metadata_file_arg, metavar='<file>', help="Optional path to a JSON formatted RO metadata file") ro_manifest_generate_arg = "--ro-manifest-generate" standard_args.add_argument( ro_manifest_generate_arg, choices=['overwrite', 'update'], help= "Automatically generate a basic RO metadata manifest.json file by introspecting a bag's metadata and " "structure.") remote_file_manifest_arg = "--remote-file-manifest" standard_args.add_argument( remote_file_manifest_arg, metavar='<file>', help= "Optional path to a JSON formatted remote file manifest configuration file used to add remote file entries" " to the bag manifest(s) and create the bag fetch.txt file.") standard_args.add_argument('--quiet', action="store_true", help="Suppress logging output.") standard_args.add_argument('--debug', action="store_true", help="Enable debug logging output.") standard_args.add_argument( 'path', metavar="<path>", help="Path to a bag directory or bag archive file.") metadata_args = parser.add_argument_group('Bag metadata arguments') headers = list(bagit.STANDARD_BAG_INFO_HEADERS) headers.append("Contact-Orcid") for header in sorted(headers): metadata_args.add_argument('--%s' % header.lower(), action=AddMetadataAction) args = parser.parse_args() bdb.configure_logging(level=logging.ERROR if args.quiet else ( logging.DEBUG if args.debug else logging.INFO)) path = os.path.abspath(args.path) if not os.path.exists(path): sys.stderr.write("Error: file or directory not found: %s\n\n" % path) sys.exit(2) is_file = os.path.isfile(path) if args.archiver and is_file: sys.stderr.write( "Error: A bag archive cannot be created from an existing bag archive.\n\n" ) sys.exit(2) if args.checksum and is_file: sys.stderr.write( "Error: A checksum manifest cannot be added to an existing bag archive. " "The bag must be extracted, updated, and re-archived.\n\n") sys.exit(2) if args.update and is_file: sys.stderr.write( "Error: An existing bag archive cannot be updated in-place. " "The bag must first be extracted and then updated.\n\n") sys.exit(2) if args.revert and is_file: sys.stderr.write( "Error: An existing bag archive cannot be reverted in-place. " "The bag must first be extracted and then reverted.\n\n") sys.exit(2) if args.fetch_filter and not args.resolve_fetch: sys.stderr.write( "Error: The %s argument can only be used with the %s argument.\n\n" % (fetch_filter_arg, fetch_arg)) sys.exit(2) if args.resolve_fetch and is_file: sys.stderr.write( "Error: It is not possible to resolve remote files directly into a bag archive. " "The bag must first be extracted before the %s argument can be specified.\n\n" % fetch_arg) sys.exit(2) if args.update and args.resolve_fetch: sys.stderr.write( "Error: The %s argument is not compatible with the %s argument.\n\n" % (update_arg, fetch_arg)) sys.exit(2) if args.remote_file_manifest and args.resolve_fetch: sys.stderr.write( "Error: The %s argument is not compatible with the %s argument.\n\n" % (remote_file_manifest_arg, fetch_arg)) sys.exit(2) is_bag = bdb.is_bag(path) if args.checksum and not args.update and is_bag: sys.stderr.write( "Error: Specifying %s for an existing bag requires the %s argument in order " "to apply any changes.\n\n" % (checksum_arg, update_arg)) sys.exit(2) if args.remote_file_manifest and not args.update and is_bag: sys.stderr.write( "Error: Specifying %s for an existing bag requires the %s argument in order " "to apply any changes.\n\n" % (remote_file_manifest_arg, update_arg)) sys.exit(2) if args.metadata_file and not args.update and is_bag: sys.stderr.write( "Error: Specifying %s for an existing bag requires the %s argument in order " "to apply any changes.\n\n" % (metadata_file_arg, update_arg)) sys.exit(2) if args.ro_metadata_file and not args.update and is_bag: sys.stderr.write( "Error: Specifying %s for an existing bag requires the %s argument in order " "to apply any changes.\n\n" % (ro_metadata_file_arg, update_arg)) sys.exit(2) if args.prune_manifests and not args.update and is_bag: sys.stderr.write( "Error: Specifying %s for an existing bag requires the %s argument in order " "to apply any changes.\n\n" % (prune_manifests_arg, update_arg)) sys.exit(2) if args.skip_manifests and not args.update and is_bag: sys.stderr.write("Error: Specifying %s requires the %s argument.\n\n" % (skip_manifests_arg, update_arg)) sys.exit(2) if BAG_METADATA and not args.update and is_bag: sys.stderr.write( "Error: Adding or modifying metadata %s for an existing bag requires the %s argument " "in order to apply any changes.\n\n" % (BAG_METADATA, update_arg)) sys.exit(2) if args.revert and not is_bag: sys.stderr.write( "Error: The directory %s is not a bag and therefore cannot be reverted.\n\n" % path) sys.exit(2) if args.revert and args.update and is_bag: sys.stderr.write( "Error: The %s argument is not compatible with the %s argument.\n\n" % (revert_arg, update_arg)) sys.exit(2) return args, is_bag, is_file
def parse_cli(): description = 'Utility routines for working with BDBags' parser = argparse.ArgumentParser( description=description, epilog="For more information see: http://github.com/fair-research/bdbag" ) parser.add_argument('--quiet', action="store_true", help="Suppress logging output.") parser.add_argument('--debug', action="store_true", help="Enable debug logging output.") subparsers = parser.add_subparsers(dest="subparser", help="sub-command help") parser_crfm = \ subparsers.add_parser('create-rfm', description="Create a remote file manifest by recursively scanning a directory.", help='create-rfm help') parser_crfm.add_argument( 'input_path', metavar="<input path>", help="Path to a directory tree which will be traversed for input files." ) parser_crfm.add_argument( 'output_file', metavar="<output file>", help= "Path of the filename where the remote file manifest will be written.") checksum_arg = parser_crfm.add_argument( "--checksum", action='append', required=True, choices=['md5', 'sha1', 'sha256', 'sha512', 'all'], help= "Checksum algorithm to use: can be specified multiple times with different values. " "If \'all\' is specified, every supported checksum will be generated") base_payload_path_arg = parser_crfm.add_argument( '--base-payload-path', metavar="<url>", help= "An optional path prefix to prepend to each relative file path found while walking the input directory " "tree. All files will be rooted under this base directory path in any bag created from this manifest." ) base_url_arg = parser_crfm.add_argument( '--base-url', metavar="<url>", required=True, help= "A URL root to prepend to each file listed in the manifest. Can be used to generate fetch URL " "fields dynamically.") # url_map_arg = parser_crfm.add_argument( # '--url-map-file', metavar="<path>", # help="Path to a JSON formatted file that maps file relative paths to URLs.") url_formatter_arg = parser_crfm.add_argument( "--url-formatter", choices=['none', 'append-path', 'append-filename'], default='none', help="Format function for generating remote file URLs. " "If \'append-path\' is specified, the existing relative path including the filename will be appended to" " the %s argument. If \'append-path\' is specified, only the filename will be appended. If \"none\" is " "specified, the %s argument will be used as-is." % (base_url_arg.option_strings, base_url_arg.option_strings)) streaming_json_arg = parser_crfm.add_argument( "--streaming-json", action='store_true', default=False, help=str( "If \'streaming-json\' is specified, one JSON tuple object per line will be output to the output file." "Enable this option if the default behavior produces a file that is prohibitively large to parse " "entirely into system memory.")) parser_crfm.set_defaults(func=create_remote_file_manifest) parser_grfm = \ subparsers.add_parser('generate-rfm', description="Generate a remote file manifest by from a list of HTTP(S) URLs by issuing " "HTTP HEAD requests for Content-Length, Content-Disposition, and Content-MD5 " "headers for each URL", help='generate-rfm help') parser_grfm.add_argument( 'input_file', metavar="<input file>", help= "Path to a newline delimited list of URLs that will be used to generate the remote file manifest." ) parser_grfm.add_argument( 'output_file', metavar="<output file>", help= "Path of the filename where the remote file manifest will be written.") parser_grfm.add_argument( '--keychain-file', default=DEFAULT_KEYCHAIN_FILE, metavar='<file>', help= "Optional path to a keychain file. If this argument is not specified, the keychain file " "defaults to: %s " % DEFAULT_KEYCHAIN_FILE) grfm_base_payload_path_arg = parser_grfm.add_argument( '--base-payload-path', metavar="<url>", help= "An optional path prefix to prepend to each relative file path found while querying each URL for metadata." " All files will be rooted under this base directory path in any bag created from this manifest." ) # grfm_url_map_arg = parser_grfm.add_argument( # '--url-map-file', metavar="<path>", # help="Path to a JSON formatted file that maps file relative paths to URLs.") parser_grfm.add_argument( '--preserve-url-path', default=False, action="store_true", help="Preserve the URL file path in the local payload.") grfm_streaming_json_arg = parser_grfm.add_argument( "--streaming-json", action='store_true', default=False, help=str( "If \'streaming-json\' is specified, one JSON tuple object per line will be output to the output file." "Enable this option if the default behavior produces a file that is prohibitively large to parse " "entirely into system memory.")) parser_grfm.set_defaults(func=generate_remote_file_manifest) args = parser.parse_args() bdb.configure_logging(level=logging.ERROR if args.quiet else ( logging.DEBUG if args.debug else logging.INFO)) return args, parser
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import os import gc import sys import shutil import tempfile import unittest import logging from bdbag.bdbag_api import configure_logging from bdbag.bdbagit import open_text_file configure_logging(logpath='test.log', filemode='w', level=logging.DEBUG) class BaseTest(unittest.TestCase): def setUp(self): if sys.version_info < (3, ): self.assertRaisesRegex = self.assertRaisesRegexp self.tmpdir = tempfile.mkdtemp(prefix="bdbag_test_") shutil.copytree(os.path.abspath(os.path.join('test-data')), os.path.join(self.tmpdir, 'test-data')) self.test_data_dir = os.path.join(self.tmpdir, 'test-data', 'test-dir') self.assertTrue(os.path.isdir(self.test_data_dir)) self.test_archive_dir = os.path.join(self.tmpdir, 'test-data',
def parse_cli(): description = 'BD2K BDBag utility for working with Bagit/RO archives' parser = argparse.ArgumentParser( description=description, epilog="For more information see: http://github.com/ini-bdds/bdbag") standard_args = parser.add_argument_group('Standard arguments') update_arg = standard_args.add_argument( '--update', action="store_true", help="Update an existing bag dir, regenerating manifests and fetch.txt if necessary.") standard_args.add_argument( "--archiver", choices=['zip', 'tar', 'tgz'], help="Archive a bag using the specified format.") checksum_arg = standard_args.add_argument( "--checksum", action='append', choices=['md5', 'sha1', 'sha256', 'sha512', 'all'], help="Checksum algorithm to use: can be specified multiple times with different values. " "If \'all\' is specified, every supported checksum will be generated") skip_manifests_arg = standard_args.add_argument( "--skip-manifests", action='store_true', help=str("If \'skip-manifests\' is specified in conjunction with %s, only tagfile manifests will be " "regenerated, with payload manifests and fetch.txt (if any) left as is. This argument should be used " "when only bag metadata has changed." % update_arg.option_strings)) prune_manifests_arg = standard_args.add_argument( "--prune-manifests", action='store_true', help="If specified, any existing checksum manifests not explicitly configured via either" " the \"checksum\" argument(s) or configuration file will be deleted from the bag during an update.") fetch_arg = standard_args.add_argument( '--resolve-fetch', choices=['all', 'missing'], help="Download remote files listed in the bag's fetch.txt file. " "The \"missing\" option only attempts to fetch files that do not " "already exist in the bag payload directory. " "The \"all\" option causes all fetch files to be re-acquired," " even if they already exist in the bag payload directory.") standard_args.add_argument( '--validate', choices=['fast', 'full'], help="Validate a bag directory or bag archive. If \"fast\" is specified, Payload-Oxum (if present) will be " "used to check that the payload files are present and accounted for. Otherwise if \"full\" is specified, " "all checksums will be regenerated and compared to the corresponding entries in the manifest") standard_args.add_argument( '--validate-profile', action="store_true", help="Validate a bag against the profile specified by the bag's " "\"BagIt-Profile-Identifier\" metadata field, if present.") standard_args.add_argument( '--config-file', default=DEFAULT_CONFIG_FILE, metavar='<file>', help="Optional path to a configuration file. If this argument is not specified, the configuration file " "defaults to: %s " % DEFAULT_CONFIG_FILE) metadata_file_arg = standard_args.add_argument( '--metadata-file', metavar='<file>', help="Optional path to a JSON formatted metadata file") remote_file_manifest_arg = standard_args.add_argument( '--remote-file-manifest', metavar='<file>', help="Optional path to a JSON formatted remote file manifest configuration file used to add remote file entries" " to the bag manifest(s) and create the bag fetch.txt file.") standard_args.add_argument( '--quiet', action="store_true", help="Suppress logging output.") standard_args.add_argument( '--debug', action="store_true", help="Enable debug logging output.") standard_args.add_argument( '--bag-path', metavar="<path>", required=True, help="Path to a bag directory or bag archive file.") metadata_args = parser.add_argument_group('Bag metadata arguments') for header in bagit.STANDARD_BAG_INFO_HEADERS: metadata_args.add_argument('--%s' % header.lower(), action=AddMetadataAction) args = parser.parse_args() bdb.configure_logging(level=logging.ERROR if args.quiet else (logging.DEBUG if args.debug else logging.INFO)) path = os.path.abspath(args.bag_path) if not os.path.exists(path): sys.stderr.write("Error: file or directory not found: %s\n\n" % path) sys.exit(2) is_file = os.path.isfile(path) if args.archiver and is_file: sys.stderr.write("Error: A bag archive cannot be created from an existing bag archive.\n\n") sys.exit(2) if args.checksum and is_file: sys.stderr.write("Error: A checksum manifest cannot be added to an existing bag archive. " "The bag must be extracted, updated, and re-archived.\n\n") sys.exit(2) if args.update and is_file: sys.stderr.write("Error: An existing bag archive cannot be updated in-place. " "The bag must first be extracted and then updated.\n\n") sys.exit(2) if args.update and args.resolve_fetch: sys.stderr.write("Error: The %s argument is not compatible with the %s argument.\n\n" % (update_arg.option_strings, fetch_arg.option_strings)) sys.exit(2) if args.remote_file_manifest and args.resolve_fetch: sys.stderr.write("Error: The %s argument is not compatible with the %s argument.\n\n" % (remote_file_manifest_arg.option_strings, fetch_arg.option_strings)) sys.exit(2) is_bag = bdb.is_bag(path) if args.checksum and not args.update and is_bag: sys.stderr.write("Error: Specifying %s for an existing bag requires the %s argument in order " "to apply any changes.\n\n" % (checksum_arg.option_strings, update_arg.option_strings)) sys.exit(2) if args.remote_file_manifest and not args.update and is_bag: sys.stderr.write("Error: Specifying %s for an existing bag requires the %s argument in order " "to apply any changes.\n\n" % (remote_file_manifest_arg.option_strings, update_arg.option_strings)) sys.exit(2) if args.metadata_file and not args.update and is_bag: sys.stderr.write("Error: Specifying %s for an existing bag requires the %s argument in order " "to apply any changes.\n\n" % (metadata_file_arg.option_strings, update_arg.option_strings)) sys.exit(2) if args.prune_manifests and not args.update and is_bag: sys.stderr.write("Error: Specifying %s for an existing bag requires the %s argument in order " "to apply any changes.\n\n" % (prune_manifests_arg.option_strings, update_arg.option_strings)) sys.exit(2) if args.skip_manifests and not args.update and is_bag: sys.stderr.write("Error: Specifying %s requires the %s argument.\n\n" % (skip_manifests_arg.option_strings, update_arg.option_strings)) sys.exit(2) if BAG_METADATA and not args.update and is_bag: sys.stderr.write("Error: Adding or modifying metadata %s for an existing bag requires the %s argument " "in order to apply any changes.\n\n" % (BAG_METADATA, update_arg.option_strings)) sys.exit(2) return args, is_bag, is_file