def test_extract_bag_archive_zip_with_relocate_existing(self): logger.info(self.getTestHeader('extract bag zip format, relocate existing')) try: bag_path = bdb.extract_bag(ospj(self.test_archive_dir, 'test-bag.zip'), temp=False) self.assertTrue(ospe(bag_path)) self.assertTrue(bdb.is_bag(bag_path)) bag_path = bdb.extract_bag(ospj(self.test_archive_dir, 'test-bag.zip'), temp=False) self.assertTrue(ospe(bag_path)) self.assertTrue(bdb.is_bag(bag_path)) bdb.cleanup_bag(os.path.dirname(bag_path)) output = self.stream.getvalue() self.assertExpectedMessages(["moving existing directory"], output) except Exception as e: self.fail(get_typed_exception(e))
def parse(self, bag_archive, output_path="out"): """ Analyze the bag, consuming BagIt-RO metadata into a structure downstream code emitters can use. """ manifest = {} """ Extract the bag. """ bag_path = bdbag_api.extract_bag(bag_archive, output_path=output_path) if bdbag_api.is_bag(bag_path): logger.debug("Initializing metadata datasets") manifest['path'] = bag_path manifest['datasets'] = {} datasets = manifest['datasets'] data_path = os.path.join(bag_path, "data") """ Extract tarred files. """ tar_data_files = glob.glob(os.path.join(data_path, "*.csv.gz")) for f in tar_data_files: with gzip.open(f, 'rb') as zipped: extracted = f.replace(".gz", "") with open(extracted, "wb") as stream: file_content = zipped.read() stream.write(file_content) """ Collect metadata for each file. """ data_files = glob.glob(os.path.join(data_path, "*.csv")) csv_filter = CSVFilter() for f in data_files: csv_filter.filter_data(f) logger.debug(f" --collecting metadata for: {f}") jsonld_context = self._get_jsonld_context(f) datasets[f] = jsonld_context context = datasets[f]['@context'] datasets[f]['columns'] = { k: None for k in context if isinstance(context[k], dict) } return manifest
def ts_validate(data_path, schema=None): """Validate a given TableSchema using frictionless. Arguments: data_path (str): Path to the TableSchema JSON or BDBag directory or BDBag archive to validate. schema (str): The schema to validate against. If not provided, the data is only validated against the defined TableSchema. Default None. Returns: dict: The validation results. is_valid (bool): Is the TableSchema valid? raw_errors (list): The raw Exceptions generated from any validation errors. error (str): A formatted error message about any validation errors. """ if os.path.isfile(data_path): archive_file = data_path try: data_path = bdbag_api.extract_bag(data_path, temp=True) except Exception as e: raise InvalidInput("Error extracting %s: %s" % (archive_file, e)) if not bdbag_api.is_bag(data_path): raise InvalidInput( "Input %s does not appear to be a valid BDBag. This tool requires a" " prepared BDBag archive when invoked on an existing archive file." % archive_file) # If data_path is a directory, find JSON if os.path.isdir(data_path): if "data" in os.listdir(data_path): data_path = os.path.join(data_path, "data") desc_file_list = [ filename for filename in os.listdir(data_path) if filename.endswith(".json") and not filename.startswith(".") ] if len(desc_file_list) < 1: raise ValidationException("No TableSchema JSON file found") elif len(desc_file_list) > 1: raise ValidationException("Mutiple JSON files found in directory") else: data_path = os.path.join(data_path, desc_file_list[0]) # Read into Package try: pkg = Package(data_path) report = validate(pkg, schema=schema) except FrictionlessException as e: raise ValidationException("Validation error\n%s" % e.error.message) if not report.valid: if report.errors: msg = report.errors[0]['message'] else: for task in report['tasks']: if not task.valid: msg = task['resource']['path'] + "\n" msg += task['errors'][0]['message'] raise ValidationException("Validation error in %s" % msg)
def test_extract_bag_archive_tar(self): logger.info(self.getTestHeader('extract bag tar format')) try: bag_path = bdb.extract_bag(ospj(self.test_archive_dir, 'test-bag.tar'), temp=True) self.assertTrue(ospe(bag_path)) self.assertTrue(bdb.is_bag(bag_path)) bdb.cleanup_bag(os.path.dirname(bag_path)) except Exception as e: self.fail(get_typed_exception(e))
def test_extract_bag_archive_tar(self): logger.info(self.getTestHeader('extract bag tar format')) try: bag_path = bdb.extract_bag(ospj(self.test_archive_dir, 'test-bag.tar'), temp=True) self.assertTrue(ospe(bag_path)) self.assertTrue(bdb.is_bag(bag_path)) bdb.cleanup_bag(os.path.dirname(bag_path)) except Exception as e: self.fail(bdbag.get_named_exception(e))
def test_materialize_from_dir(self): logger.info(self.getTestHeader('test materialize from dir')) curdir = os.getcwd() os.chdir(self.tmpdir) try: bag_path = bdb.materialize(self.test_bag_fetch_http_dir) self.assertTrue(bdb.is_bag(bag_path)) except Exception as e: self.fail(bdbag.get_typed_exception(e)) finally: os.chdir(curdir)
def test_materialize_non_bag(self): logger.info(self.getTestHeader('test materialize non-bag')) curdir = os.getcwd() os.chdir(self.tmpdir) try: bag_path = bdb.materialize(self.test_data_dir) self.assertFalse(bdb.is_bag(bag_path)) except Exception as e: self.fail(bdbag.get_typed_exception(e)) finally: os.chdir(curdir)
def test_materialize_from_file(self): logger.info(self.getTestHeader('test materialize from file')) curdir = os.getcwd() os.chdir(self.tmpdir) try: bag_path = bdb.materialize( ospj(self.test_archive_dir, 'test-bag-fetch-http.zip')) self.assertTrue(bdb.is_bag(bag_path)) except Exception as e: self.fail(bdbag.get_typed_exception(e)) finally: os.chdir(curdir)
def test_materialize_from_url(self): logger.info(self.getTestHeader('test materialize from URL')) curdir = os.getcwd() os.chdir(self.tmpdir) try: bag_path = bdb.materialize( "https://github.com/fair-research/bdbag/raw/master/test/test-data/test-archives/" "test-bag.zip") self.assertTrue(bdb.is_bag(bag_path)) except Exception as e: self.fail(bdbag.get_typed_exception(e)) finally: os.chdir(curdir)
def create_file(self) -> Tuple[str, Optional[str]]: with TemporaryDirectory() as temp_path: bag_path = os.path.join(temp_path, 'manifest') os.makedirs(bag_path) bdbag_api.make_bag(bag_path) with open(os.path.join(bag_path, 'data', 'participants.tsv'), 'w') as samples_tsv: self._samples_tsv(samples_tsv) bag = bdbag_api.make_bag(bag_path, update=True) # update TSV checksums assert bdbag_api.is_bag(bag_path) bdbag_api.validate_bag(bag_path) assert bdbag_api.check_payload_consistency(bag) temp, temp_path = mkstemp() os.close(temp) archive_path = bdbag_api.archive_bag(bag_path, 'zip') # Moves the bdbag archive out of the temporary directory. This prevents # the archive from being deleted when the temporary directory self-destructs. os.rename(archive_path, temp_path) return temp_path, None
def checkIfBag(self): if not self.currentPath: self.isBag = False else: if os.path.isdir(self.currentPath): QApplication.setOverrideCursor(Qt.WaitCursor) self.isBag = bdb.is_bag(self.currentPath) QApplication.restoreOverrideCursor() if self.isBag: self.updateStatus( "The directory [%s] is a bag." % self.currentPath, True) else: self.updateStatus( "The directory [%s] is NOT a bag." % self.currentPath, False) else: self.isBag = False
def validate_user_submission(data_path, schema, output_dir=None, delete_dir=False, handle_git_repos=True, bdbag_kwargs=None): """ Arguments: data_path (str): The path to the data to ingest into DERIVA. The path can be: 1) A directory to be formatted into a BDBag 2) A Git repository to be copied into a BDBag 3) A premade BDBag directory 4) A premade BDBag in an archive file schema (str): The named schema or schema file link to validate data against. Default None, to only validate against the declared TableSchema. output_dir (str): The path to create an output directory in. The resulting BDBag archive will be named after this directory. If not set, the directory will be turned into a BDBag in-place. For Git repositories, this is automatically set, but can be overridden. If data_path is a file, this has no effect. This dir MUST NOT be in the `data_path` directory or any subdirectories. Default None. delete_dir (bool): Should the output_dir be deleted after submission? Has no effect if output_dir is not specified. For Git repositories, this is always True. Default False. handle_git_repos (bool): Should Git repositories be detected and handled? When this is False, Git repositories are handled as simple directories instead of Git repositories. Default True. bdbag_kwargs (dict): Extra args to pass to bdbag """ bdbag_kwargs = bdbag_kwargs or {} data_path = os.path.abspath(data_path) if not os.path.exists(data_path): raise FileNotFoundError("Path '{}' does not exist".format(data_path)) if handle_git_repos: logger.debug("Checking for a Git repository") # If Git repo, set output_dir appropriately try: repo = git.Repo(data_path, search_parent_directories=True) # Not Git repo except git.InvalidGitRepositoryError: logger.debug("Not a Git repo") # Path not found, turn into standard FileNotFoundError except git.NoSuchPathError: raise FileNotFoundError( "Path '{}' does not exist".format(data_path)) # Is Git repo else: logger.debug("Git repo found, collecting metadata") # Needs to not have slash at end - is known Git repo already, slash # interferes with os.path.basename/dirname if data_path.endswith("/"): data_path = data_path[:-1] # Set output_dir to new dir named with HEAD commit hash new_dir_name = "{}_{}".format(os.path.basename(data_path), str(repo.head.commit)) output_dir = os.path.join(os.path.dirname(data_path), new_dir_name) # Delete temp dir after archival delete_dir = True # If dir and not already BDBag, make BDBag if os.path.isdir(data_path) and not bdbag_api.is_bag(data_path): logger.debug("Creating BDBag out of directory '{}'".format(data_path)) # If output_dir specified, copy data to output dir first if output_dir: logger.debug("Copying data to '{}' before creating BDBag".format( output_dir)) output_dir = os.path.abspath(output_dir) # If shutil.copytree is called when the destination dir is inside the source dir # by more than one layer, it will recurse infinitely. # (e.g. /source => /source/dir/dest) # Exactly one layer is technically okay (e.g. /source => /source/dest), # but it's easier to forbid all parent/child dir cases. # Check for this error condition by determining if output_dir is a child # of data_path. if os.path.commonpath([data_path]) == os.path.commonpath( [data_path, output_dir]): raise ValueError( "The output_dir ('{}') must not be in data_path ('{}')". format(output_dir, data_path)) try: shutil.copytree(data_path, output_dir) except FileExistsError: raise FileExistsError( ("The output directory must not exist. " "Delete '{}' to submit.\nYou can set delete_dir=True " "to avoid this issue in the future.").format(output_dir)) # Process new dir instead of old path data_path = output_dir # If output_dir not specified, never delete data dir else: delete_dir = False # Make bag bdbag_api.make_bag(data_path, **bdbag_kwargs) if not bdbag_api.is_bag(data_path): raise ValueError( "Failed to create BDBag from {}".format(data_path)) logger.debug("BDBag created at '{}'".format(data_path)) # If dir (must be BDBag at this point), archive if os.path.isdir(data_path): logger.debug("Archiving BDBag at '{}' using '{}'".format( data_path, CONFIG["ARCHIVE_FORMAT"])) new_data_path = bdbag_api.archive_bag(data_path, CONFIG["ARCHIVE_FORMAT"]) logger.debug("BDBag archived to file '{}'".format(new_data_path)) # If requested (e.g. Git repo copied dir), delete data dir if delete_dir: logger.debug("Removing old directory '{}'".format(data_path)) shutil.rmtree(data_path) # Overwrite data_path - don't care about dir for uploading data_path = new_data_path # Validate TableSchema in BDBag logger.debug("Validating TableSchema in BDBag '{}'".format(data_path)) validation_res = ts_validate(data_path, schema=schema) if not validation_res["is_valid"]: raise exc.ValidationException( "TableSchema invalid due to the following errors: " "\n{}\n".format(validation_res["error"])) logger.debug("Validation successful") return data_path
def main(): args, is_bag, is_file = parse_cli() path = os.path.abspath(args.path) archive = None temp_path = None error = None result = 0 if not args.quiet: sys.stderr.write('\n') try: if not is_file: # do not try to create or update the bag if the user just wants to validate or complete an existing bag if not ( (args.validate or args.validate_profile or args.resolve_fetch) and not (args.update and bdb.is_bag(path))): if args.checksum and 'all' in args.checksum: args.checksum = ['md5', 'sha1', 'sha256', 'sha512'] # create or update the bag depending on the input arguments bdb.make_bag(path, algs=args.checksum, update=args.update, save_manifests=not args.skip_manifests, prune_manifests=args.prune_manifests, metadata=BAG_METADATA if BAG_METADATA else None, metadata_file=args.metadata_file, remote_file_manifest=args.remote_file_manifest, config_file=args.config_file, ro_metadata_file=args.ro_metadata_file) # otherwise just extract the bag if it is an archive and no other conflicting options specified elif not (args.validate or args.validate_profile or args.resolve_fetch): bdb.extract_bag(path) if not args.quiet: sys.stderr.write('\n') return result if args.ro_manifest_generate: bdb.generate_ro_manifest( path, True if args.ro_manifest_generate == "overwrite" else False, config_file=args.config_file) if args.resolve_fetch: if args.validate == 'full': sys.stderr.write(ASYNC_TRANSFER_VALIDATION_WARNING) bdb.resolve_fetch( path, force=True if args.resolve_fetch == 'all' else False, keychain_file=args.keychain_file, config_file=args.config_file, filter_expr=args.fetch_filter) if args.validate: if is_file: temp_path = bdb.extract_bag(path, temp=True) if args.validate == 'structure': bdb.validate_bag_structure(temp_path if temp_path else path) else: bdb.validate_bag( temp_path if temp_path else path, fast=True if args.validate == 'fast' else False, config_file=args.config_file) if args.archiver: archive = bdb.archive_bag(path, args.archiver) if archive is None and is_file: archive = path if args.validate_profile: if is_file: if not temp_path: temp_path = bdb.extract_bag(path, temp=True) profile = bdb.validate_bag_profile( temp_path if temp_path else path) bdb.validate_bag_serialization(archive if archive else path, profile) if args.revert: bdb.revert_bag(path) except Exception as e: result = 1 error = "Error: %s" % get_typed_exception(e) finally: if temp_path: bdb.cleanup_bag(os.path.dirname(temp_path)) if result != 0: sys.stderr.write("\n%s" % error) if not args.quiet: sys.stderr.write('\n') return result
def parse_cli(): description = 'BDBag utility for working with Bagit/RO archives' parser = argparse.ArgumentParser( description=description, epilog="For more information see: http://github.com/fair-research/bdbag" ) parser.add_argument('--version', action='version', version=VERSION) standard_args = parser.add_argument_group('Bag arguments') update_arg = "--update" standard_args.add_argument( update_arg, action="store_true", help= "Update an existing bag dir, regenerating manifests and fetch.txt if necessary." ) revert_arg = "--revert" standard_args.add_argument( revert_arg, action="store_true", help= "Revert an existing bag directory back to a normal directory, deleting all bag metadata files. " "Payload files in the \'data\' directory will be moved back to the directory root, and the \'data\' " "directory will be deleted.") archiver_arg = "--archiver" standard_args.add_argument( archiver_arg, choices=['zip', 'tar', 'tgz'], help="Archive a bag using the specified format.") checksum_arg = "--checksum" standard_args.add_argument( checksum_arg, action='append', choices=['md5', 'sha1', 'sha256', 'sha512', 'all'], help= "Checksum algorithm to use: can be specified multiple times with different values. " "If \'all\' is specified, every supported checksum will be generated") skip_manifests_arg = "--skip-manifests" standard_args.add_argument( skip_manifests_arg, action='store_true', help=str( "If \'skip-manifests\' is specified in conjunction with %s, only tagfile manifests will be " "regenerated, with payload manifests and fetch.txt (if any) left as is. This argument should be used " "when only bag metadata has changed." % update_arg)) prune_manifests_arg = "--prune-manifests" standard_args.add_argument( prune_manifests_arg, action='store_true', help= "If specified, any existing checksum manifests not explicitly configured via either" " the \"checksum\" argument(s) or configuration file will be deleted from the bag during an update." ) fetch_arg = "--resolve-fetch" standard_args.add_argument( fetch_arg, "--fetch", choices=['all', 'missing'], help="Download remote files listed in the bag's fetch.txt file. " "The \"missing\" option only attempts to fetch files that do not " "already exist in the bag payload directory. " "The \"all\" option causes all fetch files to be re-acquired," " even if they already exist in the bag payload directory.") fetch_filter_arg = "--fetch-filter" standard_args.add_argument( fetch_filter_arg, metavar="<column><operator><value>", help= "A simple expression of the form <column><operator><value> where: <column> is the name of a column in " "the bag's fetch.txt to be filtered on, <operator> is one of the following tokens; %s, and <value> is a " "string pattern or integer to be filtered against." % FILTER_DOCSTRING) validate_arg = "--validate" standard_args.add_argument( validate_arg, choices=['fast', 'full', 'structure'], help= "Validate a bag directory or bag archive. If \"fast\" is specified, Payload-Oxum (if present) will be " "used to check that the payload files are present and accounted for. If \"full\" is specified, " "all checksums will be regenerated and compared to the corresponding entries in the manifest. " "If \"structure\" is specified, the bag will be checked for structural validity only." ) validate_profile_arg = "--validate-profile" standard_args.add_argument( validate_profile_arg, action="store_true", help="Validate a bag against the profile specified by the bag's " "\"BagIt-Profile-Identifier\" metadata field, if present.") config_file_arg = "--config-file" standard_args.add_argument( config_file_arg, default=DEFAULT_CONFIG_FILE, metavar='<file>', help= "Optional path to a configuration file. If this argument is not specified, the configuration file " "defaults to: %s " % DEFAULT_CONFIG_FILE) keychain_file_arg = "--keychain-file" standard_args.add_argument( keychain_file_arg, default=DEFAULT_KEYCHAIN_FILE, metavar='<file>', help= "Optional path to a keychain file. If this argument is not specified, the keychain file " "defaults to: %s " % DEFAULT_KEYCHAIN_FILE) metadata_file_arg = "--metadata-file" standard_args.add_argument( metadata_file_arg, metavar='<file>', help="Optional path to a JSON formatted metadata file") ro_metadata_file_arg = "--ro-metadata-file" standard_args.add_argument( ro_metadata_file_arg, metavar='<file>', help="Optional path to a JSON formatted RO metadata file") ro_manifest_generate_arg = "--ro-manifest-generate" standard_args.add_argument( ro_manifest_generate_arg, choices=['overwrite', 'update'], help= "Automatically generate a basic RO metadata manifest.json file by introspecting a bag's metadata and " "structure.") remote_file_manifest_arg = "--remote-file-manifest" standard_args.add_argument( remote_file_manifest_arg, metavar='<file>', help= "Optional path to a JSON formatted remote file manifest configuration file used to add remote file entries" " to the bag manifest(s) and create the bag fetch.txt file.") standard_args.add_argument('--quiet', action="store_true", help="Suppress logging output.") standard_args.add_argument('--debug', action="store_true", help="Enable debug logging output.") standard_args.add_argument( 'path', metavar="<path>", help="Path to a bag directory or bag archive file.") metadata_args = parser.add_argument_group('Bag metadata arguments') headers = list(bagit.STANDARD_BAG_INFO_HEADERS) headers.append("Contact-Orcid") for header in sorted(headers): metadata_args.add_argument('--%s' % header.lower(), action=AddMetadataAction) args = parser.parse_args() bdb.configure_logging(level=logging.ERROR if args.quiet else ( logging.DEBUG if args.debug else logging.INFO)) path = os.path.abspath(args.path) if not os.path.exists(path): sys.stderr.write("Error: file or directory not found: %s\n\n" % path) sys.exit(2) is_file = os.path.isfile(path) if args.archiver and is_file: sys.stderr.write( "Error: A bag archive cannot be created from an existing bag archive.\n\n" ) sys.exit(2) if args.checksum and is_file: sys.stderr.write( "Error: A checksum manifest cannot be added to an existing bag archive. " "The bag must be extracted, updated, and re-archived.\n\n") sys.exit(2) if args.update and is_file: sys.stderr.write( "Error: An existing bag archive cannot be updated in-place. " "The bag must first be extracted and then updated.\n\n") sys.exit(2) if args.revert and is_file: sys.stderr.write( "Error: An existing bag archive cannot be reverted in-place. " "The bag must first be extracted and then reverted.\n\n") sys.exit(2) if args.fetch_filter and not args.resolve_fetch: sys.stderr.write( "Error: The %s argument can only be used with the %s argument.\n\n" % (fetch_filter_arg, fetch_arg)) sys.exit(2) if args.resolve_fetch and is_file: sys.stderr.write( "Error: It is not possible to resolve remote files directly into a bag archive. " "The bag must first be extracted before the %s argument can be specified.\n\n" % fetch_arg) sys.exit(2) if args.update and args.resolve_fetch: sys.stderr.write( "Error: The %s argument is not compatible with the %s argument.\n\n" % (update_arg, fetch_arg)) sys.exit(2) if args.remote_file_manifest and args.resolve_fetch: sys.stderr.write( "Error: The %s argument is not compatible with the %s argument.\n\n" % (remote_file_manifest_arg, fetch_arg)) sys.exit(2) is_bag = bdb.is_bag(path) if args.checksum and not args.update and is_bag: sys.stderr.write( "Error: Specifying %s for an existing bag requires the %s argument in order " "to apply any changes.\n\n" % (checksum_arg, update_arg)) sys.exit(2) if args.remote_file_manifest and not args.update and is_bag: sys.stderr.write( "Error: Specifying %s for an existing bag requires the %s argument in order " "to apply any changes.\n\n" % (remote_file_manifest_arg, update_arg)) sys.exit(2) if args.metadata_file and not args.update and is_bag: sys.stderr.write( "Error: Specifying %s for an existing bag requires the %s argument in order " "to apply any changes.\n\n" % (metadata_file_arg, update_arg)) sys.exit(2) if args.ro_metadata_file and not args.update and is_bag: sys.stderr.write( "Error: Specifying %s for an existing bag requires the %s argument in order " "to apply any changes.\n\n" % (ro_metadata_file_arg, update_arg)) sys.exit(2) if args.prune_manifests and not args.update and is_bag: sys.stderr.write( "Error: Specifying %s for an existing bag requires the %s argument in order " "to apply any changes.\n\n" % (prune_manifests_arg, update_arg)) sys.exit(2) if args.skip_manifests and not args.update and is_bag: sys.stderr.write("Error: Specifying %s requires the %s argument.\n\n" % (skip_manifests_arg, update_arg)) sys.exit(2) if BAG_METADATA and not args.update and is_bag: sys.stderr.write( "Error: Adding or modifying metadata %s for an existing bag requires the %s argument " "in order to apply any changes.\n\n" % (BAG_METADATA, update_arg)) sys.exit(2) if args.revert and not is_bag: sys.stderr.write( "Error: The directory %s is not a bag and therefore cannot be reverted.\n\n" % path) sys.exit(2) if args.revert and args.update and is_bag: sys.stderr.write( "Error: The %s argument is not compatible with the %s argument.\n\n" % (revert_arg, update_arg)) sys.exit(2) return args, is_bag, is_file
def locate_bag(root_dir): for dir, subdirs, _ in os.walk(root_dir): if bdbag_api.is_bag(dir): return dir
def restore(self, **kwargs): """ Perform the catalog restore operation. The restore process is broken up into six phases: 1. Pre-process the input path. - If the input path is a file, it is assumed that it is a compressed archive file that can be extracted into an input directory via a supported codec: `tar`,`tgz`,`bz2`, or `zip`. - If the input directory is a valid _bag_ directory structure, the bag will be materialized. 2. The catalog schema will be restored first. The schema is restored from a ERMRest JSON schema document file. The schema document file must be named `catalog-schema.json` and must appear at the root of the input directory. The restore process can be configured to exclude the restoration of an enumerated set both schema and tables. 3. The catalog table data will be restored, if present. The table date restoration process is resilient to interruption and may be restarted. However, if the catalog schema or data is mutated outside of the scope of the restore function in-between such restarts, the restored catalog's consistency cannot be guaranteed. The restore process can be configured to exclude the restoration of table data for a set of tables. 4. The catalog foreign keys will be restored. 5. The catalog assets will be restored, if present. 6. On success, the restore state marker annotations will be deleted and the catalog history will be truncated. :param kwargs: :return: """ success = True start = datetime.datetime.now() # pre-process input logging.info("Processing input path: %s" % self.input_path) is_file, is_dir, is_uri = bdb.inspect_path(self.input_path) if not (is_file or is_dir or is_uri): raise DerivaRestoreError( "Invalid input path [%s]. If the specified input path refers to a locally mounted " "file or directory, it does not exist or cannot be accessed. If the specified " "path is a URI, the scheme component of the URI could not be determined." % self.input_path) if is_file or is_dir: self.input_path = os.path.abspath(self.input_path) if is_file: logging.info( "The input path [%s] is a file. Assuming input file is a directory archive and extracting..." % self.input_path) self.input_path = bdb.extract_bag(self.input_path) try: if not self.no_bag_materialize: self.input_path = bdb.materialize(self.input_path) except bdb.bdbagit.BagValidationError as e: if self.strict_bag_validation: raise DerivaRestoreError(format_exception(e)) else: logging.warning( "Input bag validation failed and strict validation mode is disabled. %s" % format_exception(e)) is_bag = bdb.is_bag(self.input_path) src_schema_file = os.path.abspath( os.path.join(self.input_path, "data" if is_bag else "", "catalog-schema.json")) # the src_catalog_stub created below will never be "connected" in any kind of network sense, # but we need an instance of ErmrestCatalog in order to get a working Model from the schema file. src_catalog_stub = ErmrestCatalog("file", src_schema_file, "1") src_model = Model.fromfile(src_catalog_stub, src_schema_file) # initialize/connect to destination catalog if not self.catalog_id: self.catalog_id = self.server.create_ermrest_catalog().catalog_id self.server_args["catalog_id"] = self.catalog_id logging.info("Created new target catalog with ID: %s" % self.catalog_id) self.dst_catalog = self.server.connect_ermrest(self.catalog_id) # init dcctx cid to a default self.dst_catalog.dcctx['cid'] = self.__class__.__name__ # build up the model content we will copy to destination dst_model = self.dst_catalog.getCatalogModel() logging.info("Restoring %s to catalog: %s" % (self.input_path, self.dst_catalog.get_server_uri())) # set top-level config right away and find fatal usage errors... if self.restore_policy: logging.info("Restoring top-level catalog ACLs...") if not src_model.acls: logging.info("Source schema does not contain any ACLs.") else: src_model.acls.owner.extend(dst_model.acls.owner) self.dst_catalog.put('/acl', json=src_model.acls) if self.restore_annotations: logging.info("Restoring top-level catalog annotations...") self.dst_catalog.put('/annotation', json=src_model.annotations) # build up the model content we will copy to destination dst_model = self.dst_catalog.getCatalogModel() new_model = [] new_columns = [ ] # ERMrest does not currently allow bulk column creation new_keys = [] # ERMrest does not currently allow bulk key creation restore_states = {} fkeys_deferred = {} exclude_schemas = [] if self.exclude_schemas is None else self.exclude_schemas try: for sname, schema in src_model.schemas.items(): if sname in exclude_schemas: continue if sname not in dst_model.schemas: new_model.append(self.copy_sdef(schema)) for tname, table in schema.tables.items(): if table.kind != 'table': logging.warning('Skipping restore of %s %s:%s' % (table.kind, sname, tname)) continue if 'RID' not in table.column_definitions.elements: raise DerivaRestoreError( "Source table %s.%s lacks system-columns and cannot be restored." % (sname, tname)) # make sure the source table is pruned of any existing restore state markers if table.annotations.get(CLONE_STATE_URL) is not None: del table.annotations[CLONE_STATE_URL] if table.annotations.get( self.RESTORE_STATE_URL) is not None: del table.annotations[self.RESTORE_STATE_URL] if sname not in dst_model.schemas or tname not in dst_model.schemas[ sname].tables: new_model.append(self.copy_tdef_core(table)) restore_states[( sname, tname)] = 1 if self.restore_data else None fkeys_deferred[(sname, tname)] = self.copy_tdef_fkeys(table) else: src_columns = { c.name: c for c in table.column_definitions } dst_columns = { c.name: c for c in dst_model.schemas[sname].tables[tname]. column_definitions } for cname in src_columns: if cname not in dst_columns: new_columns.append( self.copy_cdef(src_columns[cname])) else: self.check_column_compatibility( src_columns[cname], dst_columns[cname]) for cname in dst_columns: if cname not in src_columns: raise DerivaRestoreError( "Destination column %s.%s.%s does not exist in source catalog." % (sname, tname, cname)) src_keys = { tuple(sorted(c.name for c in key.unique_columns)): key for key in table.keys } dst_keys = { tuple(sorted(c.name for c in key.unique_columns)): key for key in dst_model.schemas[sname].tables[tname].keys } for utuple in src_keys: if utuple not in dst_keys: new_keys.append( self.copy_kdef(src_keys[utuple])) for utuple in dst_keys: if utuple not in src_keys: raise DerivaRestoreError( "Destination key %s.%s(%s) does not exist in source catalog." % (sname, tname, ', '.join(utuple))) restore_states[(sname, tname)] = \ dst_model.schemas[sname].tables[tname].annotations.get(self.RESTORE_STATE_URL) if dst_model.schemas[sname].tables[tname].foreign_keys: # assume that presence of any destination foreign keys means we already completed if self.restore_assets: self.upload_assets() return else: fkeys_deferred[( sname, tname)] = self.copy_tdef_fkeys(table) # apply the stage 1 model to the destination in bulk logging.info("Restoring catalog schema...") if new_model: self.dst_catalog.post("/schema", json=new_model).raise_for_status() for sname, tname, cdef in new_columns: self.dst_catalog.post("/schema/%s/table/%s/column" % (urlquote(sname), urlquote(tname)), json=cdef).raise_for_status() for sname, tname, kdef in new_keys: self.dst_catalog.post("/schema/%s/table/%s/key" % (urlquote(sname), urlquote(tname)), json=kdef).raise_for_status() # copy data in stage 2 if self.restore_data: logging.info("Restoring catalog data...") for sname, tname in restore_states.keys(): tname_uri = "%s:%s" % (urlquote(sname), urlquote(tname)) if restore_states[(sname, tname)] == 1: # determine current position in (partial?) copy row = self.dst_catalog.get( "/entity/%s@sort(RID::desc::)?limit=1" % tname_uri).json() if row: last = row[0]['RID'] logging.info( "Existing data detected in table [%s] -- will attempt partial restore of " "remaining records following last known RID: %s" % (tname_uri, last)) else: last = None table = self.get_json_recordset( self.open_json_stream_file( self.get_table_path(sname, tname, is_bag)), self.data_chunk_size, after=last) total = 0 table_success = True try: for chunk in table: if chunk: self.dst_catalog.post( "/entity/%s?nondefaults=RID,RCT,RCB" % tname_uri, json=chunk) total += len(chunk) else: break except: table_success = False finally: table.close() if table_success: logging.info( "Restoration of table data [%s] successful. %s rows restored." % (tname_uri, total)) else: logging.warning( "Restoration of table data [%s] failed. %s rows restored." % (tname_uri, total)) # record our progress on catalog in case we fail part way through self.dst_catalog.put( "/schema/%s/table/%s/annotation/%s" % ( urlquote(sname), urlquote(tname), urlquote(self.RESTORE_STATE_URL), ), json=2) elif restore_states[(sname, tname)] is None and ( sname, tname) in { ('public', 'ERMrest_Client'), ('public', 'ERMrest_Group'), }: # special sync behavior for magic ermrest tables # HACK: these are assumed small enough to join via local merge of arrays want = sorted(self.load_json_file( self.get_table_path(sname, tname, is_bag)), key=lambda r: r['ID']) have = sorted(self.dst_catalog.get( "/entity/%s?limit=none" % tname_uri).json(), key=lambda r: r['ID']) create = [] update = [] pos_want = 0 pos_have = 0 while pos_want < len(want): while pos_have < len(have) and have[pos_have][ 'ID'] < want[pos_want]['ID']: # dst-only rows will be retained as is pos_have += 1 if pos_have >= len(have) or have[pos_have][ 'ID'] > want[pos_want]['ID']: # src-only rows will be inserted create.append(want[pos_want]) pos_want += 1 else: # overlapping rows will be updated update.append(want[pos_want]) pos_want += 1 self.dst_catalog.post( "/entity/%s?nondefaults=RCT,RCB" % tname_uri, json=create) self.dst_catalog.put( "/attributegroup/%s/ID;%s" % (tname_uri, ",".join([ urlquote(c.name) for c in src_model. schemas[sname].tables[tname].column_definitions if c.name not in {'RID', 'RMT', 'RMB', 'ID'} ])), json=update) # record our progress on catalog in case we fail part way through self.dst_catalog.put( "/schema/%s/table/%s/annotation/%s" % ( urlquote(sname), urlquote(tname), urlquote(self.RESTORE_STATE_URL), ), json=2) # apply stage 2 model in bulk only... we won't get here unless preceding succeeded logging.info("Restoring foreign keys...") new_fkeys = [] for fkeys in fkeys_deferred.values(): new_fkeys.extend(fkeys) # restore fkeys if new_fkeys: self.dst_catalog.post("/schema", json=new_fkeys) # restore assets if self.restore_assets: self.upload_assets() # cleanup self.cleanup_restored_catalog() except: success = False raise finally: elapsed_time = datetime.datetime.now() - start total_secs = elapsed_time.total_seconds() elapsed = time.strftime('%H:%M:%S', time.gmtime(total_secs)) logging.info("Restore of catalog %s %s. %s" % (self.dst_catalog.get_server_uri(), "completed successfully" if success else "failed", ("Elapsed time: %s" % elapsed) if (total_secs > 0) else ""))
def start_deriva_flow(self, data_path, dcc_id, catalog_id=None, schema=None, server=None, dataset_acls=None, output_dir=None, delete_dir=False, handle_git_repos=True, dry_run=False, test_sub=False, verbose=False, **kwargs): """Start the Globus Automate Flow to ingest CFDE data into DERIVA. Arguments: data_path (str): The path to the data to ingest into DERIVA. The path can be: 1) A directory to be formatted into a BDBag 2) A Git repository to be copied into a BDBag 3) A premade BDBag directory 4) A premade BDBag in an archive file dcc_id (str): The CFDE-recognized DCC ID for this submission. catalog_id (int or str): The ID of the DERIVA catalog to ingest into. Default None, to create a new catalog. schema (str): The named schema or schema file link to validate data against. Default None, to only validate against the declared TableSchema. server (str): The DERIVA server to ingest to. Default None, to use the Action Provider-set default. dataset_acls (dict): The DERIVA ACL(s) to set on the final dataset. Default None, to use the CFDE default ACLs. output_dir (str): The path to create an output directory in. The resulting BDBag archive will be named after this directory. If not set, the directory will be turned into a BDBag in-place. For Git repositories, this is automatically set, but can be overridden. If data_path is a file, this has no effect. This dir MUST NOT be in the `data_path` directory or any subdirectories. Default None. delete_dir (bool): Should the output_dir be deleted after submission? Has no effect if output_dir is not specified. For Git repositories, this is always True. Default False. handle_git_repos (bool): Should Git repositories be detected and handled? When this is False, Git repositories are handled as simple directories instead of Git repositories. Default True. dry_run (bool): Should the data be validated and bagged without starting the Flow? When True, does not ingest into DERIVA or start the Globus Automate Flow, and the return value will not have valid DERIVA Flow information. Default False. test_sub (bool): Should the submission be run in "test mode" where the submission will be inegsted into DERIVA and immediately deleted? When True, the data wil not remain in DERIVA to be viewed and the Flow will terminate before any curation step. verbose (bool): Should intermediate status messages be printed out? Default False. Keyword Arguments: force_http (bool): Should the data be sent using HTTP instead of Globus Transfer, even if Globus Transfer is available? Because Globus Transfer is more robust than HTTP, it is highly recommended to leave this False. Default False. Other keyword arguments are passed directly to the ``make_bag()`` function of the BDBag API (see https://github.com/fair-research/bdbag for details). """ if verbose: print("Startup: Validating input") data_path = os.path.abspath(data_path) if not os.path.exists(data_path): raise FileNotFoundError( "Path '{}' does not exist".format(data_path)) if catalog_id in self.catalogs.keys(): if schema: raise ValueError( "You may not specify a schema ('{}') when ingesting to " "a named catalog ('{}'). Retry without specifying " "a schema.".format(schema, catalog_id)) schema = self.catalogs[catalog_id] # Pull out known kwargs force_http = kwargs.pop("force_http", False) if handle_git_repos: if verbose: print("Checking for a Git repository") # If Git repo, set output_dir appropriately try: repo = git.Repo(data_path, search_parent_directories=True) # Not Git repo except git.InvalidGitRepositoryError: if verbose: print("Not a Git repo") # Path not found, turn into standard FileNotFoundError except git.NoSuchPathError: raise FileNotFoundError( "Path '{}' does not exist".format(data_path)) # Is Git repo else: if verbose: print("Git repo found, collecting metadata") # Needs to not have slash at end - is known Git repo already, slash # interferes with os.path.basename/dirname if data_path.endswith("/"): data_path = data_path[:-1] # Set output_dir to new dir named with HEAD commit hash new_dir_name = "{}_{}".format(os.path.basename(data_path), str(repo.head.commit)) output_dir = os.path.join(os.path.dirname(data_path), new_dir_name) # Delete temp dir after archival delete_dir = True # If dir and not already BDBag, make BDBag if os.path.isdir(data_path) and not bdbag_api.is_bag(data_path): if verbose: print("Creating BDBag out of directory '{}'".format(data_path)) # If output_dir specified, copy data to output dir first if output_dir: if verbose: print("Copying data to '{}' before creating BDBag".format( output_dir)) output_dir = os.path.abspath(output_dir) # If shutil.copytree is called when the destination dir is inside the source dir # by more than one layer, it will recurse infinitely. # (e.g. /source => /source/dir/dest) # Exactly one layer is technically okay (e.g. /source => /source/dest), # but it's easier to forbid all parent/child dir cases. # Check for this error condition by determining if output_dir is a child # of data_path. if os.path.commonpath([data_path]) == os.path.commonpath( [data_path, output_dir]): raise ValueError( "The output_dir ('{}') must not be in data_path ('{}')" .format(output_dir, data_path)) try: shutil.copytree(data_path, output_dir) except FileExistsError: raise FileExistsError( ("The output directory must not exist. " "Delete '{}' to submit.\nYou can set delete_dir=True " "to avoid this issue in the future." ).format(output_dir)) # Process new dir instead of old path data_path = output_dir # If output_dir not specified, never delete data dir else: delete_dir = False # Make bag bdbag_api.make_bag(data_path, **kwargs) if not bdbag_api.is_bag(data_path): raise ValueError( "Failed to create BDBag from {}".format(data_path)) elif verbose: print("BDBag created at '{}'".format(data_path)) # If dir (must be BDBag at this point), archive if os.path.isdir(data_path): if verbose: print("Archiving BDBag at '{}' using '{}'".format( data_path, CONFIG["ARCHIVE_FORMAT"])) new_data_path = bdbag_api.archive_bag(data_path, CONFIG["ARCHIVE_FORMAT"]) if verbose: print("BDBag archived to file '{}'".format(new_data_path)) # If requested (e.g. Git repo copied dir), delete data dir if delete_dir: if verbose: print("Removing old directory '{}'".format(data_path)) shutil.rmtree(data_path) # Overwrite data_path - don't care about dir for uploading data_path = new_data_path # Validate TableSchema in BDBag if verbose: print("Validating TableSchema in BDBag '{}'".format(data_path)) validation_res = ts_validate(data_path, schema=schema) if not validation_res["is_valid"]: return { "success": False, "error": ("TableSchema invalid due to the following errors: \n{}\n". format(validation_res["error"])) } elif verbose: print("Validation successful") # Now BDBag is archived file # Set path on destination dest_path = "{}{}".format(self.flow_info["cfde_ep_path"], os.path.basename(data_path)) # If doing dry run, stop here before making Flow input if dry_run: return { "success": True, "message": "Dry run validated successfully. No data was transferred." } # Set up Flow if verbose: print("Creating input for Flow") # If local EP exists (and not force_http), can use Transfer # Local EP fetched now in case GCP started after Client creation local_endpoint = globus_sdk.LocalGlobusConnectPersonal().endpoint_id if local_endpoint and not force_http: if verbose: print( "Using local Globus Connect Personal Endpoint '{}'".format( local_endpoint)) # Populate Transfer fields in Flow flow_id = self.flow_info["flow_id"] flow_input = { "source_endpoint_id": local_endpoint, "source_path": data_path, "cfde_ep_id": self.flow_info["cfde_ep_id"], "cfde_ep_path": dest_path, "cfde_ep_url": self.flow_info["cfde_ep_url"], "is_directory": False, "test_sub": test_sub, "dcc_id": dcc_id } if catalog_id: flow_input["catalog_id"] = str(catalog_id) if server: flow_input["server"] = server # Otherwise, we must PUT the BDBag on the server else: if verbose: print("No Globus Endpoint detected; using HTTP upload instead") headers = {} self.__https_authorizer.set_authorization_header(headers) data_url = "{}{}".format(self.flow_info["cfde_ep_url"], dest_path) with open(data_path, 'rb') as bag_file: bag_data = bag_file.read() put_res = requests.put(data_url, data=bag_data, headers=headers) # Regenerate headers on 401 if put_res.status_code == 401: self.__https_authorizer.handle_missing_authorization() self.__https_authorizer.set_authorization_header(headers) put_res = requests.put(data_url, data=bag_data, headers=headers) # Error message on failed PUT or any unexpected response if put_res.status_code >= 300: return { "success": False, "error": ("Could not upload BDBag to server (error {}):\n{}".format( put_res.status_code, put_res.content)) } elif put_res.status_code != 200: print( "Warning: HTTP upload returned status code {}, which was unexpected." .format(put_res.status_code)) if verbose: print("Upload successful to '{}': {} {}".format( data_url, put_res.status_code, put_res.content)) flow_id = self.flow_info["flow_id"] flow_input = { "source_endpoint_id": False, "data_url": data_url, "test_sub": test_sub, "dcc_id": dcc_id } if catalog_id: flow_input["catalog_id"] = str(catalog_id) if server: flow_input["server"] = server if verbose: print("Flow input populated:\n{}".format( json.dumps(flow_input, indent=4, sort_keys=True))) # Get Flow scope flow_def = self.flow_client.get_flow(flow_id) flow_scope = flow_def["globus_auth_scope"] # Start Flow if verbose: print("Starting Flow - Submitting data") try: flow_res = self.flow_client.run_flow(flow_id, flow_scope, flow_input) except globus_sdk.GlobusAPIError as e: if e.http_status == 404: return { "success": False, "error": ("Could not access ingest Flow. Are you in the CFDE DERIVA " "Demo Globus Group? Check your membership or apply for access " "here: https://app.globus.org/groups/a437abe3-c9a4-11e9-b441-" "0efb3ba9a670/about") } else: raise self.last_flow_run = { "flow_id": flow_id, "flow_instance_id": flow_res["action_id"] } if verbose: print("Flow started successfully.") return { "success": True, "message": ("Started DERIVA ingest Flow\nFlow ID: {}\nFlow Instance ID: {}". format(flow_id, flow_res["action_id"])), "flow_id": flow_id, "flow_instance_id": flow_res["action_id"], "cfde_dest_path": dest_path, "http_link": "{}{}".format(self.flow_info["cfde_ep_url"], dest_path), "globus_web_link": ("https://app.globus.org/file-manager?origin_id={}&origin_path={}". format(self.flow_info["cfde_ep_id"], os.path.dirname(dest_path))) }
def parse_cli(): description = 'BD2K BDBag utility for working with Bagit/RO archives' parser = argparse.ArgumentParser( description=description, epilog="For more information see: http://github.com/ini-bdds/bdbag") standard_args = parser.add_argument_group('Standard arguments') update_arg = standard_args.add_argument( '--update', action="store_true", help="Update an existing bag dir, regenerating manifests and fetch.txt if necessary.") standard_args.add_argument( "--archiver", choices=['zip', 'tar', 'tgz'], help="Archive a bag using the specified format.") checksum_arg = standard_args.add_argument( "--checksum", action='append', choices=['md5', 'sha1', 'sha256', 'sha512', 'all'], help="Checksum algorithm to use: can be specified multiple times with different values. " "If \'all\' is specified, every supported checksum will be generated") skip_manifests_arg = standard_args.add_argument( "--skip-manifests", action='store_true', help=str("If \'skip-manifests\' is specified in conjunction with %s, only tagfile manifests will be " "regenerated, with payload manifests and fetch.txt (if any) left as is. This argument should be used " "when only bag metadata has changed." % update_arg.option_strings)) prune_manifests_arg = standard_args.add_argument( "--prune-manifests", action='store_true', help="If specified, any existing checksum manifests not explicitly configured via either" " the \"checksum\" argument(s) or configuration file will be deleted from the bag during an update.") fetch_arg = standard_args.add_argument( '--resolve-fetch', choices=['all', 'missing'], help="Download remote files listed in the bag's fetch.txt file. " "The \"missing\" option only attempts to fetch files that do not " "already exist in the bag payload directory. " "The \"all\" option causes all fetch files to be re-acquired," " even if they already exist in the bag payload directory.") standard_args.add_argument( '--validate', choices=['fast', 'full'], help="Validate a bag directory or bag archive. If \"fast\" is specified, Payload-Oxum (if present) will be " "used to check that the payload files are present and accounted for. Otherwise if \"full\" is specified, " "all checksums will be regenerated and compared to the corresponding entries in the manifest") standard_args.add_argument( '--validate-profile', action="store_true", help="Validate a bag against the profile specified by the bag's " "\"BagIt-Profile-Identifier\" metadata field, if present.") standard_args.add_argument( '--config-file', default=DEFAULT_CONFIG_FILE, metavar='<file>', help="Optional path to a configuration file. If this argument is not specified, the configuration file " "defaults to: %s " % DEFAULT_CONFIG_FILE) metadata_file_arg = standard_args.add_argument( '--metadata-file', metavar='<file>', help="Optional path to a JSON formatted metadata file") remote_file_manifest_arg = standard_args.add_argument( '--remote-file-manifest', metavar='<file>', help="Optional path to a JSON formatted remote file manifest configuration file used to add remote file entries" " to the bag manifest(s) and create the bag fetch.txt file.") standard_args.add_argument( '--quiet', action="store_true", help="Suppress logging output.") standard_args.add_argument( '--debug', action="store_true", help="Enable debug logging output.") standard_args.add_argument( '--bag-path', metavar="<path>", required=True, help="Path to a bag directory or bag archive file.") metadata_args = parser.add_argument_group('Bag metadata arguments') for header in bagit.STANDARD_BAG_INFO_HEADERS: metadata_args.add_argument('--%s' % header.lower(), action=AddMetadataAction) args = parser.parse_args() bdb.configure_logging(level=logging.ERROR if args.quiet else (logging.DEBUG if args.debug else logging.INFO)) path = os.path.abspath(args.bag_path) if not os.path.exists(path): sys.stderr.write("Error: file or directory not found: %s\n\n" % path) sys.exit(2) is_file = os.path.isfile(path) if args.archiver and is_file: sys.stderr.write("Error: A bag archive cannot be created from an existing bag archive.\n\n") sys.exit(2) if args.checksum and is_file: sys.stderr.write("Error: A checksum manifest cannot be added to an existing bag archive. " "The bag must be extracted, updated, and re-archived.\n\n") sys.exit(2) if args.update and is_file: sys.stderr.write("Error: An existing bag archive cannot be updated in-place. " "The bag must first be extracted and then updated.\n\n") sys.exit(2) if args.update and args.resolve_fetch: sys.stderr.write("Error: The %s argument is not compatible with the %s argument.\n\n" % (update_arg.option_strings, fetch_arg.option_strings)) sys.exit(2) if args.remote_file_manifest and args.resolve_fetch: sys.stderr.write("Error: The %s argument is not compatible with the %s argument.\n\n" % (remote_file_manifest_arg.option_strings, fetch_arg.option_strings)) sys.exit(2) is_bag = bdb.is_bag(path) if args.checksum and not args.update and is_bag: sys.stderr.write("Error: Specifying %s for an existing bag requires the %s argument in order " "to apply any changes.\n\n" % (checksum_arg.option_strings, update_arg.option_strings)) sys.exit(2) if args.remote_file_manifest and not args.update and is_bag: sys.stderr.write("Error: Specifying %s for an existing bag requires the %s argument in order " "to apply any changes.\n\n" % (remote_file_manifest_arg.option_strings, update_arg.option_strings)) sys.exit(2) if args.metadata_file and not args.update and is_bag: sys.stderr.write("Error: Specifying %s for an existing bag requires the %s argument in order " "to apply any changes.\n\n" % (metadata_file_arg.option_strings, update_arg.option_strings)) sys.exit(2) if args.prune_manifests and not args.update and is_bag: sys.stderr.write("Error: Specifying %s for an existing bag requires the %s argument in order " "to apply any changes.\n\n" % (prune_manifests_arg.option_strings, update_arg.option_strings)) sys.exit(2) if args.skip_manifests and not args.update and is_bag: sys.stderr.write("Error: Specifying %s requires the %s argument.\n\n" % (skip_manifests_arg.option_strings, update_arg.option_strings)) sys.exit(2) if BAG_METADATA and not args.update and is_bag: sys.stderr.write("Error: Adding or modifying metadata %s for an existing bag requires the %s argument " "in order to apply any changes.\n\n" % (BAG_METADATA, update_arg.option_strings)) sys.exit(2) return args, is_bag, is_file
def main(): sys.stderr.write('\n') args, is_bag, is_file = parse_cli() path = os.path.abspath(args.bag_path) archive = None temp_path = None error = None result = 0 try: if not is_file: # do not try to create or update the bag if the user just wants to validate or complete an existing bag if not ((args.validate or args.validate_profile or args.resolve_fetch) and not (args.update and bdb.is_bag(path))): if args.checksum and 'all' in args.checksum: args.checksum = ['md5', 'sha1', 'sha256', 'sha512'] # create or update the bag depending on the input arguments bdb.make_bag(path, args.checksum, args.update, args.skip_manifests, args.prune_manifests, BAG_METADATA if BAG_METADATA else None, args.metadata_file, args.remote_file_manifest, args.config_file) # otherwise just extract the bag if it is an archive and no other conflicting options specified elif not (args.validate or args.validate_profile or args.resolve_fetch): bdb.extract_bag(path) sys.stderr.write('\n') return result if args.resolve_fetch: if args.validate == 'full': sys.stderr.write(ASYNC_TRANSFER_VALIDATION_WARNING) bdb.resolve_fetch(path, True if args.resolve_fetch == 'all' else False) if args.validate: if is_file: temp_path = bdb.extract_bag(path, temp=True) bdb.validate_bag(temp_path if temp_path else path, True if args.validate == 'fast' else False, args.config_file) if args.archiver: archive = bdb.archive_bag(path, args.archiver) if archive is None and is_file: archive = path if args.validate_profile: if is_file: if not temp_path: temp_path = bdb.extract_bag(path, temp=True) profile = bdb.validate_bag_profile(temp_path if temp_path else path) bdb.validate_bag_serialization(archive if archive else path, profile) except Exception as e: result = 1 error = "Error: %s" % bdbag.get_named_exception(e) finally: if temp_path: bdb.cleanup_bag(os.path.dirname(temp_path)) if result != 0: sys.stderr.write("\n%s" % error) sys.stderr.write('\n') return result