def create_bag_archive(manifest, bag_metadata, ro_metadata, name): try: if not name: now = datetime.datetime.now() name = 'Concierge-Bag-{}'.format(now.strftime('%B-%d-%Y')) base_folder = create_unique_folder() bag_name = join(base_folder, name) os.mkdir(bag_name) remote_manifest_filename = join(base_folder, str(uuid.uuid4())) with open(remote_manifest_filename, 'w') as f: f.write(json.dumps(manifest)) bdbag_api.make_bag( bag_name, metadata=bag_metadata, ro_metadata=ro_metadata, remote_file_manifest=remote_manifest_filename, ) bdbag_api.archive_bag(bag_name, settings.BAG_ARCHIVE_FORMAT) archive_name = '{}.{}'.format(bag_name, settings.BAG_ARCHIVE_FORMAT) os.remove(remote_manifest_filename) return archive_name except Exception as e: log.exception(e) raise ConciergeException(str(e), code='bdbag_creation_error')
def test_archive_bag_tar(self): logger.info(self.getTestHeader('archive bag tar format')) try: archive_file = bdb.archive_bag(self.test_bag_dir, 'tar') self.assertTrue(ospif(archive_file)) except Exception as e: self.fail(bdbag.get_named_exception(e))
def test_archive_bag_tgz(self): logger.info(self.getTestHeader('archive bag tgz format')) try: archive_file = bdb.archive_bag(self.test_bag_dir, 'tgz') self.assertTrue(ospif(archive_file)) except Exception as e: self.fail(get_typed_exception(e))
def test_archive_bag_tar_with_trailing_slash(self): logger.info(self.getTestHeader('archive bag tar format')) try: archive_file = bdb.archive_bag(self.test_bag_dir + os.sep, 'tar') self.assertTrue(ospif(archive_file)) except Exception as e: self.fail(get_typed_exception(e))
def create_bag_from_metadata_file(metadata_file_path, remote_file_manifest=None, working_dir=None, output_name=None, output_path=None, archive_format=None, creator_name=None, creator_orcid=None, create_ro_manifest=False): temp_path = None if remote_file_manifest is None: if working_dir is None: working_dir = temp_path = tempfile.mkdtemp(prefix="encode2bag_") remote_file_manifest = osp.abspath(osp.join(working_dir, "remote-file-manifest.json")) ro_manifest = None if create_ro_manifest: ro_manifest = init_ro_manifest(creator_name=creator_name, creator_orcid=creator_orcid) convert_tsv_metadata_to_remote_file_manifest(metadata_file_path, remote_file_manifest, ro_manifest) bag_path = get_target_bag_path(output_name=output_name, output_path=output_path) ensure_bag_path_exists(bag_path) shutil.copy(osp.abspath(metadata_file_path), bag_path) bag_metadata = dict() if creator_name: bag_metadata["Contact-Name"] = creator_name if creator_orcid: bag_metadata["Contact-Orcid"] = creator_orcid bag = bdb.make_bag(bag_path, algs=["md5", "sha256"], metadata=bag_metadata, remote_file_manifest=remote_file_manifest) if create_ro_manifest: bag_metadata_dir = os.path.abspath(os.path.join(bag_path, "metadata")) if not os.path.exists(bag_metadata_dir): os.mkdir(bag_metadata_dir) ro_manifest_path = osp.join(bag_metadata_dir, "manifest.json") ro.write_ro_manifest(ro_manifest, ro_manifest_path) bag_metadata.update({'BagIt-Profile-Identifier': "http://raw.githubusercontent.com/ini-bdds/bdbag/master/profiles/bdbag-ro-profile.json"}) bdb.make_bag(bag_path, update=True, metadata=bag_metadata) if archive_format: bag_path = bdb.archive_bag(bag_path, archive_format) if temp_path: shutil.rmtree(temp_path) return bag_path
def create_bag_archive(metadata, bag_algorithms=('md5', 'sha256'), **bag_metadata): bag_name = join(settings.BAG_STAGING_DIR, str(uuid.uuid4())) remote_manifest_filename = join(settings.BAG_STAGING_DIR, str(uuid.uuid4())) remote_manifest_formatted = _format_remote_file_manifest( metadata, bag_algorithms) with open(remote_manifest_filename, 'w') as f: f.write(json.dumps(remote_manifest_formatted)) os.mkdir(bag_name) bdbag_api.make_bag( bag_name, algs=bag_algorithms, metadata=dict(bag_metadata), remote_file_manifest=remote_manifest_filename, ) bdbag_api.archive_bag(bag_name, settings.BAG_ARCHIVE_FORMAT) archive_name = '{}.{}'.format(bag_name, settings.BAG_ARCHIVE_FORMAT) os.remove(remote_manifest_filename) return archive_name
def create_file(self) -> Tuple[str, Optional[str]]: with TemporaryDirectory() as temp_path: bag_path = os.path.join(temp_path, 'manifest') os.makedirs(bag_path) bdbag_api.make_bag(bag_path) with open(os.path.join(bag_path, 'data', 'participants.tsv'), 'w') as samples_tsv: self._samples_tsv(samples_tsv) bag = bdbag_api.make_bag(bag_path, update=True) # update TSV checksums assert bdbag_api.is_bag(bag_path) bdbag_api.validate_bag(bag_path) assert bdbag_api.check_payload_consistency(bag) temp, temp_path = mkstemp() os.close(temp) archive_path = bdbag_api.archive_bag(bag_path, 'zip') # Moves the bdbag archive out of the temporary directory. This prevents # the archive from being deleted when the temporary directory self-destructs. os.rename(archive_path, temp_path) return temp_path, None
def create_bag(output_dir, update): """Create/Update and archive a BDBag from the contents of a passed-in directory.""" bdbag_api.make_bag(output_dir, update=update) return bdbag_api.archive_bag(output_dir, "zip")
def download(self, **kwargs): if not self.config: raise DerivaDownloadConfigurationError( "No configuration specified!") if self.config.get("catalog") is None: raise DerivaDownloadConfigurationError( "Catalog configuration error!") ro_manifest = None ro_author_name = None ro_author_orcid = None remote_file_manifest = os.path.abspath(''.join([ os.path.join(self.output_dir, 'remote-file-manifest_'), str(uuid.uuid4()), ".json" ])) catalog_config = self.config['catalog'] self.envars.update(self.config.get('env', dict())) self.envars.update({"hostname": self.hostname}) # 1. If we don't have a client identity, we need to authenticate identity = kwargs.get("identity") if not identity: try: if not self.credentials: self.set_credentials(get_credential(self.hostname)) logging.info("Validating credentials for host: %s" % self.hostname) attributes = self.catalog.get_authn_session().json() identity = attributes["client"] except HTTPError as he: if he.response.status_code == 404: logging.info( "No existing login session found for host: %s" % self.hostname) except Exception as e: raise DerivaDownloadAuthenticationError( "Unable to validate credentials: %s" % format_exception(e)) wallet = kwargs.get("wallet", {}) # 2. Check for bagging config and initialize bag related variables bag_path = None bag_archiver = None bag_algorithms = None bag_config = self.config.get('bag') create_bag = True if bag_config else False if create_bag: bag_name = bag_config.get( 'bag_name', ''.join([ "deriva_bag", '_', time.strftime("%Y-%m-%d_%H.%M.%S") ])).format(**self.envars) bag_path = os.path.abspath(os.path.join(self.output_dir, bag_name)) bag_archiver = bag_config.get('bag_archiver') bag_algorithms = bag_config.get('bag_algorithms', ['sha256']) bag_metadata = bag_config.get( 'bag_metadata', {"Internal-Sender-Identifier": "deriva@%s" % self.server_url}) bag_ro = create_bag and stob(bag_config.get('bag_ro', "True")) if create_bag: bdb.ensure_bag_path_exists(bag_path) bag = bdb.make_bag(bag_path, algs=bag_algorithms, metadata=bag_metadata) if bag_ro: ro_author_name = bag.info.get( "Contact-Name", None if not identity else identity.get( 'full_name', identity.get('display_name', identity.get('id', None)))) ro_author_orcid = bag.info.get("Contact-Orcid") ro_manifest = ro.init_ro_manifest( author_name=ro_author_name, author_orcid=ro_author_orcid) bag_metadata.update({BAG_PROFILE_TAG: BDBAG_RO_PROFILE_ID}) # 3. Process the set of queries by locating, instantiating, and invoking the specified processor(s) outputs = dict() base_path = bag_path if bag_path else self.output_dir for processor in catalog_config['query_processors']: processor_name = processor["processor"] processor_type = processor.get('processor_type') processor_params = processor.get('processor_params') try: query_processor = find_query_processor(processor_name, processor_type) processor = query_processor( self.envars, inputs=outputs, bag=create_bag, catalog=self.catalog, store=self.store, base_path=base_path, processor_params=processor_params, remote_file_manifest=remote_file_manifest, ro_manifest=ro_manifest, ro_author_name=ro_author_name, ro_author_orcid=ro_author_orcid, identity=identity, wallet=wallet) outputs = processor.process() except Exception as e: logging.error(format_exception(e)) if create_bag: bdb.cleanup_bag(bag_path) raise # 4. Execute anything in the transform processing pipeline, if configured transform_processors = self.config.get('transform_processors', []) if transform_processors: for processor in transform_processors: processor_name = processor["processor"] processor_type = processor.get('processor_type') processor_params = processor.get('processor_params') try: transform_processor = find_transform_processor( processor_name, processor_type) processor = transform_processor( self.envars, inputs=outputs, processor_params=processor_params, base_path=base_path, bag=create_bag, ro_manifest=ro_manifest, ro_author_name=ro_author_name, ro_author_orcid=ro_author_orcid, identity=identity, wallet=wallet) outputs = processor.process() except Exception as e: logging.error(format_exception(e)) raise # 5. Create the bag, and archive (serialize) if necessary if create_bag: try: if ro_manifest: ro.write_bag_ro_metadata(ro_manifest, bag_path) if not os.path.isfile(remote_file_manifest): remote_file_manifest = None bdb.make_bag( bag_path, algs=bag_algorithms, remote_file_manifest=remote_file_manifest if (remote_file_manifest and os.path.getsize(remote_file_manifest) > 0) else None, update=True) except Exception as e: logging.fatal("Exception while updating bag manifests: %s" % format_exception(e)) bdb.cleanup_bag(bag_path) raise finally: if remote_file_manifest and os.path.isfile( remote_file_manifest): os.remove(remote_file_manifest) logging.info('Created bag: %s' % bag_path) if bag_archiver is not None: try: archive = bdb.archive_bag(bag_path, bag_archiver.lower()) bdb.cleanup_bag(bag_path) outputs = { os.path.basename(archive): { LOCAL_PATH_KEY: archive } } except Exception as e: logging.error( "Exception while creating data bag archive: %s" % format_exception(e)) raise else: outputs = { os.path.basename(bag_path): { LOCAL_PATH_KEY: bag_path } } # 6. Execute anything in the post processing pipeline, if configured post_processors = self.config.get('post_processors', []) if post_processors: for processor in post_processors: processor_name = processor["processor"] processor_type = processor.get('processor_type') processor_params = processor.get('processor_params') try: post_processor = find_post_processor( processor_name, processor_type) processor = post_processor( self.envars, inputs=outputs, processor_params=processor_params, identity=identity, wallet=wallet) outputs = processor.process() except Exception as e: logging.error(format_exception(e)) raise return outputs
def validate_user_submission(data_path, schema, output_dir=None, delete_dir=False, handle_git_repos=True, bdbag_kwargs=None): """ Arguments: data_path (str): The path to the data to ingest into DERIVA. The path can be: 1) A directory to be formatted into a BDBag 2) A Git repository to be copied into a BDBag 3) A premade BDBag directory 4) A premade BDBag in an archive file schema (str): The named schema or schema file link to validate data against. Default None, to only validate against the declared TableSchema. output_dir (str): The path to create an output directory in. The resulting BDBag archive will be named after this directory. If not set, the directory will be turned into a BDBag in-place. For Git repositories, this is automatically set, but can be overridden. If data_path is a file, this has no effect. This dir MUST NOT be in the `data_path` directory or any subdirectories. Default None. delete_dir (bool): Should the output_dir be deleted after submission? Has no effect if output_dir is not specified. For Git repositories, this is always True. Default False. handle_git_repos (bool): Should Git repositories be detected and handled? When this is False, Git repositories are handled as simple directories instead of Git repositories. Default True. bdbag_kwargs (dict): Extra args to pass to bdbag """ bdbag_kwargs = bdbag_kwargs or {} data_path = os.path.abspath(data_path) if not os.path.exists(data_path): raise FileNotFoundError("Path '{}' does not exist".format(data_path)) if handle_git_repos: logger.debug("Checking for a Git repository") # If Git repo, set output_dir appropriately try: repo = git.Repo(data_path, search_parent_directories=True) # Not Git repo except git.InvalidGitRepositoryError: logger.debug("Not a Git repo") # Path not found, turn into standard FileNotFoundError except git.NoSuchPathError: raise FileNotFoundError( "Path '{}' does not exist".format(data_path)) # Is Git repo else: logger.debug("Git repo found, collecting metadata") # Needs to not have slash at end - is known Git repo already, slash # interferes with os.path.basename/dirname if data_path.endswith("/"): data_path = data_path[:-1] # Set output_dir to new dir named with HEAD commit hash new_dir_name = "{}_{}".format(os.path.basename(data_path), str(repo.head.commit)) output_dir = os.path.join(os.path.dirname(data_path), new_dir_name) # Delete temp dir after archival delete_dir = True # If dir and not already BDBag, make BDBag if os.path.isdir(data_path) and not bdbag_api.is_bag(data_path): logger.debug("Creating BDBag out of directory '{}'".format(data_path)) # If output_dir specified, copy data to output dir first if output_dir: logger.debug("Copying data to '{}' before creating BDBag".format( output_dir)) output_dir = os.path.abspath(output_dir) # If shutil.copytree is called when the destination dir is inside the source dir # by more than one layer, it will recurse infinitely. # (e.g. /source => /source/dir/dest) # Exactly one layer is technically okay (e.g. /source => /source/dest), # but it's easier to forbid all parent/child dir cases. # Check for this error condition by determining if output_dir is a child # of data_path. if os.path.commonpath([data_path]) == os.path.commonpath( [data_path, output_dir]): raise ValueError( "The output_dir ('{}') must not be in data_path ('{}')". format(output_dir, data_path)) try: shutil.copytree(data_path, output_dir) except FileExistsError: raise FileExistsError( ("The output directory must not exist. " "Delete '{}' to submit.\nYou can set delete_dir=True " "to avoid this issue in the future.").format(output_dir)) # Process new dir instead of old path data_path = output_dir # If output_dir not specified, never delete data dir else: delete_dir = False # Make bag bdbag_api.make_bag(data_path, **bdbag_kwargs) if not bdbag_api.is_bag(data_path): raise ValueError( "Failed to create BDBag from {}".format(data_path)) logger.debug("BDBag created at '{}'".format(data_path)) # If dir (must be BDBag at this point), archive if os.path.isdir(data_path): logger.debug("Archiving BDBag at '{}' using '{}'".format( data_path, CONFIG["ARCHIVE_FORMAT"])) new_data_path = bdbag_api.archive_bag(data_path, CONFIG["ARCHIVE_FORMAT"]) logger.debug("BDBag archived to file '{}'".format(new_data_path)) # If requested (e.g. Git repo copied dir), delete data dir if delete_dir: logger.debug("Removing old directory '{}'".format(data_path)) shutil.rmtree(data_path) # Overwrite data_path - don't care about dir for uploading data_path = new_data_path # Validate TableSchema in BDBag logger.debug("Validating TableSchema in BDBag '{}'".format(data_path)) validation_res = ts_validate(data_path, schema=schema) if not validation_res["is_valid"]: raise exc.ValidationException( "TableSchema invalid due to the following errors: " "\n{}\n".format(validation_res["error"])) logger.debug("Validation successful") return data_path
def main(): args, is_bag, is_file = parse_cli() path = os.path.abspath(args.path) archive = None temp_path = None error = None result = 0 if not args.quiet: sys.stderr.write('\n') try: if not is_file: # do not try to create or update the bag if the user just wants to validate or complete an existing bag if not ( (args.validate or args.validate_profile or args.resolve_fetch) and not (args.update and bdb.is_bag(path))): if args.checksum and 'all' in args.checksum: args.checksum = ['md5', 'sha1', 'sha256', 'sha512'] # create or update the bag depending on the input arguments bdb.make_bag(path, algs=args.checksum, update=args.update, save_manifests=not args.skip_manifests, prune_manifests=args.prune_manifests, metadata=BAG_METADATA if BAG_METADATA else None, metadata_file=args.metadata_file, remote_file_manifest=args.remote_file_manifest, config_file=args.config_file, ro_metadata_file=args.ro_metadata_file) # otherwise just extract the bag if it is an archive and no other conflicting options specified elif not (args.validate or args.validate_profile or args.resolve_fetch): bdb.extract_bag(path) if not args.quiet: sys.stderr.write('\n') return result if args.ro_manifest_generate: bdb.generate_ro_manifest( path, True if args.ro_manifest_generate == "overwrite" else False, config_file=args.config_file) if args.resolve_fetch: if args.validate == 'full': sys.stderr.write(ASYNC_TRANSFER_VALIDATION_WARNING) bdb.resolve_fetch( path, force=True if args.resolve_fetch == 'all' else False, keychain_file=args.keychain_file, config_file=args.config_file, filter_expr=args.fetch_filter) if args.validate: if is_file: temp_path = bdb.extract_bag(path, temp=True) if args.validate == 'structure': bdb.validate_bag_structure(temp_path if temp_path else path) else: bdb.validate_bag( temp_path if temp_path else path, fast=True if args.validate == 'fast' else False, config_file=args.config_file) if args.archiver: archive = bdb.archive_bag(path, args.archiver) if archive is None and is_file: archive = path if args.validate_profile: if is_file: if not temp_path: temp_path = bdb.extract_bag(path, temp=True) profile = bdb.validate_bag_profile( temp_path if temp_path else path) bdb.validate_bag_serialization(archive if archive else path, profile) if args.revert: bdb.revert_bag(path) except Exception as e: result = 1 error = "Error: %s" % get_typed_exception(e) finally: if temp_path: bdb.cleanup_bag(os.path.dirname(temp_path)) if result != 0: sys.stderr.write("\n%s" % error) if not args.quiet: sys.stderr.write('\n') return result
def download(self, identity=None): if not self.config: raise RuntimeError("No configuration specified!") if self.config.get("catalog") is None: raise RuntimeError("Catalog configuration error!") if not identity: logging.info("Validating credentials") try: if not self.credentials: self.setCredentials(get_credential(self.hostname)) attributes = self.catalog.get_authn_session().json() identity = attributes["client"] except Exception as e: raise RuntimeError("Unable to validate credentials: %s" % format_exception(e)) ro_manifest = None ro_author_name = None ro_author_orcid = None remote_file_manifest = os.path.abspath( ''.join([os.path.join(self.output_dir, 'remote-file-manifest_'), str(uuid.uuid4()), ".json"])) catalog_config = self.config['catalog'] self.envars.update(self.config.get('env', dict())) bag_path = None bag_archiver = None bag_algorithms = None bag_config = self.config.get('bag') create_bag = True if bag_config else False if create_bag: bag_name = bag_config.get('bag_name', ''.join(["deriva_bag", '_', time.strftime("%Y-%m-%d_%H.%M.%S")])) bag_path = os.path.abspath(os.path.join(self.output_dir, bag_name)) bag_archiver = bag_config.get('bag_archiver') bag_algorithms = bag_config.get('bag_algorithms', ['sha256']) bag_metadata = bag_config.get('bag_metadata', {"Internal-Sender-Identifier": "deriva@%s" % self.server_url}) bag_ro = create_bag and stob(bag_config.get('bag_ro', "True")) if create_bag: bdb.ensure_bag_path_exists(bag_path) bag = bdb.make_bag(bag_path, algs=bag_algorithms, metadata=bag_metadata) if bag_ro: ro_author_name = bag.info.get("Contact-Name", identity.get('full_name', identity.get('display_name', identity.get('id', None)))) ro_author_orcid = bag.info.get("Contact-Orcid") ro_manifest = ro.init_ro_manifest(author_name=ro_author_name, author_orcid=ro_author_orcid) bag_metadata.update({BAG_PROFILE_TAG: BDBAG_RO_PROFILE_ID}) file_list = list() base_path = bag_path if bag_path else self.output_dir for query in catalog_config['queries']: query_path = query['query_path'] output_format = query['output_format'] output_processor = query.get("output_format_processor") format_args = query.get('output_format_params', None) output_path = query.get('output_path', '') try: download_processor = findProcessor(output_format, output_processor) processor = download_processor(self.envars, bag=create_bag, catalog=self.catalog, store=self.store, query=query_path, base_path=base_path, sub_path=output_path, format_args=format_args, remote_file_manifest=remote_file_manifest, ro_manifest=ro_manifest, ro_author_name=ro_author_name, ro_author_orcid=ro_author_orcid) file_list.extend(processor.process()) except Exception as e: logging.error(format_exception(e)) if create_bag: bdb.cleanup_bag(bag_path) raise if create_bag: try: if ro_manifest: ro.write_bag_ro_metadata(ro_manifest, bag_path) if not os.path.isfile(remote_file_manifest): remote_file_manifest = None bdb.make_bag(bag_path, algs=bag_algorithms, remote_file_manifest=remote_file_manifest, update=True) except Exception as e: logging.fatal("Exception while updating bag manifests: %s", format_exception(e)) bdb.cleanup_bag(bag_path) raise finally: if remote_file_manifest and os.path.isfile(remote_file_manifest): os.remove(remote_file_manifest) logging.info('Created bag: %s' % bag_path) if bag_archiver is not None: try: archive = bdb.archive_bag(bag_path, bag_archiver.lower()) bdb.cleanup_bag(bag_path) return [archive] except Exception as e: logging.error("Exception while creating data bag archive:", format_exception(e)) raise else: return [bag_path] return file_list
def start_deriva_flow(self, data_path, dcc_id, catalog_id=None, schema=None, server=None, dataset_acls=None, output_dir=None, delete_dir=False, handle_git_repos=True, dry_run=False, test_sub=False, verbose=False, **kwargs): """Start the Globus Automate Flow to ingest CFDE data into DERIVA. Arguments: data_path (str): The path to the data to ingest into DERIVA. The path can be: 1) A directory to be formatted into a BDBag 2) A Git repository to be copied into a BDBag 3) A premade BDBag directory 4) A premade BDBag in an archive file dcc_id (str): The CFDE-recognized DCC ID for this submission. catalog_id (int or str): The ID of the DERIVA catalog to ingest into. Default None, to create a new catalog. schema (str): The named schema or schema file link to validate data against. Default None, to only validate against the declared TableSchema. server (str): The DERIVA server to ingest to. Default None, to use the Action Provider-set default. dataset_acls (dict): The DERIVA ACL(s) to set on the final dataset. Default None, to use the CFDE default ACLs. output_dir (str): The path to create an output directory in. The resulting BDBag archive will be named after this directory. If not set, the directory will be turned into a BDBag in-place. For Git repositories, this is automatically set, but can be overridden. If data_path is a file, this has no effect. This dir MUST NOT be in the `data_path` directory or any subdirectories. Default None. delete_dir (bool): Should the output_dir be deleted after submission? Has no effect if output_dir is not specified. For Git repositories, this is always True. Default False. handle_git_repos (bool): Should Git repositories be detected and handled? When this is False, Git repositories are handled as simple directories instead of Git repositories. Default True. dry_run (bool): Should the data be validated and bagged without starting the Flow? When True, does not ingest into DERIVA or start the Globus Automate Flow, and the return value will not have valid DERIVA Flow information. Default False. test_sub (bool): Should the submission be run in "test mode" where the submission will be inegsted into DERIVA and immediately deleted? When True, the data wil not remain in DERIVA to be viewed and the Flow will terminate before any curation step. verbose (bool): Should intermediate status messages be printed out? Default False. Keyword Arguments: force_http (bool): Should the data be sent using HTTP instead of Globus Transfer, even if Globus Transfer is available? Because Globus Transfer is more robust than HTTP, it is highly recommended to leave this False. Default False. Other keyword arguments are passed directly to the ``make_bag()`` function of the BDBag API (see https://github.com/fair-research/bdbag for details). """ if verbose: print("Startup: Validating input") data_path = os.path.abspath(data_path) if not os.path.exists(data_path): raise FileNotFoundError( "Path '{}' does not exist".format(data_path)) if catalog_id in self.catalogs.keys(): if schema: raise ValueError( "You may not specify a schema ('{}') when ingesting to " "a named catalog ('{}'). Retry without specifying " "a schema.".format(schema, catalog_id)) schema = self.catalogs[catalog_id] # Pull out known kwargs force_http = kwargs.pop("force_http", False) if handle_git_repos: if verbose: print("Checking for a Git repository") # If Git repo, set output_dir appropriately try: repo = git.Repo(data_path, search_parent_directories=True) # Not Git repo except git.InvalidGitRepositoryError: if verbose: print("Not a Git repo") # Path not found, turn into standard FileNotFoundError except git.NoSuchPathError: raise FileNotFoundError( "Path '{}' does not exist".format(data_path)) # Is Git repo else: if verbose: print("Git repo found, collecting metadata") # Needs to not have slash at end - is known Git repo already, slash # interferes with os.path.basename/dirname if data_path.endswith("/"): data_path = data_path[:-1] # Set output_dir to new dir named with HEAD commit hash new_dir_name = "{}_{}".format(os.path.basename(data_path), str(repo.head.commit)) output_dir = os.path.join(os.path.dirname(data_path), new_dir_name) # Delete temp dir after archival delete_dir = True # If dir and not already BDBag, make BDBag if os.path.isdir(data_path) and not bdbag_api.is_bag(data_path): if verbose: print("Creating BDBag out of directory '{}'".format(data_path)) # If output_dir specified, copy data to output dir first if output_dir: if verbose: print("Copying data to '{}' before creating BDBag".format( output_dir)) output_dir = os.path.abspath(output_dir) # If shutil.copytree is called when the destination dir is inside the source dir # by more than one layer, it will recurse infinitely. # (e.g. /source => /source/dir/dest) # Exactly one layer is technically okay (e.g. /source => /source/dest), # but it's easier to forbid all parent/child dir cases. # Check for this error condition by determining if output_dir is a child # of data_path. if os.path.commonpath([data_path]) == os.path.commonpath( [data_path, output_dir]): raise ValueError( "The output_dir ('{}') must not be in data_path ('{}')" .format(output_dir, data_path)) try: shutil.copytree(data_path, output_dir) except FileExistsError: raise FileExistsError( ("The output directory must not exist. " "Delete '{}' to submit.\nYou can set delete_dir=True " "to avoid this issue in the future." ).format(output_dir)) # Process new dir instead of old path data_path = output_dir # If output_dir not specified, never delete data dir else: delete_dir = False # Make bag bdbag_api.make_bag(data_path, **kwargs) if not bdbag_api.is_bag(data_path): raise ValueError( "Failed to create BDBag from {}".format(data_path)) elif verbose: print("BDBag created at '{}'".format(data_path)) # If dir (must be BDBag at this point), archive if os.path.isdir(data_path): if verbose: print("Archiving BDBag at '{}' using '{}'".format( data_path, CONFIG["ARCHIVE_FORMAT"])) new_data_path = bdbag_api.archive_bag(data_path, CONFIG["ARCHIVE_FORMAT"]) if verbose: print("BDBag archived to file '{}'".format(new_data_path)) # If requested (e.g. Git repo copied dir), delete data dir if delete_dir: if verbose: print("Removing old directory '{}'".format(data_path)) shutil.rmtree(data_path) # Overwrite data_path - don't care about dir for uploading data_path = new_data_path # Validate TableSchema in BDBag if verbose: print("Validating TableSchema in BDBag '{}'".format(data_path)) validation_res = ts_validate(data_path, schema=schema) if not validation_res["is_valid"]: return { "success": False, "error": ("TableSchema invalid due to the following errors: \n{}\n". format(validation_res["error"])) } elif verbose: print("Validation successful") # Now BDBag is archived file # Set path on destination dest_path = "{}{}".format(self.flow_info["cfde_ep_path"], os.path.basename(data_path)) # If doing dry run, stop here before making Flow input if dry_run: return { "success": True, "message": "Dry run validated successfully. No data was transferred." } # Set up Flow if verbose: print("Creating input for Flow") # If local EP exists (and not force_http), can use Transfer # Local EP fetched now in case GCP started after Client creation local_endpoint = globus_sdk.LocalGlobusConnectPersonal().endpoint_id if local_endpoint and not force_http: if verbose: print( "Using local Globus Connect Personal Endpoint '{}'".format( local_endpoint)) # Populate Transfer fields in Flow flow_id = self.flow_info["flow_id"] flow_input = { "source_endpoint_id": local_endpoint, "source_path": data_path, "cfde_ep_id": self.flow_info["cfde_ep_id"], "cfde_ep_path": dest_path, "cfde_ep_url": self.flow_info["cfde_ep_url"], "is_directory": False, "test_sub": test_sub, "dcc_id": dcc_id } if catalog_id: flow_input["catalog_id"] = str(catalog_id) if server: flow_input["server"] = server # Otherwise, we must PUT the BDBag on the server else: if verbose: print("No Globus Endpoint detected; using HTTP upload instead") headers = {} self.__https_authorizer.set_authorization_header(headers) data_url = "{}{}".format(self.flow_info["cfde_ep_url"], dest_path) with open(data_path, 'rb') as bag_file: bag_data = bag_file.read() put_res = requests.put(data_url, data=bag_data, headers=headers) # Regenerate headers on 401 if put_res.status_code == 401: self.__https_authorizer.handle_missing_authorization() self.__https_authorizer.set_authorization_header(headers) put_res = requests.put(data_url, data=bag_data, headers=headers) # Error message on failed PUT or any unexpected response if put_res.status_code >= 300: return { "success": False, "error": ("Could not upload BDBag to server (error {}):\n{}".format( put_res.status_code, put_res.content)) } elif put_res.status_code != 200: print( "Warning: HTTP upload returned status code {}, which was unexpected." .format(put_res.status_code)) if verbose: print("Upload successful to '{}': {} {}".format( data_url, put_res.status_code, put_res.content)) flow_id = self.flow_info["flow_id"] flow_input = { "source_endpoint_id": False, "data_url": data_url, "test_sub": test_sub, "dcc_id": dcc_id } if catalog_id: flow_input["catalog_id"] = str(catalog_id) if server: flow_input["server"] = server if verbose: print("Flow input populated:\n{}".format( json.dumps(flow_input, indent=4, sort_keys=True))) # Get Flow scope flow_def = self.flow_client.get_flow(flow_id) flow_scope = flow_def["globus_auth_scope"] # Start Flow if verbose: print("Starting Flow - Submitting data") try: flow_res = self.flow_client.run_flow(flow_id, flow_scope, flow_input) except globus_sdk.GlobusAPIError as e: if e.http_status == 404: return { "success": False, "error": ("Could not access ingest Flow. Are you in the CFDE DERIVA " "Demo Globus Group? Check your membership or apply for access " "here: https://app.globus.org/groups/a437abe3-c9a4-11e9-b441-" "0efb3ba9a670/about") } else: raise self.last_flow_run = { "flow_id": flow_id, "flow_instance_id": flow_res["action_id"] } if verbose: print("Flow started successfully.") return { "success": True, "message": ("Started DERIVA ingest Flow\nFlow ID: {}\nFlow Instance ID: {}". format(flow_id, flow_res["action_id"])), "flow_id": flow_id, "flow_instance_id": flow_res["action_id"], "cfde_dest_path": dest_path, "http_link": "{}{}".format(self.flow_info["cfde_ep_url"], dest_path), "globus_web_link": ("https://app.globus.org/file-manager?origin_id={}&origin_path={}". format(self.flow_info["cfde_ep_id"], os.path.dirname(dest_path))) }
def update_bag(outdir): bdbag_api.make_bag(outdir, update=True) return bdbag_api.archive_bag(outdir, "zip")
def main(): sys.stderr.write('\n') args, is_bag, is_file = parse_cli() path = os.path.abspath(args.bag_path) archive = None temp_path = None error = None result = 0 try: if not is_file: # do not try to create or update the bag if the user just wants to validate or complete an existing bag if not ((args.validate or args.validate_profile or args.resolve_fetch) and not (args.update and bdb.is_bag(path))): if args.checksum and 'all' in args.checksum: args.checksum = ['md5', 'sha1', 'sha256', 'sha512'] # create or update the bag depending on the input arguments bdb.make_bag(path, args.checksum, args.update, args.skip_manifests, args.prune_manifests, BAG_METADATA if BAG_METADATA else None, args.metadata_file, args.remote_file_manifest, args.config_file) # otherwise just extract the bag if it is an archive and no other conflicting options specified elif not (args.validate or args.validate_profile or args.resolve_fetch): bdb.extract_bag(path) sys.stderr.write('\n') return result if args.resolve_fetch: if args.validate == 'full': sys.stderr.write(ASYNC_TRANSFER_VALIDATION_WARNING) bdb.resolve_fetch(path, True if args.resolve_fetch == 'all' else False) if args.validate: if is_file: temp_path = bdb.extract_bag(path, temp=True) bdb.validate_bag(temp_path if temp_path else path, True if args.validate == 'fast' else False, args.config_file) if args.archiver: archive = bdb.archive_bag(path, args.archiver) if archive is None and is_file: archive = path if args.validate_profile: if is_file: if not temp_path: temp_path = bdb.extract_bag(path, temp=True) profile = bdb.validate_bag_profile(temp_path if temp_path else path) bdb.validate_bag_serialization(archive if archive else path, profile) except Exception as e: result = 1 error = "Error: %s" % bdbag.get_named_exception(e) finally: if temp_path: bdb.cleanup_bag(os.path.dirname(temp_path)) if result != 0: sys.stderr.write("\n%s" % error) sys.stderr.write('\n') return result