def test_filename_unicode_normalization(self): # We need to handle cases where the Unicode normalization form of a # filename has changed in-transit. This is hard to do portably in both # directions because OS X normalizes *all* filenames to an NFD variant # so we'll start with a basic test which writes the manifest using the # NFC form and confirm that this does not cause the bag to fail when it # is written to the filesystem using the NFD form, which will not be # altered when saved to an HFS+ filesystem: test_filename = "Núñez Papers.txt" test_filename_nfd = unicodedata.normalize("NFD", test_filename) os.makedirs(j(self.tmpdir, "unicode-normalization")) with open(j(self.tmpdir, "unicode-normalization", test_filename_nfd), "w") as f: f.write("This is a test filename written using NFD normalization\n") bag = bagit.make_bag(self.tmpdir) bag.save() self.assertTrue(bag.is_valid()) # Now we'll cause the entire manifest file was normalized to NFC: for m_f in bag.manifest_files(): contents = slurp_text_file(m_f) normalized_bytes = unicodedata.normalize("NFC", contents).encode("utf-8") with open(m_f, "wb") as f: f.write(normalized_bytes) for alg in bag.algorithms: bagit._make_tagmanifest_file(alg, bag.path, encoding=bag.encoding) # Now we'll reload the whole thing: bag = bagit.Bag(self.tmpdir) self.assertTrue(bag.is_valid())
def write_tag_manifests(self): for alg in set(self.algorithms): try: bagit._make_tagmanifest_file(alg, self.path) except: LOGGER.error("Do not have permission to overwrite tag manifests") return True
def write_tag_manifests(self): for alg in set(self.algs): try: bagit._make_tagmanifest_file(alg, self.path) except: LOGGER.error( "Do not have permission to overwrite tag manifests") return True
def test_filename_unicode_normalization(self): # We need to handle cases where the Unicode normalization form of a # filename has changed in-transit. This is hard to do portably in both # directions because OS X normalizes *all* filenames to an NFD variant # so we'll start with a basic test which writes the manifest using the # NFC form and confirm that this does not cause the bag to fail when it # is written to the filesystem using the NFD form, which will not be # altered when saved to an HFS+ filesystem: test_filename = 'Núñez Papers.txt' test_filename_nfc = unicodedata.normalize('NFC', test_filename) test_filename_nfd = unicodedata.normalize('NFD', test_filename) os.makedirs(j(self.tmpdir, 'unicode-normalization')) with open(j(self.tmpdir, 'unicode-normalization', test_filename_nfd), 'w') as f: f.write( 'This is a test filename written using NFD normalization\n') bag = bagit.make_bag(self.tmpdir) bag.save() self.assertTrue(bag.is_valid()) # Now we'll cause the entire manifest file was normalized to NFC: for m_f in bag.manifest_files(): contents = slurp_text_file(m_f) normalized_bytes = unicodedata.normalize('NFC', contents).encode('utf-8') with open(m_f, 'wb') as f: f.write(normalized_bytes) for alg in bag.algs: bagit._make_tagmanifest_file(alg, bag.path, encoding=bag.encoding) # Now we'll reload the whole thing: bag = bagit.Bag(self.tmpdir) self.assertTrue(bag.is_valid())
def move_tag_files(source_directory, tags): # move tag files after bagging into a tags folder # then update the tag manifest for tag_file in tags: cms_id = re.search(r'_(\d{6})_', tag_file).group(1) object_bag = os.path.join(source_directory, cms_id) # tag file for object that didn't get bagged if not os.path.exists(object_bag): print('ummm, no bag for {}'.format(cms_id)) continue else: tag_dir = os.path.join(object_bag, 'tags') os.makedirs(tag_dir, exist_ok=True) shutil.move(tag_file, tag_dir) # update the tag manifest, # messy but takes advantage of bagit setup # rewrites tag manifest for every file move, not ideal, but not the worst cur_dir = os.getcwd() os.chdir(object_bag) bagit._make_tagmanifest_file("md5", object_bag) os.chdir(cur_dir)
def save(self, processes=1, manifests=False): """ save will persist any changes that have been made to the bag metadata (self.info). If you have modified the payload of the bag (added, modified, removed files in the data directory) and want to regenerate manifests set the manifests parameter to True. The default is False since you wouldn't want a save to accidentally create a new manifest for a corrupted bag. If you want to control the number of processes that are used when recalculating checksums use the processes parameter. """ # Error checking if not self.path: raise BagError(_('Bag.save() called before setting the path!')) if not os.access(self.path, os.R_OK | os.W_OK | os.X_OK): raise BagError(_('Cannot save bag to non-existent or inaccessible directory %s') % self.path) unbaggable = _can_bag(self.path) if unbaggable: LOGGER.error(_("Missing write permissions for the following directories and files:\n%s"), unbaggable) raise BagError(_("Missing permissions to move all files and directories")) unreadable_dirs, unreadable_files = _can_read(self.path) if unreadable_dirs or unreadable_files: if unreadable_dirs: LOGGER.error(_("The following directories do not have read permissions:\n%s"), unreadable_dirs) if unreadable_files: LOGGER.error(_("The following files do not have read permissions:\n%s"), unreadable_files) raise BagError(_("Read permissions are required to calculate file fixities")) # Change working directory to bag directory so helper functions work old_dir = os.path.abspath(os.path.curdir) try: os.chdir(self.path) # Generate new manifest files if manifests: self._sync_remote_entries_with_existing_fetch() validate_remote_entries(self.remote_entries, self.path) total_bytes, total_files = make_manifests('data', processes, algorithms=self.algorithms, encoding=self.encoding) total_bytes_remote, total_files_remote = update_manifests_from_remote(self.remote_entries, self.path) total_bytes += total_bytes_remote total_files += total_files_remote # Update fetch.txt _make_fetch_file(self.path, self.remote_entries) # Update Payload-Oxum LOGGER.info(_('Updating Payload-Oxum in %s'), self.tag_file_name) self.info['Payload-Oxum'] = '%s.%s' % (total_bytes, total_files) _make_tag_file(self.tag_file_name, self.info) # Update tag-manifest for changes to manifest & bag-info files for alg in self.algorithms: _make_tagmanifest_file(alg, self.path, encoding=self.encoding) # Reload the manifests self._load_manifests() except Exception: LOGGER.error(_("An error occurred updating bag in %s"), self.path) raise finally: os.chdir(old_dir)
def make_bag(bag_dir, bag_info=None, processes=1, checksums=None, encoding='utf-8', remote_entries=None): """ Convert a given directory into a bag. You can pass in arbitrary key/value pairs to put into the bag-info.txt metadata file as the bag_info dictionary. """ if checksums is None: checksums = DEFAULT_CHECKSUMS bag_dir = os.path.abspath(bag_dir) cwd = os.path.abspath(os.path.curdir) if cwd.startswith(bag_dir) and cwd != bag_dir: raise RuntimeError(_('Bagging a parent of the current directory is not supported')) LOGGER.info(_("Creating bag for directory %s"), bag_dir) if not os.path.isdir(bag_dir): LOGGER.error(_("Bag directory %s does not exist"), bag_dir) raise RuntimeError(_("Bag directory %s does not exist") % bag_dir) # FIXME: we should do the permissions checks before changing directories old_dir = os.path.abspath(os.path.curdir) try: # TODO: These two checks are currently redundant since an unreadable directory will also # often be unwritable, and this code will require review when we add the option to # bag to a destination other than the source. It would be nice if we could avoid # walking the directory tree more than once even if most filesystems will cache it unbaggable = _can_bag(bag_dir) if unbaggable: LOGGER.error(_("Unable to write to the following directories and files:\n%s"), unbaggable) raise BagError(_("Missing permissions to move all files and directories")) unreadable_dirs, unreadable_files = _can_read(bag_dir) if unreadable_dirs or unreadable_files: if unreadable_dirs: LOGGER.error(_("The following directories do not have read permissions:\n%s"), unreadable_dirs) if unreadable_files: LOGGER.error(_("The following files do not have read permissions:\n%s"), unreadable_files) raise BagError(_("Read permissions are required to calculate file fixities")) else: LOGGER.info(_("Creating data directory")) # FIXME: if we calculate full paths we won't need to deal with changing directories os.chdir(bag_dir) cwd = os.getcwd() temp_data = tempfile.mkdtemp(dir=cwd) for f in os.listdir('.'): if os.path.abspath(f) == temp_data: continue new_f = os.path.join(temp_data, f) LOGGER.info(_('Moving %(source)s to %(destination)s'), {'source': f, 'destination': new_f}) os.rename(f, new_f) LOGGER.info(_('Moving %(source)s to %(destination)s'), {'source': temp_data, 'destination': 'data'}) os.rename(temp_data, 'data') # permissions for the payload directory should match those of the # original directory os.chmod('data', os.stat(cwd).st_mode) validate_remote_entries(remote_entries, bag_dir) total_bytes, total_files = make_manifests('data', processes, algorithms=checksums, encoding=encoding) total_bytes_remote, total_files_remote = update_manifests_from_remote(remote_entries, bag_dir) total_bytes += total_bytes_remote total_files += total_files_remote _make_fetch_file(bag_dir, remote_entries) LOGGER.info(_("Creating bagit.txt")) txt = """BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8\n""" with open_text_file('bagit.txt', 'w') as bagit_file: bagit_file.write(txt) LOGGER.info(_("Creating bag-info.txt")) if bag_info is None: bag_info = {} # allow 'Bagging-Date' and 'Bag-Software-Agent' to be overidden if 'Bagging-Date' not in bag_info: bag_info['Bagging-Date'] = date.strftime(date.today(), "%Y-%m-%d") if 'Bag-Software-Agent' not in bag_info: bag_info['Bag-Software-Agent'] = \ 'BDBag version: %s (Bagit version: %s) <%s>' % (VERSION, BAGIT_VERSION, PROJECT_URL) bag_info['Payload-Oxum'] = "%s.%s" % (total_bytes, total_files) _make_tag_file('bag-info.txt', bag_info) for c in checksums: _make_tagmanifest_file(c, bag_dir, encoding='utf-8') except Exception: LOGGER.error(_("An error occurred creating a bag in %s"), bag_dir) raise finally: os.chdir(old_dir) return BDBag(bag_dir)