Esempio n. 1
0
    def _validate_structure_tag_files(self):
        # Note: we deviate somewhat from v0.96 of the spec in that it allows
        # other files and directories to be present in the base directory

        if not list(self.manifest_files()):
            raise BagValidationError(_('No manifest files found'))
        if not self._root.relpath("bagit.txt").exists():
            raise BagValidationError(_('Expected %s to contain "bagit.txt"') % self._root)
Esempio n. 2
0
    def _validate_entries(self, processes, callback=None):
        """
        Verify that the actual file contents match the recorded hashes stored in the manifest files
        """
        errors = list()

        if os.name == 'posix':
            worker_init = posix_multiprocessing_worker_initializer
        else:
            worker_init = None

        args = ((self.path,
                 self.normalized_filesystem_names.get(rel_path, rel_path),
                 hashes, self.algorithms)
                for rel_path, hashes in self.entries.items())

        try:
            if processes == 1:
                count = 0
                hash_results = []
                totalHashes = len(self.entries.items())
                for i in args:
                    hash_results.append(_calc_hashes(i))
                    count += 1
                    if callback:
                        if not callback(count, totalHashes):
                            raise BaggingInterruptedError(
                                "Bag validation interrupted!")

            else:
                pool = None
                try:
                    pool = multiprocessing.Pool(
                        processes if processes else None,
                        initializer=worker_init)
                    hash_results = pool.map(_calc_hashes, args)
                finally:
                    if pool:
                        pool.terminate()
        except BaggingInterruptedError:
            raise
        # Any unhandled exceptions are probably fatal
        except:
            LOGGER.error(_("Unable to calculate file hashes for %s"), self)
            raise

        for rel_path, f_hashes, hashes in hash_results:
            for alg, computed_hash in f_hashes.items():
                stored_hash = hashes[alg]
                if stored_hash.lower() != computed_hash:
                    e = ChecksumMismatch(rel_path, alg, stored_hash.lower(),
                                         computed_hash)
                    LOGGER.warning(force_unicode(e))
                    errors.append(e)

        if errors:
            raise BagValidationError(_("Bag validation failed"), errors)
Esempio n. 3
0
def update_manifests_from_remote(remote_entries, bag_path=".", encoding='utf-8'):
    if not remote_entries:
        return 0, 0

    LOGGER.info(_('Generating manifest lines for remote files'))
    num_files = 0
    total_bytes = 0
    entries = []
    if remote_entries:
        sorted_remote_entries = OrderedDict(sorted(remote_entries.items(), key=lambda t: t[0]))
        for filename, values in sorted_remote_entries.items():
            checksums = []
            num_files += 1
            remote_size = int(values['length'])
            total_bytes += remote_size
            for alg in CHECKSUM_ALGOS:
                if alg in values.keys():
                    checksums.append(
                        (alg, values[alg], _denormalize_filename(_decode_filename(filename)), remote_size))
            entries.append(checksums)

    # At this point we have a list of tuples which start with the algorithm name:
    manifest_data = {}
    for batch in entries:
        for entry in batch:
            manifest_data.setdefault(entry[0], []).append(entry[1:])

    for algorithm, values in manifest_data.items():
        manifest_filename = 'manifest-%s.txt' % algorithm

        with open_text_file(manifest_filename, 'a+', encoding=encoding) as manifest:
            for digest, filename, byte_count in values:
                manifest.write("%s  %s\n" % (digest, _encode_filename(filename)))

    return total_bytes, num_files
Esempio n. 4
0
    def _validate_completeness(self):
        """
        Verify that the actual file manifests match the files in the data directory
        """
        errors = list()

        # First we'll make sure there's no mismatch between the filesystem
        # and the list of files in the manifest(s)
        only_in_manifests, only_on_fs, only_in_fetch = self.compare_manifests_with_fs_and_fetch(
        )
        for path in only_in_manifests:
            e = FileMissing(path)
            LOGGER.warning(force_unicode(e))
            errors.append(e)
        for path in only_on_fs:
            e = UnexpectedFile(path)
            LOGGER.warning(force_unicode(e))
            errors.append(e)
        for path in only_in_fetch:
            e = UnexpectedRemoteFile(path)
            # this is non-fatal according to spec but the warning is still reasonable
            LOGGER.warning(force_unicode(e))

        if errors:
            raise BagValidationError(_("Bag validation failed"), errors)
Esempio n. 5
0
def parse_version(version):
    try:
        return tuple(int(i) for i in version.split(".", 1))
    except ValueError:
        raise BagError(
            _("Bag version numbers must be MAJOR.MINOR numbers, not %s") %
            version)
Esempio n. 6
0
    def __init__(self, bagpath, name=None, location=None):
        """
        open the bag with the given location
        :param bagpath:  either a Path instance or a filepath to the bag's 
                         root directory.  A Path instance must be used if the 
                         bag is in a serialized form.  
        :type bagpath:   str or Path
        :param str name:  the name of bag (i.e. its nominal base directory); if None
                          the name will be the basename for the given bagpath
        :param str location:  the location of the bag; this can be provided when bagpath
                          is a Path instance to specify the source location of the Path's 
                          filesystem.
        """
        if not bagpath:
            raise BagError(_("path to bag root directory not provided"))
        if not isinstance(bagpath, Path):
            bagpath = bagpath.rstrip("/")
            parent = os.path.dirname(bagpath) or "."
            bagname = os.path.basename(bagpath)
            if not location:
                location = bagpath
            bagpath = Path(fs.osfs.OSFS(parent), _unicode(bagname), parent+"/")

        if not name:
            name = os.path.basename(bagpath.path)
        self._name = name
        self._root = bagpath.subfspath()

        path = location
        if not path:
            path = _unicode("/"+self._name)
            if path == "/":
                path = "//" # super __init__ will strip trailing /
        super(ReadOnlyBag, self).__init__(path)
Esempio n. 7
0
    def __init__(self, bagpath, name=None):
        """
        open the bag with the given location
        :param bagpath:  either a Path instance or a filepath to the bag's 
                         root directory.  A Path instance must be used if the 
                         bag is in a serialized form.  
        :type bagpath:   str or Path
        """
        if not bagpath:
            raise BagError(_("path to bag root directory not provided"))
        if not isinstance(bagpath, Path):
            bagpath = bagpath.rstrip("/")
            parent = os.path.dirname(bagpath) or "."
            bagname = os.path.basename(bagpath)
            bagpath = Path(fs.osfs.OSFS(parent), _unicode(bagname), parent+"/")

        if not name:
            name = os.path.basename(bagpath.path)
        self._name = name
        self._root = bagpath.subfspath()

        path = _unicode("/"+self._name)
        if path == "/":
            path = "//" # super __init__ will strip trailing /
        super(ReadOnlyBag, self).__init__(path)
Esempio n. 8
0
    def _open(self):
        # Open the bagit.txt file, and load any tags from it, including
        # the required version and encoding.
        #
        # This overrides the one inherited from bagit.Bag
        bagit_file = _unicode("bagit.txt")
        bagit_file_path = self._root.relpath(bagit_file)

        if not self._root.fs.isfile(bagit_file):
            raise BagError(_("Expected bagit.txt does not exist: %s") % bagit_file_path)

        self.tags = tags = _load_tag_file(bagit_file_path)

        required_tags = ('BagIt-Version', 'Tag-File-Character-Encoding')
        missing_tags = [i for i in required_tags if i not in tags]
        if missing_tags:
            raise BagError(_("Missing required tag in bagit.txt: %s") % ', '.join(missing_tags))

        # To avoid breaking existing code we'll leave self.version as the string
        # and parse it into a numeric version_info tuple. In version 2.0 we can
        # break that.
        self._version = tags['BagIt-Version']

        try:
            self.version_info = tuple(int(i) for i in self._version.split('.', 1))
        except ValueError:
            raise BagError(_('Bag version numbers must be MAJOR.MINOR numbers, not %s') % self._version)

        if (0, 93) <= self.version_info <= (0, 95):
            self.tag_file_name = "package-info.txt"
        elif (0, 96) <= self.version_info < (2, ):
            self.tag_file_name = "bag-info.txt"
        else:
            raise BagError(_("Unsupported bag version: %s") % self._version)

        self.encoding = tags['Tag-File-Character-Encoding']
        try:
            codecs.lookup(self.encoding)
        except LookupError:
            raise BagValidationError(_("Unsupported encoding: %s") % self.encoding)

        info_file_path = self._root.relpath(self.tag_file_name)
        if info_file_path.exists():
            self.info = _load_tag_file(info_file_path, encoding=self.encoding)

        self._load_manifests()
Esempio n. 9
0
    def save(self, processes=1, manifests=False):
        """
        save will persist any changes that have been made to the bag
        metadata (self.info).

        This implementation will always raise a BagError exception 
        complaining that the bag was opened read-only
        """
        raise BagError(_("Unable to save as the bag was opened read-only"))
Esempio n. 10
0
    def _validate_bagittxt(self):
        """
        Verify that bagit.txt conforms to specification
        """
        bagit_file_path = self._root.relpath("bagit.txt")

        # Note that we are intentionally opening this file in binary mode so we can confirm
        # that it does not start with the UTF-8 byte-order-mark
        with open_bin_file(bagit_file_path) as bagit_file:
            first_line = bagit_file.read(4)
            if first_line.startswith(codecs.BOM_UTF8):
                raise BagValidationError(_("bagit.txt must not contain a byte-order mark"))
Esempio n. 11
0
    def _validate_oxum(self):
        oxum = self.info.get('Payload-Oxum')

        if oxum is None:
            return

        # If multiple Payload-Oxum tags (bad idea)
        # use the first listed in bag-info.txt
        if isinstance(oxum, list):
            LOGGER.warning(_('bag-info.txt defines multiple Payload-Oxum values!'))
            oxum = oxum[0]

        oxum_byte_count, oxum_file_count = oxum.split('.', 1)

        if not oxum_byte_count.isdigit() or not oxum_file_count.isdigit():
            raise BagError(_("Malformed Payload-Oxum value: %s") % oxum)

        oxum_byte_count = int(oxum_byte_count)
        oxum_file_count = int(oxum_file_count)
        total_bytes = 0
        total_files = 0

        for payload_file in self.payload_files():
            info = self._root.fs.getinfo(payload_file, namespaces=['details'])
            total_bytes += info.size
            total_files += 1

        if oxum_file_count != total_files or oxum_byte_count != total_bytes:
            raise BagValidationError(
                _('Payload-Oxum validation failed.'
                  ' Expected %(oxum_file_count)d files and %(oxum_byte_count)d bytes'
                  ' but found %(found_file_count)d files and %(found_byte_count)d bytes') % {
                        'found_file_count': total_files,
                        'found_byte_count': total_bytes,
                        'oxum_file_count': oxum_file_count,
                        'oxum_byte_count': oxum_byte_count,
                    }
            )
Esempio n. 12
0
    def _validate_fetch(self):
        """Validate the fetch.txt file

        Raises `BagError` for errors and otherwise returns no value
        """
        for url, file_size, filename in self.fetch_entries():
            # fetch_entries will raise a BagError for unsafe filenames
            # so at this point we will check only that the URL is minimally
            # well formed:
            parsed_url = urlparse(url)

            # only check for a scheme component since per the spec the URL field is actually a URI per
            # RFC3986 (https://tools.ietf.org/html/rfc3986)
            if not all(parsed_url.scheme):
                raise BagError(_('Malformed URL in fetch.txt: %s') % url)
Esempio n. 13
0
def _calculate_file_hashes(full_path, f_hashers):
    """
    Returns a dictionary of (algorithm, hexdigest) values for the provided
    filename
    """
    LOGGER.info(_("Verifying checksum for file %s"), full_path)

    try:
        with open_bin_file(full_path) as f:
            while True:
                block = f.read(HASH_BLOCK_SIZE)
                if not block:
                    break
                for i in f_hashers.values():
                    i.update(block)
    except (OSError, IOError) as e:
        raise BagValidationError(_("Could not read %(filename)s: %(error)s") % {
            'filename': full_path,
            'error': _unicode(e),
        })

    return dict(
        (alg, h.hexdigest()) for alg, h in f_hashers.items()
    )
Esempio n. 14
0
    def _validate_contents(self, processes=1, fast=False, completeness_only=False, callback=None):
        if fast and not self.has_oxum():
            raise BagValidationError(_('Fast validation requires bag-info.txt to include Payload-Oxum'))

        if fast:
            # Perform the fast file count + size check so we can fail early, but only if fast is specified:
            self._validate_oxum()
            return

        self._validate_completeness()

        if completeness_only:
            return

        self._validate_entries(processes, callback)
Esempio n. 15
0
def _make_tag_file(bag_info_path, bag_info):
    headers = sorted(bag_info.keys())
    with open_text_file(bag_info_path, 'w') as f:
        for h in headers:
            values = bag_info[h]
            if not isinstance(values, list):
                values = [values]
            for txt in values:
                if isinstance(txt, dict):
                    LOGGER.warning(
                        _("Nested dictionary content not supported in tag file: [%s]. "
                          "Skipping element \"%s\" with value %s." %
                          (bag_info_path, h, json.dumps(txt))))
                    continue
                # strip CR, LF and CRLF so they don't mess up the tag file
                txt = re.sub(r'\n|\r|(\r\n)', '', force_unicode(txt))
                f.write("%s: %s\n" % (h, txt))
Esempio n. 16
0
    def fetch_entries(self):
        """Load fetch.txt if present and iterate over its contents

        yields (url, size, filename) tuples

        raises BagError for errors such as an unsafe filename referencing
        data outside of the bag directory
        """
        fetch_file_path = self._root.relpath("fetch.txt")
    
        if fetch_file_path.isfile():
            with open_text_file(fetch_file_path, 'r', encoding=self.encoding) as fetch_file:
                for line in fetch_file:
                    url, file_size, filename = line.strip().split(None, 2)

                    if self._path_is_dangerous(filename):
                        raise BagError(_('Path "%(payload_file)s" in "%(source_file)s" is unsafe') % {
                            'payload_file': filename,
                            'source_file': str(fetch_file_path),
                        })

                    yield url, file_size, filename
Esempio n. 17
0
def make_bag(bag_dir, bag_info=None, processes=1, checksums=None, encoding='utf-8', remote_entries=None):
    """
    Convert a given directory into a bag. You can pass in arbitrary
    key/value pairs to put into the bag-info.txt metadata file as
    the bag_info dictionary.
    """

    if checksums is None:
        checksums = DEFAULT_CHECKSUMS

    bag_dir = os.path.abspath(bag_dir)
    cwd = os.path.abspath(os.path.curdir)

    if cwd.startswith(bag_dir) and cwd != bag_dir:
        raise RuntimeError(_('Bagging a parent of the current directory is not supported'))

    LOGGER.info(_("Creating bag for directory %s"), bag_dir)

    if not os.path.isdir(bag_dir):
        LOGGER.error(_("Bag directory %s does not exist"), bag_dir)
        raise RuntimeError(_("Bag directory %s does not exist") % bag_dir)

    # FIXME: we should do the permissions checks before changing directories
    old_dir = os.path.abspath(os.path.curdir)

    try:
        # TODO: These two checks are currently redundant since an unreadable directory will also
        #       often be unwritable, and this code will require review when we add the option to
        #       bag to a destination other than the source. It would be nice if we could avoid
        #       walking the directory tree more than once even if most filesystems will cache it

        unbaggable = _can_bag(bag_dir)

        if unbaggable:
            LOGGER.error(_("Unable to write to the following directories and files:\n%s"), unbaggable)
            raise BagError(_("Missing permissions to move all files and directories"))

        unreadable_dirs, unreadable_files = _can_read(bag_dir)

        if unreadable_dirs or unreadable_files:
            if unreadable_dirs:
                LOGGER.error(_("The following directories do not have read permissions:\n%s"),
                             unreadable_dirs)
            if unreadable_files:
                LOGGER.error(_("The following files do not have read permissions:\n%s"),
                             unreadable_files)
            raise BagError(_("Read permissions are required to calculate file fixities"))
        else:
            LOGGER.info(_("Creating data directory"))

            # FIXME: if we calculate full paths we won't need to deal with changing directories
            os.chdir(bag_dir)
            cwd = os.getcwd()
            temp_data = tempfile.mkdtemp(dir=cwd)

            for f in os.listdir('.'):
                if os.path.abspath(f) == temp_data:
                    continue
                new_f = os.path.join(temp_data, f)
                LOGGER.info(_('Moving %(source)s to %(destination)s'), {'source': f, 'destination': new_f})
                os.rename(f, new_f)

            LOGGER.info(_('Moving %(source)s to %(destination)s'), {'source': temp_data, 'destination': 'data'})
            os.rename(temp_data, 'data')

            # permissions for the payload directory should match those of the
            # original directory
            os.chmod('data', os.stat(cwd).st_mode)

            validate_remote_entries(remote_entries, bag_dir)
            total_bytes, total_files = make_manifests('data', processes, algorithms=checksums, encoding=encoding)
            total_bytes_remote, total_files_remote = update_manifests_from_remote(remote_entries, bag_dir)
            total_bytes += total_bytes_remote
            total_files += total_files_remote

            _make_fetch_file(bag_dir, remote_entries)

            LOGGER.info(_("Creating bagit.txt"))
            txt = """BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8\n"""
            with open_text_file('bagit.txt', 'w') as bagit_file:
                bagit_file.write(txt)

            LOGGER.info(_("Creating bag-info.txt"))
            if bag_info is None:
                bag_info = {}

            # allow 'Bagging-Date' and 'Bag-Software-Agent' to be overidden
            if 'Bagging-Date' not in bag_info:
                bag_info['Bagging-Date'] = date.strftime(date.today(), "%Y-%m-%d")
            if 'Bag-Software-Agent' not in bag_info:
                bag_info['Bag-Software-Agent'] = \
                    'BDBag version: %s (Bagit version: %s) <%s>' % (VERSION, BAGIT_VERSION, PROJECT_URL)
            bag_info['Payload-Oxum'] = "%s.%s" % (total_bytes, total_files)
            _make_tag_file('bag-info.txt', bag_info)

            for c in checksums:
                _make_tagmanifest_file(c, bag_dir, encoding='utf-8')
    except Exception:
        LOGGER.error(_("An error occurred creating a bag in %s"), bag_dir)
        raise
    finally:
        os.chdir(old_dir)

    return BDBag(bag_dir)
Esempio n. 18
0
 def __str__(self):
     return _("%s exists in fetch.txt but is not in manifest") % self.path
Esempio n. 19
0
 def _validate_structure_payload_directory(self):
     if not self._root.fs.isdir("data"):
         raise BagValidationError(_('Expected data directory does not exist in %s') % str(self._root))
Esempio n. 20
0
    def _load_manifests(self):
        self.entries = OrderedDict()
        manifests = list(self.manifest_files())

        if self.version_info >= (0, 97):
            # v0.97+ requires that optional tagfiles are verified.
            manifests += list(self.tagmanifest_files())

        for manifest_filename in manifests:
            if not manifest_filename.find("tagmanifest-") is -1:
                search = "tagmanifest-"
            else:
                search = "manifest-"
            alg = os.path.basename(manifest_filename).replace(search, "").replace(".txt", "")
            self.algorithms.append(alg)

            manifest_filename = self._root.relpath(manifest_filename)

            manifest_file = None
            try:
                manifest_file = open_text_file(manifest_filename, 'r', encoding=self.encoding)

                if manifest_file.encoding.startswith('UTF'):
                    # We'll check the first character to see if it's a BOM:
                    if manifest_file.read(1) == UNICODE_BYTE_ORDER_MARK:
                        # We'll skip it either way by letting line decoding
                        # happen at the new offset but we will issue a warning
                        # for UTF-8 since the presence of a BOM  is contrary to
                        # the BagIt specification:
                        if manifest_file.encoding == 'UTF-8':
                            LOGGER.warning(_('%s is encoded using UTF-8 but contains an unnecessary'
                                             ' byte-order mark, which is not in compliance with the'
                                             ' BagIt RFC'),
                                           manifest_file.name)
                    else:
                        # Pretend the first read never happened
                        # manifest_file.seek(0)  
                        # seek() may not be available, so instead close and reopen
                        manifest_file.close()
                        manifest_file = open_text_file(manifest_filename, 'r', encoding=self.encoding)
                        
                for line in manifest_file:
                    line = line.strip()

                    # Ignore blank lines and comments.
                    if line == "" or line.startswith("#"):
                        continue

                    entry = line.split(None, 1)

                    # Format is FILENAME *CHECKSUM
                    if len(entry) != 2:
                        LOGGER.error(_("%(bag)s: Invalid %(algorithm)s manifest entry: %(line)s"),
                                     {'bag': self, 'algorithm': alg, 'line': line})
                        continue

                    entry_hash = entry[0]
                    entry_path = os.path.normpath(entry[1].lstrip("*"))
                    entry_path = _decode_filename(entry_path)

                    if self._path_is_dangerous(entry_path):
                        raise BagError(
                            _('Path "%(payload_file)s" in manifest "%(manifest_file)s" is unsafe') % {
                                'payload_file': entry_path,
                                'manifest_file': manifest_file.name,
                            }
                        )

                    entry_hashes = self.entries.setdefault(entry_path, OrderedDict())

                    if alg in entry_hashes:
                        warning_ctx = {'bag': self, 'algorithm': alg, 'filename': entry_path}
                        if entry_hashes[alg] == entry_hash:
                            msg = _('%(bag)s: %(algorithm)s manifest lists %(filename)s'
                                    ' multiple times with the same value')
                            if self.version_info >= (1, ):
                                raise BagError(msg % warning_ctx)
                            else:
                                LOGGER.warning(msg, warning_ctx)
                        else:
                            raise BagError(_('%(bag)s: %(algorithm)s manifest lists %(filename)s'
                                             ' multiple times with conflicting values') % warning_ctx)

                    entry_hashes[alg] = entry_hash

            finally:
                if manifest_file:
                    manifest_file.close()
                    manifest_file = None

        self.normalized_manifest_names.update(
            (normalize_unicode(i), i) for i in self.entries.keys()
        )
Esempio n. 21
0
    def save(self, processes=1, manifests=False):
        """
        save will persist any changes that have been made to the bag
        metadata (self.info).

        If you have modified the payload of the bag (added, modified,
        removed files in the data directory) and want to regenerate manifests
        set the manifests parameter to True. The default is False since you
        wouldn't want a save to accidentally create a new manifest for
        a corrupted bag.

        If you want to control the number of processes that are used when
        recalculating checksums use the processes parameter.
        """
        # Error checking
        if not self.path:
            raise BagError(_('Bag.save() called before setting the path!'))

        if not os.access(self.path, os.R_OK | os.W_OK | os.X_OK):
            raise BagError(_('Cannot save bag to non-existent or inaccessible directory %s') % self.path)

        unbaggable = _can_bag(self.path)
        if unbaggable:
            LOGGER.error(_("Missing write permissions for the following directories and files:\n%s"),
                         unbaggable)
            raise BagError(_("Missing permissions to move all files and directories"))

        unreadable_dirs, unreadable_files = _can_read(self.path)
        if unreadable_dirs or unreadable_files:
            if unreadable_dirs:
                LOGGER.error(_("The following directories do not have read permissions:\n%s"),
                             unreadable_dirs)
            if unreadable_files:
                LOGGER.error(_("The following files do not have read permissions:\n%s"),
                             unreadable_files)
            raise BagError(_("Read permissions are required to calculate file fixities"))

        # Change working directory to bag directory so helper functions work
        old_dir = os.path.abspath(os.path.curdir)
        try:
            os.chdir(self.path)

            # Generate new manifest files
            if manifests:
                self._sync_remote_entries_with_existing_fetch()
                validate_remote_entries(self.remote_entries, self.path)
                total_bytes, total_files = make_manifests('data', processes,
                                                          algorithms=self.algorithms,
                                                          encoding=self.encoding)
                total_bytes_remote, total_files_remote = update_manifests_from_remote(self.remote_entries, self.path)
                total_bytes += total_bytes_remote
                total_files += total_files_remote

                # Update fetch.txt
                _make_fetch_file(self.path, self.remote_entries)

                # Update Payload-Oxum
                LOGGER.info(_('Updating Payload-Oxum in %s'), self.tag_file_name)
                self.info['Payload-Oxum'] = '%s.%s' % (total_bytes, total_files)

            _make_tag_file(self.tag_file_name, self.info)

            # Update tag-manifest for changes to manifest & bag-info files
            for alg in self.algorithms:
                _make_tagmanifest_file(alg, self.path, encoding=self.encoding)

            # Reload the manifests
            self._load_manifests()

        except Exception:
            LOGGER.error(_("An error occurred updating bag in %s"), self.path)
            raise

        finally:
            os.chdir(old_dir)
Esempio n. 22
0
def make_manifests(data_dir,
                   processes,
                   algorithms=DEFAULT_CHECKSUMS,
                   encoding='utf-8',
                   remote=None,
                   strict=False):
    LOGGER.info(
        _('Using %(process_count)d processes to generate manifests: %(algorithms)s'
          ), {
              'process_count': processes,
              'algorithms': ', '.join(algorithms)
          })

    manifest_line_generator = partial(generate_manifest_lines,
                                      algorithms=algorithms)

    if processes > 1:
        pool = multiprocessing.Pool(processes=processes)
        checksums = pool.map(manifest_line_generator, _walk(data_dir))
        pool.close()
        pool.join()
    else:
        checksums = [manifest_line_generator(i) for i in _walk(data_dir)]

    # At this point we have a list of tuples which start with the algorithm name:
    manifest_data = {}
    for batch in checksums:
        for entry in batch:
            manifest_data.setdefault(entry[0], []).append(entry[1:])

    # These will be keyed on the algorithm name so we can perform sanity checks
    # below to catch failures in the hashing process:
    num_files = defaultdict(lambda: 0)
    total_bytes = defaultdict(lambda: 0)

    remote_entries = []
    if remote:
        LOGGER.info(_('Generating manifest lines for remote files'))
        sorted_remote_entries = OrderedDict(
            sorted(remote.items(), key=lambda t: t[0]))
        for filename, values in sorted_remote_entries.items():
            checksums = []
            remote_size = int(values['length'])
            for alg in CHECKSUM_ALGOS:
                if alg in values.keys():
                    checksums.append(
                        (alg, values[alg],
                         _denormalize_filename(_decode_filename(filename)),
                         remote_size))
            remote_entries.append(checksums)

    for batch in remote_entries:
        for entry in batch:
            manifest_data.setdefault(entry[0], []).append(entry[1:])

    file_entries = {}
    for algorithm, values in manifest_data.items():
        manifest_filename = 'manifest-%s.txt' % algorithm

        with open_text_file(manifest_filename, 'w',
                            encoding=encoding) as manifest:
            for digest, filename, byte_count in values:
                manifest.write("%s  %s\n" %
                               (digest, _encode_filename(filename)))
                num_files[algorithm] += 1
                total_bytes[algorithm] += byte_count
                file_entries[filename] = byte_count

    # We'll use sets of the values for the error checks and eventually return the payload oxum values:
    byte_value_set = set(total_bytes.values())
    file_count_set = set(num_files.values())

    # allow a bag with an empty payload
    if not byte_value_set and not file_count_set:
        return 0, 0

    if strict:
        if len(file_count_set) != 1:
            raise RuntimeError(
                _('Expected the same number of files for each checksum'))

        if len(byte_value_set) != 1:
            raise RuntimeError(
                _('Expected the same number of bytes for each checksum'))

        return byte_value_set.pop(), file_count_set.pop()

    byte_total = file_total = 0
    for file_size in file_entries.values():
        file_total += 1
        byte_total += file_size

    return byte_total, file_total