def main(source, dest, splitter, partition=False): source_bag = bagit.Bag(source) new_payload, old_payload = payload_filter(source_bag, splitter) if len(new_payload) == 0: print('No files to split!') exit() if not os.path.exists(dest): os.mkdir(dest) os.chdir(source) new_oxum = write_manifests(new_payload, source_bag.algorithms, dest) move_files(new_payload, dest) logging.info('Writing manifests') if partition: for file in source_bag.manifest_files(): os.remove(file) old_oxum = write_manifests( old_payload, source_bag.algorithms, source_bag.path) for file, hash in source_bag.tagfile_entries().items(): if 'manifest' not in file: logging.info('Copying tag file {} to {}'.format(file, dest)) shutil.copy(file, dest) new_bag = bagit.Bag(dest) new_bag.info['Payload-Oxum'] = new_oxum if partition: source_bag.info['Payload-Oxum'] = old_oxum new_bag.save() logging.info('Validating bag {}'.format(new_bag.path)) if new_bag.validate() and partition: del_files(new_payload) source_bag.save() else: logging.error('bag {} is invalid'.format(new_bag.path)) logging.info('Validating bag {}'.format(source_bag.path)) source_bag.validate()
def test_validate_optional_tagfile(self): bag = bagit.make_bag(self.tmpdir, checksums=['md5']) tagdir = tempfile.mkdtemp(dir=self.tmpdir) with open(j(tagdir, "tagfile"), "w") as tagfile: tagfile.write("test") relpath = j(tagdir, "tagfile").replace(self.tmpdir + os.sep, "") relpath.replace("\\", "/") with open(j(self.tmpdir, "tagmanifest-md5.txt"), "w") as tagman: # Incorrect checksum. tagman.write("8e2af7a0143c7b8f4de0b3fc90f27354 " + relpath + "\n") bag = bagit.Bag(self.tmpdir) self.assertRaises(bagit.BagValidationError, self.validate, bag) hasher = hashlib.new("md5") contents = slurp_text_file(j(tagdir, "tagfile")).encode('utf-8') hasher.update(contents) with open(j(self.tmpdir, "tagmanifest-md5.txt"), "w") as tagman: tagman.write(hasher.hexdigest() + " " + relpath + "\n") bag = bagit.Bag(self.tmpdir) self.assertTrue(self.validate(bag)) # Missing tagfile. os.remove(j(tagdir, "tagfile")) bag = bagit.Bag(self.tmpdir) self.assertRaises(bagit.BagValidationError, self.validate, bag)
def test_validate_optional_tagfile_in_directory(self): bag = bagit.make_bag(self.tmpdir, checksums=["md5"]) tagdir = tempfile.mkdtemp(dir=self.tmpdir) if not os.path.exists(j(tagdir, "tagfolder")): os.makedirs(j(tagdir, "tagfolder")) with open(j(tagdir, "tagfolder", "tagfile"), "w") as tagfile: tagfile.write("test") relpath = j(tagdir, "tagfolder", "tagfile").replace(self.tmpdir + os.sep, "") relpath.replace("\\", "/") with open(j(self.tmpdir, "tagmanifest-md5.txt"), "w") as tagman: # Incorrect checksum. tagman.write("8e2af7a0143c7b8f4de0b3fc90f27354 " + relpath + "\n") bag = bagit.Bag(self.tmpdir) self.assertRaises(bagit.BagValidationError, self.validate, bag) hasher = hashlib.new("md5") with open(j(tagdir, "tagfolder", "tagfile"), "r") as tf: contents = tf.read().encode("utf-8") hasher.update(contents) with open(j(self.tmpdir, "tagmanifest-md5.txt"), "w") as tagman: tagman.write(hasher.hexdigest() + " " + relpath + "\n") bag = bagit.Bag(self.tmpdir) self.assertTrue(self.validate(bag)) # Missing tagfile. os.remove(j(tagdir, "tagfolder", "tagfile")) bag = bagit.Bag(self.tmpdir) self.assertRaises(bagit.BagValidationError, self.validate, bag)
def callback(ch, method, properties, body): """Passed a 'jobname/timestamp', creates a SIP. Having created the SIP, adds a message to the indexing queue.""" try: logger.info("Message received: %s." % body) if verify_message(body): sip_dir = create_sip(body) logger.debug("Created SIP: %s" % sip_dir) # Create our Bagit. bag = bagit.Bag(sip_dir) if bag.validate(): logger.debug("Moving %s to %s." % (body, settings.DLS_DROP)) dls = copy_to_dls(body) bag = bagit.Bag(dls) if bag.validate(): logger.debug("Moving %s to %s." % (dls, settings.DLS_WATCH)) shutil.move( dls, "%s/%s" % (settings.DLS_WATCH, os.path.basename(body))) gztar = copy_to_hdfs(sip_dir) logger.debug("SIP tarball at hdfs://%s" % gztar) logger.debug("Sending message to '%s': %s" % (settings.SUBMITTED_QUEUE_NAME, body)) send_index_message(body) else: raise Exception("Invalid Bagit after copy: %s" % dls) else: raise Exception("Invalid Bagit: %s" % sip_dir) else: raise Exception("Could not verify message: %s" % body) except Exception as e: logger.error("%s [%s]" % (str(e), body)) send_error_message("%s|%s" % (body, str(e)))
def test_validate_optional_tagfile(self): bag = bagit.make_bag(self.tmpdir) tagdir = tempfile.mkdtemp(dir=self.tmpdir) tagfile = open(os.path.join(tagdir, "tagfile"), "w") tagfile.write("test") tagfile.close() relpath = os.path.join(tagdir, "tagfile").replace(self.tmpdir + os.sep, "") relpath.replace("\\", "/") tagman = open(os.path.join(self.tmpdir, "tagmanifest-md5.txt"), "w") # Incorrect checksum. tagman.write("8e2af7a0143c7b8f4de0b3fc90f27354 " + relpath + "\n") tagman.close() bag = bagit.Bag(self.tmpdir) self.assertRaises(bagit.BagValidationError, bag.validate) hasher = hashlib.new("md5") hasher.update(open(os.path.join(tagdir, "tagfile"), "rb").read()) tagman = open(os.path.join(self.tmpdir, "tagmanifest-md5.txt"), "w") tagman.write(hasher.hexdigest() + " " + relpath + "\n") tagman.close() bag = bagit.Bag(self.tmpdir) self.assertTrue(bag.validate()) # Missing tagfile. os.remove(os.path.join(tagdir, "tagfile")) bag = bagit.Bag(self.tmpdir) self.assertRaises(bagit.BagValidationError, bag.validate)
def test_save_baginfo(self): bag = bagit.make_bag(self.tmpdir) bag.info["foo"] = "bar" bag.save() bag = bagit.Bag(self.tmpdir) self.assertEqual(bag.info["foo"], "bar") self.assertTrue(bag.is_valid()) bag.info["x"] = ["a", "b", "c"] bag.save() b = bagit.Bag(self.tmpdir) self.assertEqual(b.info["x"], ["a", "b", "c"]) self.assertTrue(bag.is_valid())
def test_multiple_oxum_values(self): bag = bagit.make_bag(self.tmpdir) baginfo = open(os.path.join(self.tmpdir, "bag-info.txt"), "a") baginfo.write('Payload-Oxum: 7.7\n') baginfo.close() bag = bagit.Bag(self.tmpdir) self.assertTrue(bag.validate(fast=True))
def bag_derivatives(taskid, update_manifest=True): """ Generate bag of derivative args: taskid: cybercommons generated task id for derivative update_manifest: boolean to update bag manifest - default is True """ bagpath = "{0}/oulib_tasks/{1}/derivative/".format(basedir, taskid) for bagname in os.listdir(bagpath): fullpath = "{0}/{1}".format(bagpath, bagname) try: bag = bagit.Bag(fullpath) except bagit.BagError: bag = bagit.make_bag(fullpath) bag.info['External-Description'] = bagname bag.info['External-Identifier'] = 'University of Oklahoma Libraries' try: bag.save(manifests=update_manifest) except IOError as err: logging.error(err) # point back at task return "{0}/oulib_tasks/{1}".format(base_url, taskid)
def test_validate_slow_without_oxum_extra_file(self): bag = bagit.make_bag(self.tmpdir) os.remove(j(self.tmpdir, "bag-info.txt")) with open(j(self.tmpdir, "data", "extra_file"), "w") as ef: ef.write("foo") bag = bagit.Bag(self.tmpdir) self.assertRaises(bagit.BagValidationError, self.validate, bag, fast=False)
def bag_load(self, bag_path): global bag global tempdir tempdir = tempfile.TemporaryDirectory() ZipFile(bag_path).extractall(path=tempdir.name) try: bag = bagit.Bag(path=tempdir.name) except: return False, False if bag.is_valid(): encrypted_files = [] for x in bag.payload_files(): if Path(x).suffix == '.pdf': with open(os.path.join(tempdir.name, x), mode='rb') as pdf: reader = PyPDF2.PdfFileReader(pdf) if reader.isEncrypted: encrypted_files.append(x) elif Path(x).suffix == '.zip': with ZipFile(os.path.join(tempdir.name, x)) as zippy: try: zippy.open(zippy.namelist()[0]) except RuntimeError: encrypted_files.append(x) return bag.info, encrypted_files else: bad_files = [] try: bag.validate() except bagit.BagValidationError as e: for d in e.details: if isinstance(d, bagit.ChecksumMismatch): bad_files.append(d.path) return False, bad_files
def test_cron(self): for ref in BAGS_REF: helpers.create_target_bags(ref[0], settings.TEST_BAGS_DIR, self.orgs[0], username=self.user.username) discovered = DiscoverTransfers().do() self.assertIsNot(False, discovered) for archive in Archives.objects.filter( process_status=Archives.VALIDATED): archive.process_status = Archives.ACCESSIONING_STARTED archive.save() delivered = DeliverTransfers().do() self.assertIsNot(False, delivered) self.assertEqual( len( Archives.objects.filter( process_status=Archives.ACCESSIONING_STARTED)), 0) self.assertEqual( len(Archives.objects.filter(process_status=Archives.DELIVERED)), len(os.listdir(settings.DELIVERY_QUEUE_DIR))) for bag_path in os.listdir(settings.DELIVERY_QUEUE_DIR): bag = bagit.Bag(os.path.join(settings.DELIVERY_QUEUE_DIR, bag_path)) self.assertTrue('Origin' in bag.bag_info)
def find_arcp(base_path): # First try to find External-Identifier bag = bagit.Bag(base_path) ext_id = bag.info.get("External-Identifier") if arcp.is_arcp_uri(ext_id): return ext_id raise Exception("Can't find External-Identifier")
def read_bag(bag_path): """ :param bag_path: :return: """ tmpdir = None try: if not os.path.exists(bag_path): raise HsBagitException('Bag does not exist') if os.path.isdir(bag_path): unpacked_bag_path = bag_path else: mtype = mimetypes.guess_type(bag_path) if mtype[0] != 'application/zip': msg = "Expected bag to have MIME type application/zip, " \ "but it has {0} instead.".format(mtype[0]) raise HsBagitException(msg) tmpdir = tempfile.mkdtemp() zfile = zipfile.ZipFile(bag_path) zroot = zfile.namelist()[0].split(os.sep)[0] zfile.extractall(tmpdir) unpacked_bag_path = os.path.join(tmpdir, zroot) bag = bagit.Bag(unpacked_bag_path) if not bag.is_valid(): msg = "Bag is not valid" raise HsBagitException(msg) finally: if tmpdir: shutil.rmtree(tmpdir)
def test_mixed_case_checksums(self): bag = bagit.make_bag(self.tmpdir) hashstr = {} #Extract entries only for the payload and ignore # entries from the tagmanifest file for key in bag.entries.iterkeys(): if key.startswith('data' + os.sep): hashstr = bag.entries[key] hashstr = hashstr.itervalues().next() manifest = open(os.path.join(self.tmpdir, "manifest-md5.txt"), "r").read() manifest = manifest.replace(hashstr, hashstr.upper()) open(os.path.join(self.tmpdir, "manifest-md5.txt"), "w").write(manifest) #Since manifest-md5.txt file is updated, re-calculate its # md5 checksum and update it in the tagmanifest-md5.txt file hasher = hashlib.new('md5') hasher.update( open(os.path.join(self.tmpdir, "manifest-md5.txt"), "r").read()) tagmanifest = open(os.path.join(self.tmpdir, "tagmanifest-md5.txt"), "r").read() tagmanifest = tagmanifest.replace( bag.entries['manifest-md5.txt']['md5'], hasher.hexdigest()) open(os.path.join(self.tmpdir, "tagmanifest-md5.txt"), "w").write(tagmanifest) bag = bagit.Bag(self.tmpdir) self.assertTrue(bag.validate())
def main(bagdir, outdir, parent_ref): """traverses a bagit package with an object directory, converting it to a V6 SIP using existing checksums """ bag = bagit.Bag(bagdir) os.chdir(bag.path) bag_path = pathlib.Path(bagdir) sip_path = pathlib.Path(outdir, bag_path.name+'.zip') sip = siplib.Sip(sip_path, parent_ref) for root, dirs, files in os.walk('data/objects'): if root == 'data/objects': parent_ref = sip.add_structobj( bag.info['identifier'], parent_ref=parent_ref) sip.add_identifier(parent_ref, bag.info['identifier']) else: parent_ref = sip.add_structobj( os.path.split(root)[1], parent_ref=parent_ref) for file in files: fpath = pathlib.Path(root) / file hash = [hash for file, hash in bag.payload_entries().items() if pathlib.Path(file) == fpath] if len(hash) == 1: norm_hash = {alg.upper(): val for alg, val in hash[0].items()} sip.add_asset_tree(parent_ref, fpath, checksum=norm_hash) else: raise ValueError('Too many hashes') sip.serialise() sip.close()
def update_bag_info(bag_path, data): """Adds metadata from a dictionary to `bag-info.txt`""" assert(isinstance(data, dict)) bag = bagit.Bag(bag_path) for k, v in data.items(): bag.info[k] = v bag.save()
def survey_bag(bag_path): try: bag = ami_bag(bag_path) bag_valid = bag.validate_amibag(metadata=True) bag_type = bag.type bag_subtype = bag.subtype except: bag = bagit.Bag(bag_path) bag_valid = False bag_type = None bag_subtype = None all_files = glob.iglob(os.path.join(bag_path, 'data/**/*.*'), recursive=True) bag_files = 0 bag_size = 0 for filepath in all_files: bag_files += 1 filesize = os.stat(filepath).st_size bag_size += filesize bag_metadata = [ filename for filename in all_files if filename.endswith(('.xlsx', '.json')) ] if len(bag_metadata) > 0: bag_metadata = ','.join(bag_metadata) else: bag_metadata = 'no metadata' return [bag_path, bag_type, bag_subtype, bag_size, bag_files, bag_valid]
def is_bag(bag_path): bag = None try: bag = bagit.Bag(bag_path) except (bagit.BagError, bagit.BagValidationError): pass return True if bag else False
def validate_bag(bag_path, fast=False, callback=None, config_file=bdbag.DEFAULT_CONFIG_FILE): config = read_config(config_file) bag_config = config['bag_config'] bag_processes = bag_config.get('bag_processes', 1) try: logger.info("Validating bag: %s" % bag_path) bag = bagit.Bag(bag_path) bag.validate(bag_processes if not callback else 1, fast=fast, callback=callback) logger.info("Bag %s is valid" % bag_path) except bagit.BagIncompleteError as e: logger.warning( "BagIncompleteError: %s %s", e, "This validation error may be transient if the bag contains unresolved remote file references " "from a fetch.txt file. In this case the bag is incomplete but not necessarily invalid. " "Resolve remote file references (if any) and re-validate.") raise e except bagit.BagValidationError as e: errors = list() for d in e.details: errors.append(bdbag.get_named_exception(d)) raise bagit.BagValidationError('\nError: '.join(errors)) except bagit.InterruptedError as e: logger.warn(bdbag.get_named_exception(e)) raise e except Exception as e: raise RuntimeError("Unhandled exception while validating bag: %s" % e)
def extract_bag(local_bag_archive_path): """Unachive a local bdbag, and return the local path. Places the unachived bag next to the archived one, minus the archived bag's extension.""" local_bag, _ = os.path.splitext(local_bag_archive_path) bdbag_api.extract_bag(local_bag_archive_path, os.path.dirname(local_bag)) bagit_bag = bagit.Bag(local_bag) return bagit_bag
def test_validation_error_details(self): bag = bagit.make_bag(self.tmpdir, checksums=['md5'], bag_info={'Bagging-Date': '1970-01-01'}) readme = j(self.tmpdir, "data", "README") txt = slurp_text_file(readme) txt = 'A' + txt[1:] with open(readme, "w") as r: r.write(txt) bag = bagit.Bag(self.tmpdir) got_exception = False try: self.validate(bag) except bagit.BagValidationError as e: got_exception = True exc_str = str(e) self.assertIn('data/README md5 validation failed: expected="8e2af7a0143c7b8f4de0b3fc90f27354" found="fd41543285d17e7c29cd953f5cf5b955"', exc_str) self.assertEqual(len(e.details), 1) readme_error = e.details[0] self.assertEqual('data/README md5 validation failed: expected="8e2af7a0143c7b8f4de0b3fc90f27354" found="fd41543285d17e7c29cd953f5cf5b955"', str(readme_error)) self.assertIsInstance(readme_error, bagit.ChecksumMismatch) self.assertEqual(readme_error.algorithm, 'md5') self.assertEqual(readme_error.path, 'data/README') self.assertEqual(readme_error.expected, '8e2af7a0143c7b8f4de0b3fc90f27354') self.assertEqual(readme_error.found, 'fd41543285d17e7c29cd953f5cf5b955') if not got_exception: self.fail("didn't get BagValidationError")
def populate_data_from_files(self, es_client, transfer_backlog_dir): """Populate indices and/or database from files.""" transfer_backlog_dir = Path(transfer_backlog_dir) processed = 0 for transfer_dir in transfer_backlog_dir.glob("*"): if transfer_dir.name == ".gitignore" or transfer_dir.is_file(): continue try: bag = bagit.Bag(str(transfer_dir)) bag.validate(processes=multiprocessing.cpu_count(), completeness_only=True) except bagit.BagError: bag = None transfer_uuid = transfer_dir.name[-36:] if bag and "External-Identifier" in bag.info: self.info("Importing self-describing transfer {}.".format( transfer_uuid)) size = am.get_bag_size(bag, str(transfer_dir)) _import_self_describing_transfer(self, es_client, self.stdout, transfer_dir, transfer_uuid, size) else: self.info( "Rebuilding known transfer {}.".format(transfer_uuid)) if bag: size = am.get_bag_size(bag, str(transfer_dir)) else: size = am.walk_dir(str(transfer_dir)) _import_pipeline_dependant_transfer(self, es_client, self.stdout, transfer_dir, transfer_uuid, size) processed += 1 self.success("{} transfers indexed!".format(processed))
def test_restructure_sip(self, mock_validate, mock_processing_config): """Asserts the RestructurePackageRoutine adds expected data and does not replace files.""" with open(join(processing_config_fixture_dir, "processingMCP.xml"), "r") as config_file: config_contents = config_file.read() mock_processing_config.return_value = config_contents mock_validate.return_value = {"valid": "true"} self.set_process_status(SIP.CREATED) total_sips = len(SIP.objects.all()) extracted = 0 while extracted < total_sips: ExtractPackageRoutine().run() extracted += 1 restructured = 0 while restructured < total_sips: message, sip_id = RestructurePackageRoutine().run() self.assertEqual(message, "SIP restructured.") self.assertEqual(len(sip_id), 1) restructured += 1 for sip in SIP.objects.filter(process_status=SIP.RESTRUCTURED): bag = bagit.Bag(sip.bag_path) self.assertEqual(sip.bag_identifier, bag.info["Internal-Sender-Identifier"]) self.assertTrue(isfile(join(sip.bag_path, "processingMCP.xml"))) self.assert_files_not_removed(sip)
def test_is_valid(self): bag = bagit.make_bag(self.tmpdir) bag = bagit.Bag(self.tmpdir) self.assertTrue(bag.is_valid()) with open(j(self.tmpdir, "data", "extra_file"), "w") as ef: ef.write("bar") self.assertFalse(bag.is_valid())
def bag_as_source(srcbag, metadata): """Validate and read metadata from srcbag as input. The notion of a bag being valid includes it being complete, ie. not having a fetch.txt to provide URLs for files that are not included in local filesystem. We thus don't need to test for that case, bagit.is_valid() is enough. Parameters: srcbag - The directory of the bag on disk metadata - A VersionMetadata object that will be updated with metadata from the bag Returns the srcdir for OCFL object content within the bag as it should be expressed in the state block. """ bag = bagit.Bag(srcbag) if not bag.is_valid(): raise BaggerError("Source Bagit bag at %s is not valid" % (srcbag)) # Local arguments override but otherwise take metadata from bag-info.txt if not metadata.id and 'External-Identifier' in bag.info: metadata.id = bag.info['External-Identifier'] if not metadata.created and 'Bagging-Date' in bag.info: metadata.created = bag.info[ 'Bagging-Date'] + 'T00:00:00Z' # FIXME - timezone fudge if not metadata.message and 'External-Description' in bag.info: metadata.message = bag.info['External-Description'] if not metadata.name and 'Contact-Name' in bag.info: metadata.name = bag.info['Contact-Name'] if not metadata.address and 'Contact-Email' in bag.info: metadata.address = 'mailto:' + bag.info['Contact-Email'] return os.path.join(srcbag, 'data')
def test_mixed_case_checksums(self): bag = bagit.make_bag(self.tmpdir, checksums=["md5"]) hashstr = {} # Extract entries only for the payload and ignore # entries from the tagmanifest file for key in bag.entries.keys(): if key.startswith("data" + os.sep): hashstr = bag.entries[key] hashstr = next(iter(hashstr.values())) manifest = slurp_text_file(j(self.tmpdir, "manifest-md5.txt")) manifest = manifest.replace(hashstr, hashstr.upper()) with open(j(self.tmpdir, "manifest-md5.txt"), "wb") as m: m.write(manifest.encode("utf-8")) # Since manifest-md5.txt file is updated, re-calculate its # md5 checksum and update it in the tagmanifest-md5.txt file hasher = hashlib.new("md5") contents = slurp_text_file(j(self.tmpdir, "manifest-md5.txt")).encode("utf-8") hasher.update(contents) with open(j(self.tmpdir, "tagmanifest-md5.txt"), "r") as tagmanifest: tagman_contents = tagmanifest.read() tagman_contents = tagman_contents.replace( bag.entries["manifest-md5.txt"]["md5"], hasher.hexdigest()) with open(j(self.tmpdir, "tagmanifest-md5.txt"), "w") as tagmanifest: tagmanifest.write(tagman_contents) bag = bagit.Bag(self.tmpdir) self.assertTrue(self.validate(bag))
def __init__(self, path=None): self.path = os.path.abspath(path) try: self.bag = bagit.Bag(path) except: print("not bag") self.check_baginfo()
def test_update_bag_info(self): key = "foo" value = "bar" bagit_helpers.update_bag_info(self.bag_path, {key: value}) bag = bagit.Bag(self.bag_path) self.assertEqual(bag.info[key], value) with self.assertRaises(AssertionError): bagit_helpers.update_bag_info(self.bag_path, [key, value])
def test_validate_fast_without_oxum(self): bag = bagit.make_bag(self.tmpdir) os.remove(j(self.tmpdir, "bag-info.txt")) bag = bagit.Bag(self.tmpdir) self.assertRaises(bagit.BagValidationError, self.validate, bag, fast=True)
def find_arcp(self): # First try to find External-Identifier bag = bagit.Bag(self.folder) ext_id = bag.info.get("External-Identifier") if arcp.is_arcp_uri(ext_id): return ext_id else: return arcp.arcp_random()