def test_data_package_structure(self): """Check that the package_descriptor is valid""" f = open(self.wacz_json, "rb") json_parse = json.loads(f.read()) # Make sure it's recording the correct number of resources self.assertEqual(len(json_parse["resources"]), 4) # Check that the correct hash was recorded for a warc f = open(self.warc_file, "rb") original_warc = support_hash_file(f.read()) f.close() warc_resource = self.find_resource(json_parse["resources"], "example-collection.warc") self.assertEqual(original_warc, warc_resource["stats"]["hash"]) # Check that the correct hash was recorded for the index.idx f = open(self.wacz_index_idx, "rb") original_wacz_index_idx = support_hash_file(f.read()) f.close() idx_resource = self.find_resource(json_parse["resources"], "idx") self.assertEqual(original_wacz_index_idx, idx_resource["stats"]["hash"]) # Check that the correct hash was recorded for the index.cdx.gz f = open(self.wacz_index_cdx, "rb") original_wacz_index_cdx = support_hash_file(f.read()) f.close() cdx_resource = self.find_resource(json_parse["resources"], "cdx") self.assertEqual(original_wacz_index_cdx, cdx_resource["stats"]["hash"]) # Use frictionless validation valid = validate(self.wacz_json) self.assertTrue(valid.valid)
def test_util_hash(self): """When invoking the util hash method a hash should be returned""" test_hash = hashlib.sha256("test".encode("utf-8")).hexdigest() self.assertEqual(support_hash_file("sha256", "test".encode("utf-8")), test_hash) test_hash = hashlib.md5("test".encode("utf-8")).hexdigest() self.assertEqual(support_hash_file("md5", "test".encode("utf-8")), test_hash)
def test_archive_structure(self): """Check that the hash of the original warc file matches that of the warc file in the archive folder""" f = open(self.warc_file, "rb") original_warc = support_hash_file(f.read()) f.close() f = open(self.wacz_archive, "rb") archive_warc = support_hash_file(f.read()) f.close() self.assertEqual(original_warc, archive_warc)
def check_file_hashes(self): """Uses the datapackage to check that all the hashes of file in the data folder match those in the datapackage""" for filepath in pathlib.Path(self.dir.name).glob("**/*.*"): if not os.path.basename(filepath).endswith("datapackage.json"): file = open(filepath, "rb").read() hash = support_hash_file(self.hash_type, file) file = str(filepath).split("/")[-2:] file = "/".join(file) res = None for item in self.datapackage["resources"]: if item["path"] == file: res = item if res == None or (res["stats"]["hash"] != hash): print( "\nfile %s's hash does not match the has listed in the datapackage" % file) return False return True
def check_indexes(self): """Indexing existing WARC which should match the index in the wacz""" if os.path.exists(os.path.join(self.dir.name, "indexes/index.cdx.gz")): for resource in self.datapackage["resources"]: if resource["path"] == "indexes/index.cdx.gz": cdx = resource["stats"]["hash"] else: return False archive_folder = os.listdir(os.path.join(self.dir.name, "archive")) for item in archive_folder: if ".warc" in item: warc = item wacz_file = tempfile.NamedTemporaryFile(delete=False) wacz = zipfile.ZipFile(wacz_file.name, "w") data_file = zipfile.ZipInfo("indexes/index.cdx.gz", now()) index_buff = BytesIO() text_wrap = TextIOWrapper(index_buff, "utf-8", write_through=True) wacz_indexer = None with wacz.open(data_file, "w") as data: wacz_indexer = WACZIndexer( text_wrap, {}, sort=True, compress=data, fields="referrer", data_out_name="index.cdx.gz", records="all", main_url="", detect_pages="", ) wacz_indexer.process_all() wacz.close() dir = tempfile.TemporaryDirectory() with zipfile.ZipFile(self.wacz, "r") as zip_ref: zip_ref.extractall(dir.name) zip_ref.close() with open(os.path.join(dir.name, "indexes/index.cdx.gz"), "rb") as fd: hash = support_hash_file(self.hash_type, fd.read()) gzip_fd = gzip.GzipFile(fileobj=fd) return cdx == hash
def generate_metadata(self, res, wacz): package_dict = {} package_dict["profile"] = "data-package" package_dict["resources"] = [] for i in range(0, len(wacz.infolist())): file = wacz.infolist()[i] package_dict["resources"].append({}) package_dict["resources"][i]["path"] = file.filename with wacz.open(file, "r") as myfile: content = myfile.read() package_dict["resources"][i]["hash"] = support_hash_file( self.hash_type, content) package_dict["resources"][i]["bytes"] = len(content) # set optional metadata desc = res.desc or self.desc title = res.title or self.title if title: package_dict["title"] = title if desc: package_dict["description"] = desc if self.main_url: package_dict["mainPageURL"] = self.main_url if self.main_ts: package_dict["mainPageDate"] = timestamp_to_iso_date( self.main_ts) if res.date: package_dict["mainPageDate"] = res.date package_dict["created"] = datetime.datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%SZ") package_dict["wacz_version"] = WACZ_VERSION package_dict["software"] = "py-wacz " + get_py_wacz_version() return json.dumps(package_dict, indent=2)
def generate_metadata(self, res, wacz): package_dict = {} metadata = {} package_dict["profile"] = "data-package" package_dict["resources"] = [] for i in range(0, len(wacz.infolist())): file = wacz.infolist()[i] package_dict["resources"].append({}) package_dict["resources"][i]["path"] = file.filename with wacz.open(file, "r") as myfile: content = myfile.read() package_dict["resources"][i]["stats"] = {} package_dict["resources"][i]["stats"]["hash"] = support_hash_file( content ) package_dict["resources"][i]["stats"]["bytes"] = len(content) package_dict["resources"][i]["hashing"] = "sha256" # set optional metadata desc = res.desc or self.desc title = res.title or self.title if title: metadata["title"] = title if desc: metadata["desc"] = desc if self.main_url: metadata["mainPageURL"] = self.main_url if self.main_ts: metadata["mainPageTS"] = self.main_ts if res.date: metadata["mainPageTS"] = res.date package_dict["metadata"] = metadata package_dict["wacz_version"] = WACZ_VERSION return json.dumps(package_dict, indent=2)