def test_data_package_structure(self):
        """Check that the package_descriptor is valid"""
        f = open(self.wacz_json, "rb")
        json_parse = json.loads(f.read())
        # Make sure it's recording the correct number of resources
        self.assertEqual(len(json_parse["resources"]), 4)

        # Check that the correct hash was recorded for a warc
        f = open(self.warc_file, "rb")
        original_warc = support_hash_file(f.read())
        f.close()

        warc_resource = self.find_resource(json_parse["resources"],
                                           "example-collection.warc")
        self.assertEqual(original_warc, warc_resource["stats"]["hash"])

        # Check that the correct hash was recorded for the index.idx
        f = open(self.wacz_index_idx, "rb")
        original_wacz_index_idx = support_hash_file(f.read())
        f.close()
        idx_resource = self.find_resource(json_parse["resources"], "idx")
        self.assertEqual(original_wacz_index_idx,
                         idx_resource["stats"]["hash"])

        # Check that the correct hash was recorded for the index.cdx.gz
        f = open(self.wacz_index_cdx, "rb")
        original_wacz_index_cdx = support_hash_file(f.read())
        f.close()
        cdx_resource = self.find_resource(json_parse["resources"], "cdx")
        self.assertEqual(original_wacz_index_cdx,
                         cdx_resource["stats"]["hash"])

        # Use frictionless validation
        valid = validate(self.wacz_json)
        self.assertTrue(valid.valid)
Beispiel #2
0
    def test_util_hash(self):
        """When invoking the util hash method a  hash should be returned"""
        test_hash = hashlib.sha256("test".encode("utf-8")).hexdigest()
        self.assertEqual(support_hash_file("sha256", "test".encode("utf-8")),
                         test_hash)

        test_hash = hashlib.md5("test".encode("utf-8")).hexdigest()
        self.assertEqual(support_hash_file("md5", "test".encode("utf-8")),
                         test_hash)
    def test_archive_structure(self):
        """Check that the hash of the original warc file matches that of the warc file in the archive folder"""
        f = open(self.warc_file, "rb")
        original_warc = support_hash_file(f.read())
        f.close()

        f = open(self.wacz_archive, "rb")
        archive_warc = support_hash_file(f.read())
        f.close()

        self.assertEqual(original_warc, archive_warc)
Beispiel #4
0
 def check_file_hashes(self):
     """Uses the datapackage to check that all the hashes of file in the data folder match those in the datapackage"""
     for filepath in pathlib.Path(self.dir.name).glob("**/*.*"):
         if not os.path.basename(filepath).endswith("datapackage.json"):
             file = open(filepath, "rb").read()
             hash = support_hash_file(self.hash_type, file)
             file = str(filepath).split("/")[-2:]
             file = "/".join(file)
             res = None
             for item in self.datapackage["resources"]:
                 if item["path"] == file:
                     res = item
             if res == None or (res["stats"]["hash"] != hash):
                 print(
                     "\nfile %s's hash does not match the has listed in the datapackage"
                     % file)
                 return False
     return True
Beispiel #5
0
    def check_indexes(self):
        """Indexing existing WARC which should match the index in the wacz"""
        if os.path.exists(os.path.join(self.dir.name, "indexes/index.cdx.gz")):
            for resource in self.datapackage["resources"]:
                if resource["path"] == "indexes/index.cdx.gz":
                    cdx = resource["stats"]["hash"]
        else:
            return False

        archive_folder = os.listdir(os.path.join(self.dir.name, "archive"))
        for item in archive_folder:
            if ".warc" in item:
                warc = item
        wacz_file = tempfile.NamedTemporaryFile(delete=False)
        wacz = zipfile.ZipFile(wacz_file.name, "w")
        data_file = zipfile.ZipInfo("indexes/index.cdx.gz", now())
        index_buff = BytesIO()
        text_wrap = TextIOWrapper(index_buff, "utf-8", write_through=True)
        wacz_indexer = None
        with wacz.open(data_file, "w") as data:
            wacz_indexer = WACZIndexer(
                text_wrap,
                {},
                sort=True,
                compress=data,
                fields="referrer",
                data_out_name="index.cdx.gz",
                records="all",
                main_url="",
                detect_pages="",
            )

            wacz_indexer.process_all()
        wacz.close()
        dir = tempfile.TemporaryDirectory()
        with zipfile.ZipFile(self.wacz, "r") as zip_ref:
            zip_ref.extractall(dir.name)
            zip_ref.close()

        with open(os.path.join(dir.name, "indexes/index.cdx.gz"), "rb") as fd:
            hash = support_hash_file(self.hash_type, fd.read())
            gzip_fd = gzip.GzipFile(fileobj=fd)

        return cdx == hash
Beispiel #6
0
    def generate_metadata(self, res, wacz):
        package_dict = {}

        package_dict["profile"] = "data-package"
        package_dict["resources"] = []
        for i in range(0, len(wacz.infolist())):
            file = wacz.infolist()[i]
            package_dict["resources"].append({})
            package_dict["resources"][i]["path"] = file.filename
            with wacz.open(file, "r") as myfile:
                content = myfile.read()
                package_dict["resources"][i]["hash"] = support_hash_file(
                    self.hash_type, content)
                package_dict["resources"][i]["bytes"] = len(content)

        # set optional metadata
        desc = res.desc or self.desc
        title = res.title or self.title

        if title:
            package_dict["title"] = title

        if desc:
            package_dict["description"] = desc

        if self.main_url:
            package_dict["mainPageURL"] = self.main_url
            if self.main_ts:
                package_dict["mainPageDate"] = timestamp_to_iso_date(
                    self.main_ts)

        if res.date:
            package_dict["mainPageDate"] = res.date

        package_dict["created"] = datetime.datetime.utcnow().strftime(
            "%Y-%m-%dT%H:%M:%SZ")

        package_dict["wacz_version"] = WACZ_VERSION

        package_dict["software"] = "py-wacz " + get_py_wacz_version()

        return json.dumps(package_dict, indent=2)
Beispiel #7
0
    def generate_metadata(self, res, wacz):
        package_dict = {}
        metadata = {}

        package_dict["profile"] = "data-package"
        package_dict["resources"] = []
        for i in range(0, len(wacz.infolist())):
            file = wacz.infolist()[i]
            package_dict["resources"].append({})
            package_dict["resources"][i]["path"] = file.filename
            with wacz.open(file, "r") as myfile:
                content = myfile.read()
                package_dict["resources"][i]["stats"] = {}
                package_dict["resources"][i]["stats"]["hash"] = support_hash_file(
                    content
                )
                package_dict["resources"][i]["stats"]["bytes"] = len(content)
                package_dict["resources"][i]["hashing"] = "sha256"

        # set optional metadata
        desc = res.desc or self.desc
        title = res.title or self.title

        if title:
            metadata["title"] = title

        if desc:
            metadata["desc"] = desc

        if self.main_url:
            metadata["mainPageURL"] = self.main_url
            if self.main_ts:
                metadata["mainPageTS"] = self.main_ts

        if res.date:
            metadata["mainPageTS"] = res.date

        package_dict["metadata"] = metadata
        package_dict["wacz_version"] = WACZ_VERSION

        return json.dumps(package_dict, indent=2)