def write_page_list(self, wacz, filename, page_iter): pages_file = zipfile.ZipInfo(filename, now()) pages_file.compress_type = zipfile.ZIP_DEFLATED with wacz.open(pages_file, "w") as pg_fh: for line in page_iter: pg_fh.write(line.encode("utf-8"))
def check_indexes(self): """Indexing existing WARC which should match the index in the wacz""" if os.path.exists(os.path.join(self.dir.name, "indexes/index.cdx.gz")): for resource in self.datapackage["resources"]: if resource["path"] == "indexes/index.cdx.gz": cdx = resource["stats"]["hash"] else: return False archive_folder = os.listdir(os.path.join(self.dir.name, "archive")) for item in archive_folder: if ".warc" in item: warc = item wacz_file = tempfile.NamedTemporaryFile(delete=False) wacz = zipfile.ZipFile(wacz_file.name, "w") data_file = zipfile.ZipInfo("indexes/index.cdx.gz", now()) index_buff = BytesIO() text_wrap = TextIOWrapper(index_buff, "utf-8", write_through=True) wacz_indexer = None with wacz.open(data_file, "w") as data: wacz_indexer = WACZIndexer( text_wrap, {}, sort=True, compress=data, fields="referrer", data_out_name="index.cdx.gz", records="all", main_url="", detect_pages="", ) wacz_indexer.process_all() wacz.close() dir = tempfile.TemporaryDirectory() with zipfile.ZipFile(self.wacz, "r") as zip_ref: zip_ref.extractall(dir.name) zip_ref.close() with open(os.path.join(dir.name, "indexes/index.cdx.gz"), "rb") as fd: hash = support_hash_file(self.hash_type, fd.read()) gzip_fd = gzip.GzipFile(fileobj=fd) return cdx == hash
def create_wacz(res): wacz = zipfile.ZipFile(res.output, "w") print("Generating indexes...") # write index data_file = zipfile.ZipInfo("indexes/index.cdx.gz", now()) index_file = zipfile.ZipInfo("indexes/index.idx", now()) index_file.compress_type = zipfile.ZIP_DEFLATED index_buff = BytesIO() text_wrap = TextIOWrapper(index_buff, "utf-8", write_through=True) wacz_indexer = None passed_pages_dict = {} # If the flag for passed pages has been passed if res.pages != None: print("Attempt to validate passed pages.jsonl file") passed_content = open(res.pages, "r").read().split("\n") # Get rid of the blank end line that editors can sometimes add to jsonl files if it's present if passed_content[len(passed_content) - 1] == "": passed_content.pop() # Confirm the passed jsonl file has valid json on each line for page_str in passed_content: page_json = validateJSON(page_str) if not page_json: print( "The passed jsonl file cannot be validated. Error found on the following line\n %s" % page_str) return 1 # Create a dict of the passed pages that will be used in the construction of the index passed_pages_dict = construct_passed_pages_dict(passed_content) with wacz.open(data_file, "w") as data: wacz_indexer = WACZIndexer( text_wrap, res.inputs, sort=True, post_append=True, compress=data, fields="referrer", data_out_name="index.cdx.gz", hash_type=res.hash_type, main_url=res.url, main_ts=res.ts, detect_pages=res.detect_pages, passed_pages_dict=passed_pages_dict, extract_text=res.text, ) wacz_indexer.process_all() index_buff.seek(0) with wacz.open(index_file, "w") as index: shutil.copyfileobj(index_buff, index) # write archives print("Writing archives...") for _input in res.inputs: archive_file = zipfile.ZipInfo.from_file( _input, "archive/" + os.path.basename(_input)) with wacz.open(archive_file, "w") as out_fh: with open(_input, "rb") as in_fh: shutil.copyfileobj(in_fh, out_fh) path = "archive/" + os.path.basename(_input) if wacz_indexer.passed_pages_dict != None: for key in wacz_indexer.passed_pages_dict: print( "Invalid passed page. We were unable to find a match for %s" % str(key)) if len(wacz_indexer.pages) > 0 and res.pages == None: print("Generating page index...") # generate pages/text wacz_indexer.write_page_list( wacz, PAGE_INDEX, wacz_indexer.serialize_json_pages( wacz_indexer.pages.values(), id="pages", title="All Pages", has_text=wacz_indexer.has_text, ), ) if len(wacz_indexer.pages) > 0 and res.pages != None: print("Generating page index from passed pages...") # Initially set the default value of the header id and title id_value = "pages" title_value = "All Pages" # If the user has provided a title or an id in a header of their file we will use those instead of our default. header = json.loads(passed_content[0]) if "format" in header: print("Header detected in the passed pages.jsonl file") if "id" in header: id_value = header["id"] if "title" in header: title_value = header["title"] wacz_indexer.write_page_list( wacz, PAGE_INDEX, wacz_indexer.serialize_json_pages( wacz_indexer.pages.values(), id=id_value, title=title_value, has_text=wacz_indexer.has_text, ), ) if len(wacz_indexer.extra_page_lists) > 0: print("Generating extra page lists...") for name, pagelist in wacz_indexer.extra_page_lists.items(): if name == "pages": name = shortuuid.uuid() filename = PAGE_INDEX_TEMPLATE.format(name) wacz_indexer.write_page_list(wacz, filename, pagelist) # generate metadata print("Generating metadata...") metadata = wacz_indexer.generate_metadata(res, wacz) metadata_file = zipfile.ZipInfo("datapackage.json", now()) metadata_file.compress_type = zipfile.ZIP_DEFLATED wacz.writestr(metadata_file, metadata.encode("utf-8")) return 0
def create_wacz(res): wacz = zipfile.ZipFile(res.output, "w") print("Generating indexes...") # write index data_file = zipfile.ZipInfo("indexes/index.cdx.gz", now()) index_file = zipfile.ZipInfo("indexes/index.idx", now()) index_file.compress_type = zipfile.ZIP_DEFLATED index_buff = BytesIO() text_wrap = TextIOWrapper(index_buff, "utf-8", write_through=True) wacz_indexer = None with wacz.open(data_file, "w") as data: wacz_indexer = WACZIndexer( text_wrap, res.inputs, sort=True, compress=data, fields="referrer", data_out_name="index.cdx.gz", records="all", main_url=res.url, main_ts=res.ts, detect_pages=res.detect_pages, extract_text=res.text, ) wacz_indexer.process_all() index_buff.seek(0) with wacz.open(index_file, "w") as index: shutil.copyfileobj(index_buff, index) # write archives print("Writing archives...") for _input in res.inputs: archive_file = zipfile.ZipInfo.from_file( _input, "archive/" + os.path.basename(_input)) with wacz.open(archive_file, "w") as out_fh: with open(_input, "rb") as in_fh: shutil.copyfileobj(in_fh, out_fh) path = "archive/" + os.path.basename(_input) if len(wacz_indexer.pages) > 0: print("Generating page index...") # generate pages/text wacz_indexer.write_page_list( wacz, PAGE_INDEX, wacz_indexer.serialize_json_pages( wacz_indexer.pages.values(), id="pages", title="All Pages", has_text=wacz_indexer.has_text, ), ) if len(wacz_indexer.extra_page_lists) > 0: print("Generating extra page lists...") for name, pagelist in wacz_indexer.extra_page_lists.items(): if name == "pages": name = shortuuid.uuid() filename = PAGE_INDEX_TEMPLATE.format(name) wacz_indexer.write_page_list(wacz, filename, pagelist) # generate metadata print("Generating metadata...") metadata = wacz_indexer.generate_metadata(res, wacz) metadata_file = zipfile.ZipInfo("datapackage.json", now()) metadata_file.compress_type = zipfile.ZIP_DEFLATED wacz.writestr(metadata_file, metadata.encode("utf-8")) return 0