def do_content_extraction(args): docid, page, baseurl = args try: page = zlib.decompress(page) except: page = '' extr = html_extractor.ExtractedContent(baseurl, page) lang = cld2.detect(extr.text_pruned, want_chunks=True) segmented = [{ "l": c[0].code, "t": list(word_seg.segment(c[0].code, c[1])) } for c in lang.chunks] pagelen = len(page) content = extr.text_content.encode("utf-8") chash = hashlib.sha256(content).digest() pruned = extr.text_pruned.encode("utf-8") phash = hashlib.sha256(pruned).digest() segmtd = json.dumps(segmented).encode("utf-8") heads = json.dumps(extr.headings).encode("utf-8") hhash = hashlib.sha256(heads).digest() links = json.dumps(extr.links).encode("utf-8") lhash = hashlib.sha256(links).digest() rsrcs = json.dumps(extr.resources).encode("utf-8") rhash = hashlib.sha256(rsrcs).digest() domst = json.dumps(extr.dom_stats.to_json()).encode("utf-8") dhash = hashlib.sha256(domst).digest() return (docid, pagelen, chash, content, phash, pruned, segmtd, hhash, heads, lhash, links, rhash, rsrcs, dhash, domst)
def do_content_extraction(args): page, url, locale, sources, access_time, result, detail, ourl, rurl = args page = zlib.decompress(page) pagelen = len(page) pagehash = hashlib.sha256(page).digest() extr = html_extractor.ExtractedContent(url, page) langs = cld2.detect(extr.text_pruned) return (zlib.compress(extr.text_pruned.encode("utf-8")), zlib.compress(extr.text_content.encode("utf-8")), zlib.compress(json.dumps(extr.headings).encode("utf-8")), zlib.compress(json.dumps(extr.links).encode("utf-8")), zlib.compress(json.dumps(extr.resources).encode("utf-8")), zlib.compress( json.dumps(extr.dom_stats.to_json()).encode("utf-8")), langs[0].code, langs[0].percent, locale, sources, access_time, result, detail, ourl, rurl, pagelen, pagehash)
def do_content_extraction(args): origin, page, baseurl = args try: page = zlib.decompress(page) except: page = '' pagelen = len(page) extr = html_extractor.ExtractedContent(baseurl, page) langs = cld2.detect(extr.text_pruned) pcontent = zlib.compress(extr.text_pruned.encode("utf-8")) phash = hashlib.sha256(pcontent).digest() headings = zlib.compress(json.dumps(extr.headings).encode("utf-8")) links = zlib.compress(json.dumps(extr.links).encode("utf-8")) resources = zlib.compress(json.dumps(extr.resources).encode("utf-8")) domstats = zlib.compress( json.dumps(extr.dom_stats.to_json()).encode("utf-8")) return (origin, pagelen, phash, langs[0].code, langs[0].percent, pcontent, links, resources, headings, domstats)