Exemple #1
0
def do_content_extraction(args):
    docid, page, baseurl = args
    try:
        page = zlib.decompress(page)
    except:
        page = ''
    extr = html_extractor.ExtractedContent(baseurl, page)
    lang = cld2.detect(extr.text_pruned, want_chunks=True)

    segmented = [{
        "l": c[0].code,
        "t": list(word_seg.segment(c[0].code, c[1]))
    } for c in lang.chunks]

    pagelen = len(page)
    content = extr.text_content.encode("utf-8")
    chash = hashlib.sha256(content).digest()
    pruned = extr.text_pruned.encode("utf-8")
    phash = hashlib.sha256(pruned).digest()
    segmtd = json.dumps(segmented).encode("utf-8")
    heads = json.dumps(extr.headings).encode("utf-8")
    hhash = hashlib.sha256(heads).digest()
    links = json.dumps(extr.links).encode("utf-8")
    lhash = hashlib.sha256(links).digest()
    rsrcs = json.dumps(extr.resources).encode("utf-8")
    rhash = hashlib.sha256(rsrcs).digest()
    domst = json.dumps(extr.dom_stats.to_json()).encode("utf-8")
    dhash = hashlib.sha256(domst).digest()

    return (docid, pagelen, chash, content, phash, pruned, segmtd, hhash,
            heads, lhash, links, rhash, rsrcs, dhash, domst)
Exemple #2
0
def do_content_extraction(args):
    page, url, locale, sources, access_time, result, detail, ourl, rurl = args
    page = zlib.decompress(page)
    pagelen = len(page)
    pagehash = hashlib.sha256(page).digest()
    extr = html_extractor.ExtractedContent(url, page)
    langs = cld2.detect(extr.text_pruned)
    return (zlib.compress(extr.text_pruned.encode("utf-8")),
            zlib.compress(extr.text_content.encode("utf-8")),
            zlib.compress(json.dumps(extr.headings).encode("utf-8")),
            zlib.compress(json.dumps(extr.links).encode("utf-8")),
            zlib.compress(json.dumps(extr.resources).encode("utf-8")),
            zlib.compress(
                json.dumps(extr.dom_stats.to_json()).encode("utf-8")),
            langs[0].code, langs[0].percent, locale, sources, access_time,
            result, detail, ourl, rurl, pagelen, pagehash)
def do_content_extraction(args):
    origin, page, baseurl = args
    try:
        page = zlib.decompress(page)
    except:
        page = ''
    pagelen = len(page)
    extr = html_extractor.ExtractedContent(baseurl, page)
    langs = cld2.detect(extr.text_pruned)

    pcontent = zlib.compress(extr.text_pruned.encode("utf-8"))
    phash = hashlib.sha256(pcontent).digest()
    headings = zlib.compress(json.dumps(extr.headings).encode("utf-8"))
    links = zlib.compress(json.dumps(extr.links).encode("utf-8"))
    resources = zlib.compress(json.dumps(extr.resources).encode("utf-8"))
    domstats = zlib.compress(
        json.dumps(extr.dom_stats.to_json()).encode("utf-8"))

    return (origin, pagelen, phash, langs[0].code, langs[0].percent, pcontent,
            links, resources, headings, domstats)