def do_content_extraction(args):
    docid, page, baseurl = args
    try:
        page = zlib.decompress(page)
    except:
        page = ''
    extr = html_extractor.ExtractedContent(baseurl, page)
    lang = cld2.detect(extr.text_pruned, want_chunks=True)

    segmented = [ { "l": c[0].code,
                    "t": list(word_seg.segment(c[0].code, c[1])) }
                  for c in lang.chunks ]

    pagelen = len(page)
    content = extr.text_content.encode("utf-8")
    chash   = hashlib.sha256(content).digest()
    pruned  = extr.text_pruned.encode("utf-8")
    phash   = hashlib.sha256(pruned).digest()
    segmtd  = json.dumps(segmented).encode("utf-8")
    heads   = json.dumps(extr.headings).encode("utf-8")
    hhash   = hashlib.sha256(heads).digest()
    links   = json.dumps(extr.links).encode("utf-8")
    lhash   = hashlib.sha256(links).digest()
    rsrcs   = json.dumps(extr.resources).encode("utf-8")
    rhash   = hashlib.sha256(rsrcs).digest()
    domst   = json.dumps(extr.dom_stats.to_json()).encode("utf-8")
    dhash   = hashlib.sha256(domst).digest()

    return (docid, pagelen,
            chash, content,
            phash, pruned, segmtd,
            hhash, heads,
            lhash, links,
            rhash, rsrcs,
            dhash, domst)
Esempio n. 2
0
def corpus_wide_statistics(lang, db):
    """Compute corpus-wide frequency and raw document frequency per term,
       and count the number of documents."""

    corpus_word_freq = collections.Counter()
    raw_doc_freq = collections.Counter()
    n_documents = 0

    for text in db.get_page_texts(
            where_clause="p.has_boilerplate=false and p.lang_code='{}'".format(
                lang)):

        n_documents += 1
        already_this_document = set()
        for word in word_seg.segment(lang, text.contents):
            corpus_word_freq[word] += 1
            if word not in already_this_document:
                raw_doc_freq[word] += 1
                already_this_document.add(word)

    idf = compute_idf(n_documents, raw_doc_freq)
    db.update_corpus_statistics(lang, False, n_documents,
                                [('cwf', corpus_word_freq),
                                 ('rdf', raw_doc_freq), ('idf', idf)])

    return idf
Esempio n. 3
0
def do_content_extraction(args):
    docid, page, baseurl = args
    try:
        page = zlib.decompress(page)
    except:
        page = ''
    extr = html_extractor.ExtractedContent(baseurl, page)
    lang = cld2.detect(extr.text_pruned, want_chunks=True)

    segmented = [{
        "l": c[0].code,
        "t": list(word_seg.segment(c[0].code, c[1]))
    } for c in lang.chunks]

    pagelen = len(page)
    content = extr.text_content.encode("utf-8")
    chash = hashlib.sha256(content).digest()
    pruned = extr.text_pruned.encode("utf-8")
    phash = hashlib.sha256(pruned).digest()
    segmtd = json.dumps(segmented).encode("utf-8")
    heads = json.dumps(extr.headings).encode("utf-8")
    hhash = hashlib.sha256(heads).digest()
    links = json.dumps(extr.links).encode("utf-8")
    lhash = hashlib.sha256(links).digest()
    rsrcs = json.dumps(extr.resources).encode("utf-8")
    rhash = hashlib.sha256(rsrcs).digest()
    domst = json.dumps(extr.dom_stats.to_json()).encode("utf-8")
    dhash = hashlib.sha256(domst).digest()

    return (docid, pagelen, chash, content, phash, pruned, segmtd, hhash,
            heads, lhash, links, rhash, rsrcs, dhash, domst)
Esempio n. 4
0
def do_resegment(args):
    docid, text_pruned = args
    lang = cld2.detect(text_pruned, want_chunks=True)
    segmented = [ { "l": c[0].code,
                    "t": list(word_seg.segment(c[0].code, c[1])) }
                  for c in lang.chunks ]
    return (docid, json.dumps(segmented))
Esempio n. 5
0
def corpus_wide_statistics(lang, db):
    """Compute corpus-wide frequency and raw document frequency per term,
       and count the number of documents."""

    corpus_word_freq = collections.Counter()
    raw_doc_freq     = collections.Counter()
    n_documents      = 0

    for text in db.get_page_texts(where_clause="lang_code='{}'"
                                  .format(lang)):

        n_documents += 1
        already_this_document = set()
        for word in word_seg.segment(lang, text.contents):
            corpus_word_freq[word] += 1
            if word not in already_this_document:
                raw_doc_freq[word] += 1
                already_this_document.add(word)

    idf = compute_idf(n_documents, raw_doc_freq)
    db.update_corpus_statistics(lang, n_documents,
                                [('cwf', corpus_word_freq),
                                 ('rdf', raw_doc_freq),
                                 ('idf', idf)])

    return idf
Esempio n. 6
0
def do_segmentation(args):
    id, text = args
    lang = cld2.detect(text, want_chunks=True)
    segmented = [{
        "l": c[0].code,
        "t": list(word_seg.segment(c[0].code, c[1]))
    } for c in lang.chunks]
    return id, quote_utf8_as_text(json.dumps(segmented).encode("utf-8"))
Esempio n. 7
0
def compute_tfidf(db, lang, text, idf):
    # This is baseline tf-idf: no corrections for document length or
    # anything like that.
    tf = collections.Counter()
    for word in word_seg.segment(lang, text.contents):
        tf[word] += 1

    for word in tf.keys():
        tf[word] *= idf[word]

    db.update_text_statistic('tfidf', text.id, tf)
Esempio n. 8
0
def compute_tfidf(db, lang, text, idf):
    # This is baseline tf-idf: no corrections for document length or
    # anything like that.
    tf = collections.Counter()
    for word in word_seg.segment(lang, text.contents):
        tf[word] += 1

    for word in tf.keys():
        tf[word] *= idf[word]

    db.update_text_statistic('tfidf', text.origin, tf)
Esempio n. 9
0
def compute_nfidf(db, lang, text, idf):
    # This is "augmented normalized" tf-idf: the term frequency within
    # each document is normalized by the maximum term frequency within
    # that document, so long documents cannot over-influence scoring
    # of the entire corpus.
    tf = collections.Counter()
    for word in word_seg.segment(lang, text.contents):
        tf[word] += 1

    try:
        max_tf = max(tf.values())
    except ValueError:
        max_tf = 1

    for word in tf.keys():
        tf[word] = (0.5 + (0.5 * tf[word]) / max_tf) * idf[word]

    db.update_text_statistic('nfidf', text.id, tf)
Esempio n. 10
0
def compute_nfidf(db, lang, text, idf):
    # This is "augmented normalized" tf-idf: the term frequency within
    # each document is normalized by the maximum term frequency within
    # that document, so long documents cannot over-influence scoring
    # of the entire corpus.
    tf = collections.Counter()
    for word in word_seg.segment(lang, text.contents):
        tf[word] += 1

    try:
        max_tf = max(tf.values())
    except ValueError:
        max_tf = 1

    for word in tf.keys():
        tf[word] = (0.5 + (0.5 * tf[word])/max_tf) * idf[word]

    db.update_text_statistic('nfidf', text.origin, tf)