def do_content_extraction(args): docid, page, baseurl = args try: page = zlib.decompress(page) except: page = '' extr = html_extractor.ExtractedContent(baseurl, page) lang = cld2.detect(extr.text_pruned, want_chunks=True) segmented = [ { "l": c[0].code, "t": list(word_seg.segment(c[0].code, c[1])) } for c in lang.chunks ] pagelen = len(page) content = extr.text_content.encode("utf-8") chash = hashlib.sha256(content).digest() pruned = extr.text_pruned.encode("utf-8") phash = hashlib.sha256(pruned).digest() segmtd = json.dumps(segmented).encode("utf-8") heads = json.dumps(extr.headings).encode("utf-8") hhash = hashlib.sha256(heads).digest() links = json.dumps(extr.links).encode("utf-8") lhash = hashlib.sha256(links).digest() rsrcs = json.dumps(extr.resources).encode("utf-8") rhash = hashlib.sha256(rsrcs).digest() domst = json.dumps(extr.dom_stats.to_json()).encode("utf-8") dhash = hashlib.sha256(domst).digest() return (docid, pagelen, chash, content, phash, pruned, segmtd, hhash, heads, lhash, links, rhash, rsrcs, dhash, domst)
def corpus_wide_statistics(lang, db): """Compute corpus-wide frequency and raw document frequency per term, and count the number of documents.""" corpus_word_freq = collections.Counter() raw_doc_freq = collections.Counter() n_documents = 0 for text in db.get_page_texts( where_clause="p.has_boilerplate=false and p.lang_code='{}'".format( lang)): n_documents += 1 already_this_document = set() for word in word_seg.segment(lang, text.contents): corpus_word_freq[word] += 1 if word not in already_this_document: raw_doc_freq[word] += 1 already_this_document.add(word) idf = compute_idf(n_documents, raw_doc_freq) db.update_corpus_statistics(lang, False, n_documents, [('cwf', corpus_word_freq), ('rdf', raw_doc_freq), ('idf', idf)]) return idf
def do_content_extraction(args): docid, page, baseurl = args try: page = zlib.decompress(page) except: page = '' extr = html_extractor.ExtractedContent(baseurl, page) lang = cld2.detect(extr.text_pruned, want_chunks=True) segmented = [{ "l": c[0].code, "t": list(word_seg.segment(c[0].code, c[1])) } for c in lang.chunks] pagelen = len(page) content = extr.text_content.encode("utf-8") chash = hashlib.sha256(content).digest() pruned = extr.text_pruned.encode("utf-8") phash = hashlib.sha256(pruned).digest() segmtd = json.dumps(segmented).encode("utf-8") heads = json.dumps(extr.headings).encode("utf-8") hhash = hashlib.sha256(heads).digest() links = json.dumps(extr.links).encode("utf-8") lhash = hashlib.sha256(links).digest() rsrcs = json.dumps(extr.resources).encode("utf-8") rhash = hashlib.sha256(rsrcs).digest() domst = json.dumps(extr.dom_stats.to_json()).encode("utf-8") dhash = hashlib.sha256(domst).digest() return (docid, pagelen, chash, content, phash, pruned, segmtd, hhash, heads, lhash, links, rhash, rsrcs, dhash, domst)
def do_resegment(args): docid, text_pruned = args lang = cld2.detect(text_pruned, want_chunks=True) segmented = [ { "l": c[0].code, "t": list(word_seg.segment(c[0].code, c[1])) } for c in lang.chunks ] return (docid, json.dumps(segmented))
def corpus_wide_statistics(lang, db): """Compute corpus-wide frequency and raw document frequency per term, and count the number of documents.""" corpus_word_freq = collections.Counter() raw_doc_freq = collections.Counter() n_documents = 0 for text in db.get_page_texts(where_clause="lang_code='{}'" .format(lang)): n_documents += 1 already_this_document = set() for word in word_seg.segment(lang, text.contents): corpus_word_freq[word] += 1 if word not in already_this_document: raw_doc_freq[word] += 1 already_this_document.add(word) idf = compute_idf(n_documents, raw_doc_freq) db.update_corpus_statistics(lang, n_documents, [('cwf', corpus_word_freq), ('rdf', raw_doc_freq), ('idf', idf)]) return idf
def do_segmentation(args): id, text = args lang = cld2.detect(text, want_chunks=True) segmented = [{ "l": c[0].code, "t": list(word_seg.segment(c[0].code, c[1])) } for c in lang.chunks] return id, quote_utf8_as_text(json.dumps(segmented).encode("utf-8"))
def compute_tfidf(db, lang, text, idf): # This is baseline tf-idf: no corrections for document length or # anything like that. tf = collections.Counter() for word in word_seg.segment(lang, text.contents): tf[word] += 1 for word in tf.keys(): tf[word] *= idf[word] db.update_text_statistic('tfidf', text.id, tf)
def compute_tfidf(db, lang, text, idf): # This is baseline tf-idf: no corrections for document length or # anything like that. tf = collections.Counter() for word in word_seg.segment(lang, text.contents): tf[word] += 1 for word in tf.keys(): tf[word] *= idf[word] db.update_text_statistic('tfidf', text.origin, tf)
def compute_nfidf(db, lang, text, idf): # This is "augmented normalized" tf-idf: the term frequency within # each document is normalized by the maximum term frequency within # that document, so long documents cannot over-influence scoring # of the entire corpus. tf = collections.Counter() for word in word_seg.segment(lang, text.contents): tf[word] += 1 try: max_tf = max(tf.values()) except ValueError: max_tf = 1 for word in tf.keys(): tf[word] = (0.5 + (0.5 * tf[word]) / max_tf) * idf[word] db.update_text_statistic('nfidf', text.id, tf)
def compute_nfidf(db, lang, text, idf): # This is "augmented normalized" tf-idf: the term frequency within # each document is normalized by the maximum term frequency within # that document, so long documents cannot over-influence scoring # of the entire corpus. tf = collections.Counter() for word in word_seg.segment(lang, text.contents): tf[word] += 1 try: max_tf = max(tf.values()) except ValueError: max_tf = 1 for word in tf.keys(): tf[word] = (0.5 + (0.5 * tf[word])/max_tf) * idf[word] db.update_text_statistic('nfidf', text.origin, tf)