Ejemplo n.º 1
0
def prep_database(dbname, runs, start):
    db = pagedb.PageDB(dbname, runs)
    sys.stderr.write("[{}] preparation...\n".format(fmt_elapsed(start)))
    db.prepare_text_statistic('tfidf')
    db.prepare_text_statistic('nfidf')
    sys.stderr.write("[{}] preparation complete.\n".format(fmt_elapsed(start)))
    return db
Ejemplo n.º 2
0
def main():
    db = pagedb.PageDB("ts_analysis")
    all_clusters = load_clusters(sys.argv[1])
    interesting_pages = process_clusters(db, all_clusters)

    for thing in sorted(interesting_pages.values(), key=lambda x: x.sortkey()):
        thing.report(sys.stdout)
Ejemplo n.º 3
0
def main():
    db = pagedb.PageDB(sys.argv[1])
    lang_codes = db.lang_codes
    del db

    pool = multiprocessing.Pool(initializer=worker_init,
                                initargs=(sys.argv[1],))
    start = time.time()
    sys.stderr.write("{}: processing {} languages...\n"
                     .format(fmt_interval(0), len(lang_codes)))
    for finished in pool.imap_unordered(fwfl_shim, lang_codes):
        sys.stderr.write("{}: {}\n".format(fmt_interval(time.time() - start),
                                           finished))
def main():
    db = pagedb.PageDB(sys.argv[1])

    stats = collections.defaultdict(page_data)
    for page in db.get_pages(limit=100000):
        #sys.stderr.write("{!r}\t{!r}\n"
        #                 .format(page.page_id, page.url))

        text = page.text_content.encode('utf-8')
        h = hashlib.sha256(text).digest()
        s = stats[h]
        s.count += 1
        if s.text is None:
            s.text = text
            s.length = len(text)
            s.url = page.url
        else:
            if s.text != text:
                sys.stderr.write("COLLISION: {}: {} != {}\n".format(
                    base64.b64encode(h), s.url, page.url))

    agg = collections.defaultdict(agg_data)
    for stat in stats.values():
        a = agg[stat.count]
        a.count += 1
        a.length += stat.length
        a.total_length += stat.count * stat.length

    sys.stdout.write(
        "n\tcount\tlen_uniq\tcumlen_uniq\tlen_total\tcumlen_total\n")
    cumlen_uniq = 0
    cumlen_total = 0
    for n, a in sorted(agg.items()):
        cumlen_uniq += a.length
        cumlen_total += a.total_length
        sys.stdout.write(
            "{n}\t{count}\t{len_uniq}\t{cumlen_uniq}\t{len_total}\t{cumlen_total}\n"
            .format(n=n,
                    count=a.count,
                    len_uniq=a.length,
                    cumlen_uniq=cumlen_uniq,
                    len_total=a.total_length,
                    cumlen_total=cumlen_total))
Ejemplo n.º 5
0
def main():
    db = pagedb.PageDB(sys.argv[1])
    stats = collections.defaultdict(page_data)
    ident = langid.LanguageIdentifier.from_model(norm_probs=False)
    for page in db.get_pages(limit=int(sys.argv[2])):

        text = page.text_content.encode('utf-8')
        h = hashlib.sha256(text).digest()
        s = stats[h]
        s.count += 1
        if s.lang is None:
            s.length = len(text)
            s.lang = ident.classify(text)[0]

    agg = collections.Counter()
    for doc in stats.values():
        agg[doc.lang] += doc.length

    for lang, nbytes in sorted(agg.items(), key=lambda kv: -kv[1]):
        sys.stdout.write("{}\t{}\n".format(lang, nbytes))
Ejemplo n.º 6
0
def main():
    db = pagedb.PageDB("ts_analysis")

    interesting_pages = {}

    for obs in db.get_page_observations(load=['dom_stats']):
        if obs.result == 'crawler failure': continue
        maxdepth = max((int(x) for x in obs.dom_stats.tags_at_depth.keys()),
                       default=0)
        #if maxdepth > 1 and obs.html_len > 512:
        #    continue
        if maxdepth < 95:
            continue

        if obs.html_hash not in interesting_pages:
            interesting_pages[obs.html_hash] = Thing(obs, maxdepth)
        else:
            interesting_pages[obs.html_hash].add_obs(obs)

    for thing in sorted(interesting_pages.values(), key=lambda x: x.sortkey()):

        thing.report(sys.stdout)
def main():
    db = pagedb.PageDB(sys.argv[1])

    start = time.process_time()
    stats = collections.defaultdict(page_data)
    for n, page in enumerate(db.get_pages(limit=100000)):
        text = page.text_content.encode('utf-8')
        h = hashlib.sha256(text).digest()
        s = stats[h]
        s.count += 1
        if len(text) > 0 and s.len_zd == 0:
            s.compression_trial(text)

        if n and not n % 1000:
            sys.stderr.write("%d pages in %.4fs\n" %
                             (n, time.process_time() - start))

    out = csv.DictWriter(sys.stdout,
                         page_data.__slots__,
                         dialect='unix',
                         quoting=csv.QUOTE_MINIMAL)
    out.writeheader()
    for row in stats.values():
        out.writerow(row.asdict())
Ejemplo n.º 8
0
def worker_init(dbname):
    global DATABASE
    DATABASE = pagedb.PageDB(dbname)
Ejemplo n.º 9
0
def prep_database(dbname):
    db = pagedb.PageDB(dbname)
    langs = db.prepare_text_statistic('tfidf')
    langs |= db.prepare_text_statistic('nfidf')
    return langs