def prep_database(dbname, runs, start): db = pagedb.PageDB(dbname, runs) sys.stderr.write("[{}] preparation...\n".format(fmt_elapsed(start))) db.prepare_text_statistic('tfidf') db.prepare_text_statistic('nfidf') sys.stderr.write("[{}] preparation complete.\n".format(fmt_elapsed(start))) return db
def main(): db = pagedb.PageDB("ts_analysis") all_clusters = load_clusters(sys.argv[1]) interesting_pages = process_clusters(db, all_clusters) for thing in sorted(interesting_pages.values(), key=lambda x: x.sortkey()): thing.report(sys.stdout)
def main(): db = pagedb.PageDB(sys.argv[1]) lang_codes = db.lang_codes del db pool = multiprocessing.Pool(initializer=worker_init, initargs=(sys.argv[1],)) start = time.time() sys.stderr.write("{}: processing {} languages...\n" .format(fmt_interval(0), len(lang_codes))) for finished in pool.imap_unordered(fwfl_shim, lang_codes): sys.stderr.write("{}: {}\n".format(fmt_interval(time.time() - start), finished))
def main(): db = pagedb.PageDB(sys.argv[1]) stats = collections.defaultdict(page_data) for page in db.get_pages(limit=100000): #sys.stderr.write("{!r}\t{!r}\n" # .format(page.page_id, page.url)) text = page.text_content.encode('utf-8') h = hashlib.sha256(text).digest() s = stats[h] s.count += 1 if s.text is None: s.text = text s.length = len(text) s.url = page.url else: if s.text != text: sys.stderr.write("COLLISION: {}: {} != {}\n".format( base64.b64encode(h), s.url, page.url)) agg = collections.defaultdict(agg_data) for stat in stats.values(): a = agg[stat.count] a.count += 1 a.length += stat.length a.total_length += stat.count * stat.length sys.stdout.write( "n\tcount\tlen_uniq\tcumlen_uniq\tlen_total\tcumlen_total\n") cumlen_uniq = 0 cumlen_total = 0 for n, a in sorted(agg.items()): cumlen_uniq += a.length cumlen_total += a.total_length sys.stdout.write( "{n}\t{count}\t{len_uniq}\t{cumlen_uniq}\t{len_total}\t{cumlen_total}\n" .format(n=n, count=a.count, len_uniq=a.length, cumlen_uniq=cumlen_uniq, len_total=a.total_length, cumlen_total=cumlen_total))
def main(): db = pagedb.PageDB(sys.argv[1]) stats = collections.defaultdict(page_data) ident = langid.LanguageIdentifier.from_model(norm_probs=False) for page in db.get_pages(limit=int(sys.argv[2])): text = page.text_content.encode('utf-8') h = hashlib.sha256(text).digest() s = stats[h] s.count += 1 if s.lang is None: s.length = len(text) s.lang = ident.classify(text)[0] agg = collections.Counter() for doc in stats.values(): agg[doc.lang] += doc.length for lang, nbytes in sorted(agg.items(), key=lambda kv: -kv[1]): sys.stdout.write("{}\t{}\n".format(lang, nbytes))
def main(): db = pagedb.PageDB("ts_analysis") interesting_pages = {} for obs in db.get_page_observations(load=['dom_stats']): if obs.result == 'crawler failure': continue maxdepth = max((int(x) for x in obs.dom_stats.tags_at_depth.keys()), default=0) #if maxdepth > 1 and obs.html_len > 512: # continue if maxdepth < 95: continue if obs.html_hash not in interesting_pages: interesting_pages[obs.html_hash] = Thing(obs, maxdepth) else: interesting_pages[obs.html_hash].add_obs(obs) for thing in sorted(interesting_pages.values(), key=lambda x: x.sortkey()): thing.report(sys.stdout)
def main(): db = pagedb.PageDB(sys.argv[1]) start = time.process_time() stats = collections.defaultdict(page_data) for n, page in enumerate(db.get_pages(limit=100000)): text = page.text_content.encode('utf-8') h = hashlib.sha256(text).digest() s = stats[h] s.count += 1 if len(text) > 0 and s.len_zd == 0: s.compression_trial(text) if n and not n % 1000: sys.stderr.write("%d pages in %.4fs\n" % (n, time.process_time() - start)) out = csv.DictWriter(sys.stdout, page_data.__slots__, dialect='unix', quoting=csv.QUOTE_MINIMAL) out.writeheader() for row in stats.values(): out.writerow(row.asdict())
def worker_init(dbname): global DATABASE DATABASE = pagedb.PageDB(dbname)
def prep_database(dbname): db = pagedb.PageDB(dbname) langs = db.prepare_text_statistic('tfidf') langs |= db.prepare_text_statistic('nfidf') return langs