def createcounter(nodes): counter = {} for node in nodes: h = HyperLogLog() h.update(str(node)) counter[node] = h.reg return counter
def hyperloglog(): h = HyperLogLog() data = [] for pair in stream: for node in pair: data.append(node) for item in data: h.update(str(item).encode('utf8')) print "the number of node" num = count(h) print num
def cumulative_size(self): total_pages = 0 for crawl in sorted(self.crawls): total_pages += self.size['page'][self.crawls[crawl]] self.add_by_type(crawl, 'page cumul.', total_pages) for item_type in self.hll.keys(): item_type_cumul = ' '.join([item_type, 'cumul.']) item_type_new = ' '.join([item_type, 'new']) cumul_hll = HyperLogLog(HYPERLOGLOG_ERROR) n = 0 hlls = [] for crawl in sorted(self.hll[item_type]): n += 1 hll = self.hll[item_type][crawl] last_cumul_hll_len = len(cumul_hll) cumul_hll.update(hll) # cumulative size self.add_by_type(crawl, item_type_cumul, len(cumul_hll)) # new unseen items this crawl (since the first analyzed crawl) unseen = (len(cumul_hll) - last_cumul_hll_len) if unseen > len(hll): # 1% error rate for cumulative HLLs is large in comparison # to crawl size, adjust to size of items in this crawl # (there can be no more new items than the size of the crawl) unseen = len(hll) self.add_by_type(crawl, item_type_new, unseen) hlls.append(hll) # cumulative size for last N crawls for n_crawls in [2, 3, 4, 6, 9, 12]: item_type_n_crawls = '{} cumul. last {} crawls'.format( item_type, n_crawls) if n_crawls <= len(hlls): cum_hll = HyperLogLog(HYPERLOGLOG_ERROR) for i in range(1, (n_crawls + 1)): if i > len(hlls): break cum_hll.update(hlls[-i]) size_last_n = len(cum_hll) else: size_last_n = 'nan' self.add_by_type(crawl, item_type_n_crawls, size_last_n)
def cumulative_size(self): total_pages = 0 for crawl in sorted(self.crawls): total_pages += self.size['page'][self.crawls[crawl]] self.add_by_type(crawl, 'page cumul.', total_pages) for item_type in self.hll.keys(): item_type_cumul = ' '.join([item_type, 'cumul.']) item_type_new = ' '.join([item_type, 'new']) cumul_hll = HyperLogLog(HYPERLOGLOG_ERROR) n = 0 hlls = [] for crawl in sorted(self.hll[item_type]): n += 1 hll = self.hll[item_type][crawl] last_cumul_hll_len = len(cumul_hll) cumul_hll.update(hll) # cumulative size self.add_by_type(crawl, item_type_cumul, len(cumul_hll)) # new unseen items this crawl (since the first analyzed crawl) unseen = (len(cumul_hll) - last_cumul_hll_len) if unseen > len(hll): # 1% error rate for cumulative HLLs is large in comparison # to crawl size, adjust to size of items in this crawl # (there can be no more new items than the size of the crawl) unseen = len(hll) self.add_by_type(crawl, item_type_new, unseen) hlls.append(hll) # cumulative size for last N crawls for n_crawls in [2, 3, 6, 12]: item_type_n_crawls = '{} cumul. last {} crawls'.format( item_type, n_crawls) if n_crawls <= len(hlls): cum_hll = HyperLogLog(HYPERLOGLOG_ERROR) for i in range(1, (n_crawls+1)): if i > len(hlls): break cum_hll.update(hlls[-i]) size_last_n = len(cum_hll) else: size_last_n = 'nan' self.add_by_type(crawl, item_type_n_crawls, size_last_n)
def count_reducer(self, key, values): outputType = key[0] if outputType in (CST.size.value, CST.size_robotstxt.value): yield key, sum(values) elif outputType == CST.histogram.value: yield key, sum(values) elif outputType in (CST.url.value, CST.digest.value): # only with --exact-counts crawls = MonthlyCrawlSet() new_crawls = set() page_count = MultiCount(2) for val in values: if type(val) is list: if (outputType == CST.url.value): (crawl, pages) = val page_count.incr(crawl, pages, 1) else: # digest (crawl, (pages, urls)) = val page_count.incr(crawl, pages, urls) crawls.add(crawl) new_crawls.add(crawl) else: # crawl set bit mask crawls.update(val) yield key, crawls.get_bits() for new_crawl in new_crawls: if crawls.is_new(new_crawl): self.counters[(CST.new_items.value, outputType, new_crawl)] += 1 # url/digest duplicate histograms for crawl, counts in page_count.items(): items = (1 + counts[0] - counts[1]) self.counters[(CST.histogram.value, outputType, crawl, CST.page.value, items)] += 1 # size in terms of unique URLs and unique content digests for crawl, counts in page_count.items(): self.counters[(CST.size.value, outputType, crawl)] += 1 elif outputType in (CST.mimetype.value, CST.mimetype_detected.value, CST.charset.value, CST.languages.value, CST.primary_language.value, CST.scheme.value, CST.tld.value, CST.domain.value, CST.surt_domain.value, CST.host.value, CST.http_status.value, CST.robotstxt_status.value): yield key, MultiCount.sum_values(values) elif outputType == CST.size_estimate.value: hll = HyperLogLog(HYPERLOGLOG_ERROR) for val in values: hll.update(CrawlStatsJSONDecoder.json_decode_hyperloglog(val)) yield (key, CrawlStatsJSONEncoder.json_encode_hyperloglog(hll)) elif outputType == CST.size_estimate_for.value: res = None hll = None cnt = 0 for val in values: if res: if hll is None: cnt = res[0] hll = CrawlStatsJSONDecoder.json_decode_hyperloglog( res[1]) cnt += val[0] hll.update( CrawlStatsJSONDecoder.json_decode_hyperloglog(val[1])) else: res = val if hll is not None and cnt >= MIN_SURT_HLL_SIZE: yield (key, (cnt, CrawlStatsJSONEncoder.json_encode_hyperloglog(hll))) elif res[0] >= MIN_SURT_HLL_SIZE: yield (key, res) else: raise UnhandledTypeError(outputType)
def count_reducer(self, key, values): outputType = key[0] if outputType in (CST.size.value, CST.size_robotstxt.value): yield key, sum(values) elif outputType == CST.histogram.value: yield key, sum(values) elif outputType in (CST.url.value, CST.digest.value): # only with --exact-counts crawls = MonthlyCrawlSet() new_crawls = set() page_count = MultiCount(2) for val in values: if type(val) is list: if (outputType == CST.url.value): (crawl, pages) = val page_count.incr(crawl, pages, 1) else: # digest (crawl, (pages, urls)) = val page_count.incr(crawl, pages, urls) crawls.add(crawl) new_crawls.add(crawl) else: # crawl set bit mask crawls.update(val) yield key, crawls.get_bits() for new_crawl in new_crawls: if crawls.is_new(new_crawl): self.counters[(CST.new_items.value, outputType, new_crawl)] += 1 # url/digest duplicate histograms for crawl, counts in page_count.items(): items = (1+counts[0]-counts[1]) self.counters[(CST.histogram.value, outputType, crawl, CST.page.value, items)] += 1 # size in terms of unique URLs and unique content digests for crawl, counts in page_count.items(): self.counters[(CST.size.value, outputType, crawl)] += 1 elif outputType in (CST.mimetype.value, CST.mimetype_detected.value, CST.charset.value, CST.languages.value, CST.primary_language.value, CST.scheme.value, CST.tld.value, CST.domain.value, CST.surt_domain.value, CST.host.value, CST.http_status.value, CST.robotstxt_status.value): yield key, MultiCount.sum_values(values) elif outputType == CST.size_estimate.value: hll = HyperLogLog(HYPERLOGLOG_ERROR) for val in values: hll.update( CrawlStatsJSONDecoder.json_decode_hyperloglog(val)) yield(key, CrawlStatsJSONEncoder.json_encode_hyperloglog(hll)) elif outputType == CST.size_estimate_for.value: res = None hll = None cnt = 0 for val in values: if res: if hll is None: cnt = res[0] hll = CrawlStatsJSONDecoder.json_decode_hyperloglog(res[1]) cnt += val[0] hll.update(CrawlStatsJSONDecoder.json_decode_hyperloglog(val[1])) else: res = val if hll is not None and cnt >= MIN_SURT_HLL_SIZE: yield(key, (cnt, CrawlStatsJSONEncoder.json_encode_hyperloglog(hll))) elif res[0] >= MIN_SURT_HLL_SIZE: yield(key, res) else: raise UnhandledTypeError(outputType)