Beispiel #1
0
def createcounter(nodes):
    counter = {}
    for node in nodes:
        h = HyperLogLog()
        h.update(str(node))
        counter[node] = h.reg
    return counter
Beispiel #2
0
def hyperloglog():
    h = HyperLogLog()
    data = []
    for pair in stream:
        for node in pair:
            data.append(node)
    for item in data:
        h.update(str(item).encode('utf8'))
    print "the number of node"
    num = count(h)
    print num
Beispiel #3
0
 def cumulative_size(self):
     total_pages = 0
     for crawl in sorted(self.crawls):
         total_pages += self.size['page'][self.crawls[crawl]]
         self.add_by_type(crawl, 'page cumul.', total_pages)
     for item_type in self.hll.keys():
         item_type_cumul = ' '.join([item_type, 'cumul.'])
         item_type_new = ' '.join([item_type, 'new'])
         cumul_hll = HyperLogLog(HYPERLOGLOG_ERROR)
         n = 0
         hlls = []
         for crawl in sorted(self.hll[item_type]):
             n += 1
             hll = self.hll[item_type][crawl]
             last_cumul_hll_len = len(cumul_hll)
             cumul_hll.update(hll)
             # cumulative size
             self.add_by_type(crawl, item_type_cumul, len(cumul_hll))
             # new unseen items this crawl (since the first analyzed crawl)
             unseen = (len(cumul_hll) - last_cumul_hll_len)
             if unseen > len(hll):
                 # 1% error rate for cumulative HLLs is large in comparison
                 # to crawl size, adjust to size of items in this crawl
                 # (there can be no more new items than the size of the crawl)
                 unseen = len(hll)
             self.add_by_type(crawl, item_type_new, unseen)
             hlls.append(hll)
             # cumulative size for last N crawls
             for n_crawls in [2, 3, 4, 6, 9, 12]:
                 item_type_n_crawls = '{} cumul. last {} crawls'.format(
                     item_type, n_crawls)
                 if n_crawls <= len(hlls):
                     cum_hll = HyperLogLog(HYPERLOGLOG_ERROR)
                     for i in range(1, (n_crawls + 1)):
                         if i > len(hlls):
                             break
                         cum_hll.update(hlls[-i])
                     size_last_n = len(cum_hll)
                 else:
                     size_last_n = 'nan'
                 self.add_by_type(crawl, item_type_n_crawls, size_last_n)
 def cumulative_size(self):
     total_pages = 0
     for crawl in sorted(self.crawls):
         total_pages += self.size['page'][self.crawls[crawl]]
         self.add_by_type(crawl, 'page cumul.', total_pages)
     for item_type in self.hll.keys():
         item_type_cumul = ' '.join([item_type, 'cumul.'])
         item_type_new = ' '.join([item_type, 'new'])
         cumul_hll = HyperLogLog(HYPERLOGLOG_ERROR)
         n = 0
         hlls = []
         for crawl in sorted(self.hll[item_type]):
             n += 1
             hll = self.hll[item_type][crawl]
             last_cumul_hll_len = len(cumul_hll)
             cumul_hll.update(hll)
             # cumulative size
             self.add_by_type(crawl, item_type_cumul, len(cumul_hll))
             # new unseen items this crawl (since the first analyzed crawl)
             unseen = (len(cumul_hll) - last_cumul_hll_len)
             if unseen > len(hll):
                 # 1% error rate for cumulative HLLs is large in comparison
                 # to crawl size, adjust to size of items in this crawl
                 # (there can be no more new items than the size of the crawl)
                 unseen = len(hll)
             self.add_by_type(crawl, item_type_new, unseen)
             hlls.append(hll)
             # cumulative size for last N crawls
             for n_crawls in [2, 3, 6, 12]:
                 item_type_n_crawls = '{} cumul. last {} crawls'.format(
                     item_type, n_crawls)
                 if n_crawls <= len(hlls):
                     cum_hll = HyperLogLog(HYPERLOGLOG_ERROR)
                     for i in range(1, (n_crawls+1)):
                         if i > len(hlls):
                             break
                         cum_hll.update(hlls[-i])
                     size_last_n = len(cum_hll)
                 else:
                     size_last_n = 'nan'
                 self.add_by_type(crawl, item_type_n_crawls, size_last_n)
Beispiel #5
0
 def count_reducer(self, key, values):
     outputType = key[0]
     if outputType in (CST.size.value, CST.size_robotstxt.value):
         yield key, sum(values)
     elif outputType == CST.histogram.value:
         yield key, sum(values)
     elif outputType in (CST.url.value, CST.digest.value):
         # only with --exact-counts
         crawls = MonthlyCrawlSet()
         new_crawls = set()
         page_count = MultiCount(2)
         for val in values:
             if type(val) is list:
                 if (outputType == CST.url.value):
                     (crawl, pages) = val
                     page_count.incr(crawl, pages, 1)
                 else:  # digest
                     (crawl, (pages, urls)) = val
                     page_count.incr(crawl, pages, urls)
                 crawls.add(crawl)
                 new_crawls.add(crawl)
             else:
                 # crawl set bit mask
                 crawls.update(val)
         yield key, crawls.get_bits()
         for new_crawl in new_crawls:
             if crawls.is_new(new_crawl):
                 self.counters[(CST.new_items.value, outputType,
                                new_crawl)] += 1
         # url/digest duplicate histograms
         for crawl, counts in page_count.items():
             items = (1 + counts[0] - counts[1])
             self.counters[(CST.histogram.value, outputType, crawl,
                            CST.page.value, items)] += 1
         # size in terms of unique URLs and unique content digests
         for crawl, counts in page_count.items():
             self.counters[(CST.size.value, outputType, crawl)] += 1
     elif outputType in (CST.mimetype.value, CST.mimetype_detected.value,
                         CST.charset.value, CST.languages.value,
                         CST.primary_language.value, CST.scheme.value,
                         CST.tld.value, CST.domain.value,
                         CST.surt_domain.value, CST.host.value,
                         CST.http_status.value, CST.robotstxt_status.value):
         yield key, MultiCount.sum_values(values)
     elif outputType == CST.size_estimate.value:
         hll = HyperLogLog(HYPERLOGLOG_ERROR)
         for val in values:
             hll.update(CrawlStatsJSONDecoder.json_decode_hyperloglog(val))
         yield (key, CrawlStatsJSONEncoder.json_encode_hyperloglog(hll))
     elif outputType == CST.size_estimate_for.value:
         res = None
         hll = None
         cnt = 0
         for val in values:
             if res:
                 if hll is None:
                     cnt = res[0]
                     hll = CrawlStatsJSONDecoder.json_decode_hyperloglog(
                         res[1])
                 cnt += val[0]
                 hll.update(
                     CrawlStatsJSONDecoder.json_decode_hyperloglog(val[1]))
             else:
                 res = val
         if hll is not None and cnt >= MIN_SURT_HLL_SIZE:
             yield (key,
                    (cnt,
                     CrawlStatsJSONEncoder.json_encode_hyperloglog(hll)))
         elif res[0] >= MIN_SURT_HLL_SIZE:
             yield (key, res)
     else:
         raise UnhandledTypeError(outputType)
 def count_reducer(self, key, values):
     outputType = key[0]
     if outputType in (CST.size.value, CST.size_robotstxt.value):
         yield key, sum(values)
     elif outputType == CST.histogram.value:
         yield key, sum(values)
     elif outputType in (CST.url.value, CST.digest.value):
         # only with --exact-counts
         crawls = MonthlyCrawlSet()
         new_crawls = set()
         page_count = MultiCount(2)
         for val in values:
             if type(val) is list:
                 if (outputType == CST.url.value):
                     (crawl, pages) = val
                     page_count.incr(crawl, pages, 1)
                 else:  # digest
                     (crawl, (pages, urls)) = val
                     page_count.incr(crawl, pages, urls)
                 crawls.add(crawl)
                 new_crawls.add(crawl)
             else:
                 # crawl set bit mask
                 crawls.update(val)
         yield key, crawls.get_bits()
         for new_crawl in new_crawls:
             if crawls.is_new(new_crawl):
                 self.counters[(CST.new_items.value,
                                outputType, new_crawl)] += 1
         # url/digest duplicate histograms
         for crawl, counts in page_count.items():
             items = (1+counts[0]-counts[1])
             self.counters[(CST.histogram.value, outputType,
                            crawl, CST.page.value, items)] += 1
         # size in terms of unique URLs and unique content digests
         for crawl, counts in page_count.items():
             self.counters[(CST.size.value, outputType, crawl)] += 1
     elif outputType in (CST.mimetype.value,
                         CST.mimetype_detected.value,
                         CST.charset.value,
                         CST.languages.value,
                         CST.primary_language.value,
                         CST.scheme.value,
                         CST.tld.value,
                         CST.domain.value,
                         CST.surt_domain.value,
                         CST.host.value,
                         CST.http_status.value,
                         CST.robotstxt_status.value):
         yield key, MultiCount.sum_values(values)
     elif outputType == CST.size_estimate.value:
         hll = HyperLogLog(HYPERLOGLOG_ERROR)
         for val in values:
             hll.update(
                 CrawlStatsJSONDecoder.json_decode_hyperloglog(val))
         yield(key,
               CrawlStatsJSONEncoder.json_encode_hyperloglog(hll))
     elif outputType == CST.size_estimate_for.value:
         res = None
         hll = None
         cnt = 0
         for val in values:
             if res:
                 if hll is None:
                     cnt = res[0]
                     hll = CrawlStatsJSONDecoder.json_decode_hyperloglog(res[1])
                 cnt += val[0]
                 hll.update(CrawlStatsJSONDecoder.json_decode_hyperloglog(val[1]))
             else:
                 res = val
         if hll is not None and cnt >= MIN_SURT_HLL_SIZE:
             yield(key, (cnt, CrawlStatsJSONEncoder.json_encode_hyperloglog(hll)))
         elif res[0] >= MIN_SURT_HLL_SIZE:
             yield(key, res)
     else:
         raise UnhandledTypeError(outputType)