Esempi in Python per HyperLogLog.update

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: hyperloglog

Classe/tipologia: HyperLogLog

Metodo/funzione: update

Esempi su hotexamples.com: 6

HyperLogLog.update in Python: 6 esempi trovati. Questi sono i migliori esempi reali in Python per hyperloglog.HyperLogLog.update, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

HyperLogLog(17)

add(6)

update(4)

M(1)

alpha(1)

card(1)

count(1)

datastr(1)

load(1)

m(1)

offer(1)

p(1)

Metodi utilizzati di frequente

HyperLogLog (17)

add (6)

update (4)

M (1)

alpha (1)

card (1)

count (1)

datastr (1)

load (1)

m (1)

Metodi utilizzati di frequente

offer (1)

p (1)

Esempio n. 1

Mostra file

File: hyperball.py Progetto: highsmallxu/Data-Mining

def createcounter(nodes):
    counter = {}
    for node in nodes:
        h = HyperLogLog()
        h.update(str(node))
        counter[node] = h.reg
    return counter

Esempio n. 2

Mostra file

File: hyperball.py Progetto: highsmallxu/Data-Mining

def hyperloglog():
    h = HyperLogLog()
    data = []
    for pair in stream:
        for node in pair:
            data.append(node)
    for item in data:
        h.update(str(item).encode('utf8'))
    print "the number of node"
    num = count(h)
    print num

Esempio n. 3

Mostra file

 def cumulative_size(self):
     total_pages = 0
     for crawl in sorted(self.crawls):
         total_pages += self.size['page'][self.crawls[crawl]]
         self.add_by_type(crawl, 'page cumul.', total_pages)
     for item_type in self.hll.keys():
         item_type_cumul = ' '.join([item_type, 'cumul.'])
         item_type_new = ' '.join([item_type, 'new'])
         cumul_hll = HyperLogLog(HYPERLOGLOG_ERROR)
         n = 0
         hlls = []
         for crawl in sorted(self.hll[item_type]):
             n += 1
             hll = self.hll[item_type][crawl]
             last_cumul_hll_len = len(cumul_hll)
             cumul_hll.update(hll)
             # cumulative size
             self.add_by_type(crawl, item_type_cumul, len(cumul_hll))
             # new unseen items this crawl (since the first analyzed crawl)
             unseen = (len(cumul_hll) - last_cumul_hll_len)
             if unseen > len(hll):
                 # 1% error rate for cumulative HLLs is large in comparison
                 # to crawl size, adjust to size of items in this crawl
                 # (there can be no more new items than the size of the crawl)
                 unseen = len(hll)
             self.add_by_type(crawl, item_type_new, unseen)
             hlls.append(hll)
             # cumulative size for last N crawls
             for n_crawls in [2, 3, 4, 6, 9, 12]:
                 item_type_n_crawls = '{} cumul. last {} crawls'.format(
                     item_type, n_crawls)
                 if n_crawls <= len(hlls):
                     cum_hll = HyperLogLog(HYPERLOGLOG_ERROR)
                     for i in range(1, (n_crawls + 1)):
                         if i > len(hlls):
                             break
                         cum_hll.update(hlls[-i])
                     size_last_n = len(cum_hll)
                 else:
                     size_last_n = 'nan'
                 self.add_by_type(crawl, item_type_n_crawls, size_last_n)

Esempio n. 4

Mostra file

File: crawl_size.py Progetto: commoncrawl/cc-crawl-statistics

 def cumulative_size(self):
     total_pages = 0
     for crawl in sorted(self.crawls):
         total_pages += self.size['page'][self.crawls[crawl]]
         self.add_by_type(crawl, 'page cumul.', total_pages)
     for item_type in self.hll.keys():
         item_type_cumul = ' '.join([item_type, 'cumul.'])
         item_type_new = ' '.join([item_type, 'new'])
         cumul_hll = HyperLogLog(HYPERLOGLOG_ERROR)
         n = 0
         hlls = []
         for crawl in sorted(self.hll[item_type]):
             n += 1
             hll = self.hll[item_type][crawl]
             last_cumul_hll_len = len(cumul_hll)
             cumul_hll.update(hll)
             # cumulative size
             self.add_by_type(crawl, item_type_cumul, len(cumul_hll))
             # new unseen items this crawl (since the first analyzed crawl)
             unseen = (len(cumul_hll) - last_cumul_hll_len)
             if unseen > len(hll):
                 # 1% error rate for cumulative HLLs is large in comparison
                 # to crawl size, adjust to size of items in this crawl
                 # (there can be no more new items than the size of the crawl)
                 unseen = len(hll)
             self.add_by_type(crawl, item_type_new, unseen)
             hlls.append(hll)
             # cumulative size for last N crawls
             for n_crawls in [2, 3, 6, 12]:
                 item_type_n_crawls = '{} cumul. last {} crawls'.format(
                     item_type, n_crawls)
                 if n_crawls <= len(hlls):
                     cum_hll = HyperLogLog(HYPERLOGLOG_ERROR)
                     for i in range(1, (n_crawls+1)):
                         if i > len(hlls):
                             break
                         cum_hll.update(hlls[-i])
                     size_last_n = len(cum_hll)
                 else:
                     size_last_n = 'nan'
                 self.add_by_type(crawl, item_type_n_crawls, size_last_n)

Esempio n. 5

Mostra file

 def count_reducer(self, key, values):
     outputType = key[0]
     if outputType in (CST.size.value, CST.size_robotstxt.value):
         yield key, sum(values)
     elif outputType == CST.histogram.value:
         yield key, sum(values)
     elif outputType in (CST.url.value, CST.digest.value):
         # only with --exact-counts
         crawls = MonthlyCrawlSet()
         new_crawls = set()
         page_count = MultiCount(2)
         for val in values:
             if type(val) is list:
                 if (outputType == CST.url.value):
                     (crawl, pages) = val
                     page_count.incr(crawl, pages, 1)
                 else:  # digest
                     (crawl, (pages, urls)) = val
                     page_count.incr(crawl, pages, urls)
                 crawls.add(crawl)
                 new_crawls.add(crawl)
             else:
                 # crawl set bit mask
                 crawls.update(val)
         yield key, crawls.get_bits()
         for new_crawl in new_crawls:
             if crawls.is_new(new_crawl):
                 self.counters[(CST.new_items.value, outputType,
                                new_crawl)] += 1
         # url/digest duplicate histograms
         for crawl, counts in page_count.items():
             items = (1 + counts[0] - counts[1])
             self.counters[(CST.histogram.value, outputType, crawl,
                            CST.page.value, items)] += 1
         # size in terms of unique URLs and unique content digests
         for crawl, counts in page_count.items():
             self.counters[(CST.size.value, outputType, crawl)] += 1
     elif outputType in (CST.mimetype.value, CST.mimetype_detected.value,
                         CST.charset.value, CST.languages.value,
                         CST.primary_language.value, CST.scheme.value,
                         CST.tld.value, CST.domain.value,
                         CST.surt_domain.value, CST.host.value,
                         CST.http_status.value, CST.robotstxt_status.value):
         yield key, MultiCount.sum_values(values)
     elif outputType == CST.size_estimate.value:
         hll = HyperLogLog(HYPERLOGLOG_ERROR)
         for val in values:
             hll.update(CrawlStatsJSONDecoder.json_decode_hyperloglog(val))
         yield (key, CrawlStatsJSONEncoder.json_encode_hyperloglog(hll))
     elif outputType == CST.size_estimate_for.value:
         res = None
         hll = None
         cnt = 0
         for val in values:
             if res:
                 if hll is None:
                     cnt = res[0]
                     hll = CrawlStatsJSONDecoder.json_decode_hyperloglog(
                         res[1])
                 cnt += val[0]
                 hll.update(
                     CrawlStatsJSONDecoder.json_decode_hyperloglog(val[1]))
             else:
                 res = val
         if hll is not None and cnt >= MIN_SURT_HLL_SIZE:
             yield (key,
                    (cnt,
                     CrawlStatsJSONEncoder.json_encode_hyperloglog(hll)))
         elif res[0] >= MIN_SURT_HLL_SIZE:
             yield (key, res)
     else:
         raise UnhandledTypeError(outputType)

Esempio n. 6

Mostra file

File: crawlstats.py Progetto: commoncrawl/cc-crawl-statistics

 def count_reducer(self, key, values):
     outputType = key[0]
     if outputType in (CST.size.value, CST.size_robotstxt.value):
         yield key, sum(values)
     elif outputType == CST.histogram.value:
         yield key, sum(values)
     elif outputType in (CST.url.value, CST.digest.value):
         # only with --exact-counts
         crawls = MonthlyCrawlSet()
         new_crawls = set()
         page_count = MultiCount(2)
         for val in values:
             if type(val) is list:
                 if (outputType == CST.url.value):
                     (crawl, pages) = val
                     page_count.incr(crawl, pages, 1)
                 else:  # digest
                     (crawl, (pages, urls)) = val
                     page_count.incr(crawl, pages, urls)
                 crawls.add(crawl)
                 new_crawls.add(crawl)
             else:
                 # crawl set bit mask
                 crawls.update(val)
         yield key, crawls.get_bits()
         for new_crawl in new_crawls:
             if crawls.is_new(new_crawl):
                 self.counters[(CST.new_items.value,
                                outputType, new_crawl)] += 1
         # url/digest duplicate histograms
         for crawl, counts in page_count.items():
             items = (1+counts[0]-counts[1])
             self.counters[(CST.histogram.value, outputType,
                            crawl, CST.page.value, items)] += 1
         # size in terms of unique URLs and unique content digests
         for crawl, counts in page_count.items():
             self.counters[(CST.size.value, outputType, crawl)] += 1
     elif outputType in (CST.mimetype.value,
                         CST.mimetype_detected.value,
                         CST.charset.value,
                         CST.languages.value,
                         CST.primary_language.value,
                         CST.scheme.value,
                         CST.tld.value,
                         CST.domain.value,
                         CST.surt_domain.value,
                         CST.host.value,
                         CST.http_status.value,
                         CST.robotstxt_status.value):
         yield key, MultiCount.sum_values(values)
     elif outputType == CST.size_estimate.value:
         hll = HyperLogLog(HYPERLOGLOG_ERROR)
         for val in values:
             hll.update(
                 CrawlStatsJSONDecoder.json_decode_hyperloglog(val))
         yield(key,
               CrawlStatsJSONEncoder.json_encode_hyperloglog(hll))
     elif outputType == CST.size_estimate_for.value:
         res = None
         hll = None
         cnt = 0
         for val in values:
             if res:
                 if hll is None:
                     cnt = res[0]
                     hll = CrawlStatsJSONDecoder.json_decode_hyperloglog(res[1])
                 cnt += val[0]
                 hll.update(CrawlStatsJSONDecoder.json_decode_hyperloglog(val[1]))
             else:
                 res = val
         if hll is not None and cnt >= MIN_SURT_HLL_SIZE:
             yield(key, (cnt, CrawlStatsJSONEncoder.json_encode_hyperloglog(hll)))
         elif res[0] >= MIN_SURT_HLL_SIZE:
             yield(key, res)
     else:
         raise UnhandledTypeError(outputType)