def output(self, crawl, exact_count=True, min_surt_hll_size=50000): counts = (self.pages, self.unique_urls()) host_domain_count = HostDomainCount() surt_hll = None if self.unique_urls() >= min_surt_hll_size: surt_hll = HyperLogLog(HYPERLOGLOG_ERROR) for url, count in self.url.items(): host_domain_count.add(url, count) if exact_count: yield (CST.url.value, self.surt_domain, url), (crawl, count) if surt_hll is not None: surt_hll.add(url) if exact_count: for digest, counts in self.digest.items(): yield (CST.digest.value, digest), (crawl, counts) for mime, counts in self.mime.items(): yield (CST.mimetype.value, mime, crawl), counts for mime, counts in self.mime_detected.items(): yield (CST.mimetype_detected.value, mime, crawl), counts for key, val in host_domain_count.output(crawl): yield key, val yield ((CST.surt_domain.value, self.surt_domain, crawl), (self.pages, self.unique_urls(), len(host_domain_count.hosts))) if surt_hll is not None: yield ((CST.size_estimate_for.value, CST.surt_domain.value, self.surt_domain, CST.url.value, crawl), (self.unique_urls(), CrawlStatsJSONEncoder.json_encode_hyperloglog(surt_hll))) for status, counts in self.http_status.items(): yield (CST.http_status.value, status, crawl), counts for url, count in self.robotstxt_url.items(): yield (CST.size_robotstxt.value, CST.url.value, crawl), 1 yield (CST.size_robotstxt.value, CST.page.value, crawl), count for status, counts in self.robotstxt_status.items(): yield (CST.robotstxt_status.value, status, crawl), counts
def test_json_hyperloglog(): hll1 = HyperLogLog(.01) for i in range(0, 50): hll1.add(i) jsons = json.dumps(hll1, cls=CrawlStatsJSONEncoder) hll2 = json.loads(jsons, cls=CrawlStatsJSONDecoder) assert(hll1.card() == hll2.card()) # test jsonpickle serialization jsonp = jsonpickle.encode(hll2) hll3 = jsonpickle.decode(jsonp) assert(hll1.card() == hll3.card())
def test_json_hyperloglog(): hll1 = HyperLogLog(.01) for i in range(0, 50): hll1.add(i) jsons = json.dumps(hll1, cls=CrawlStatsJSONEncoder) hll2 = json.loads(jsons, cls=CrawlStatsJSONDecoder) assert (hll1.card() == hll2.card()) # test jsonpickle serialization jsonp = jsonpickle.encode(hll2) hll3 = jsonpickle.decode(jsonp) assert (hll1.card() == hll3.card())
def output(self, crawl, exact_count=True, min_surt_hll_size=50000): counts = (self.pages, self.unique_urls()) host_domain_count = HostDomainCount() surt_hll = None if self.unique_urls() >= min_surt_hll_size: surt_hll = HyperLogLog(HYPERLOGLOG_ERROR) for url, count in self.url.items(): host_domain_count.add(url, count) if exact_count: yield (CST.url.value, self.surt_domain, url), (crawl, count) if surt_hll is not None: surt_hll.add(url) if exact_count: for digest, counts in self.digest.items(): yield (CST.digest.value, digest), (crawl, counts) for mime, counts in self.mime.items(): yield (CST.mimetype.value, mime, crawl), counts for mime, counts in self.mime_detected.items(): yield (CST.mimetype_detected.value, mime, crawl), counts for charset, counts in self.charset.items(): yield (CST.charset.value, charset, crawl), counts for languages, counts in self.languages.items(): yield (CST.languages.value, languages, crawl), counts # yield primary language prim_l = languages.split(',')[0] yield (CST.primary_language.value, prim_l, crawl), counts for key, val in host_domain_count.output(crawl): yield key, val yield((CST.surt_domain.value, self.surt_domain, crawl), (self.pages, self.unique_urls(), len(host_domain_count.hosts))) if surt_hll is not None: yield((CST.size_estimate_for.value, CST.surt_domain.value, self.surt_domain, CST.url.value, crawl), (self.unique_urls(), CrawlStatsJSONEncoder.json_encode_hyperloglog(surt_hll))) for status, counts in self.http_status.items(): yield (CST.http_status.value, status, crawl), counts for url, count in self.robotstxt_url.items(): yield (CST.size_robotstxt.value, CST.url.value, crawl), 1 yield (CST.size_robotstxt.value, CST.page.value, crawl), count for status, counts in self.robotstxt_status.items(): yield (CST.robotstxt_status.value, status, crawl), counts
def get_unique_states(self, states=None, limits=None): if states is None: states = copy.deepcopy(self.states) for axis in range(len(states[0])): if limits is None: axmin, axmax = np.min(states[:, axis]), np.max(states[:, axis]) else: axmin, axmax = limits[axis*2:axis*2+2] states[:, axis] = np.digitize(states[:, axis], np.linspace(axmin, axmax, num=100)) states.astype(int) hll = HyperLogLog(0.01) for state in tqdm(states, desc=f"Search for Unique States in whole dataset ({self.environment} @ {self.buffer_type})", total=len(states)): hll.add(",".join([str(s) for s in state])) return len(hll)
class CCStatsJob(MRJob): '''Job to get crawl statistics from Common Crawl index --job=count run count job (first step) to get counts from Common Crawl index files (cdx-*.gz) --job=stats run statistics job (second step) on output from count job''' OUTPUT_PROTOCOL = JSONProtocol JOBCONF = { 'mapreduce.task.timeout': '9600000', 'mapreduce.map.speculative': 'false', 'mapreduce.reduce.speculative': 'false', 'mapreduce.job.jvm.numtasks': '-1', } s3pattern = re.compile('^s3://([^/]+)/(.+)') gzpattern = re.compile('\.gz$') crawlpattern = re.compile('(CC-MAIN-2\d{3}-\d{2})') def configure_args(self): """Custom command line options for common crawl index statistics""" super(CCStatsJob, self).configure_args() self.add_passthru_arg( '--job', dest='job_to_run', default='', choices=['count', 'stats', ''], help='''Job(s) to run ("count", "stats", or empty to run both)''') self.add_passthru_arg( '--exact-counts', dest='exact_counts', action='store_true', default=None, help='''Exact counts for URLs and content digests, this increases the output size significantly''') self.add_passthru_arg( '--no-exact-counts', dest='exact_counts', action='store_false', default=None, help='''No exact counts for URLs and content digests to save storage space and computation time''') self.add_passthru_arg( '--max-top-hosts-domains', dest='max_hosts', type=int, default=200, help='''Max. number of most frequent hosts or domains shown in final statistics (cf. --min-urls-top-host-domain)''') self.add_passthru_arg( '--min-urls-top-host-domain', dest='min_domain_frequency', type=int, default=1, help='''Min. number of URLs required per host or domain shown in final statistics (cf. --max-top-hosts-domains).''') self.add_passthru_arg( '--min-lang-comb-freq', dest='min_lang_comb_freq', type=int, default=1, help='''Min. number of pages required for a combination of detected languages to be shown in final statistics.''') self.add_passthru_arg( '--crawl', dest='crawl', default=None, help='''ID/name of the crawl analyzed (if not given detected from input path)''') def input_protocol(self): if self.options.job_to_run != 'stats': LOG.debug('Reading text input from cdx files') return RawValueProtocol() LOG.debug('Reading JSON input from count job') return JSONProtocol() def hadoop_input_format(self): input_format = self.HADOOP_INPUT_FORMAT if self.options.job_to_run != 'stats': input_format = 'org.apache.hadoop.mapred.TextInputFormat' LOG.info("Setting input format for {} job: {}".format( self.options.job_to_run, input_format)) return input_format def count_mapper_init(self): """Because cdx.gz files cannot be split and mapreduce.input.fileinputformat.split.minsize is set to a value larger than any cdx.gz file, the mapper is guaranteed to process the content of a single cdx file. Input lines of a cdx file are sorted by SURT URL which allows to aggregate URL counts for one SURT domain in memory. It may happen that one SURT domain spans over multiple cdx files. In this case (and without --exact-counts) the count of unique URLs and the URL histograms may be slightly off in case the same URL occurs also in a second cdx file. However, this problem is negligible because there are only 300 cdx files.""" self.counters = Counter() self.cdx_path = os.environ['mapreduce_map_input_file'] LOG.info('Reading {0}'.format(self.cdx_path)) self.crawl_name = None self.crawl = None if self.options.crawl is not None: self.crawl_name = self.options.crawl else: crawl_name_match = self.crawlpattern.search(self.cdx_path) if crawl_name_match is not None: self.crawl_name = crawl_name_match.group(1) else: raise InputError( "Cannot determine ID of monthly crawl from input path {}". format(self.cdx_path)) if self.crawl_name is None: raise InputError("Name of crawl not given") self.crawl = MonthlyCrawl.get_by_name(self.crawl_name) self.fetches_total = 0 self.pages_total = 0 self.urls_total = 0 self.urls_hll = HyperLogLog(HYPERLOGLOG_ERROR) self.digest_hll = HyperLogLog(HYPERLOGLOG_ERROR) self.url_histogram = Counter() self.count = None # first and last SURT may continue in previous/next cdx self.min_surt_hll_size = 1 self.increment_counter('cdx-stats', 'cdx files processed', 1) def count_mapper(self, _, line): self.fetches_total += 1 if (self.fetches_total % 1000) == 0: self.increment_counter('cdx-stats', 'cdx lines read', 1000) if (self.fetches_total % 100000) == 0: LOG.info('Read {0} cdx lines'.format(self.fetches_total)) else: LOG.debug('Read {0} cdx lines'.format(self.fetches_total)) parts = line.split(' ') [surt_domain, path] = parts[0].split(')', 1) if self.count is None: self.count = SurtDomainCount(surt_domain) if surt_domain != self.count.surt_domain: # output accumulated statistics for one SURT domain for pair in self.count.output(self.crawl, self.options.exact_counts, self.min_surt_hll_size): yield pair self.urls_total += self.count.unique_urls() for url, cnt in self.count.url.items(): self.urls_hll.add(url) self.url_histogram[cnt] += 1 for digest in self.count.digest: self.digest_hll.add(digest) self.pages_total += self.count.pages self.count = SurtDomainCount(surt_domain) self.min_surt_hll_size = MIN_SURT_HLL_SIZE json_string = ' '.join(parts[2:]) try: metadata = ujson.loads(json_string) self.count.add(path, metadata) except ValueError as e: LOG.error('Failed to parse json: {0} - {1}'.format(e, json_string)) def count_mapper_final(self): self.increment_counter('cdx-stats', 'cdx lines read', self.fetches_total % 1000) if self.count is None: return for pair in self.count.output(self.crawl, self.options.exact_counts, 1): yield pair self.urls_total += self.count.unique_urls() for url, cnt in self.count.url.items(): self.urls_hll.add(url) self.url_histogram[cnt] += 1 for digest in self.count.digest: self.digest_hll.add(digest) self.pages_total += self.count.pages if not self.options.exact_counts: for count, frequency in self.url_histogram.items(): yield ((CST.histogram.value, CST.url.value, self.crawl, CST.page.value, count), frequency) yield (CST.size.value, CST.page.value, self.crawl), self.pages_total yield (CST.size.value, CST.fetch.value, self.crawl), self.fetches_total if not self.options.exact_counts: yield (CST.size.value, CST.url.value, self.crawl), self.urls_total yield ((CST.size_estimate.value, CST.url.value, self.crawl), CrawlStatsJSONEncoder.json_encode_hyperloglog(self.urls_hll)) yield ((CST.size_estimate.value, CST.digest.value, self.crawl), CrawlStatsJSONEncoder.json_encode_hyperloglog(self.digest_hll)) self.increment_counter('cdx-stats', 'cdx files finished', 1) def reducer_init(self): self.counters = Counter() self.mostfrequent = defaultdict(list) def count_reducer(self, key, values): outputType = key[0] if outputType in (CST.size.value, CST.size_robotstxt.value): yield key, sum(values) elif outputType == CST.histogram.value: yield key, sum(values) elif outputType in (CST.url.value, CST.digest.value): # only with --exact-counts crawls = MonthlyCrawlSet() new_crawls = set() page_count = MultiCount(2) for val in values: if type(val) is list: if (outputType == CST.url.value): (crawl, pages) = val page_count.incr(crawl, pages, 1) else: # digest (crawl, (pages, urls)) = val page_count.incr(crawl, pages, urls) crawls.add(crawl) new_crawls.add(crawl) else: # crawl set bit mask crawls.update(val) yield key, crawls.get_bits() for new_crawl in new_crawls: if crawls.is_new(new_crawl): self.counters[(CST.new_items.value, outputType, new_crawl)] += 1 # url/digest duplicate histograms for crawl, counts in page_count.items(): items = (1 + counts[0] - counts[1]) self.counters[(CST.histogram.value, outputType, crawl, CST.page.value, items)] += 1 # size in terms of unique URLs and unique content digests for crawl, counts in page_count.items(): self.counters[(CST.size.value, outputType, crawl)] += 1 elif outputType in (CST.mimetype.value, CST.mimetype_detected.value, CST.charset.value, CST.languages.value, CST.primary_language.value, CST.scheme.value, CST.tld.value, CST.domain.value, CST.surt_domain.value, CST.host.value, CST.http_status.value, CST.robotstxt_status.value): yield key, MultiCount.sum_values(values) elif outputType == CST.size_estimate.value: hll = HyperLogLog(HYPERLOGLOG_ERROR) for val in values: hll.update(CrawlStatsJSONDecoder.json_decode_hyperloglog(val)) yield (key, CrawlStatsJSONEncoder.json_encode_hyperloglog(hll)) elif outputType == CST.size_estimate_for.value: res = None hll = None cnt = 0 for val in values: if res: if hll is None: cnt = res[0] hll = CrawlStatsJSONDecoder.json_decode_hyperloglog( res[1]) cnt += val[0] hll.update( CrawlStatsJSONDecoder.json_decode_hyperloglog(val[1])) else: res = val if hll is not None and cnt >= MIN_SURT_HLL_SIZE: yield (key, (cnt, CrawlStatsJSONEncoder.json_encode_hyperloglog(hll))) elif res[0] >= MIN_SURT_HLL_SIZE: yield (key, res) else: raise UnhandledTypeError(outputType) def stats_mapper_init(self): self.counters = Counter() def stats_mapper(self, key, value): if key[0] in (CST.url.value, CST.digest.value, CST.size_estimate_for.value): return if ((self.options.min_domain_frequency > 1) and (key[0] in (CST.host.value, CST.domain.value, CST.surt_domain.value))): # quick skip of infrequent host and domains, # significantly limits amount of tuples processed in reducer page_count = MultiCount.get_count(0, value) url_count = MultiCount.get_count(1, value) self.counters[(CST.size.value, key[0], key[2])] += 1 self.counters[(CST.histogram.value, key[0], key[2], CST.page.value, page_count)] += 1 self.counters[(CST.histogram.value, key[0], key[2], CST.url.value, url_count)] += 1 if key[0] in (CST.domain.value, CST.surt_domain.value): host_count = MultiCount.get_count(2, value) self.counters[(CST.histogram.value, key[0], key[2], CST.host.value, host_count)] += 1 if url_count < self.options.min_domain_frequency: return if key[0] == CST.languages.value: # yield only frequent language combinations (if configured) page_count = MultiCount.get_count(0, value) if ((self.options.min_lang_comb_freq > 1) and (page_count < self.options.min_lang_comb_freq) and (',' in key[1])): return yield key, value def stats_mapper_final(self): for (counter, count) in self.counters.items(): yield counter, count def stats_reducer(self, key, values): outputType = CST(key[0]) item = key[1] crawl = MonthlyCrawl.to_name(key[2]) if outputType in (CST.size, CST.new_items, CST.size_estimate, CST.size_robotstxt): verbose_key = (outputType.name, CST(item).name, crawl) if outputType in (CST.size, CST.size_robotstxt): val = sum(values) elif outputType == CST.new_items: val = MultiCount.sum_values(values) elif outputType == CST.size_estimate: # already "reduced" in count job for val in values: break yield verbose_key, val elif outputType == CST.histogram: yield ((outputType.name, CST(item).name, crawl, CST(key[3]).name, key[4]), sum(values)) elif outputType in (CST.mimetype, CST.mimetype_detected, CST.charset, CST.languages, CST.primary_language, CST.scheme, CST.surt_domain, CST.tld, CST.domain, CST.host, CST.http_status, CST.robotstxt_status): item = key[1] for counts in values: page_count = MultiCount.get_count(0, counts) url_count = MultiCount.get_count(1, counts) if outputType in (CST.domain, CST.surt_domain, CST.tld): host_count = MultiCount.get_count(2, counts) if (self.options.min_domain_frequency <= 1 or outputType not in (CST.host, CST.domain, CST.surt_domain)): self.counters[(CST.size.name, outputType.name, crawl)] += 1 self.counters[(CST.histogram.name, outputType.name, crawl, CST.page.name, page_count)] += 1 self.counters[(CST.histogram.name, outputType.name, crawl, CST.url.name, url_count)] += 1 if outputType in (CST.domain, CST.surt_domain, CST.tld): self.counters[(CST.histogram.name, outputType.name, crawl, CST.host.name, host_count)] += 1 if outputType == CST.tld: domain_count = MultiCount.get_count(3, counts) self.counters[(CST.histogram.name, outputType.name, crawl, CST.domain.name, domain_count)] += 1 if outputType in (CST.domain, CST.host, CST.surt_domain): outKey = (outputType.name, crawl) outVal = (page_count, url_count, item) if outputType in (CST.domain, CST.surt_domain): outVal = (page_count, url_count, host_count, item) # take most common if len(self.mostfrequent[outKey]) < self.options.max_hosts: heapq.heappush(self.mostfrequent[outKey], outVal) else: heapq.heappushpop(self.mostfrequent[outKey], outVal) else: yield ((outputType.name, item, crawl), counts) else: raise UnhandledTypeError(outputType) def reducer_final(self): for (counter, count) in self.counters.items(): yield counter, count for key, mostfrequent in self.mostfrequent.items(): (outputType, crawl) = key if outputType in (CST.domain.name, CST.surt_domain.name): for (pages, urls, hosts, item) in mostfrequent: yield ((outputType, item, crawl), MultiCount.compress(3, [pages, urls, hosts])) else: for (pages, urls, item) in mostfrequent: yield ((outputType, item, crawl), MultiCount.compress(2, [pages, urls])) def steps(self): reduces = 10 cdxminsplitsize = 2**32 # do not split cdx map input files if self.options.exact_counts: # with exact counts need many reducers to aggregate the counts # in reasonable time and to get not too large partitions reduces = 200 count_job = \ MRStep(mapper_init=self.count_mapper_init, mapper=self.count_mapper, mapper_final=self.count_mapper_final, reducer_init=self.reducer_init, reducer=self.count_reducer, reducer_final=self.reducer_final, jobconf={'mapreduce.job.reduces': reduces, 'mapreduce.input.fileinputformat.split.minsize': cdxminsplitsize, 'mapreduce.output.fileoutputformat.compress': "true", 'mapreduce.output.fileoutputformat.compress.codec': 'org.apache.hadoop.io.compress.BZip2Codec'}) stats_job = \ MRStep(mapper_init=self.stats_mapper_init, mapper=self.stats_mapper, mapper_final=self.stats_mapper_final, reducer_init=self.reducer_init, reducer=self.stats_reducer, reducer_final=self.reducer_final, jobconf={'mapreduce.job.reduces': 1, 'mapreduce.output.fileoutputformat.compress': "true", 'mapreduce.output.fileoutputformat.compress.codec': 'org.apache.hadoop.io.compress.GzipCodec'}) if self.options.job_to_run == 'count': return [count_job] if self.options.job_to_run == 'stats': return [stats_job] return [count_job, stats_job]
class CCStatsJob(MRJob): '''Job to get crawl statistics from Common Crawl index --job=count run count job (first step) to get counts from Common Crawl index files (cdx-*.gz) --job=stats run statistics job (second step) on output from count job''' OUTPUT_PROTOCOL = JSONProtocol JOBCONF = { 'mapreduce.task.timeout': '9600000', 'mapreduce.map.speculative': 'false', 'mapreduce.reduce.speculative': 'false', 'mapreduce.job.jvm.numtasks': '-1', } s3pattern = re.compile('^s3://([^/]+)/(.+)') gzpattern = re.compile('\.gz$') crawlpattern = re.compile('(CC-MAIN-2\d{3}-\d{2})') def configure_args(self): """Custom command line options for common crawl index statistics""" super(CCStatsJob, self).configure_args() self.add_passthru_arg( '--job', dest='job_to_run', default='', choices=['count', 'stats', ''], help='''Job(s) to run ("count", "stats", or empty to run both)''') self.add_passthru_arg( '--exact-counts', dest='exact_counts', action='store_true', default=None, help='''Exact counts for URLs and content digests, this increases the output size significantly''') self.add_passthru_arg( '--no-exact-counts', dest='exact_counts', action='store_false', default=None, help='''No exact counts for URLs and content digests to save storage space and computation time''') self.add_passthru_arg( '--max-top-hosts-domains', dest='max_hosts', type=int, default=200, help='''Max. number of most frequent hosts or domains shown in final statistics (cf. --min-urls-top-host-domain)''') self.add_passthru_arg( '--min-urls-top-host-domain', dest='min_domain_frequency', type=int, default=1, help='''Min. number of URLs required per host or domain shown in final statistics (cf. --max-top-hosts-domains).''') self.add_passthru_arg( '--min-lang-comb-freq', dest='min_lang_comb_freq', type=int, default=1, help='''Min. number of pages required for a combination of detected languages to be shown in final statistics.''') self.add_passthru_arg( '--crawl', dest='crawl', default=None, help='''ID/name of the crawl analyzed (if not given detected from input path)''') def input_protocol(self): if self.options.job_to_run != 'stats': LOG.debug('Reading text input from cdx files') return RawValueProtocol() LOG.debug('Reading JSON input from count job') return JSONProtocol() def hadoop_input_format(self): input_format = self.HADOOP_INPUT_FORMAT if self.options.job_to_run != 'stats': input_format = 'org.apache.hadoop.mapred.TextInputFormat' LOG.info("Setting input format for {} job: {}".format( self.options.job_to_run, input_format)) return input_format def count_mapper_init(self): """Because cdx.gz files cannot be split and mapreduce.input.fileinputformat.split.minsize is set to a value larger than any cdx.gz file, the mapper is guaranteed to process the content of a single cdx file. Input lines of a cdx file are sorted by SURT URL which allows to aggregate URL counts for one SURT domain in memory. It may happen that one SURT domain spans over multiple cdx files. In this case (and without --exact-counts) the count of unique URLs and the URL histograms may be slightly off in case the same URL occurs also in a second cdx file. However, this problem is negligible because there are only 300 cdx files.""" self.counters = Counter() self.cdx_path = os.environ['mapreduce_map_input_file'] LOG.info('Reading {0}'.format(self.cdx_path)) self.crawl_name = None self.crawl = None if self.options.crawl is not None: self.crawl_name = self.options.crawl else: crawl_name_match = self.crawlpattern.search(self.cdx_path) if crawl_name_match is not None: self.crawl_name = crawl_name_match.group(1) else: raise InputError( "Cannot determine ID of monthly crawl from input path {}" .format(self.cdx_path)) if self.crawl_name is None: raise InputError("Name of crawl not given") self.crawl = MonthlyCrawl.get_by_name(self.crawl_name) self.fetches_total = 0 self.pages_total = 0 self.urls_total = 0 self.urls_hll = HyperLogLog(HYPERLOGLOG_ERROR) self.digest_hll = HyperLogLog(HYPERLOGLOG_ERROR) self.url_histogram = Counter() self.count = None # first and last SURT may continue in previous/next cdx self.min_surt_hll_size = 1 self.increment_counter('cdx-stats', 'cdx files processed', 1) def count_mapper(self, _, line): self.fetches_total += 1 if (self.fetches_total % 1000) == 0: self.increment_counter('cdx-stats', 'cdx lines read', 1000) if (self.fetches_total % 100000) == 0: LOG.info('Read {0} cdx lines'.format(self.fetches_total)) else: LOG.debug('Read {0} cdx lines'.format(self.fetches_total)) parts = line.split(' ') [surt_domain, path] = parts[0].split(')', 1) if self.count is None: self.count = SurtDomainCount(surt_domain) if surt_domain != self.count.surt_domain: # output accumulated statistics for one SURT domain for pair in self.count.output(self.crawl, self.options.exact_counts, self.min_surt_hll_size): yield pair self.urls_total += self.count.unique_urls() for url, cnt in self.count.url.items(): self.urls_hll.add(url) self.url_histogram[cnt] += 1 for digest in self.count.digest: self.digest_hll.add(digest) self.pages_total += self.count.pages self.count = SurtDomainCount(surt_domain) self.min_surt_hll_size = MIN_SURT_HLL_SIZE json_string = ' '.join(parts[2:]) try: metadata = ujson.loads(json_string) self.count.add(path, metadata) except ValueError as e: LOG.error('Failed to parse json: {0} - {1}'.format( e, json_string)) def count_mapper_final(self): self.increment_counter('cdx-stats', 'cdx lines read', self.fetches_total % 1000) if self.count is None: return for pair in self.count.output(self.crawl, self.options.exact_counts, 1): yield pair self.urls_total += self.count.unique_urls() for url, cnt in self.count.url.items(): self.urls_hll.add(url) self.url_histogram[cnt] += 1 for digest in self.count.digest: self.digest_hll.add(digest) self.pages_total += self.count.pages if not self.options.exact_counts: for count, frequency in self.url_histogram.items(): yield((CST.histogram.value, CST.url.value, self.crawl, CST.page.value, count), frequency) yield (CST.size.value, CST.page.value, self.crawl), self.pages_total yield (CST.size.value, CST.fetch.value, self.crawl), self.fetches_total if not self.options.exact_counts: yield (CST.size.value, CST.url.value, self.crawl), self.urls_total yield((CST.size_estimate.value, CST.url.value, self.crawl), CrawlStatsJSONEncoder.json_encode_hyperloglog(self.urls_hll)) yield((CST.size_estimate.value, CST.digest.value, self.crawl), CrawlStatsJSONEncoder.json_encode_hyperloglog(self.digest_hll)) self.increment_counter('cdx-stats', 'cdx files finished', 1) def reducer_init(self): self.counters = Counter() self.mostfrequent = defaultdict(list) def count_reducer(self, key, values): outputType = key[0] if outputType in (CST.size.value, CST.size_robotstxt.value): yield key, sum(values) elif outputType == CST.histogram.value: yield key, sum(values) elif outputType in (CST.url.value, CST.digest.value): # only with --exact-counts crawls = MonthlyCrawlSet() new_crawls = set() page_count = MultiCount(2) for val in values: if type(val) is list: if (outputType == CST.url.value): (crawl, pages) = val page_count.incr(crawl, pages, 1) else: # digest (crawl, (pages, urls)) = val page_count.incr(crawl, pages, urls) crawls.add(crawl) new_crawls.add(crawl) else: # crawl set bit mask crawls.update(val) yield key, crawls.get_bits() for new_crawl in new_crawls: if crawls.is_new(new_crawl): self.counters[(CST.new_items.value, outputType, new_crawl)] += 1 # url/digest duplicate histograms for crawl, counts in page_count.items(): items = (1+counts[0]-counts[1]) self.counters[(CST.histogram.value, outputType, crawl, CST.page.value, items)] += 1 # size in terms of unique URLs and unique content digests for crawl, counts in page_count.items(): self.counters[(CST.size.value, outputType, crawl)] += 1 elif outputType in (CST.mimetype.value, CST.mimetype_detected.value, CST.charset.value, CST.languages.value, CST.primary_language.value, CST.scheme.value, CST.tld.value, CST.domain.value, CST.surt_domain.value, CST.host.value, CST.http_status.value, CST.robotstxt_status.value): yield key, MultiCount.sum_values(values) elif outputType == CST.size_estimate.value: hll = HyperLogLog(HYPERLOGLOG_ERROR) for val in values: hll.update( CrawlStatsJSONDecoder.json_decode_hyperloglog(val)) yield(key, CrawlStatsJSONEncoder.json_encode_hyperloglog(hll)) elif outputType == CST.size_estimate_for.value: res = None hll = None cnt = 0 for val in values: if res: if hll is None: cnt = res[0] hll = CrawlStatsJSONDecoder.json_decode_hyperloglog(res[1]) cnt += val[0] hll.update(CrawlStatsJSONDecoder.json_decode_hyperloglog(val[1])) else: res = val if hll is not None and cnt >= MIN_SURT_HLL_SIZE: yield(key, (cnt, CrawlStatsJSONEncoder.json_encode_hyperloglog(hll))) elif res[0] >= MIN_SURT_HLL_SIZE: yield(key, res) else: raise UnhandledTypeError(outputType) def stats_mapper_init(self): self.counters = Counter() def stats_mapper(self, key, value): if key[0] in (CST.url.value, CST.digest.value, CST.size_estimate_for.value): return if ((self.options.min_domain_frequency > 1) and (key[0] in (CST.host.value, CST.domain.value, CST.surt_domain.value))): # quick skip of infrequent host and domains, # significantly limits amount of tuples processed in reducer page_count = MultiCount.get_count(0, value) url_count = MultiCount.get_count(1, value) self.counters[(CST.size.value, key[0], key[2])] += 1 self.counters[(CST.histogram.value, key[0], key[2], CST.page.value, page_count)] += 1 self.counters[(CST.histogram.value, key[0], key[2], CST.url.value, url_count)] += 1 if key[0] in (CST.domain.value, CST.surt_domain.value): host_count = MultiCount.get_count(2, value) self.counters[(CST.histogram.value, key[0], key[2], CST.host.value, host_count)] += 1 if url_count < self.options.min_domain_frequency: return if key[0] == CST.languages.value: # yield only frequent language combinations (if configured) page_count = MultiCount.get_count(0, value) if ((self.options.min_lang_comb_freq > 1) and (page_count < self.options.min_lang_comb_freq) and (',' in key[1])): return yield key, value def stats_mapper_final(self): for (counter, count) in self.counters.items(): yield counter, count def stats_reducer(self, key, values): outputType = CST(key[0]) item = key[1] crawl = MonthlyCrawl.to_name(key[2]) if outputType in (CST.size, CST.new_items, CST.size_estimate, CST.size_robotstxt): verbose_key = (outputType.name, CST(item).name, crawl) if outputType in (CST.size, CST.size_robotstxt): val = sum(values) elif outputType == CST.new_items: val = MultiCount.sum_values(values) elif outputType == CST.size_estimate: # already "reduced" in count job for val in values: break yield verbose_key, val elif outputType == CST.histogram: yield((outputType.name, CST(item).name, crawl, CST(key[3]).name, key[4]), sum(values)) elif outputType in (CST.mimetype, CST.mimetype_detected, CST.charset, CST.languages, CST.primary_language, CST.scheme, CST.surt_domain, CST.tld, CST.domain, CST.host, CST.http_status, CST.robotstxt_status): item = key[1] for counts in values: page_count = MultiCount.get_count(0, counts) url_count = MultiCount.get_count(1, counts) if outputType in (CST.domain, CST.surt_domain, CST.tld): host_count = MultiCount.get_count(2, counts) if (self.options.min_domain_frequency <= 1 or outputType not in (CST.host, CST.domain, CST.surt_domain)): self.counters[(CST.size.name, outputType.name, crawl)] += 1 self.counters[(CST.histogram.name, outputType.name, crawl, CST.page.name, page_count)] += 1 self.counters[(CST.histogram.name, outputType.name, crawl, CST.url.name, url_count)] += 1 if outputType in (CST.domain, CST.surt_domain, CST.tld): self.counters[(CST.histogram.name, outputType.name, crawl, CST.host.name, host_count)] += 1 if outputType == CST.tld: domain_count = MultiCount.get_count(3, counts) self.counters[(CST.histogram.name, outputType.name, crawl, CST.domain.name, domain_count)] += 1 if outputType in (CST.domain, CST.host, CST.surt_domain): outKey = (outputType.name, crawl) outVal = (page_count, url_count, item) if outputType in (CST.domain, CST.surt_domain): outVal = (page_count, url_count, host_count, item) # take most common if len(self.mostfrequent[outKey]) < self.options.max_hosts: heapq.heappush(self.mostfrequent[outKey], outVal) else: heapq.heappushpop(self.mostfrequent[outKey], outVal) else: yield((outputType.name, item, crawl), counts) else: raise UnhandledTypeError(outputType) def reducer_final(self): for (counter, count) in self.counters.items(): yield counter, count for key, mostfrequent in self.mostfrequent.items(): (outputType, crawl) = key if outputType in (CST.domain.name, CST.surt_domain.name): for (pages, urls, hosts, item) in mostfrequent: yield((outputType, item, crawl), MultiCount.compress(3, [pages, urls, hosts])) else: for (pages, urls, item) in mostfrequent: yield((outputType, item, crawl), MultiCount.compress(2, [pages, urls])) def steps(self): reduces = 10 cdxminsplitsize = 2**32 # do not split cdx map input files if self.options.exact_counts: # with exact counts need many reducers to aggregate the counts # in reasonable time and to get not too large partitions reduces = 200 count_job = \ MRStep(mapper_init=self.count_mapper_init, mapper=self.count_mapper, mapper_final=self.count_mapper_final, reducer_init=self.reducer_init, reducer=self.count_reducer, reducer_final=self.reducer_final, jobconf={'mapreduce.job.reduces': reduces, 'mapreduce.input.fileinputformat.split.minsize': cdxminsplitsize, 'mapreduce.output.fileoutputformat.compress': "true", 'mapreduce.output.fileoutputformat.compress.codec': 'org.apache.hadoop.io.compress.BZip2Codec'}) stats_job = \ MRStep(mapper_init=self.stats_mapper_init, mapper=self.stats_mapper, mapper_final=self.stats_mapper_final, reducer_init=self.reducer_init, reducer=self.stats_reducer, reducer_final=self.reducer_final, jobconf={'mapreduce.job.reduces': 1, 'mapreduce.output.fileoutputformat.compress': "true", 'mapreduce.output.fileoutputformat.compress.codec': 'org.apache.hadoop.io.compress.GzipCodec'}) if self.options.job_to_run == 'count': return [count_job] if self.options.job_to_run == 'stats': return [stats_job] return [count_job, stats_job]
continue # Try to parse event data try: data = loads(event.data) except ValueError: continue # Ignore other wiki sources like # wikimedia, wikidata, wikisource, wiktionary, ... server_search = 'wikipedia.org' if not server_search in data['server_name']: continue # Add page title to HyperLogLog and Set hll.add(data['title']) hll_custom.add(data['title']) naive.add(data['title']) write_counter += 1 # ... Keep track of current approximation and size dtn = datetime.now() print(f'[{dtn}] +{dtn - start_time}', end='\r') # Write Log if dtn - start_time >= duration: # ... Log results duration_counter += 1 duration = timedelta(hours=duration_counter) if duration_counter > max_duration:
from hyperloglog import HyperLogLog sheep_seen = set() sheep_seen_hll = HyperLogLog(0.01) for m in range(0, 100000): sheep_id = str(m) sheep_seen.add(sheep_id) sheep_seen_hll.add(sheep_id) print(f"There are {len(sheep_seen)} sheep (set).") print(f"There are {len(sheep_seen_hll)} sheep (hyperloglog).")