def transform_data(self): crawl_has_host_domain_counts = {} for tld in self.tlds: tld_repr = tld tld_obj = None if tld in ('', '(ip address)'): continue else: try: tld_obj = TopLevelDomain(tld) tld_repr = tld_obj.tld except: print('error', tld) continue for crawl in self.tlds[tld]: self.tld_stats['suffix'][self.N] = tld_repr self.tld_stats['crawl'][self.N] = crawl date = pandas.Timestamp(MonthlyCrawl.date_of(crawl)) self.tld_stats['date'][self.N] = date if tld_obj: self.tld_stats['type'][self.N] \ = TopLevelDomain.short_type(tld_obj.tld_type) self.tld_stats['subtype'][self.N] = tld_obj.sub_type self.tld_stats['tld'][self.N] = tld_obj.first_level else: self.tld_stats['type'][self.N] = '' self.tld_stats['subtype'][self.N] = '' self.tld_stats['tld'][self.N] = '' value = self.tlds[tld][crawl] n_pages = MultiCount.get_count(0, value) self.tld_stats['pages'][self.N] = n_pages n_urls = MultiCount.get_count(1, value) self.tld_stats['urls'][self.N] = n_urls n_hosts = MultiCount.get_count(2, value) self.tld_stats['hosts'][self.N] = n_hosts n_domains = MultiCount.get_count(3, value) self.tld_stats['domains'][self.N] = n_domains if n_urls != n_hosts: # multi counts including host counts are not (yet) # available for all crawls crawl_has_host_domain_counts[crawl] = True elif crawl not in crawl_has_host_domain_counts: crawl_has_host_domain_counts[crawl] = False self.N += 1 for crawl in crawl_has_host_domain_counts: if not crawl_has_host_domain_counts[crawl]: print('No host and domain counts for', crawl) for n in self.tld_stats['crawl']: if self.tld_stats['crawl'][n] == crawl: del(self.tld_stats['hosts'][n]) del(self.tld_stats['domains'][n]) self.tld_stats = pandas.DataFrame(self.tld_stats)
def add_check_type(self, key, val, requ_type_cst): cst = CST[key[0]] if cst != requ_type_cst: return typeval = key[1] crawl = key[2] self.crawls.add(crawl) typeval = self.norm_value(typeval) if crawl in self.types[typeval]: self.types[typeval][crawl] = \ MultiCount.sum_values([val, self.types[typeval][crawl]]) else: self.types[typeval][crawl] = val npages = MultiCount.get_count(0, val) self.types_total[typeval] += npages
def add(self, key, val): cst = CST[key[0]] if cst != CST.mimetype: return mimetype = key[1] crawl = key[2] self.crawls.add(crawl) mimetype = MimeTypeStats.norm_mimetype(mimetype) if crawl in self.types[mimetype]: self.types[mimetype][crawl] = \ MultiCount.sum_values([val, self.types[mimetype][crawl]]) else: self.types[mimetype][crawl] = val npages = MultiCount.get_count(0, val) self.types_total[mimetype] += npages
def add_check_type(self, key, val, requ_type_cst): cst = CST[key[0]] if cst != requ_type_cst and cst != CST.size: return typeval = key[1] crawl = key[2] self.crawls.add(crawl) typeval = self.norm_value(typeval) if cst == CST.size: self.size[crawl][typeval] = int(val) return if crawl in self.types[typeval]: self.types[typeval][crawl] = \ MultiCount.sum_values([val, self.types[typeval][crawl]]) else: self.types[typeval][crawl] = val npages = MultiCount.get_count(0, val) self.types_total[typeval] += npages if 'known_values' not in self.size[crawl]: self.size[crawl]['known_values'] = 0 self.size[crawl]['known_values'] += npages
def transform_data(self, top_n, min_avg_count, check_pattern=None): print( "Number of different values after first normalization: {}".format( len(self.types))) typevals_for_deletion = set() typevals_mostfrequent = [] for typeval in self.types: total_count = self.types_total[typeval] average_count = int(total_count / len(self.crawls)) if average_count >= top_n: if not check_pattern or check_pattern.match(typeval): print('{}\t{}\t{}'.format(typeval, average_count, total_count)) fval = (total_count, typeval) if len(typevals_mostfrequent) < top_n: heapq.heappush(typevals_mostfrequent, fval) else: heapq.heappushpop(typevals_mostfrequent, fval) continue # ok, keep this type value else: print( 'Type value frequent but invalid: <{}> (avg. count = {})' .format(typeval, average_count)) elif average_count >= (min_avg_count / 10): if not check_pattern or check_pattern.match(typeval): print( 'Skipped type value because of low frequency: <{}> (avg. count = {})' .format(typeval, average_count)) typevals_for_deletion.add(typeval) # map low frequency or invalid type values to empty type keep_typevals = set() for (_, typeval) in typevals_mostfrequent: keep_typevals.add(typeval) for typeval in self.types: if (typeval not in keep_typevals and typeval not in typevals_for_deletion): print( 'Skipped type value because not in top {}: <{}> (avg. count = {})' .format(top_n, typeval, int(self.types_total[typeval] / len(self.crawls)))) typevals_for_deletion.add(typeval) typevals_other = dict() for typeval in typevals_for_deletion: for crawl in self.types[typeval]: if crawl in typevals_other: val = typevals_other[crawl] else: val = 0 typevals_other[crawl] = \ MultiCount.sum_values([val, self.types[typeval][crawl]]) self.types.pop(typeval, None) self.types['<other>'] = typevals_other print('Number of different type values after cleaning and' ' removal of low frequency types: {}'.format(len(self.types))) for typeval in self.types: for crawl in self.types[typeval]: self.type_stats['type'][self.N] = typeval self.type_stats['crawl'][self.N] = crawl value = self.types[typeval][crawl] n_pages = MultiCount.get_count(0, value) self.type_stats['pages'][self.N] = n_pages n_urls = MultiCount.get_count(1, value) self.type_stats['urls'][self.N] = n_urls self.N += 1 self.type_stats = pandas.DataFrame(self.type_stats)
def transform_data(self, top_n, min_avg_count): print("Number of different MIME types after first normalization: {}". format(len(self.types))) mimetypes_for_deletion = set() mimetypes_mostfrequent = [] for mimetype in self.types: total_count = self.types_total[mimetype] average_count = int(total_count / len(self.crawls)) if average_count >= top_n: if MimeTypeStats.mime_pattern.match(mimetype): print('{}\t{}\t{}'.format(mimetype, average_count, total_count)) fval = (total_count, mimetype) if len(mimetypes_mostfrequent) < top_n: heapq.heappush(mimetypes_mostfrequent, fval) else: heapq.heappushpop(mimetypes_mostfrequent, fval) continue # ok, keep this MIME type else: print( 'MIME type frequent but invalid: <{}> (avg. count = {})' .format(mimetype, average_count)) elif average_count >= (min_avg_count / 10): if MimeTypeStats.mime_pattern.match(mimetype): print( 'Skipped MIME type because of low frequency: <{}> (avg. count = {})' .format(mimetype, average_count)) mimetypes_for_deletion.add(mimetype) # map low frequency or invalid MIME types to empty type keep_mimetypes = set() for (_, mimetype) in mimetypes_mostfrequent: keep_mimetypes.add(mimetype) for mimetype in self.types: if (mimetype not in keep_mimetypes and mimetype not in mimetypes_for_deletion): print( 'Skipped MIME type because not in top {}: <{}> (avg. count = {})' .format(top_n, mimetype, int(self.types_total[mimetype] / len(self.crawls)))) mimetypes_for_deletion.add(mimetype) mimetypes_other = dict() for mimetype in mimetypes_for_deletion: for crawl in self.types[mimetype]: if crawl in mimetypes_other: val = mimetypes_other[crawl] else: val = 0 mimetypes_other[crawl] = \ MultiCount.sum_values([val, self.types[mimetype][crawl]]) self.types.pop(mimetype, None) self.types['<other>'] = mimetypes_other print('Number of different MIME types after cleaning and' ' removal of low frequency types: {}'.format(len(self.types))) for mimetype in self.types: for crawl in self.types[mimetype]: self.type_stats['mimetype'][self.N] = mimetype self.type_stats['crawl'][self.N] = crawl value = self.types[mimetype][crawl] n_pages = MultiCount.get_count(0, value) self.type_stats['pages'][self.N] = n_pages n_urls = MultiCount.get_count(1, value) self.type_stats['urls'][self.N] = n_urls self.N += 1 self.type_stats = pandas.DataFrame(self.type_stats)
def test_multicount(): cnt = MultiCount(2) cnt.incr('a', 1, 1) assert([1, 1] == cnt.get('a')) assert(1 == cnt.get_compressed('a')) cnt.incr('a', 2, 1) assert([3, 2] == cnt.get_compressed('a')) assert([3, 2] == MultiCount.sum_values(2, [[2, 1], 1])) assert([6, 4, 3] == MultiCount.sum_values(3, [[3, 2, 1], [2, 1], 1])) cnt.incr('b', *[2, 1])
def test_multicount(): cnt = MultiCount(2) cnt.incr('a', 1, 1) assert ([1, 1] == cnt.get('a')) assert (1 == cnt.get_compressed('a')) cnt.incr('a', 2, 1) assert ([3, 2] == cnt.get_compressed('a')) assert ([3, 2] == MultiCount.sum_values(2, [[2, 1], 1])) assert ([6, 4, 3] == MultiCount.sum_values(3, [[3, 2, 1], [2, 1], 1])) cnt.incr('b', *[2, 1])
def transform_data(self, top_n, min_avg_count, check_pattern=None): print("Number of different values after first normalization: {}" .format(len(self.types))) typevals_for_deletion = set() typevals_mostfrequent = [] for typeval in self.types: total_count = self.types_total[typeval] average_count = int(total_count / len(self.crawls)) if average_count >= top_n: if not check_pattern or check_pattern.match(typeval): print('{}\t{}\t{}'.format(typeval, average_count, total_count)) fval = (total_count, typeval) if len(typevals_mostfrequent) < top_n: heapq.heappush(typevals_mostfrequent, fval) else: heapq.heappushpop(typevals_mostfrequent, fval) continue # ok, keep this type value else: print('Type value frequent but invalid: <{}> (avg. count = {})' .format(typeval, average_count)) elif average_count >= (min_avg_count/10): if not check_pattern or check_pattern.match(typeval): print('Skipped type value because of low frequency: <{}> (avg. count = {})' .format(typeval, average_count)) typevals_for_deletion.add(typeval) # map low frequency or invalid type values to empty type keep_typevals = set() for (_, typeval) in typevals_mostfrequent: keep_typevals.add(typeval) for typeval in self.types: if (typeval not in keep_typevals and typeval not in typevals_for_deletion): print('Skipped type value because not in top {}: <{}> (avg. count = {})' .format(top_n, typeval, int(self.types_total[typeval]/len(self.crawls)))) typevals_for_deletion.add(typeval) typevals_other = dict() for typeval in typevals_for_deletion: for crawl in self.types[typeval]: if crawl in typevals_other: val = typevals_other[crawl] else: val = 0 typevals_other[crawl] = \ MultiCount.sum_values([val, self.types[typeval][crawl]]) self.types.pop(typeval, None) self.types['<other>'] = typevals_other print('Number of different type values after cleaning and' ' removal of low frequency types: {}' .format(len(self.types))) # unknown values for crawl in self.crawls: known_values = 0 if 'known_values' in self.size[crawl]: known_values = self.size[crawl]['known_values'] unknown = (self.size[crawl]['page'] - known_values) if unknown > 0: print("{} unknown values in {}".format(unknown, crawl)) self.types['<unknown>'][crawl] = unknown for typeval in self.types: for crawl in self.types[typeval]: self.type_stats['type'][self.N] = typeval self.type_stats['crawl'][self.N] = crawl value = self.types[typeval][crawl] n_pages = MultiCount.get_count(0, value) self.type_stats['pages'][self.N] = n_pages n_urls = MultiCount.get_count(1, value) self.type_stats['urls'][self.N] = n_urls self.N += 1 self.type_stats = pandas.DataFrame(self.type_stats)