Example #1
0
 def transform_data(self):
     crawl_has_host_domain_counts = {}
     for tld in self.tlds:
         tld_repr = tld
         tld_obj = None
         if tld in ('', '(ip address)'):
             continue
         else:
             try:
                 tld_obj = TopLevelDomain(tld)
                 tld_repr = tld_obj.tld
             except:
                 print('error', tld)
                 continue
         for crawl in self.tlds[tld]:
             self.tld_stats['suffix'][self.N] = tld_repr
             self.tld_stats['crawl'][self.N] = crawl
             date = pandas.Timestamp(MonthlyCrawl.date_of(crawl))
             self.tld_stats['date'][self.N] = date
             if tld_obj:
                 self.tld_stats['type'][self.N] \
                     = TopLevelDomain.short_type(tld_obj.tld_type)
                 self.tld_stats['subtype'][self.N] = tld_obj.sub_type
                 self.tld_stats['tld'][self.N] = tld_obj.first_level
             else:
                 self.tld_stats['type'][self.N] = ''
                 self.tld_stats['subtype'][self.N] = ''
                 self.tld_stats['tld'][self.N] = ''
             value = self.tlds[tld][crawl]
             n_pages = MultiCount.get_count(0, value)
             self.tld_stats['pages'][self.N] = n_pages
             n_urls = MultiCount.get_count(1, value)
             self.tld_stats['urls'][self.N] = n_urls
             n_hosts = MultiCount.get_count(2, value)
             self.tld_stats['hosts'][self.N] = n_hosts
             n_domains = MultiCount.get_count(3, value)
             self.tld_stats['domains'][self.N] = n_domains
             if n_urls != n_hosts:
                 # multi counts including host counts are not (yet)
                 # available for all crawls
                 crawl_has_host_domain_counts[crawl] = True
             elif crawl not in crawl_has_host_domain_counts:
                 crawl_has_host_domain_counts[crawl] = False
             self.N += 1
     for crawl in crawl_has_host_domain_counts:
         if not crawl_has_host_domain_counts[crawl]:
             print('No host and domain counts for', crawl)
             for n in self.tld_stats['crawl']:
                 if self.tld_stats['crawl'][n] == crawl:
                     del(self.tld_stats['hosts'][n])
                     del(self.tld_stats['domains'][n])
     self.tld_stats = pandas.DataFrame(self.tld_stats)
Example #2
0
 def add_check_type(self, key, val, requ_type_cst):
     cst = CST[key[0]]
     if cst != requ_type_cst:
         return
     typeval = key[1]
     crawl = key[2]
     self.crawls.add(crawl)
     typeval = self.norm_value(typeval)
     if crawl in self.types[typeval]:
         self.types[typeval][crawl] = \
             MultiCount.sum_values([val, self.types[typeval][crawl]])
     else:
         self.types[typeval][crawl] = val
     npages = MultiCount.get_count(0, val)
     self.types_total[typeval] += npages
Example #3
0
 def add(self, key, val):
     cst = CST[key[0]]
     if cst != CST.mimetype:
         return
     mimetype = key[1]
     crawl = key[2]
     self.crawls.add(crawl)
     mimetype = MimeTypeStats.norm_mimetype(mimetype)
     if crawl in self.types[mimetype]:
         self.types[mimetype][crawl] = \
             MultiCount.sum_values([val, self.types[mimetype][crawl]])
     else:
         self.types[mimetype][crawl] = val
     npages = MultiCount.get_count(0, val)
     self.types_total[mimetype] += npages
Example #4
0
 def add_check_type(self, key, val, requ_type_cst):
     cst = CST[key[0]]
     if cst != requ_type_cst and cst != CST.size:
         return
     typeval = key[1]
     crawl = key[2]
     self.crawls.add(crawl)
     typeval = self.norm_value(typeval)
     if cst == CST.size:
         self.size[crawl][typeval] = int(val)
         return
     if crawl in self.types[typeval]:
         self.types[typeval][crawl] = \
             MultiCount.sum_values([val, self.types[typeval][crawl]])
     else:
         self.types[typeval][crawl] = val
     npages = MultiCount.get_count(0, val)
     self.types_total[typeval] += npages
     if 'known_values' not in self.size[crawl]:
         self.size[crawl]['known_values'] = 0
     self.size[crawl]['known_values'] += npages
Example #5
0
 def add_check_type(self, key, val, requ_type_cst):
     cst = CST[key[0]]
     if cst != requ_type_cst and cst != CST.size:
         return
     typeval = key[1]
     crawl = key[2]
     self.crawls.add(crawl)
     typeval = self.norm_value(typeval)
     if cst == CST.size:
         self.size[crawl][typeval] = int(val)
         return
     if crawl in self.types[typeval]:
         self.types[typeval][crawl] = \
             MultiCount.sum_values([val, self.types[typeval][crawl]])
     else:
         self.types[typeval][crawl] = val
     npages = MultiCount.get_count(0, val)
     self.types_total[typeval] += npages
     if 'known_values' not in self.size[crawl]:
         self.size[crawl]['known_values'] = 0
     self.size[crawl]['known_values'] += npages
Example #6
0
 def transform_data(self, top_n, min_avg_count, check_pattern=None):
     print(
         "Number of different values after first normalization: {}".format(
             len(self.types)))
     typevals_for_deletion = set()
     typevals_mostfrequent = []
     for typeval in self.types:
         total_count = self.types_total[typeval]
         average_count = int(total_count / len(self.crawls))
         if average_count >= top_n:
             if not check_pattern or check_pattern.match(typeval):
                 print('{}\t{}\t{}'.format(typeval, average_count,
                                           total_count))
                 fval = (total_count, typeval)
                 if len(typevals_mostfrequent) < top_n:
                     heapq.heappush(typevals_mostfrequent, fval)
                 else:
                     heapq.heappushpop(typevals_mostfrequent, fval)
                 continue  # ok, keep this type value
             else:
                 print(
                     'Type value frequent but invalid: <{}> (avg. count = {})'
                     .format(typeval, average_count))
         elif average_count >= (min_avg_count / 10):
             if not check_pattern or check_pattern.match(typeval):
                 print(
                     'Skipped type value because of low frequency: <{}> (avg. count = {})'
                     .format(typeval, average_count))
         typevals_for_deletion.add(typeval)
     # map low frequency or invalid type values to empty type
     keep_typevals = set()
     for (_, typeval) in typevals_mostfrequent:
         keep_typevals.add(typeval)
     for typeval in self.types:
         if (typeval not in keep_typevals
                 and typeval not in typevals_for_deletion):
             print(
                 'Skipped type value because not in top {}: <{}> (avg. count = {})'
                 .format(top_n, typeval,
                         int(self.types_total[typeval] / len(self.crawls))))
             typevals_for_deletion.add(typeval)
     typevals_other = dict()
     for typeval in typevals_for_deletion:
         for crawl in self.types[typeval]:
             if crawl in typevals_other:
                 val = typevals_other[crawl]
             else:
                 val = 0
             typevals_other[crawl] = \
                 MultiCount.sum_values([val, self.types[typeval][crawl]])
         self.types.pop(typeval, None)
     self.types['<other>'] = typevals_other
     print('Number of different type values after cleaning and'
           ' removal of low frequency types: {}'.format(len(self.types)))
     for typeval in self.types:
         for crawl in self.types[typeval]:
             self.type_stats['type'][self.N] = typeval
             self.type_stats['crawl'][self.N] = crawl
             value = self.types[typeval][crawl]
             n_pages = MultiCount.get_count(0, value)
             self.type_stats['pages'][self.N] = n_pages
             n_urls = MultiCount.get_count(1, value)
             self.type_stats['urls'][self.N] = n_urls
             self.N += 1
     self.type_stats = pandas.DataFrame(self.type_stats)
Example #7
0
 def transform_data(self, top_n, min_avg_count):
     print("Number of different MIME types after first normalization: {}".
           format(len(self.types)))
     mimetypes_for_deletion = set()
     mimetypes_mostfrequent = []
     for mimetype in self.types:
         total_count = self.types_total[mimetype]
         average_count = int(total_count / len(self.crawls))
         if average_count >= top_n:
             if MimeTypeStats.mime_pattern.match(mimetype):
                 print('{}\t{}\t{}'.format(mimetype, average_count,
                                           total_count))
                 fval = (total_count, mimetype)
                 if len(mimetypes_mostfrequent) < top_n:
                     heapq.heappush(mimetypes_mostfrequent, fval)
                 else:
                     heapq.heappushpop(mimetypes_mostfrequent, fval)
                 continue  # ok, keep this MIME type
             else:
                 print(
                     'MIME type frequent but invalid: <{}> (avg. count = {})'
                     .format(mimetype, average_count))
         elif average_count >= (min_avg_count / 10):
             if MimeTypeStats.mime_pattern.match(mimetype):
                 print(
                     'Skipped MIME type because of low frequency: <{}> (avg. count = {})'
                     .format(mimetype, average_count))
         mimetypes_for_deletion.add(mimetype)
     # map low frequency or invalid MIME types to empty type
     keep_mimetypes = set()
     for (_, mimetype) in mimetypes_mostfrequent:
         keep_mimetypes.add(mimetype)
     for mimetype in self.types:
         if (mimetype not in keep_mimetypes
                 and mimetype not in mimetypes_for_deletion):
             print(
                 'Skipped MIME type because not in top {}: <{}> (avg. count = {})'
                 .format(top_n, mimetype,
                         int(self.types_total[mimetype] /
                             len(self.crawls))))
             mimetypes_for_deletion.add(mimetype)
     mimetypes_other = dict()
     for mimetype in mimetypes_for_deletion:
         for crawl in self.types[mimetype]:
             if crawl in mimetypes_other:
                 val = mimetypes_other[crawl]
             else:
                 val = 0
             mimetypes_other[crawl] = \
                 MultiCount.sum_values([val, self.types[mimetype][crawl]])
         self.types.pop(mimetype, None)
     self.types['<other>'] = mimetypes_other
     print('Number of different MIME types after cleaning and'
           ' removal of low frequency types: {}'.format(len(self.types)))
     for mimetype in self.types:
         for crawl in self.types[mimetype]:
             self.type_stats['mimetype'][self.N] = mimetype
             self.type_stats['crawl'][self.N] = crawl
             value = self.types[mimetype][crawl]
             n_pages = MultiCount.get_count(0, value)
             self.type_stats['pages'][self.N] = n_pages
             n_urls = MultiCount.get_count(1, value)
             self.type_stats['urls'][self.N] = n_urls
             self.N += 1
     self.type_stats = pandas.DataFrame(self.type_stats)
def test_multicount():
    cnt = MultiCount(2)
    cnt.incr('a', 1, 1)
    assert([1, 1] == cnt.get('a'))
    assert(1 == cnt.get_compressed('a'))
    cnt.incr('a', 2, 1)
    assert([3, 2] == cnt.get_compressed('a'))
    assert([3, 2] == MultiCount.sum_values(2, [[2, 1], 1]))
    assert([6, 4, 3] == MultiCount.sum_values(3, [[3, 2, 1], [2, 1], 1]))
    cnt.incr('b', *[2, 1])
Example #9
0
def test_multicount():
    cnt = MultiCount(2)
    cnt.incr('a', 1, 1)
    assert ([1, 1] == cnt.get('a'))
    assert (1 == cnt.get_compressed('a'))
    cnt.incr('a', 2, 1)
    assert ([3, 2] == cnt.get_compressed('a'))
    assert ([3, 2] == MultiCount.sum_values(2, [[2, 1], 1]))
    assert ([6, 4, 3] == MultiCount.sum_values(3, [[3, 2, 1], [2, 1], 1]))
    cnt.incr('b', *[2, 1])
Example #10
0
 def transform_data(self, top_n, min_avg_count, check_pattern=None):
     print("Number of different values after first normalization: {}"
           .format(len(self.types)))
     typevals_for_deletion = set()
     typevals_mostfrequent = []
     for typeval in self.types:
         total_count = self.types_total[typeval]
         average_count = int(total_count / len(self.crawls))
         if average_count >= top_n:
             if not check_pattern or check_pattern.match(typeval):
                 print('{}\t{}\t{}'.format(typeval,
                                           average_count, total_count))
                 fval = (total_count, typeval)
                 if len(typevals_mostfrequent) < top_n:
                     heapq.heappush(typevals_mostfrequent, fval)
                 else:
                     heapq.heappushpop(typevals_mostfrequent, fval)
                 continue  # ok, keep this type value
             else:
                 print('Type value frequent but invalid: <{}> (avg. count = {})'
                       .format(typeval, average_count))
         elif average_count >= (min_avg_count/10):
             if not check_pattern or check_pattern.match(typeval):
                 print('Skipped type value because of low frequency: <{}> (avg. count = {})'
                       .format(typeval, average_count))
         typevals_for_deletion.add(typeval)
     # map low frequency or invalid type values to empty type
     keep_typevals = set()
     for (_, typeval) in typevals_mostfrequent:
         keep_typevals.add(typeval)
     for typeval in self.types:
         if (typeval not in keep_typevals and
                 typeval not in typevals_for_deletion):
             print('Skipped type value because not in top {}: <{}> (avg. count = {})'
                   .format(top_n, typeval,
                           int(self.types_total[typeval]/len(self.crawls))))
             typevals_for_deletion.add(typeval)
     typevals_other = dict()
     for typeval in typevals_for_deletion:
         for crawl in self.types[typeval]:
             if crawl in typevals_other:
                 val = typevals_other[crawl]
             else:
                 val = 0
             typevals_other[crawl] = \
                 MultiCount.sum_values([val, self.types[typeval][crawl]])
         self.types.pop(typeval, None)
     self.types['<other>'] = typevals_other
     print('Number of different type values after cleaning and'
           ' removal of low frequency types: {}'
           .format(len(self.types)))
     # unknown values
     for crawl in self.crawls:
         known_values = 0
         if 'known_values' in self.size[crawl]:
             known_values = self.size[crawl]['known_values']
         unknown = (self.size[crawl]['page'] - known_values)
         if unknown > 0:
             print("{} unknown values in {}".format(unknown, crawl))
             self.types['<unknown>'][crawl] = unknown
     for typeval in self.types:
         for crawl in self.types[typeval]:
             self.type_stats['type'][self.N] = typeval
             self.type_stats['crawl'][self.N] = crawl
             value = self.types[typeval][crawl]
             n_pages = MultiCount.get_count(0, value)
             self.type_stats['pages'][self.N] = n_pages
             n_urls = MultiCount.get_count(1, value)
             self.type_stats['urls'][self.N] = n_urls
             self.N += 1
     self.type_stats = pandas.DataFrame(self.type_stats)