def read_data(datadir, category_index, domain_level=None): index = {} all_clicks = 0 clicks_without_user = 0 for row in generate_pig_data(datadir): all_clicks += 1 # (dt, sid, bcookie, referrer, target, refsite, refcat, targetsite, targetcat) # category_index will be either 5 or 6 for refsite or refcat category = row[category_index] user = row[1].strip() if user == '': clicks_without_user += 1 continue if domain_level is not None: target = nth_level_domain(row[4], domain_level) else: target = row[4] if category not in index: index[category] = {} if user not in index[category]: index[category][user] = {} if target not in index[category][user]: index[category][user][target] = 0 index[category][user][target] += 1 logging.debug('All clicks: %d' % all_clicks) logging.debug('Clicks without users: %d' % clicks_without_user) return index
def is_member(h, hosts): if len(hosts) == 1 and hosts[0] == '': return h == '' else: dls = domain_levels(hosts) for dl in dls: if nth_level_domain(h, dl) in hosts: return True return False
def should_skip_host(h): if is_ip_address(h): return True elif domain_level(h) <= 1: return True for dl in DOMAIN_LEVELS: if nth_level_domain(h, dl) in UNWANTED_DOMAINS: return True return fnmatches_multiple(UNWANTED_PATTERNS, h)
def parents(url): """ >>> parents('facebook.com') [] >>> parents('indiana.facebook.com') ['facebook.com'] >>> parents('1.2.3.news.bbc.co.uk') ['2.3.news.bbc.co.uk', '3.news.bbc.co.uk', 'news.bbc.co.uk', 'bbc.co.uk'] """ parent_urls = [] dl = domain_level(url) if is_exception(url): end = 2 else: end = 1 for parent_dl in range(dl - 1, end, -1): parent_urls.append(nth_level_domain(url, parent_dl)) return parent_urls
def should_skip_host(h): for dl in DOMAIN_LEVELS: if nth_level_domain(h, dl) in UNWANTED_DOMAINS: return True return False
def should_skip_host(h): for dl in DOMAIN_LEVELS: if nth_level_domain(h, dl) in WANTED_DOMAINS: return False return True
def should_skip_host(h): for dl in DOMAIN_LEVELS: if nth_level_domain(h, dl) in IU_TRAFFIC: return True return False