def read_data(datadir, category_index, domain_level=None):
    index = {}

    all_clicks = 0
    clicks_without_user = 0
    for row in generate_pig_data(datadir):
        all_clicks += 1
        # (dt, sid, bcookie, referrer, target, refsite, refcat, targetsite, targetcat)
        # category_index will be either 5 or 6 for refsite or refcat
        category = row[category_index]
        user = row[1].strip()

        if user == '':
            clicks_without_user += 1
            continue

        if domain_level is not None:
            target = nth_level_domain(row[4], domain_level)
        else:
            target = row[4]

        if category not in index:
            index[category] = {}

        if user not in index[category]:
            index[category][user] = {}

        if target not in index[category][user]:
            index[category][user][target] = 0

        index[category][user][target] += 1

    logging.debug('All clicks: %d' % all_clicks)
    logging.debug('Clicks without users: %d' % clicks_without_user)
    return index
Beispiel #2
0
def is_member(h, hosts):
	if len(hosts) == 1 and hosts[0] == '':
		return h == ''
	else:
		dls = domain_levels(hosts)
		for dl in dls:
			if nth_level_domain(h, dl) in hosts:
				return True
	return False
Beispiel #3
0
def should_skip_host(h):
	if is_ip_address(h):
		return True
	elif domain_level(h) <= 1:
		return True
	for dl in DOMAIN_LEVELS:
	   	if nth_level_domain(h, dl) in UNWANTED_DOMAINS:
   			return True
	return fnmatches_multiple(UNWANTED_PATTERNS, h)
Beispiel #4
0
def parents(url):
    """
	>>> parents('facebook.com')
	[]
	>>> parents('indiana.facebook.com')
	['facebook.com']
	>>> parents('1.2.3.news.bbc.co.uk')
	['2.3.news.bbc.co.uk', '3.news.bbc.co.uk', 'news.bbc.co.uk', 'bbc.co.uk']
	"""
    parent_urls = []
    dl = domain_level(url)
    if is_exception(url):
        end = 2
    else:
        end = 1
    for parent_dl in range(dl - 1, end, -1):
        parent_urls.append(nth_level_domain(url, parent_dl))
    return parent_urls
def should_skip_host(h):
    for dl in DOMAIN_LEVELS:
        if nth_level_domain(h, dl) in UNWANTED_DOMAINS:
            return True
    return False
def should_skip_host(h):
	for dl in DOMAIN_LEVELS:
		if nth_level_domain(h, dl) in WANTED_DOMAINS:
			return False
	return True
def should_skip_host(h):
	for dl in DOMAIN_LEVELS:
	   	if nth_level_domain(h, dl) in IU_TRAFFIC:
   			return True
	return False