Esempio n. 1
0
def find_popular_targets(datadir, site, dest):
    targets = {}

    logging.debug('Reading data.')
    for row in generate_pig_data(datadir):
        # (dt, sid, bcookie, referrer, target, refsite, refcat, targetsite, targetcat)
        # category_index will be either 5 or 6 for refsite or refcat
        user = row[1].strip()
        curr_site = row[SITE_IDX]
        target = row[4].strip()

        assert user != ''

        if curr_site == site:
            if user not in targets:
                targets[user] = {}

            if target not in targets[user]:
                targets[user][target] = 0
            targets[user][target] += 1

    logging.debug('Writing.')
    with open(dest, 'w') as f:
        writer = csv.writer(f, delimiter='\t')
        for user in targets:
            sorted_targets = sorted(targets[user].items(),
                                    reverse=True,
                                    key=itemgetter(1))
            for target, count in sorted_targets:
                writer.writerow([user, target, count])
def read_data(datadir, category_index, domain_level=None):
    index = {}

    all_clicks = 0
    clicks_without_user = 0
    for row in generate_pig_data(datadir):
        all_clicks += 1
        # (dt, sid, bcookie, referrer, target, refsite, refcat, targetsite, targetcat)
        # category_index will be either 5 or 6 for refsite or refcat
        category = row[category_index]
        user = row[1].strip()

        if user == '':
            clicks_without_user += 1
            continue

        if domain_level is not None:
            target = nth_level_domain(row[4], domain_level)
        else:
            target = row[4]

        if category not in index:
            index[category] = {}

        if user not in index[category]:
            index[category][user] = {}

        if target not in index[category][user]:
            index[category][user][target] = 0

        index[category][user][target] += 1

    logging.debug('All clicks: %d' % all_clicks)
    logging.debug('Clicks without users: %d' % clicks_without_user)
    return index
def read_pagerank_counts(dirpath):
	pageranks = {}
	for row in generate_pig_data(dirpath):
		pr = float(row[1])
		if pr not in pageranks:
			pageranks[pr] = 0
		pageranks[pr] += 1
	return pageranks
def create_news_list(src_dir, dest):
    news_targets = set()
    for row in generate_pig_data(src_dir):
        target = row[4].lower().strip()
        refsite = row[5].strip()
        if refsite == 'googlenews' and target not in news_targets:
            news_targets.add(target)

    with open(dest, 'w') as f:
        for target in news_targets:
            f.write('%s\n' % target)
Esempio n. 5
0
def create_news_dataset(datadir, dest, num_clicks):

	if not os.path.exists(os.path.dirname(dest)):
		os.makedirs(os.path.dirname(dest))

	with open(dest, 'w') as f:
		writer = csv.writer(f, delimiter='\t')

		for row in generate_pig_data(datadir):
			target = row[4].lower().strip()
			if target in news_urls:
				writer.writerow(row)
def collect_pr_vs_volume_data(clicks_dir, dest):
	prs = {}
	for row in generate_pig_data(clicks_dir):
		pr = row[-1]
		if pr not in prs:
			prs[pr] = 0
		prs[pr] += 1

	sorted_prs = sorted([(float(pr), pr) for pr in prs.keys()], reverse=True)
	with open(dest, 'w') as f:
		writer = csv.writer(f, delimiter='\t')
		for (pr_fl, pr_str) in sorted_prs:
			writer.writerow([pr_str, prs[pr_str]])
Esempio n. 7
0
def create_pageranks_dataset(pageranks_dir, dest, news_urls):

	if not os.path.exists(os.path.dirname(dest)):
		os.makedirs(os.path.dirname(dest))

	with open(dest, 'w') as f:
		writer = csv.writer(f, delimiter='\t')

		for row in generate_pig_data(pageranks_dir):
			domain = row[0].strip().lower()
			pr = float(row[1])
			if domain in news_urls:
				writer.writerow(row)
Esempio n. 8
0
def index_data(datadir, category_index):
    index = {}
    for row in generate_pig_data(datadir):
        # (dt, sid, bcookie, referrer, target, refsite, refcat, targetsite, targetcat)
        # category_index will be either 5 or 6 for refsite or refcat
        category = row[category_index]
        user = row[1]
        pagerank = float(row[9])
        if category not in index:
            index[category] = {}
        if user not in index[category]:
            index[category][user] = {}
        if pagerank not in index[category][user]:
            index[category][user][pagerank] = 0
        index[category][user][pagerank] += 1
    return index
Esempio n. 9
0
def reduce_news_dataset(datadir, dest, num_clicks):

	if not os.path.exists(os.path.dirname(dest)):
		os.makedirs(os.path.dirname(dest))

	logging.info('Computing click counts.')
	click_counts = compute_user_click_counts(datadir)
	logging.debug('Num users: %d' % len(click_counts))

	logging.info('Computing good users.')
	good_users = {user: click_count for user, click_count in click_counts.items() if click_count >= num_clicks}
	logging.debug('Num good users: %d' % len(good_users))

	logging.info('Writing results.')
	with open(dest, 'w') as f:
		writer = csv.writer(f, delimiter="\t")
		for row in generate_pig_data(datadir):
			user = row[1].strip()
			if user in good_users:
				writer.writerow(row)
Esempio n. 10
0
def count_users_per_cat(datadir, least_num_clicks):
    counts = {}

    logging.debug('Counting click counts per user per cat.')
    for row in generate_pig_data(datadir):
        # (dt, sid, bcookie, referrer, target, refsite, refcat, targetsite, targetcat)
        # category_index will be either 5 or 6 for refsite or refcat
        user = row[1].strip()
        site = row[SITE_IDX]
        cat = row[CAT_IDX]

        assert user != ''

        if site not in counts:
            counts[site] = {}
        if cat not in counts:
            counts[cat] = {}

        if user not in counts[site]:
            counts[site][user] = 0
        if user not in counts[cat]:
            counts[cat][user] = 0

        counts[site][user] += 1
        counts[cat][user] += 1

    logging.debug('Counts computed. Processing results.')
    cat_counts = {}
    for cat in counts:
        if cat not in cat_counts:
            cat_counts[cat] = 0
            if least_num_clicks is not None:
                for user in counts[cat]:
                    if counts[cat][user] >= least_num_clicks:
                        cat_counts[cat] += 1
            else:
                cat_counts[cat] = len(counts[cat])

    for cat in sorted(cat_counts.keys()):
        logging.info('%s\t%d' % (cat, cat_counts[cat]))
Esempio n. 11
0
def compute_user_click_counts(datadir):
	click_counts = {}

	all_clicks = 0
	clicks_without_user = 0
	for row in generate_pig_data(datadir):
		all_clicks += 1
		# (dt, sid, bcookie, referrer, target, refsite, refcat, targetsite, targetcat)
		# category_index will be either 5 or 6 for refsite or refcat
		user = row[1].strip()

		if user == '':
			clicks_without_user += 1
			continue
		else:
			if user not in click_counts:
				click_counts[user] = 0
			click_counts[user] += 1

	logging.debug('All clicks: %d' % all_clicks)
	logging.debug('Clicks without users: %d' % clicks_without_user)
	return click_counts