def find_popular_targets(datadir, site, dest): targets = {} logging.debug('Reading data.') for row in generate_pig_data(datadir): # (dt, sid, bcookie, referrer, target, refsite, refcat, targetsite, targetcat) # category_index will be either 5 or 6 for refsite or refcat user = row[1].strip() curr_site = row[SITE_IDX] target = row[4].strip() assert user != '' if curr_site == site: if user not in targets: targets[user] = {} if target not in targets[user]: targets[user][target] = 0 targets[user][target] += 1 logging.debug('Writing.') with open(dest, 'w') as f: writer = csv.writer(f, delimiter='\t') for user in targets: sorted_targets = sorted(targets[user].items(), reverse=True, key=itemgetter(1)) for target, count in sorted_targets: writer.writerow([user, target, count])
def read_data(datadir, category_index, domain_level=None): index = {} all_clicks = 0 clicks_without_user = 0 for row in generate_pig_data(datadir): all_clicks += 1 # (dt, sid, bcookie, referrer, target, refsite, refcat, targetsite, targetcat) # category_index will be either 5 or 6 for refsite or refcat category = row[category_index] user = row[1].strip() if user == '': clicks_without_user += 1 continue if domain_level is not None: target = nth_level_domain(row[4], domain_level) else: target = row[4] if category not in index: index[category] = {} if user not in index[category]: index[category][user] = {} if target not in index[category][user]: index[category][user][target] = 0 index[category][user][target] += 1 logging.debug('All clicks: %d' % all_clicks) logging.debug('Clicks without users: %d' % clicks_without_user) return index
def read_pagerank_counts(dirpath): pageranks = {} for row in generate_pig_data(dirpath): pr = float(row[1]) if pr not in pageranks: pageranks[pr] = 0 pageranks[pr] += 1 return pageranks
def create_news_list(src_dir, dest): news_targets = set() for row in generate_pig_data(src_dir): target = row[4].lower().strip() refsite = row[5].strip() if refsite == 'googlenews' and target not in news_targets: news_targets.add(target) with open(dest, 'w') as f: for target in news_targets: f.write('%s\n' % target)
def create_news_dataset(datadir, dest, num_clicks): if not os.path.exists(os.path.dirname(dest)): os.makedirs(os.path.dirname(dest)) with open(dest, 'w') as f: writer = csv.writer(f, delimiter='\t') for row in generate_pig_data(datadir): target = row[4].lower().strip() if target in news_urls: writer.writerow(row)
def collect_pr_vs_volume_data(clicks_dir, dest): prs = {} for row in generate_pig_data(clicks_dir): pr = row[-1] if pr not in prs: prs[pr] = 0 prs[pr] += 1 sorted_prs = sorted([(float(pr), pr) for pr in prs.keys()], reverse=True) with open(dest, 'w') as f: writer = csv.writer(f, delimiter='\t') for (pr_fl, pr_str) in sorted_prs: writer.writerow([pr_str, prs[pr_str]])
def create_pageranks_dataset(pageranks_dir, dest, news_urls): if not os.path.exists(os.path.dirname(dest)): os.makedirs(os.path.dirname(dest)) with open(dest, 'w') as f: writer = csv.writer(f, delimiter='\t') for row in generate_pig_data(pageranks_dir): domain = row[0].strip().lower() pr = float(row[1]) if domain in news_urls: writer.writerow(row)
def index_data(datadir, category_index): index = {} for row in generate_pig_data(datadir): # (dt, sid, bcookie, referrer, target, refsite, refcat, targetsite, targetcat) # category_index will be either 5 or 6 for refsite or refcat category = row[category_index] user = row[1] pagerank = float(row[9]) if category not in index: index[category] = {} if user not in index[category]: index[category][user] = {} if pagerank not in index[category][user]: index[category][user][pagerank] = 0 index[category][user][pagerank] += 1 return index
def reduce_news_dataset(datadir, dest, num_clicks): if not os.path.exists(os.path.dirname(dest)): os.makedirs(os.path.dirname(dest)) logging.info('Computing click counts.') click_counts = compute_user_click_counts(datadir) logging.debug('Num users: %d' % len(click_counts)) logging.info('Computing good users.') good_users = {user: click_count for user, click_count in click_counts.items() if click_count >= num_clicks} logging.debug('Num good users: %d' % len(good_users)) logging.info('Writing results.') with open(dest, 'w') as f: writer = csv.writer(f, delimiter="\t") for row in generate_pig_data(datadir): user = row[1].strip() if user in good_users: writer.writerow(row)
def count_users_per_cat(datadir, least_num_clicks): counts = {} logging.debug('Counting click counts per user per cat.') for row in generate_pig_data(datadir): # (dt, sid, bcookie, referrer, target, refsite, refcat, targetsite, targetcat) # category_index will be either 5 or 6 for refsite or refcat user = row[1].strip() site = row[SITE_IDX] cat = row[CAT_IDX] assert user != '' if site not in counts: counts[site] = {} if cat not in counts: counts[cat] = {} if user not in counts[site]: counts[site][user] = 0 if user not in counts[cat]: counts[cat][user] = 0 counts[site][user] += 1 counts[cat][user] += 1 logging.debug('Counts computed. Processing results.') cat_counts = {} for cat in counts: if cat not in cat_counts: cat_counts[cat] = 0 if least_num_clicks is not None: for user in counts[cat]: if counts[cat][user] >= least_num_clicks: cat_counts[cat] += 1 else: cat_counts[cat] = len(counts[cat]) for cat in sorted(cat_counts.keys()): logging.info('%s\t%d' % (cat, cat_counts[cat]))
def compute_user_click_counts(datadir): click_counts = {} all_clicks = 0 clicks_without_user = 0 for row in generate_pig_data(datadir): all_clicks += 1 # (dt, sid, bcookie, referrer, target, refsite, refcat, targetsite, targetcat) # category_index will be either 5 or 6 for refsite or refcat user = row[1].strip() if user == '': clicks_without_user += 1 continue else: if user not in click_counts: click_counts[user] = 0 click_counts[user] += 1 logging.debug('All clicks: %d' % all_clicks) logging.debug('Clicks without users: %d' % clicks_without_user) return click_counts