from hashmapd.token_counts import TokenCounts data_dir = "projects/word_vectors/raw/amdata" #'amdata' truncate_at_num_users = 200 truncate_at_num_tokens = 400 # load all data data = TokenCounts(file_prefix="inc_", data_dir=data_dir) data.load_from_csv() # define a truncate function def truncate_user_filter(user): return data.user_ids[user] < truncate_at_num_users def truncate_token_filter(token): return data.token_ids[token] < truncate_at_num_tokens # this is a really crappy way to do things but anyway data.write_to_csv( file_prefix="truncated_", token_filter_fun=truncate_token_filter, user_filter_fun=truncate_user_filter ) data.load_from_csv() data.write_to_training_csv()
return False return True def user_filter(user): if (data.user_totals[user]<min_user_total): return False return True def token_filter_by_variance(token): return (token in included_tokens) # ------------------------------------------------------------ # main program starts here data = TokenCounts(data_dir = data_dir) data.load_from_csv(min_token_count=min_token_count, skip_common_tokens_cutoff=skip_common_tokens_cutoff, min_user_total=min_user_total) data.write_to_csv(file_prefix='stage0_', token_filter_fun=pre_token_filter, user_filter_fun=user_filter) data.load_from_csv(file_prefix='stage0_') # data.user_token_counts is a list, each element being a tuple of: # (user_id, token_id, count, token_user_prob) # token_user_prob is prob(token | user), so these sum to 1 per user. probs = np.zeros((len(data.user_token_counts)), dtype=float) for i,[user_id, token_id, count, token_user_prob] in enumerate(data.user_token_counts): probs[i] = token_user_prob # But isn't there some vastly more efficient way to do this?! probs = np.sort(probs) print 'THE FIRST TEN: ',probs[0:10]