from hashmapd.token_counts import TokenCounts

data_dir = "projects/word_vectors/raw/amdata"  #'amdata'
truncate_at_num_users = 200
truncate_at_num_tokens = 400

# load all data
data = TokenCounts(file_prefix="inc_", data_dir=data_dir)
data.load_from_csv()

# define a truncate function
def truncate_user_filter(user):
    return data.user_ids[user] < truncate_at_num_users


def truncate_token_filter(token):
    return data.token_ids[token] < truncate_at_num_tokens


# this is a really crappy way to do things but anyway
data.write_to_csv(
    file_prefix="truncated_", token_filter_fun=truncate_token_filter, user_filter_fun=truncate_user_filter
)
data.load_from_csv()

data.write_to_training_csv()
        return False
    return True

def user_filter(user):
    if (data.user_totals[user]<min_user_total):
        return False
    return True

def token_filter_by_variance(token):
    return (token in included_tokens)


# ------------------------------------------------------------
# main program starts here

data = TokenCounts(data_dir = data_dir)
data.load_from_csv(min_token_count=min_token_count, skip_common_tokens_cutoff=skip_common_tokens_cutoff, min_user_total=min_user_total)

data.write_to_csv(file_prefix='stage0_', token_filter_fun=pre_token_filter, user_filter_fun=user_filter)
data.load_from_csv(file_prefix='stage0_')
# data.user_token_counts is a list, each element being a tuple of:
# (user_id, token_id, count, token_user_prob)
# token_user_prob is prob(token | user), so these sum to 1 per user.


probs = np.zeros((len(data.user_token_counts)), dtype=float)
for i,[user_id, token_id, count, token_user_prob] in enumerate(data.user_token_counts):
    probs[i] = token_user_prob
    # But isn't there some vastly more efficient way to do this?!
probs = np.sort(probs)
print 'THE FIRST TEN: ',probs[0:10]