def train_wordfrequency(n_dims = 50):
    ### Load data
    dataloader = csv_dataloader()
    dataloader.load('output/data_cache.pk')
    print "Read in finished"

    train_id = dataloader.id
    _, pos_id, neg_id = dataloader.balance(train_id, 'full')
    train_data_pos = dataloader.data_retrieve(pos_id)
    train_data_neg = dataloader.data_retrieve(neg_id)
    tokens = sum(dataloader.data.viewvalues(), [])
    tokens_pos = sum(train_data_pos['data'].viewvalues(), [])
    tokens_neg = sum(train_data_neg['data'].viewvalues(), [])

    fdist_base = FreqDist(tokens)

    fdist_pos = FreqDist(tokens_pos)
    fdist_pos = normalize(fdist_pos, fdist_base)
    fdist_neg = FreqDist(tokens_neg)
    fdist_neg = normalize(fdist_neg, fdist_base)

    print list(fdist_pos.viewkeys())[:100]
    print list(fdist_neg.viewkeys())[:100]

    labels_pos = [1] * len(tokens_pos)
    labels_neg = [0] * len(tokens_neg)

    labels = labels_pos + labels_neg
    corpus = tokens_pos + tokens_neg