def train_LDA(n_topics=100):

    ### Load extra data
    dataloader = csv_dataloader(datafile='data/extra_statuses.csv')
    if not os.path.exists('output/extra_cache.pk'):
        dataloader.read_csv(applyfun=preprocess, verbose=True)
        dataloader.save('output/extra_cache.pk')
    else:
        dataloader.load('output/extra_cache.pk')
    tokens = sum(dataloader.ldata.viewvalues(), [])
    print '#Tokens from training data: ' + str(len(tokens))
    print 'Readin done'

    ### Get word2id first
    word2id = get_word2id()
    if not os.path.exists('word2id.pk'):
        word2id.fit(tokens)
        word2id.save('word2id.pk')
    else:
        word2id.load('word2id.pk')
    ids = word2id.ids()
    print "#Id: " + str(len(ids.keys()))

    ### Train LDA
    topics = get_topics(id2word=ids, method='lda', n_topics=n_topics)
    if not os.path.exists('output/lda_all_'+str(n_topics)+'.pk'):
        print 'Training LDA...'
        topics.fit(tokens)
        topics.save('output/lda_all_'+str(n_topics)+'.pk')
        topics.summary()
    else:
        topics.load('output/lda_all_'+str(n_topics)+'.pk')
def train_clustering(n_topics=100, method='gmm'):

    ### Load extra data
    dataloader = csv_dataloader(datafile='data/extra_statuses.csv')
    CACHE_FILE = 'output/extra_cache.pk'
    if not os.path.exists(CACHE_FILE):
        dataloader.read_csv(applyfun=preprocess, verbose=True)
        dataloader.save(CACHE_FILE)
    else:
        dataloader.load(CACHE_FILE)
    dataloader.summary()
    tokens = sum(dataloader.data.viewvalues(), [])
    print '#Tokens from training data: ' + str(len(tokens))
    print 'Readin done'

    ### Load pre-train word2vector model
    word2vec = get_word2vec(model='data/GoogleNews-vectors-negative300.bin', binary=True, size=300)
    print 'Pretrained word2vec loaded'
    ### Convert word to vector

    train_vectors = word2vec.batch_convert(tokens)
    print '#Vectors from training data: ' + str(len(train_vectors))
    save(train_vectors, 'output/extra_vectors.pk')

    ### Train Clustering
    clusters = get_clusters(method=method, n_topics=n_topics)
    if not os.path.exists('output/clustering_'+ method + '_' + str(n_topics) + '.pk'):
        print 'Training Clusters...'
        clusters.fit(train_vectors)
        clusters.save('output/clustering_'+ method + '_' + str(n_topics) + '.pk')
        clusters.summary()
    else:
        print 'Cluster Model Loaded...'
        clusters.load('output/clustering_'+ method + '_' + str(n_topics) + '.pk')
def train_wordfrequency(n_dims = 50):
    ### Load data
    dataloader = csv_dataloader()
    dataloader.load('output/data_cache.pk')
    print "Read in finished"

    train_id = dataloader.id
    _, pos_id, neg_id = dataloader.balance(train_id, 'full')
    train_data_pos = dataloader.data_retrieve(pos_id)
    train_data_neg = dataloader.data_retrieve(neg_id)
    tokens = sum(dataloader.data.viewvalues(), [])
    tokens_pos = sum(train_data_pos['data'].viewvalues(), [])
    tokens_neg = sum(train_data_neg['data'].viewvalues(), [])

    fdist_base = FreqDist(tokens)

    fdist_pos = FreqDist(tokens_pos)
    fdist_pos = normalize(fdist_pos, fdist_base)
    fdist_neg = FreqDist(tokens_neg)
    fdist_neg = normalize(fdist_neg, fdist_base)

    print list(fdist_pos.viewkeys())[:100]
    print list(fdist_neg.viewkeys())[:100]

    labels_pos = [1] * len(tokens_pos)
    labels_neg = [0] * len(tokens_neg)

    labels = labels_pos + labels_neg
    corpus = tokens_pos + tokens_neg
def draw_word2vec():
    ### Load data
    dataloader = csv_dataloader()
    dataloader.load("output/data_cache.pk")
    print "Read in finished"

    ### Load pre-train word2vector model
    word2vec = get_word2vec(model="data/GoogleNews-vectors-negative300.bin", binary=True, size=300)
    print "Pretrained word2vec loaded"

    all_tokens = sum(dataloader.data.viewvalues(), [])
    print "#Tokens: " + str(len(all_tokens))
    fdist = FreqDist(all_tokens)
    tokens = fdist.keys()[1:500]
    print tokens
    tokens_has_vectors = []
    for token in tokens:
        if word2vec[token] is not None:
            tokens_has_vectors.append(token)

    print "#Unique Tokens \w Vectors: " + str(len(tokens_has_vectors))
    vectors = word2vec.encode(tokens_has_vectors)
    print "#Unique Vectors: " + str(len(vectors))

    print ("Computing MDS embedding")
    clf = manifold.MDS(n_components=2, n_init=1, max_iter=2000)
    # clf = manifold.Isomap(n_components=2, max_iter=100)
    vectors_mds = clf.fit_transform(vectors)
    print ("Done. Stress: %f" % clf.stress_)
    plot_embedding(vectors_mds, tokens_has_vectors, "MDS embedding of the words")
def train_LLDA(n_topics=2):
    ### Load data
    dataloader = csv_dataloader()
    dataloader.load('output/data_cache.pk')
    print "Read in finished"

    train_id = dataloader.id
    train_data = dataloader.data_retrieve(train_id)

    labels = []
    corpus = []

    for id in train_id:
        corpus.append(train_data['ldata'][id])
        labels.append(score2label(train_data['score'][id]))

    ### Train LLDA
    topics = get_topics2(method='LLDA', max_iter=2)
    model_file = 'output/LLDA_8.pk'
    if not os.path.exists(model_file):
        print 'Training LLDA...'
        topics.fit(corpus, labels, verbose=True)
        topics.save(model_file)
    else:
        topics.load(model_file)
    topics.summary()
def predict():

    dataloader = csv_dataloader(datafile='data/test_statuses.csv', extrafile='data/test_metadata.csv', nolabel=True, extra=True)
    if not os.path.exists('output/test_cache.pk'):
        dataloader.read_csv(applyfun=preprocess)
        dataloader.save('output/test_cache.pk')
    else:
        dataloader.load('output/test_cache.pk')
    dataloader.summary()
    print "Read in finished"

    word2id = get_word2id()
    word2id.load('output/word2id.pk')
    ids = word2id.ids()

    n_topics = 100
    topics = get_topics(id2word=ids, method='lda', n_topics=n_topics)
    topics.load('output/lda_all_100.pk')

    # ### Load pre-train word2vector model
    # word2vec = get_word2vec(model='data/GoogleNews-vectors-negative300.bin', binary=True, size=300)
    # print 'Pretrained word2vec loaded'
    #
    # n_topics = 100
    # model_file = 'output/clustering_gmm_100.pk';
    # clusters = get_clusters(method='gmm', n_topics=n_topics)
    # print 'Cluster Model Loaded...'
    # clusters.load(model_file)

    ### Calculate LIWC hist
    LIWC = get_LIWC()

    test_id = dataloader.id
    test_data = dataloader.data_retrieve(test_id)

    ## Generate Test Data Encodings
    encode = encode_feature(test_data, test_id, [topics, LIWC])

    print encode

    encode = preprocessing.scale(encode)
    classifier = load('output/model_LDA_100_0.461538461538.pk')
    predict_label = classifier.predict(encode)
    predict_prob = classifier.predict_proba(encode)

    with open('output/result.csv', 'w+') as file:
        file.write('userID, binaryPrediction, confidence, regression\n')

        for i in range(len(predict_label)):
            string = test_id[i] + ', '
            if predict_label[i]==1:
                string += '+, '
            else:
                string += '-, '
            string += str(predict_prob[i][1]) + ', '
            string += 'N\n'
            file.write(string)

    print str(sum(predict_label)) + '/' + str(len(predict_label))
def draw_LIWC_hist():
    ### Load data
    dataloader = csv_dataloader()
    dataloader.load('output/data_cache.pk')
    print "Read in finished"

    train_id = dataloader.id
    train_data = dataloader.data_retrieve(train_id)
    _, pos_id, neg_id = dataloader.balance(train_id, 'full')
    train_data_pos = dataloader.data_retrieve(pos_id)
    train_data_neg = dataloader.data_retrieve(neg_id)

    tokens = sum(train_data['data'].viewvalues(), [])
    tokens_pos = sum(train_data_pos['data'].viewvalues(), [])
    tokens_neg = sum(train_data_neg['data'].viewvalues(), [])

    ### Calculate LIWC hist
    LIWC = get_LIWC()
    LIWC_hist = LIWC.encode(tokens, normalize=False)
    LIWC_hist_pos = LIWC.encode(tokens_pos, normalize=True)
    LIWC_hist_neg = LIWC.encode(tokens_neg, normalize=True)


    fig = plt.figure()
    ax = fig.add_subplot(2,1,1)
    width = 0.3
    bar0 = ax.bar(np.arange(67)+width, LIWC_hist, width)
    #ax.set_xlabel('Category')
    ax.set_ylabel('Frequency')
    ax.set_title('(a)')
    ax = fig.add_subplot(2,1,2)
    bar1 = ax.bar(np.arange(67)+width, LIWC_hist_pos, width, color='r')
    bar2 = ax.bar(np.arange(67)+2*width, LIWC_hist_neg, width, color='g')

    labels = list(LIWC.category.viewvalues())
    ax.set_label(['Postive', 'Negative'])
    ax.set_xticks(np.arange(67)+2*width)
    ax.set_xticklabels(labels, rotation='vertical')
    ax.set_xlabel('Category')
    ax.set_ylabel('Percentage')
    ax.set_title('(b)')
    ax.grid(True)

    plt.show()
def draw_gmm():
    ### Load data
    dataloader = csv_dataloader()
    dataloader.load('output/data_cache.pk')
    print "Read in finished"

    ### Load pre-train word2vector model
    word2vec = get_word2vec(model='data/GoogleNews-vectors-negative300.bin', binary=True, size=300)
    print 'Pretrained word2vec loaded'

    ### Reverse engineering, build vector2word dictionary
    # vec2word = []
    # words =[]
    # for voc, obj in word2vec.model.vocab:
    #     words.append(voc)
    #     vec2word.append(word2vec.model.syn0[obj.index])

    all_vectors = word2vec.model.syn0


    ### Train BoW
    n_topics = 25
    model_file = 'output/clustering_gmm_100.pk';
    clusters = get_clusters(method='gmm', n_topics=n_topics)

    clusters.load(model_file)
    print clusters.clusters.means_

    knn = NearestNeighbors(n_neighbors=10)
    knn.fit(all_vectors)
    save(knn, 'output\draw_gmm_knn.pk')
    nns =  knn.kneighbors(clusters.clusters.means_, return_distance=False)
    for i in range(np.shape(nns)[0]):
        print 'Topic ' + str(i+1) + ': ',
        for j in range(np.shape(nns)[1]):
            print str(word2vec.model.index2word[nns[i,j]]) + ' ',
        print ''
def main(n_fold=10):
    ### Load trainning data
    dataloader = csv_dataloader(extrafile='data/fixed_train_gender_class.csv', extra=True)
    if not os.path.exists('output/data_cache.pk'):
        dataloader.read_csv(applyfun=preprocess)
        dataloader.save('output/data_cache.pk')
    else:
        dataloader.load('output/data_cache.pk')
    dataloader.summary()
    print "Read in finished"

    ### Get word2id first
    tokens = sum(dataloader.ldata.viewvalues(), [])
    word2id = get_word2id()
    if not os.path.exists('output/word2id.pk'):
        word2id.fit(tokens)
        word2id.save('output/word2id.pk')
    else:
        word2id.load('output/word2id.pk')
    ids = word2id.ids()
    print "#Id: " + str(len(ids.keys()))
    print '#Tokens from training data: ' + str(len(tokens))

    ### Calculate LIWC hist
    LIWC = get_LIWC()
    #print LIWC.calculate_hist(tokens, normalize=False)

    ### Train and load LDA
    n_topics = 25
    model_file = 'output/lda_all_25.pk'
    topics = get_topics(id2word=ids, method='lda', n_topics=n_topics)
    if not os.path.exists(model_file):
        print 'Training LDA...'
        topics.fit(tokens)
        topics.save(model_file)
        topics.summary()
    else:
        topics.load(model_file)


    ### ============================================================
    ###                         n fold
    ### ============================================================

    nfolds = dataloader.nfold(n_fold)
    fscores = []
    models = []

    for fold_ind in range(n_fold):

        print '======================== FOLD ' + str(fold_ind+1) + '========================'

        test_id = nfolds[fold_ind]
        train_id = []
        for i in range(n_fold):
            if i != fold_ind:
                train_id += nfolds[i]


        ### ============================================================
        ###                         Train Part
        ### ============================================================
        print 'Training>>>>>>>>>>>>>>>>>>>>>>>>>'

        train_data = dataloader.data_retrieve(train_id)

        ### Balance Train Data
        _, train_pos_id, train_neg_id = dataloader.balance(train_id, K=2)

        encode_pos = encode_feature(train_data, train_pos_id, [topics, LIWC])
        encode_pos = SMOTE(encode_pos, 200, len(train_pos_id)/4)
        label_pos = np.ones(len(encode_pos))

        encode_neg = encode_feature(train_data, train_pos_id, [topics, LIWC])
        label_neg = np.zeros(len(encode_neg))

        encode = np.concatenate((encode_pos, encode_neg), axis=0)
        label = np.concatenate((label_pos, label_neg), axis=0)
        print encode.shape
        print label.shape

        ### Train
        encode = preprocessing.scale(encode)
        classifier = svm.LinearSVC(verbose=True)
        classifier.fit(encode, label)
        print 'F1 score: ' + str(f1_score(label, classifier.predict(encode)))


        ### ============================================================
        ###                         Test Part
        ### ============================================================
        print 'Testing>>>>>>>>>>>>>>>>>>>>>>>>>'

        test_data = dataloader.data_retrieve(test_id)

        ### Generate Test Data Encodings
        encode = encode_feature(test_data, test_id, [topics, LIWC])
        label = dataloader.label_retrieve(test_id)

        ### Test
        encode = preprocessing.scale(encode)
        print 'F1 score: ' + str(f1_score(label, classifier.predict(encode)))

        fscores.append(f1_score(label, classifier.predict(encode)))
        models.append(classifier)

    print 'MEAN F1 score: ' + str(np.mean(fscores))
    print 'BEST F1 score: ' + str(np.max(fscores)) + ' by Model ' + str(np.argmax(fscores)+1)
    print 'VAR F1 score: ' + str(np.var(fscores))

    save(models[np.argmax(fscores)], 'output/model_LDA_' + str(n_topics) + '_' + str(fscores[np.argmax(fscores)]) + '.pk')
def main(n_fold=10):
    ### Load trainning data
    dataloader = csv_dataloader(extrafile="data/fixed_train_gender_class.csv", extra=True)
    if not os.path.exists("output/data_cache.pk"):
        dataloader.read_csv(applyfun=preprocess)
        dataloader.save("output/data_cache.pk")
    else:
        dataloader.load("output/data_cache.pk")
    dataloader.summary()
    print "Read in finished"

    ### Calculate LIWC hist
    LIWC = get_LIWC()
    # print LIWC.calculate_hist(tokens, normalize=False)

    ### Train and Load LLDA
    topics = get_topics2(method="LLDA", max_iter=20, n_topics=8)
    model_file = "output/LLDA_8_20.pk"
    if not os.path.exists(model_file):
        train_id = dataloader.id
        train_data = dataloader.data_retrieve(train_id)

        labels = []
        corpus = []

        for id in train_id:
            corpus.append(train_data["data"][id])
            labels.append(score2label(train_data["score"][id]))
        print "Training LLDA..."
        topics.fit(corpus, labels, verbose=True)
        topics.save(model_file)
        print "Saved"
    else:
        topics.load(model_file)
    topics.summary()

    ### ============================================================
    ###                         n fold
    ### ============================================================

    nfolds = dataloader.nfold(n_fold)
    fscores = []
    models = []

    for fold_ind in range(n_fold):

        print "======================== FOLD " + str(fold_ind + 1) + "========================"

        test_id = nfolds[fold_ind]
        train_id = []
        for i in range(n_fold):
            if i != fold_ind:
                train_id += nfolds[i]

        ### ============================================================
        ###                         Train Part
        ### ============================================================
        print "Training>>>>>>>>>>>>>>>>>>>>>>>>>"

        train_data = dataloader.data_retrieve(train_id)

        ### Balance Train Data
        _, train_pos_id, train_neg_id = dataloader.balance(train_id, K=2)

        encode_pos = encode_feature(train_data, train_pos_id, [topics, LIWC])
        encode_pos = SMOTE(encode_pos, 200, len(train_pos_id) / 4)
        label_pos = np.ones(len(encode_pos))

        encode_neg = encode_feature(train_data, train_pos_id, [topics, LIWC])
        label_neg = np.zeros(len(encode_neg))

        encode = np.concatenate((encode_pos, encode_neg), axis=0)
        label = np.concatenate((label_pos, label_neg), axis=0)
        print encode.shape
        print label.shape

        ### Train
        encode = preprocessing.scale(encode)
        classifier = svm.LinearSVC(verbose=True)
        classifier.fit(encode, label)
        print "F1 score: " + str(f1_score(label, classifier.predict(encode)))

        ### ============================================================
        ###                         Test Part
        ### ============================================================
        print "Testing>>>>>>>>>>>>>>>>>>>>>>>>>"

        test_data = dataloader.data_retrieve(test_id)

        ### Generate Test Data Encodings
        encode = encode_feature(test_data, test_id, [topics, LIWC])
        label = dataloader.label_retrieve(test_id)

        ### Test
        encode = preprocessing.scale(encode)
        print "F1 score: " + str(f1_score(label, classifier.predict(encode)))

        fscores.append(f1_score(label, classifier.predict(encode)))
        models.append(classifier)

    print "MEAN F1 score: " + str(np.mean(fscores))
    print "BEST F1 score: " + str(np.max(fscores)) + " by Model " + str(np.argmax(fscores) + 1)
    print "VAR F1 score: " + str(np.var(fscores))

    save(models[np.argmax(fscores)], "output/model_LLDA_8_" + str(fscores[np.argmax(fscores)]) + ".pk")
def main(n_fold=10):
    ### Load data
    dataloader = csv_dataloader(extrafile='data/fixed_train_gender_class.csv', extra=True)
    if not os.path.exists('output/data_cache.pk'):
        dataloader.read_csv(applyfun=preprocess)
        dataloader.save('output/data_cache.pk')
    else:
        dataloader.load('output/data_cache.pk')
    dataloader.summary()
    print "Read in finished"

    ### Load pre-train word2vector model
    word2vec = get_word2vec(model='data/GoogleNews-vectors-negative300.bin', binary=True, size=300)
    print 'Pretrained word2vec loaded'

    ### Train BoW
    n_topics = 25
    model_file = 'output/clustering_gmm_25.pk';
    clusters = get_clusters(method='gmm', n_topics=n_topics)
    if not os.path.exists(model_file):
        ### Convert word to vector
        tokens = sum(dataloader.data.viewvalues(), [])
        print '#Tokens from training data: ' + str(len(tokens))
        train_vectors = word2vec.encode(tokens)
        print '#Vectors from training data: ' + str(len(train_vectors))
        print 'Training Clusters...'
        clusters.fit(train_vectors)
        clusters.save(model_file)
        clusters.summary()
    else:
        print 'Cluster Model Loaded...'
        clusters.load(model_file)

    ### Calculate LIWC hist
    LIWC = get_LIWC()
    #print LIWC.calculate_hist(tokens, normalize=False)

    ### ============================================================
    ###                         n fold
    ### ============================================================

    nfolds = dataloader.nfold(n_fold)
    fscores = []
    models = []

    for fold_ind in range(n_fold):

        print '======================== FOLD ' + str(fold_ind+1) + '========================'

        test_id = nfolds[fold_ind]
        train_id = []
        for i in range(n_fold):
            if i != fold_ind:
                train_id += nfolds[i]


        ### ============================================================
        ###                         Train Part
        ### ============================================================
        print 'Training>>>>>>>>>>>>>>>>>>>>>>>>>'

        train_data = dataloader.data_retrieve(train_id)

        ### Balance Train Data
        _, train_pos_id, train_neg_id = dataloader.balance(train_id, K=2)

        encode_pos = encode_feature(train_data, train_pos_id, [word2vec, clusters, LIWC])
        encode_pos = SMOTE(encode_pos, 200, len(train_pos_id)/4)
        label_pos = np.ones(len(encode_pos))

        encode_neg = encode_feature(train_data, train_pos_id, [word2vec, clusters, LIWC])
        label_neg = np.zeros(len(encode_neg))

        encode = np.concatenate((encode_pos, encode_neg), axis=0)
        label = np.concatenate((label_pos, label_neg), axis=0)
        print encode.shape
        print label.shape

        encode = preprocessing.scale(encode)
        classifier = svm.LinearSVC(verbose=True)
        #weight = label+1 # pos:neg = 2:1 for imbalanced training
        classifier.fit(encode, label)
        print 'F1 score: ' + str(f1_score(label, classifier.predict(encode)))


        ### ============================================================
        ###                         Test Part
        ### ============================================================
        print 'Testing>>>>>>>>>>>>>>>>>>>>>>>>>'

        test_data = dataloader.data_retrieve(test_id)
        encode = encode_feature(test_data, test_id, [word2vec, clusters, LIWC])
        label = dataloader.label_retrieve(test_id)

        encode = preprocessing.scale(encode)
        print 'F1 score: ' + str(f1_score(label, classifier.predict(encode)))
        fscores.append(f1_score(label, classifier.predict(encode)))
        models.append(classifier)

    print 'MEAN F1 score: ' + str(np.mean(fscores))
    print 'BEST F1 score: ' + str(np.max(fscores)) + ' by Model ' + str(np.argmax(fscores)+1)
    print 'VAR F1 score: ' + str(np.var(fscores))

    save(models[np.argmax(fscores)], 'output/model_clustering_' + str(fscores[np.argmax(fscores)]) + '.pk')