def predict():

    dataloader = csv_dataloader(datafile='data/test_statuses.csv', extrafile='data/test_metadata.csv', nolabel=True, extra=True)
    if not os.path.exists('output/test_cache.pk'):
        dataloader.read_csv(applyfun=preprocess)
        dataloader.save('output/test_cache.pk')
    else:
        dataloader.load('output/test_cache.pk')
    dataloader.summary()
    print "Read in finished"

    word2id = get_word2id()
    word2id.load('output/word2id.pk')
    ids = word2id.ids()

    n_topics = 100
    topics = get_topics(id2word=ids, method='lda', n_topics=n_topics)
    topics.load('output/lda_all_100.pk')

    # ### Load pre-train word2vector model
    # word2vec = get_word2vec(model='data/GoogleNews-vectors-negative300.bin', binary=True, size=300)
    # print 'Pretrained word2vec loaded'
    #
    # n_topics = 100
    # model_file = 'output/clustering_gmm_100.pk';
    # clusters = get_clusters(method='gmm', n_topics=n_topics)
    # print 'Cluster Model Loaded...'
    # clusters.load(model_file)

    ### Calculate LIWC hist
    LIWC = get_LIWC()

    test_id = dataloader.id
    test_data = dataloader.data_retrieve(test_id)

    ## Generate Test Data Encodings
    encode = encode_feature(test_data, test_id, [topics, LIWC])

    print encode

    encode = preprocessing.scale(encode)
    classifier = load('output/model_LDA_100_0.461538461538.pk')
    predict_label = classifier.predict(encode)
    predict_prob = classifier.predict_proba(encode)

    with open('output/result.csv', 'w+') as file:
        file.write('userID, binaryPrediction, confidence, regression\n')

        for i in range(len(predict_label)):
            string = test_id[i] + ', '
            if predict_label[i]==1:
                string += '+, '
            else:
                string += '-, '
            string += str(predict_prob[i][1]) + ', '
            string += 'N\n'
            file.write(string)

    print str(sum(predict_label)) + '/' + str(len(predict_label))
def draw_LIWC_hist():
    ### Load data
    dataloader = csv_dataloader()
    dataloader.load('output/data_cache.pk')
    print "Read in finished"

    train_id = dataloader.id
    train_data = dataloader.data_retrieve(train_id)
    _, pos_id, neg_id = dataloader.balance(train_id, 'full')
    train_data_pos = dataloader.data_retrieve(pos_id)
    train_data_neg = dataloader.data_retrieve(neg_id)

    tokens = sum(train_data['data'].viewvalues(), [])
    tokens_pos = sum(train_data_pos['data'].viewvalues(), [])
    tokens_neg = sum(train_data_neg['data'].viewvalues(), [])

    ### Calculate LIWC hist
    LIWC = get_LIWC()
    LIWC_hist = LIWC.encode(tokens, normalize=False)
    LIWC_hist_pos = LIWC.encode(tokens_pos, normalize=True)
    LIWC_hist_neg = LIWC.encode(tokens_neg, normalize=True)


    fig = plt.figure()
    ax = fig.add_subplot(2,1,1)
    width = 0.3
    bar0 = ax.bar(np.arange(67)+width, LIWC_hist, width)
    #ax.set_xlabel('Category')
    ax.set_ylabel('Frequency')
    ax.set_title('(a)')
    ax = fig.add_subplot(2,1,2)
    bar1 = ax.bar(np.arange(67)+width, LIWC_hist_pos, width, color='r')
    bar2 = ax.bar(np.arange(67)+2*width, LIWC_hist_neg, width, color='g')

    labels = list(LIWC.category.viewvalues())
    ax.set_label(['Postive', 'Negative'])
    ax.set_xticks(np.arange(67)+2*width)
    ax.set_xticklabels(labels, rotation='vertical')
    ax.set_xlabel('Category')
    ax.set_ylabel('Percentage')
    ax.set_title('(b)')
    ax.grid(True)

    plt.show()
def main(n_fold=10):
    ### Load trainning data
    dataloader = csv_dataloader(extrafile='data/fixed_train_gender_class.csv', extra=True)
    if not os.path.exists('output/data_cache.pk'):
        dataloader.read_csv(applyfun=preprocess)
        dataloader.save('output/data_cache.pk')
    else:
        dataloader.load('output/data_cache.pk')
    dataloader.summary()
    print "Read in finished"

    ### Get word2id first
    tokens = sum(dataloader.ldata.viewvalues(), [])
    word2id = get_word2id()
    if not os.path.exists('output/word2id.pk'):
        word2id.fit(tokens)
        word2id.save('output/word2id.pk')
    else:
        word2id.load('output/word2id.pk')
    ids = word2id.ids()
    print "#Id: " + str(len(ids.keys()))
    print '#Tokens from training data: ' + str(len(tokens))

    ### Calculate LIWC hist
    LIWC = get_LIWC()
    #print LIWC.calculate_hist(tokens, normalize=False)

    ### Train and load LDA
    n_topics = 25
    model_file = 'output/lda_all_25.pk'
    topics = get_topics(id2word=ids, method='lda', n_topics=n_topics)
    if not os.path.exists(model_file):
        print 'Training LDA...'
        topics.fit(tokens)
        topics.save(model_file)
        topics.summary()
    else:
        topics.load(model_file)


    ### ============================================================
    ###                         n fold
    ### ============================================================

    nfolds = dataloader.nfold(n_fold)
    fscores = []
    models = []

    for fold_ind in range(n_fold):

        print '======================== FOLD ' + str(fold_ind+1) + '========================'

        test_id = nfolds[fold_ind]
        train_id = []
        for i in range(n_fold):
            if i != fold_ind:
                train_id += nfolds[i]


        ### ============================================================
        ###                         Train Part
        ### ============================================================
        print 'Training>>>>>>>>>>>>>>>>>>>>>>>>>'

        train_data = dataloader.data_retrieve(train_id)

        ### Balance Train Data
        _, train_pos_id, train_neg_id = dataloader.balance(train_id, K=2)

        encode_pos = encode_feature(train_data, train_pos_id, [topics, LIWC])
        encode_pos = SMOTE(encode_pos, 200, len(train_pos_id)/4)
        label_pos = np.ones(len(encode_pos))

        encode_neg = encode_feature(train_data, train_pos_id, [topics, LIWC])
        label_neg = np.zeros(len(encode_neg))

        encode = np.concatenate((encode_pos, encode_neg), axis=0)
        label = np.concatenate((label_pos, label_neg), axis=0)
        print encode.shape
        print label.shape

        ### Train
        encode = preprocessing.scale(encode)
        classifier = svm.LinearSVC(verbose=True)
        classifier.fit(encode, label)
        print 'F1 score: ' + str(f1_score(label, classifier.predict(encode)))


        ### ============================================================
        ###                         Test Part
        ### ============================================================
        print 'Testing>>>>>>>>>>>>>>>>>>>>>>>>>'

        test_data = dataloader.data_retrieve(test_id)

        ### Generate Test Data Encodings
        encode = encode_feature(test_data, test_id, [topics, LIWC])
        label = dataloader.label_retrieve(test_id)

        ### Test
        encode = preprocessing.scale(encode)
        print 'F1 score: ' + str(f1_score(label, classifier.predict(encode)))

        fscores.append(f1_score(label, classifier.predict(encode)))
        models.append(classifier)

    print 'MEAN F1 score: ' + str(np.mean(fscores))
    print 'BEST F1 score: ' + str(np.max(fscores)) + ' by Model ' + str(np.argmax(fscores)+1)
    print 'VAR F1 score: ' + str(np.var(fscores))

    save(models[np.argmax(fscores)], 'output/model_LDA_' + str(n_topics) + '_' + str(fscores[np.argmax(fscores)]) + '.pk')
def main(n_fold=10):
    ### Load trainning data
    dataloader = csv_dataloader(extrafile="data/fixed_train_gender_class.csv", extra=True)
    if not os.path.exists("output/data_cache.pk"):
        dataloader.read_csv(applyfun=preprocess)
        dataloader.save("output/data_cache.pk")
    else:
        dataloader.load("output/data_cache.pk")
    dataloader.summary()
    print "Read in finished"

    ### Calculate LIWC hist
    LIWC = get_LIWC()
    # print LIWC.calculate_hist(tokens, normalize=False)

    ### Train and Load LLDA
    topics = get_topics2(method="LLDA", max_iter=20, n_topics=8)
    model_file = "output/LLDA_8_20.pk"
    if not os.path.exists(model_file):
        train_id = dataloader.id
        train_data = dataloader.data_retrieve(train_id)

        labels = []
        corpus = []

        for id in train_id:
            corpus.append(train_data["data"][id])
            labels.append(score2label(train_data["score"][id]))
        print "Training LLDA..."
        topics.fit(corpus, labels, verbose=True)
        topics.save(model_file)
        print "Saved"
    else:
        topics.load(model_file)
    topics.summary()

    ### ============================================================
    ###                         n fold
    ### ============================================================

    nfolds = dataloader.nfold(n_fold)
    fscores = []
    models = []

    for fold_ind in range(n_fold):

        print "======================== FOLD " + str(fold_ind + 1) + "========================"

        test_id = nfolds[fold_ind]
        train_id = []
        for i in range(n_fold):
            if i != fold_ind:
                train_id += nfolds[i]

        ### ============================================================
        ###                         Train Part
        ### ============================================================
        print "Training>>>>>>>>>>>>>>>>>>>>>>>>>"

        train_data = dataloader.data_retrieve(train_id)

        ### Balance Train Data
        _, train_pos_id, train_neg_id = dataloader.balance(train_id, K=2)

        encode_pos = encode_feature(train_data, train_pos_id, [topics, LIWC])
        encode_pos = SMOTE(encode_pos, 200, len(train_pos_id) / 4)
        label_pos = np.ones(len(encode_pos))

        encode_neg = encode_feature(train_data, train_pos_id, [topics, LIWC])
        label_neg = np.zeros(len(encode_neg))

        encode = np.concatenate((encode_pos, encode_neg), axis=0)
        label = np.concatenate((label_pos, label_neg), axis=0)
        print encode.shape
        print label.shape

        ### Train
        encode = preprocessing.scale(encode)
        classifier = svm.LinearSVC(verbose=True)
        classifier.fit(encode, label)
        print "F1 score: " + str(f1_score(label, classifier.predict(encode)))

        ### ============================================================
        ###                         Test Part
        ### ============================================================
        print "Testing>>>>>>>>>>>>>>>>>>>>>>>>>"

        test_data = dataloader.data_retrieve(test_id)

        ### Generate Test Data Encodings
        encode = encode_feature(test_data, test_id, [topics, LIWC])
        label = dataloader.label_retrieve(test_id)

        ### Test
        encode = preprocessing.scale(encode)
        print "F1 score: " + str(f1_score(label, classifier.predict(encode)))

        fscores.append(f1_score(label, classifier.predict(encode)))
        models.append(classifier)

    print "MEAN F1 score: " + str(np.mean(fscores))
    print "BEST F1 score: " + str(np.max(fscores)) + " by Model " + str(np.argmax(fscores) + 1)
    print "VAR F1 score: " + str(np.var(fscores))

    save(models[np.argmax(fscores)], "output/model_LLDA_8_" + str(fscores[np.argmax(fscores)]) + ".pk")
def main(n_fold=10):
    ### Load data
    dataloader = csv_dataloader(extrafile='data/fixed_train_gender_class.csv', extra=True)
    if not os.path.exists('output/data_cache.pk'):
        dataloader.read_csv(applyfun=preprocess)
        dataloader.save('output/data_cache.pk')
    else:
        dataloader.load('output/data_cache.pk')
    dataloader.summary()
    print "Read in finished"

    ### Load pre-train word2vector model
    word2vec = get_word2vec(model='data/GoogleNews-vectors-negative300.bin', binary=True, size=300)
    print 'Pretrained word2vec loaded'

    ### Train BoW
    n_topics = 25
    model_file = 'output/clustering_gmm_25.pk';
    clusters = get_clusters(method='gmm', n_topics=n_topics)
    if not os.path.exists(model_file):
        ### Convert word to vector
        tokens = sum(dataloader.data.viewvalues(), [])
        print '#Tokens from training data: ' + str(len(tokens))
        train_vectors = word2vec.encode(tokens)
        print '#Vectors from training data: ' + str(len(train_vectors))
        print 'Training Clusters...'
        clusters.fit(train_vectors)
        clusters.save(model_file)
        clusters.summary()
    else:
        print 'Cluster Model Loaded...'
        clusters.load(model_file)

    ### Calculate LIWC hist
    LIWC = get_LIWC()
    #print LIWC.calculate_hist(tokens, normalize=False)

    ### ============================================================
    ###                         n fold
    ### ============================================================

    nfolds = dataloader.nfold(n_fold)
    fscores = []
    models = []

    for fold_ind in range(n_fold):

        print '======================== FOLD ' + str(fold_ind+1) + '========================'

        test_id = nfolds[fold_ind]
        train_id = []
        for i in range(n_fold):
            if i != fold_ind:
                train_id += nfolds[i]


        ### ============================================================
        ###                         Train Part
        ### ============================================================
        print 'Training>>>>>>>>>>>>>>>>>>>>>>>>>'

        train_data = dataloader.data_retrieve(train_id)

        ### Balance Train Data
        _, train_pos_id, train_neg_id = dataloader.balance(train_id, K=2)

        encode_pos = encode_feature(train_data, train_pos_id, [word2vec, clusters, LIWC])
        encode_pos = SMOTE(encode_pos, 200, len(train_pos_id)/4)
        label_pos = np.ones(len(encode_pos))

        encode_neg = encode_feature(train_data, train_pos_id, [word2vec, clusters, LIWC])
        label_neg = np.zeros(len(encode_neg))

        encode = np.concatenate((encode_pos, encode_neg), axis=0)
        label = np.concatenate((label_pos, label_neg), axis=0)
        print encode.shape
        print label.shape

        encode = preprocessing.scale(encode)
        classifier = svm.LinearSVC(verbose=True)
        #weight = label+1 # pos:neg = 2:1 for imbalanced training
        classifier.fit(encode, label)
        print 'F1 score: ' + str(f1_score(label, classifier.predict(encode)))


        ### ============================================================
        ###                         Test Part
        ### ============================================================
        print 'Testing>>>>>>>>>>>>>>>>>>>>>>>>>'

        test_data = dataloader.data_retrieve(test_id)
        encode = encode_feature(test_data, test_id, [word2vec, clusters, LIWC])
        label = dataloader.label_retrieve(test_id)

        encode = preprocessing.scale(encode)
        print 'F1 score: ' + str(f1_score(label, classifier.predict(encode)))
        fscores.append(f1_score(label, classifier.predict(encode)))
        models.append(classifier)

    print 'MEAN F1 score: ' + str(np.mean(fscores))
    print 'BEST F1 score: ' + str(np.max(fscores)) + ' by Model ' + str(np.argmax(fscores)+1)
    print 'VAR F1 score: ' + str(np.var(fscores))

    save(models[np.argmax(fscores)], 'output/model_clustering_' + str(fscores[np.argmax(fscores)]) + '.pk')