def train_LDA(n_topics=100):

    ### Load extra data
    dataloader = csv_dataloader(datafile='data/extra_statuses.csv')
    if not os.path.exists('output/extra_cache.pk'):
        dataloader.read_csv(applyfun=preprocess, verbose=True)
        dataloader.save('output/extra_cache.pk')
    else:
        dataloader.load('output/extra_cache.pk')
    tokens = sum(dataloader.ldata.viewvalues(), [])
    print '#Tokens from training data: ' + str(len(tokens))
    print 'Readin done'

    ### Get word2id first
    word2id = get_word2id()
    if not os.path.exists('word2id.pk'):
        word2id.fit(tokens)
        word2id.save('word2id.pk')
    else:
        word2id.load('word2id.pk')
    ids = word2id.ids()
    print "#Id: " + str(len(ids.keys()))

    ### Train LDA
    topics = get_topics(id2word=ids, method='lda', n_topics=n_topics)
    if not os.path.exists('output/lda_all_'+str(n_topics)+'.pk'):
        print 'Training LDA...'
        topics.fit(tokens)
        topics.save('output/lda_all_'+str(n_topics)+'.pk')
        topics.summary()
    else:
        topics.load('output/lda_all_'+str(n_topics)+'.pk')
def predict():

    dataloader = csv_dataloader(datafile='data/test_statuses.csv', extrafile='data/test_metadata.csv', nolabel=True, extra=True)
    if not os.path.exists('output/test_cache.pk'):
        dataloader.read_csv(applyfun=preprocess)
        dataloader.save('output/test_cache.pk')
    else:
        dataloader.load('output/test_cache.pk')
    dataloader.summary()
    print "Read in finished"

    word2id = get_word2id()
    word2id.load('output/word2id.pk')
    ids = word2id.ids()

    n_topics = 100
    topics = get_topics(id2word=ids, method='lda', n_topics=n_topics)
    topics.load('output/lda_all_100.pk')

    # ### Load pre-train word2vector model
    # word2vec = get_word2vec(model='data/GoogleNews-vectors-negative300.bin', binary=True, size=300)
    # print 'Pretrained word2vec loaded'
    #
    # n_topics = 100
    # model_file = 'output/clustering_gmm_100.pk';
    # clusters = get_clusters(method='gmm', n_topics=n_topics)
    # print 'Cluster Model Loaded...'
    # clusters.load(model_file)

    ### Calculate LIWC hist
    LIWC = get_LIWC()

    test_id = dataloader.id
    test_data = dataloader.data_retrieve(test_id)

    ## Generate Test Data Encodings
    encode = encode_feature(test_data, test_id, [topics, LIWC])

    print encode

    encode = preprocessing.scale(encode)
    classifier = load('output/model_LDA_100_0.461538461538.pk')
    predict_label = classifier.predict(encode)
    predict_prob = classifier.predict_proba(encode)

    with open('output/result.csv', 'w+') as file:
        file.write('userID, binaryPrediction, confidence, regression\n')

        for i in range(len(predict_label)):
            string = test_id[i] + ', '
            if predict_label[i]==1:
                string += '+, '
            else:
                string += '-, '
            string += str(predict_prob[i][1]) + ', '
            string += 'N\n'
            file.write(string)

    print str(sum(predict_label)) + '/' + str(len(predict_label))
def show_topics(n_topics=100):
    ### Get word2id first
    word2id = get_word2id()
    word2id.load('word2id.pk')
    ids = word2id.ids()
    print "#Id: " + str(len(ids.keys()))

    ### Show LDA
    topics = get_topics(id2word=ids, method='lda', n_topics=n_topics)
    topics.load('output/lda_all_100.pk')
    topics.summary(n_topics=100)
def main(n_fold=10):
    ### Load trainning data
    dataloader = csv_dataloader(extrafile='data/fixed_train_gender_class.csv', extra=True)
    if not os.path.exists('output/data_cache.pk'):
        dataloader.read_csv(applyfun=preprocess)
        dataloader.save('output/data_cache.pk')
    else:
        dataloader.load('output/data_cache.pk')
    dataloader.summary()
    print "Read in finished"

    ### Get word2id first
    tokens = sum(dataloader.ldata.viewvalues(), [])
    word2id = get_word2id()
    if not os.path.exists('output/word2id.pk'):
        word2id.fit(tokens)
        word2id.save('output/word2id.pk')
    else:
        word2id.load('output/word2id.pk')
    ids = word2id.ids()
    print "#Id: " + str(len(ids.keys()))
    print '#Tokens from training data: ' + str(len(tokens))

    ### Calculate LIWC hist
    LIWC = get_LIWC()
    #print LIWC.calculate_hist(tokens, normalize=False)

    ### Train and load LDA
    n_topics = 25
    model_file = 'output/lda_all_25.pk'
    topics = get_topics(id2word=ids, method='lda', n_topics=n_topics)
    if not os.path.exists(model_file):
        print 'Training LDA...'
        topics.fit(tokens)
        topics.save(model_file)
        topics.summary()
    else:
        topics.load(model_file)


    ### ============================================================
    ###                         n fold
    ### ============================================================

    nfolds = dataloader.nfold(n_fold)
    fscores = []
    models = []

    for fold_ind in range(n_fold):

        print '======================== FOLD ' + str(fold_ind+1) + '========================'

        test_id = nfolds[fold_ind]
        train_id = []
        for i in range(n_fold):
            if i != fold_ind:
                train_id += nfolds[i]


        ### ============================================================
        ###                         Train Part
        ### ============================================================
        print 'Training>>>>>>>>>>>>>>>>>>>>>>>>>'

        train_data = dataloader.data_retrieve(train_id)

        ### Balance Train Data
        _, train_pos_id, train_neg_id = dataloader.balance(train_id, K=2)

        encode_pos = encode_feature(train_data, train_pos_id, [topics, LIWC])
        encode_pos = SMOTE(encode_pos, 200, len(train_pos_id)/4)
        label_pos = np.ones(len(encode_pos))

        encode_neg = encode_feature(train_data, train_pos_id, [topics, LIWC])
        label_neg = np.zeros(len(encode_neg))

        encode = np.concatenate((encode_pos, encode_neg), axis=0)
        label = np.concatenate((label_pos, label_neg), axis=0)
        print encode.shape
        print label.shape

        ### Train
        encode = preprocessing.scale(encode)
        classifier = svm.LinearSVC(verbose=True)
        classifier.fit(encode, label)
        print 'F1 score: ' + str(f1_score(label, classifier.predict(encode)))


        ### ============================================================
        ###                         Test Part
        ### ============================================================
        print 'Testing>>>>>>>>>>>>>>>>>>>>>>>>>'

        test_data = dataloader.data_retrieve(test_id)

        ### Generate Test Data Encodings
        encode = encode_feature(test_data, test_id, [topics, LIWC])
        label = dataloader.label_retrieve(test_id)

        ### Test
        encode = preprocessing.scale(encode)
        print 'F1 score: ' + str(f1_score(label, classifier.predict(encode)))

        fscores.append(f1_score(label, classifier.predict(encode)))
        models.append(classifier)

    print 'MEAN F1 score: ' + str(np.mean(fscores))
    print 'BEST F1 score: ' + str(np.max(fscores)) + ' by Model ' + str(np.argmax(fscores)+1)
    print 'VAR F1 score: ' + str(np.var(fscores))

    save(models[np.argmax(fscores)], 'output/model_LDA_' + str(n_topics) + '_' + str(fscores[np.argmax(fscores)]) + '.pk')