def train_LLDA(n_topics=2):
    ### Load data
    dataloader = csv_dataloader()
    dataloader.load('output/data_cache.pk')
    print "Read in finished"

    train_id = dataloader.id
    train_data = dataloader.data_retrieve(train_id)

    labels = []
    corpus = []

    for id in train_id:
        corpus.append(train_data['ldata'][id])
        labels.append(score2label(train_data['score'][id]))

    ### Train LLDA
    topics = get_topics2(method='LLDA', max_iter=2)
    model_file = 'output/LLDA_8.pk'
    if not os.path.exists(model_file):
        print 'Training LLDA...'
        topics.fit(corpus, labels, verbose=True)
        topics.save(model_file)
    else:
        topics.load(model_file)
    topics.summary()
def main(n_fold=10):
    ### Load trainning data
    dataloader = csv_dataloader(extrafile="data/fixed_train_gender_class.csv", extra=True)
    if not os.path.exists("output/data_cache.pk"):
        dataloader.read_csv(applyfun=preprocess)
        dataloader.save("output/data_cache.pk")
    else:
        dataloader.load("output/data_cache.pk")
    dataloader.summary()
    print "Read in finished"

    ### Calculate LIWC hist
    LIWC = get_LIWC()
    # print LIWC.calculate_hist(tokens, normalize=False)

    ### Train and Load LLDA
    topics = get_topics2(method="LLDA", max_iter=20, n_topics=8)
    model_file = "output/LLDA_8_20.pk"
    if not os.path.exists(model_file):
        train_id = dataloader.id
        train_data = dataloader.data_retrieve(train_id)

        labels = []
        corpus = []

        for id in train_id:
            corpus.append(train_data["data"][id])
            labels.append(score2label(train_data["score"][id]))
        print "Training LLDA..."
        topics.fit(corpus, labels, verbose=True)
        topics.save(model_file)
        print "Saved"
    else:
        topics.load(model_file)
    topics.summary()

    ### ============================================================
    ###                         n fold
    ### ============================================================

    nfolds = dataloader.nfold(n_fold)
    fscores = []
    models = []

    for fold_ind in range(n_fold):

        print "======================== FOLD " + str(fold_ind + 1) + "========================"

        test_id = nfolds[fold_ind]
        train_id = []
        for i in range(n_fold):
            if i != fold_ind:
                train_id += nfolds[i]

        ### ============================================================
        ###                         Train Part
        ### ============================================================
        print "Training>>>>>>>>>>>>>>>>>>>>>>>>>"

        train_data = dataloader.data_retrieve(train_id)

        ### Balance Train Data
        _, train_pos_id, train_neg_id = dataloader.balance(train_id, K=2)

        encode_pos = encode_feature(train_data, train_pos_id, [topics, LIWC])
        encode_pos = SMOTE(encode_pos, 200, len(train_pos_id) / 4)
        label_pos = np.ones(len(encode_pos))

        encode_neg = encode_feature(train_data, train_pos_id, [topics, LIWC])
        label_neg = np.zeros(len(encode_neg))

        encode = np.concatenate((encode_pos, encode_neg), axis=0)
        label = np.concatenate((label_pos, label_neg), axis=0)
        print encode.shape
        print label.shape

        ### Train
        encode = preprocessing.scale(encode)
        classifier = svm.LinearSVC(verbose=True)
        classifier.fit(encode, label)
        print "F1 score: " + str(f1_score(label, classifier.predict(encode)))

        ### ============================================================
        ###                         Test Part
        ### ============================================================
        print "Testing>>>>>>>>>>>>>>>>>>>>>>>>>"

        test_data = dataloader.data_retrieve(test_id)

        ### Generate Test Data Encodings
        encode = encode_feature(test_data, test_id, [topics, LIWC])
        label = dataloader.label_retrieve(test_id)

        ### Test
        encode = preprocessing.scale(encode)
        print "F1 score: " + str(f1_score(label, classifier.predict(encode)))

        fscores.append(f1_score(label, classifier.predict(encode)))
        models.append(classifier)

    print "MEAN F1 score: " + str(np.mean(fscores))
    print "BEST F1 score: " + str(np.max(fscores)) + " by Model " + str(np.argmax(fscores) + 1)
    print "VAR F1 score: " + str(np.var(fscores))

    save(models[np.argmax(fscores)], "output/model_LLDA_8_" + str(fscores[np.argmax(fscores)]) + ".pk")