def main(s1, e1, s2, e2, n_topic):
    # ========== loading LDA model directly without training =======
    dictionary = corpora.Dictionary.load('./ldaResult/sogou.dict')
    corpus = corpora.MmCorpus('./ldaResult/corpus.mm')
    lda = models.LdaModel.load('./ldaResult/model.lda')
    # n_topic = 100
    preTrain = preProcess(testDir, folder, './ldaResult/trainWordlist.txt')
    preTrain.loadText(s1,e1) #s1 = 10, e1 = 1010
    n_train = len(preTrain.label)

    # =========== generating vector space =====================
    train_data = getVSM(lda, corpus, n_train, n_topic)# matrix: n_doc*n_topic
    train_label = np.array(preTrain.label)

    # ============== loading testing file =======================
    preTest = preProcess(testDir, folder, './ldaResult/testWordlist.txt')
    preTest.loadText(s2,e2) #s2 = 1500, e2 = 1800
    n_test = len(preTest.label)
    test_set = preTest.textList
    new_bow = [dictionary.doc2bow(text) for text in test_set]
    # ============== generating vector space ====================
    test_data = getVSM(lda, new_bow, n_test, n_topic)
    test_label = np.array(preTest.label)
    # print('distribution:\n'+(lda[new_bow])[0])

    # ============== classifier ===================
    from sklearn import linear_model
    from sklearn import ensemble
    # clf = ensemble.RandomForestClassifier(n_estimators=100, max_depth=40)
    clf = linear_model.LogisticRegression(C=0.8)
    clf.fit(train_data, train_label)
    predict = clf.predict(test_data)

    # =====================
    pre_list = metrics.precision_score(test_label, predict, average=None)
    rec_list = metrics.precision_score(test_label, predict, average=None)
    f1_list = metrics.f1_score(test_label, predict, average=None)
    from plotBar import plotBar
    from plotMat import plotMat
    from plotCurve import plotPR, plotROC
    plotBar(pre_list, rec_list, f1_list)
    plotMat(test_label, predict)
    # Plot Precision-Recall Curve
    plotPR(train_data, train_label, test_data, test_label, 8)
    plotROC(train_data, train_label, test_data, test_label, 8)
    # =====================

    r1, r2 = metrics.classification_report(test_label, predict), metrics.confusion_matrix(test_label,predict)
    print(str(r1)+'\n'+str(r2))
    return r1
def main(s1, e1, s2, e2, n_topic):
    # ============ loading training file ==========================
    preTrain = preProcess(testDir, folder, './ldaResult/trainWordlist.txt')
    preTrain.loadText(s1,e1)
    n_train = len(preTrain.label)
    train_set = preTrain.textList

    # ============= building dictionary and bow ====================
    print('Mapping token to id...')
    dictionary = corpora.Dictionary(train_set) # mapping word to id
    dictionary.save('./ldaResult/sogou.dict') # saving dictionary

    print('Generating bag of words...')
    corpus = [dictionary.doc2bow(text) for text in train_set]
    corpora.MmCorpus.serialize('./ldaResult/corpus.mm', corpus) # saving corpus(bag of words)

    # tfidf = models.TfidfModel(corpus)
    # corpus_tfidf = tfidf[corpus]

    # ============== training LDA model ===========================
    # n_topic = 100
    lda = models.LdaModel(corpus, id2word=dictionary, num_topics=n_topic)
    # lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=n_topic)
    lda.save('./ldaResult/model.lda') # saving LDA model

    for i in xrange(n_topic):
        print ('topic %d : %s' % (i, lda.print_topic(i, topn=15)))

    # ============== generating vector space =====================
    train_data = getVSM(lda, corpus, n_train, n_topic)# matrix: n_doc*n_topic
    train_label = np.array(preTrain.label)

    # ============== loading testing file =======================
    preTest = preProcess(testDir, folder, './ldaResult/testWordlist.txt')
    preTest.loadText(s2,e2)
    n_test = len(preTest.label)
    test_set = preTest.textList
    new_bow = [dictionary.doc2bow(text) for text in test_set]
    # ============== generating vector space ====================
    test_data = getVSM(lda, new_bow, n_test, n_topic)
    test_label = np.array(preTest.label)
    # print('distribution:\n'+(lda[new_bow])[0])

    # ============== classifier ===================

    from sklearn import ensemble
    rf_clf = ensemble.RandomForestClassifier(n_estimators=100, max_depth=40)
    lr_clf = linear_model.LogisticRegression()

    rf_clf.fit(train_data, train_label)
    lr_clf.fit(train_data, train_label)

    predict1 = rf_clf.predict(test_data)
    predict2 = lr_clf.predict(test_data)

    r1, r2 = metrics.classification_report(test_label, predict1), metrics.confusion_matrix(test_label,predict1)
    print(str(r1)+'\n'+str(r2))
    r1, r2 = metrics.classification_report(test_label, predict2), metrics.confusion_matrix(test_label,predict2)
    print(str(r1)+'\n'+str(r2))

    # =====================
    pre_list = metrics.precision_score(test_label, predict2, average=None)
    rec_list = metrics.precision_score(test_label, predict2, average=None)
    f1_list = metrics.f1_score(test_label, predict2, average=None)
    from plotBar import plotBar
    from plotMat import plotMat
    from plotCurve import plotPR, plotROC
    plotBar(pre_list, rec_list, f1_list)
    plotMat(test_label, predict2)
    # Plot Precision-Recall Curve
    plotPR(train_data, train_label, test_data, test_label, 8)
    plotROC(train_data, train_label, test_data, test_label, 8)
    # =====================
    return r1, r2
def main(s1, e1, s2, e2):
    print 'Now loading training data, text file...'
    start = time.clock()
    preTrain = preProcess(testDir,folder,'./trainResult/trainWordlist.txt')
    preTrain.loadText(s1,e1)
    end = time.clock()
    print 'Loading data costs',end-start,'seconds'

    print 'Now training model...'
    trainM = trainModel(preTrain)
    trainM.train()
    train_data, train_label = trainM.featureVector, np.array(preTrain.label)
    s = "training (sample x feature):%s\n" % str(np.shape(train_data))
    endt = time.clock()
    print 'Generating feature costs',endt-end,'seconds'

    #=======================================
    # cross-validation
    # from sklearn import svm
    # from sklearn import grid_search
    # C_range = np.logspace(-2, 10, 13)
    # gamma_range = np.logspace(-9, 3, 13)
    # param_grid = dict(gamma=gamma_range, C=C_range)
    # cv = cross_validation.StratifiedShuffleSplit(train_label, n_iter=5, test_size=0.2, random_state=42)
    # grid = grid_search.GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv)
    # grid.fit(train_data, train_label)
    #
    # print("The best parameters are %s with a score of %0.2f"
    #       % (grid.best_params_, grid.best_score_))
    #
    # svm_clf = svm.SVC(gamma=grid.best_params_['gamma'], C=grid.best_params_['C'])
    # svm_score = cross_validation.cross_val_score(svm_clf, train_data, train_label, cv=10)
    # print "score = %s\n" % np.mean(svm_score)

    # from sklearn import neighbors
    # knn_clf = neighbors.KNeighborsClassifier()
    # knn_score = cross_validation.cross_val_score(knn_clf, train_data, train_label, cv=10)
    # print "knn score:%s\n" % np.mean(knn_score), knn_score

    # from sklearn import naive_bayes
    # nb_clf = naive_bayes.GaussianNB()
    # nb_score = cross_validation.cross_val_score(nb_clf, train_data, train_label, cv=10)
    # print "naive bayes score:%s\n" % np.mean(nb_score), nb_score

    from sklearn import linear_model
    lr_clf = linear_model.LogisticRegression(C=1.0)
    print lr_clf
    # # cross-validation
    # print '10-folder cross-validation...'
    # from sklearn import grid_search
    # C_range = np.logspace(0,4,5)
    # param_grid = dict(C=C_range)
    # cv = cross_validation.StratifiedShuffleSplit(train_label, n_iter=5, test_size=0.2, random_state=42)
    # grid = grid_search.GridSearchCV(lr_clf, param_grid=param_grid, cv=cv)
    # grid.fit(train_data, train_label)
    # print("The best parameters are %s with a score of %0.2f"
    #       % (grid.best_params_, grid.best_score_))
    #
    # lr_clf.C = grid.best_params_['C']
    # -------------------------------------------------

    #lr_score = cross_validation.cross_val_score(lr_clf, train_data, train_label, cv=10)
    #print "logistic regression score:%s\n" % np.mean(lr_score), lr_score

    sgd_clf = linear_model.SGDClassifier()
    print sgd_clf
    # from sklearn import tree
    # dt_clf = tree.DecisionTreeClassifier()
    # dt_score = cross_validation.cross_val_score(dt_clf, train_data, train_label, cv=10)
    # print "decision tree score:%s\n" % np.mean(dt_score), dt_score

    from sklearn import ensemble
    rf_clf = ensemble.RandomForestClassifier(n_estimators=100, max_depth=40)
    print rf_clf
    # rf_score = cross_validation.cross_val_score(rf_clf, train_data, train_label, cv=10)
    # print "random forest score:%s\n" % np.mean(rf_score), rf_score

    gbc_clf = ensemble.GradientBoostingClassifier()
    print gbc_clf
    # gbc_score = cross_validation.cross_val_score(gbc, train_data, train_label, cv=10)
    # print "gradient boosting classifier score:%s\n" % np.mean(gbc_score), gbc_score

    #=======================================
    # fit model
    lr_clf.fit(train_data, train_label)
    sgd_clf.fit(train_data, train_label)
    rf_clf.fit(train_data, train_label)
    gbc_clf.fit(train_data, train_label)
    ###############################################################################
    # # Models we will use
    # from sklearn import linear_model
    # from sklearn.neural_network import BernoulliRBM
    # from sklearn.pipeline import Pipeline
    # logistic = linear_model.LogisticRegression()
    # rbm = BernoulliRBM(random_state=0, verbose=True)
    # classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
    #
    # # Training
    #
    # # Hyper-parameters. These were set by cross-validation,
    # # using a GridSearchCV. Here we are not performing cross-validation to
    # # save time.
    # # -------------------------------------------------
    # rbm.learning_rate = 0.01
    # rbm.n_iter = 40
    # # More components tend to give better prediction performance, but larger fitting time
    # rbm.n_components = 500
    # logistic.C = 1.0 # grid.best_params_['C'] # 6000.0
    #
    # # Training RBM-Logistic Pipeline
    # classifier.fit(train_data, train_label)
    # ###############################################################################

    print 'Now testing model...'
    preTest = preProcess(testDir,folder,'./testResult/testWordlist.txt')
    preTest.loadText(s2,e2)
    testM = testModel(preTest)
    testM.getFeature(trainM.featureWord)

    test_data, test_label = testM.featureVector, np.array(preTest.label)
    s += "testing (sample, feature):%s " % str(np.shape(test_data))
    #=======================================
    # prediction
    # predict = lr_clf.predict(test_data)
    # predict = rf_clf.predict(test_data)
    # predict = classifier.predict(test_data)
    # print "LR using RBM: \n"
    str1 = 'Logistic Regression Classifier:\n%s\n' % metrics.classification_report(test_label, lr_clf.predict(test_data))
    print str1
    print metrics.confusion_matrix(test_label, lr_clf.predict(test_data))

# =====================
    pre_list = metrics.precision_score(test_label, lr_clf.predict(test_data), average=None)
    rec_list = metrics.precision_score(test_label, lr_clf.predict(test_data), average=None)
    f1_list = metrics.f1_score(test_label, lr_clf.predict(test_data), average=None)
    from plotBar import plotBar
    from plotMat import plotMat
    from plotCurve import plotPR, plotROC
    plotBar(pre_list, rec_list, f1_list)
    plotMat(test_label, lr_clf.predict(test_data))
    # Plot Precision-Recall Curve
    plotPR(train_data, train_label, test_data, test_label, 8)
    plotROC(train_data, train_label, test_data, test_label, 8)
# =====================

    str2 = 'LR with SGD(Stochastic Gradient Descent):\n%s\n' % metrics.classification_report(test_label, sgd_clf.predict(test_data))
    print str2
    print metrics.confusion_matrix(test_label, sgd_clf.predict(test_data))

    str3 =  'Random Forest Classifier:\n%s\n' % metrics.classification_report(test_label, rf_clf.predict(test_data))
    print str3
    print metrics.confusion_matrix(test_label, rf_clf.predict(test_data))

    str4 =  'Gradient Boosting Classifier:\n%s\n' % metrics.classification_report(test_label, gbc_clf.predict(test_data))
    print str4
    print metrics.confusion_matrix(test_label, gbc_clf.predict(test_data))
    # print 'LR with RBM Classifier:\n'
    # print metrics.classification_report(test_label, classifier.predict(test_data))
    # print metrics.confusion_matrix(test_label, classifier.predict(test_data))
    return s,str1+str2+str3+str4