def main(s1, e1, s2, e2, n_topic): # ========== loading LDA model directly without training ======= dictionary = corpora.Dictionary.load('./ldaResult/sogou.dict') corpus = corpora.MmCorpus('./ldaResult/corpus.mm') lda = models.LdaModel.load('./ldaResult/model.lda') # n_topic = 100 preTrain = preProcess(testDir, folder, './ldaResult/trainWordlist.txt') preTrain.loadText(s1,e1) #s1 = 10, e1 = 1010 n_train = len(preTrain.label) # =========== generating vector space ===================== train_data = getVSM(lda, corpus, n_train, n_topic)# matrix: n_doc*n_topic train_label = np.array(preTrain.label) # ============== loading testing file ======================= preTest = preProcess(testDir, folder, './ldaResult/testWordlist.txt') preTest.loadText(s2,e2) #s2 = 1500, e2 = 1800 n_test = len(preTest.label) test_set = preTest.textList new_bow = [dictionary.doc2bow(text) for text in test_set] # ============== generating vector space ==================== test_data = getVSM(lda, new_bow, n_test, n_topic) test_label = np.array(preTest.label) # print('distribution:\n'+(lda[new_bow])[0]) # ============== classifier =================== from sklearn import linear_model from sklearn import ensemble # clf = ensemble.RandomForestClassifier(n_estimators=100, max_depth=40) clf = linear_model.LogisticRegression(C=0.8) clf.fit(train_data, train_label) predict = clf.predict(test_data) # ===================== pre_list = metrics.precision_score(test_label, predict, average=None) rec_list = metrics.precision_score(test_label, predict, average=None) f1_list = metrics.f1_score(test_label, predict, average=None) from plotBar import plotBar from plotMat import plotMat from plotCurve import plotPR, plotROC plotBar(pre_list, rec_list, f1_list) plotMat(test_label, predict) # Plot Precision-Recall Curve plotPR(train_data, train_label, test_data, test_label, 8) plotROC(train_data, train_label, test_data, test_label, 8) # ===================== r1, r2 = metrics.classification_report(test_label, predict), metrics.confusion_matrix(test_label,predict) print(str(r1)+'\n'+str(r2)) return r1
def main(s1, e1, s2, e2, n_topic): # ============ loading training file ========================== preTrain = preProcess(testDir, folder, './ldaResult/trainWordlist.txt') preTrain.loadText(s1,e1) n_train = len(preTrain.label) train_set = preTrain.textList # ============= building dictionary and bow ==================== print('Mapping token to id...') dictionary = corpora.Dictionary(train_set) # mapping word to id dictionary.save('./ldaResult/sogou.dict') # saving dictionary print('Generating bag of words...') corpus = [dictionary.doc2bow(text) for text in train_set] corpora.MmCorpus.serialize('./ldaResult/corpus.mm', corpus) # saving corpus(bag of words) # tfidf = models.TfidfModel(corpus) # corpus_tfidf = tfidf[corpus] # ============== training LDA model =========================== # n_topic = 100 lda = models.LdaModel(corpus, id2word=dictionary, num_topics=n_topic) # lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=n_topic) lda.save('./ldaResult/model.lda') # saving LDA model for i in xrange(n_topic): print ('topic %d : %s' % (i, lda.print_topic(i, topn=15))) # ============== generating vector space ===================== train_data = getVSM(lda, corpus, n_train, n_topic)# matrix: n_doc*n_topic train_label = np.array(preTrain.label) # ============== loading testing file ======================= preTest = preProcess(testDir, folder, './ldaResult/testWordlist.txt') preTest.loadText(s2,e2) n_test = len(preTest.label) test_set = preTest.textList new_bow = [dictionary.doc2bow(text) for text in test_set] # ============== generating vector space ==================== test_data = getVSM(lda, new_bow, n_test, n_topic) test_label = np.array(preTest.label) # print('distribution:\n'+(lda[new_bow])[0]) # ============== classifier =================== from sklearn import ensemble rf_clf = ensemble.RandomForestClassifier(n_estimators=100, max_depth=40) lr_clf = linear_model.LogisticRegression() rf_clf.fit(train_data, train_label) lr_clf.fit(train_data, train_label) predict1 = rf_clf.predict(test_data) predict2 = lr_clf.predict(test_data) r1, r2 = metrics.classification_report(test_label, predict1), metrics.confusion_matrix(test_label,predict1) print(str(r1)+'\n'+str(r2)) r1, r2 = metrics.classification_report(test_label, predict2), metrics.confusion_matrix(test_label,predict2) print(str(r1)+'\n'+str(r2)) # ===================== pre_list = metrics.precision_score(test_label, predict2, average=None) rec_list = metrics.precision_score(test_label, predict2, average=None) f1_list = metrics.f1_score(test_label, predict2, average=None) from plotBar import plotBar from plotMat import plotMat from plotCurve import plotPR, plotROC plotBar(pre_list, rec_list, f1_list) plotMat(test_label, predict2) # Plot Precision-Recall Curve plotPR(train_data, train_label, test_data, test_label, 8) plotROC(train_data, train_label, test_data, test_label, 8) # ===================== return r1, r2
def main(s1, e1, s2, e2): print 'Now loading training data, text file...' start = time.clock() preTrain = preProcess(testDir,folder,'./trainResult/trainWordlist.txt') preTrain.loadText(s1,e1) end = time.clock() print 'Loading data costs',end-start,'seconds' print 'Now training model...' trainM = trainModel(preTrain) trainM.train() train_data, train_label = trainM.featureVector, np.array(preTrain.label) s = "training (sample x feature):%s\n" % str(np.shape(train_data)) endt = time.clock() print 'Generating feature costs',endt-end,'seconds' #======================================= # cross-validation # from sklearn import svm # from sklearn import grid_search # C_range = np.logspace(-2, 10, 13) # gamma_range = np.logspace(-9, 3, 13) # param_grid = dict(gamma=gamma_range, C=C_range) # cv = cross_validation.StratifiedShuffleSplit(train_label, n_iter=5, test_size=0.2, random_state=42) # grid = grid_search.GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv) # grid.fit(train_data, train_label) # # print("The best parameters are %s with a score of %0.2f" # % (grid.best_params_, grid.best_score_)) # # svm_clf = svm.SVC(gamma=grid.best_params_['gamma'], C=grid.best_params_['C']) # svm_score = cross_validation.cross_val_score(svm_clf, train_data, train_label, cv=10) # print "score = %s\n" % np.mean(svm_score) # from sklearn import neighbors # knn_clf = neighbors.KNeighborsClassifier() # knn_score = cross_validation.cross_val_score(knn_clf, train_data, train_label, cv=10) # print "knn score:%s\n" % np.mean(knn_score), knn_score # from sklearn import naive_bayes # nb_clf = naive_bayes.GaussianNB() # nb_score = cross_validation.cross_val_score(nb_clf, train_data, train_label, cv=10) # print "naive bayes score:%s\n" % np.mean(nb_score), nb_score from sklearn import linear_model lr_clf = linear_model.LogisticRegression(C=1.0) print lr_clf # # cross-validation # print '10-folder cross-validation...' # from sklearn import grid_search # C_range = np.logspace(0,4,5) # param_grid = dict(C=C_range) # cv = cross_validation.StratifiedShuffleSplit(train_label, n_iter=5, test_size=0.2, random_state=42) # grid = grid_search.GridSearchCV(lr_clf, param_grid=param_grid, cv=cv) # grid.fit(train_data, train_label) # print("The best parameters are %s with a score of %0.2f" # % (grid.best_params_, grid.best_score_)) # # lr_clf.C = grid.best_params_['C'] # ------------------------------------------------- #lr_score = cross_validation.cross_val_score(lr_clf, train_data, train_label, cv=10) #print "logistic regression score:%s\n" % np.mean(lr_score), lr_score sgd_clf = linear_model.SGDClassifier() print sgd_clf # from sklearn import tree # dt_clf = tree.DecisionTreeClassifier() # dt_score = cross_validation.cross_val_score(dt_clf, train_data, train_label, cv=10) # print "decision tree score:%s\n" % np.mean(dt_score), dt_score from sklearn import ensemble rf_clf = ensemble.RandomForestClassifier(n_estimators=100, max_depth=40) print rf_clf # rf_score = cross_validation.cross_val_score(rf_clf, train_data, train_label, cv=10) # print "random forest score:%s\n" % np.mean(rf_score), rf_score gbc_clf = ensemble.GradientBoostingClassifier() print gbc_clf # gbc_score = cross_validation.cross_val_score(gbc, train_data, train_label, cv=10) # print "gradient boosting classifier score:%s\n" % np.mean(gbc_score), gbc_score #======================================= # fit model lr_clf.fit(train_data, train_label) sgd_clf.fit(train_data, train_label) rf_clf.fit(train_data, train_label) gbc_clf.fit(train_data, train_label) ############################################################################### # # Models we will use # from sklearn import linear_model # from sklearn.neural_network import BernoulliRBM # from sklearn.pipeline import Pipeline # logistic = linear_model.LogisticRegression() # rbm = BernoulliRBM(random_state=0, verbose=True) # classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) # # # Training # # # Hyper-parameters. These were set by cross-validation, # # using a GridSearchCV. Here we are not performing cross-validation to # # save time. # # ------------------------------------------------- # rbm.learning_rate = 0.01 # rbm.n_iter = 40 # # More components tend to give better prediction performance, but larger fitting time # rbm.n_components = 500 # logistic.C = 1.0 # grid.best_params_['C'] # 6000.0 # # # Training RBM-Logistic Pipeline # classifier.fit(train_data, train_label) # ############################################################################### print 'Now testing model...' preTest = preProcess(testDir,folder,'./testResult/testWordlist.txt') preTest.loadText(s2,e2) testM = testModel(preTest) testM.getFeature(trainM.featureWord) test_data, test_label = testM.featureVector, np.array(preTest.label) s += "testing (sample, feature):%s " % str(np.shape(test_data)) #======================================= # prediction # predict = lr_clf.predict(test_data) # predict = rf_clf.predict(test_data) # predict = classifier.predict(test_data) # print "LR using RBM: \n" str1 = 'Logistic Regression Classifier:\n%s\n' % metrics.classification_report(test_label, lr_clf.predict(test_data)) print str1 print metrics.confusion_matrix(test_label, lr_clf.predict(test_data)) # ===================== pre_list = metrics.precision_score(test_label, lr_clf.predict(test_data), average=None) rec_list = metrics.precision_score(test_label, lr_clf.predict(test_data), average=None) f1_list = metrics.f1_score(test_label, lr_clf.predict(test_data), average=None) from plotBar import plotBar from plotMat import plotMat from plotCurve import plotPR, plotROC plotBar(pre_list, rec_list, f1_list) plotMat(test_label, lr_clf.predict(test_data)) # Plot Precision-Recall Curve plotPR(train_data, train_label, test_data, test_label, 8) plotROC(train_data, train_label, test_data, test_label, 8) # ===================== str2 = 'LR with SGD(Stochastic Gradient Descent):\n%s\n' % metrics.classification_report(test_label, sgd_clf.predict(test_data)) print str2 print metrics.confusion_matrix(test_label, sgd_clf.predict(test_data)) str3 = 'Random Forest Classifier:\n%s\n' % metrics.classification_report(test_label, rf_clf.predict(test_data)) print str3 print metrics.confusion_matrix(test_label, rf_clf.predict(test_data)) str4 = 'Gradient Boosting Classifier:\n%s\n' % metrics.classification_report(test_label, gbc_clf.predict(test_data)) print str4 print metrics.confusion_matrix(test_label, gbc_clf.predict(test_data)) # print 'LR with RBM Classifier:\n' # print metrics.classification_report(test_label, classifier.predict(test_data)) # print metrics.confusion_matrix(test_label, classifier.predict(test_data)) return s,str1+str2+str3+str4