def test_lda_predict(): # Test LDA classification. # This checks that LDA implements fit and predict and returns correct # values for simple toy data. for test_case in solver_shrinkage: solver, shrinkage = test_case clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage) y_pred = clf.fit(X, y).predict(X) assert_array_equal(y_pred, y, "solver %s" % solver) # Assert that it works with 1D data y_pred1 = clf.fit(X1, y).predict(X1) assert_array_equal(y_pred1, y, "solver %s" % solver) # Test probability estimates y_proba_pred1 = clf.predict_proba(X1) assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y, "solver %s" % solver) y_log_proba_pred1 = clf.predict_log_proba(X1) assert_allclose( np.exp(y_log_proba_pred1), y_proba_pred1, rtol=1e-6, atol=1e-6, err_msg="solver %s" % solver, ) # Primarily test for commit 2f34950 -- "reuse" of priors y_pred3 = clf.fit(X, y3).predict(X) # LDA shouldn't be able to separate those assert np.any(y_pred3 != y3), "solver %s" % solver clf = LinearDiscriminantAnalysis(solver="svd", shrinkage="auto") with pytest.raises(NotImplementedError): clf.fit(X, y) clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=0.1, covariance_estimator=ShrunkCovariance()) with pytest.raises( ValueError, match=("covariance_estimator and shrinkage " "parameters are not None. " "Only one of the two can be set."), ): clf.fit(X, y) # test bad solver with covariance_estimator clf = LinearDiscriminantAnalysis(solver="svd", covariance_estimator=LedoitWolf()) with pytest.raises(ValueError, match="covariance estimator is not supported with svd"): clf.fit(X, y) # test bad covariance estimator clf = LinearDiscriminantAnalysis(solver="lsqr", covariance_estimator=KMeans( n_clusters=2, n_init="auto")) with pytest.raises(ValueError): clf.fit(X, y)
def test_lda_predict(): # Test LDA classification. # This checks that LDA implements fit and predict and returns correct # values for simple toy data. for test_case in solver_shrinkage: solver, shrinkage = test_case clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage) y_pred = clf.fit(X, y).predict(X) assert_array_equal(y_pred, y, "solver %s" % solver) # Assert that it works with 1D data y_pred1 = clf.fit(X1, y).predict(X1) assert_array_equal(y_pred1, y, "solver %s" % solver) # Test probability estimates y_proba_pred1 = clf.predict_proba(X1) assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y, "solver %s" % solver) y_log_proba_pred1 = clf.predict_log_proba(X1) assert_array_almost_equal(np.exp(y_log_proba_pred1), y_proba_pred1, 8, "solver %s" % solver) # Primarily test for commit 2f34950 -- "reuse" of priors y_pred3 = clf.fit(X, y3).predict(X) # LDA shouldn't be able to separate those assert_true(np.any(y_pred3 != y3), "solver %s" % solver) # Test invalid shrinkages clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=-0.2231) assert_raises(ValueError, clf.fit, X, y) clf = LinearDiscriminantAnalysis(solver="eigen", shrinkage="dummy") assert_raises(ValueError, clf.fit, X, y) clf = LinearDiscriminantAnalysis(solver="svd", shrinkage="auto") assert_raises(NotImplementedError, clf.fit, X, y) # Test unknown solver clf = LinearDiscriminantAnalysis(solver="dummy") assert_raises(ValueError, clf.fit, X, y)
def test_lda_predict(): # Test LDA classification. # This checks that LDA implements fit and predict and returns correct # values for simple toy data. for test_case in solver_shrinkage: solver, shrinkage = test_case clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage) y_pred = clf.fit(X, y).predict(X) assert_array_equal(y_pred, y, 'solver %s' % solver) # Assert that it works with 1D data y_pred1 = clf.fit(X1, y).predict(X1) assert_array_equal(y_pred1, y, 'solver %s' % solver) # Test probability estimates y_proba_pred1 = clf.predict_proba(X1) assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y, 'solver %s' % solver) y_log_proba_pred1 = clf.predict_log_proba(X1) assert_allclose(np.exp(y_log_proba_pred1), y_proba_pred1, rtol=1e-6, atol=1e-6, err_msg='solver %s' % solver) # Primarily test for commit 2f34950 -- "reuse" of priors y_pred3 = clf.fit(X, y3).predict(X) # LDA shouldn't be able to separate those assert np.any(y_pred3 != y3), 'solver %s' % solver # Test invalid shrinkages clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=-0.2231) assert_raises(ValueError, clf.fit, X, y) clf = LinearDiscriminantAnalysis(solver="eigen", shrinkage="dummy") assert_raises(ValueError, clf.fit, X, y) clf = LinearDiscriminantAnalysis(solver="svd", shrinkage="auto") assert_raises(NotImplementedError, clf.fit, X, y) # Test unknown solver clf = LinearDiscriminantAnalysis(solver="dummy") assert_raises(ValueError, clf.fit, X, y)
def func(path, repath): vectorizer = CountVectorizer() transformer = TfidfTransformer() #path = '/home/hao/桌面/学科分类新/2gram/geog.txt' # geog.txt #mapPath = '/home/hao/PycharmProjects/subjectClassify/TFIDF/map/aa/all.txt' #三个文本类别 lablemap = {"识记与理解": '0', "分析与应用": '1', "综合与拓展": '2'} lables = [] #标签y corpus = [] #切词后用空格分开的文本 list2 = [] for line in open(path, 'r').readlines(): words = line.strip().split(' ') lable = lablemap.get(words[0]) line = line[line.find(' ') + 1:] corpus.append(line) lables.append(lable) # for ti in range(0, 4, 1): # for ind in range(0, len(list2), 1): # lables.append('2') # corpus.append(list2[ind]) print os.path.basename( path) + '------------------------------------------------------------' fwrite = open(repath + os.path.basename(path), 'w') fwrite.write(os.path.basename(path) + '\n') # 5fold交叉检验 #lables = np.array(lables) kf = StratifiedKFold(lables, n_folds=5) #kf = KFold(len(lables), n_folds=5) tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) tfidf = SVD_Vec(tfidf, 1000) i = 0 for train, test in kf: i = i + 1 print 'fold' + str(i) + '' fwrite.write('fold' + str(i) + '\n') clf = LogisticRegression() clf2 = LDA() clf4 = LinearSVC() X = [] y = [] for ti in train: # if(lables[ti]=='2'): # for time in range(0,10,1): # X.append(tfidf[ti]) # y.append(lables[ti]) # if (lables[ti] == '1'): # for time in range(0, 4, 1): # X.append(tfidf[ti]) # y.append(lables[ti]) # else: X.append(tfidf[ti]) y.append(lables[ti]) clf.fit(X, y) X = clf.predict_log_proba(X) clf2.fit(X, y) X = clf2.predict_log_proba(X) clf4.fit(X, y) Xt = [] yt = [] for xti in test: yt.append(lables[xti]) Xt.append(tfidf[xti]) Xt = clf.predict_log_proba(Xt) Xt = clf2.predict_log_proba(Xt) predicted = clf4.predict(Xt) fwrite.write( classification_report(yt, predicted).replace('\n\n', '\n')) print classification_report(yt, predicted).replace('\n\n', '\n') #print accuracy_score(testlables, predicted) #scores = scv.cross_val_score(clf, tfidf, lables1, cv=5, scoring='accuracy') #print scores #predicted = scv.cross_val_predict(clf, tfidf1, lables1, cv=5) #predicted = clf.predict(tfidf1) #print os.path.basename(path) #print classification_report(lables1,predicted) #print len(predicted) # prere = mt.accuracy_score(lables,predicted) #fwrite = open('/home/hao/桌面/学科分类新/pre/lr/'+ os.path.basename(path),'w') #for pre in predicted: # fwrite.write(pre) # fwrite.write('\n') fwrite.close()
def func(trainpath,testpath,repath,testFile): clf = LogisticRegression() clf2 = LDA(); clf4 = LinearSVC(); repath = repath + os.path.basename(trainpath) testpath = testpath + os.path.basename(trainpath) repath = repath + os.path.basename(trainpath).replace('.txt','.csv') testFile = testFile + os.path.basename(trainpath).replace('.txt','.csv') lablemap = {"识记与理解": '0', "分析与应用": '1', "综合与拓展": '2'} lablemap2 = {'0': "识记与理解", '1': "分析与应用", '2':"综合与拓展"} y = []#标签y corpus = []#切词后用空格分开的文本 list2=[] for line in open(trainpath, 'r').readlines(): words = line.strip().split(' ') lable = lablemap.get(words[0]) line = line[line.find(' ') + 1:] corpus.append(line) y.append(lable) if lable == '2': list2.append(line) # for ti in range(0, 4, 1): # for ind in range(0, len(list2), 1): # y.append('2') # corpus.append(list2[ind]) X = transformer.fit_transform(vectorizer.fit_transform(corpus)) X = SVD_Vec(X, 1000) print os.path.basename(trainpath)+'------------------------------------------------------------' clf.fit(X,y) X = clf.predict_log_proba(X) clf2.fit(X,y) X = clf2.predict_log_proba(X) clf4.fit(X, y) csvfile = file(testFile,'rb') testAll = csv.reader(csvfile) csvtest = [] # for line in testAll: csvtest.append(line) csvout=file(repath, 'wb') csvwriter = csv.writer(csvout) testX = loadTest(testpath) predicted = clf.predict_log_proba(testX) predicted = clf2.predict_log_proba(predicted); predicted = clf4.predict(predicted); for preindex in range(0,len(predicted)): pre = predicted[preindex] csvnub = csvtest[preindex] lableStr=lablemap2.get(pre) csvreline=[] csvreline.append(csvnub[0]) csvreline.append(csvnub[1]) csvreline.append(lableStr) csvwriter.writerow(csvreline) csvout.close()
def type(X,Y): rfc = RandomForestClassifier() classifier =LogisticRegression() # SVC(kernel="linear") #svm.SVC(kernel='rbf',C=1,gamma='auto') gnb =GaussianNB() #BernoulliNB()#MultinomialNB()# gnb2=BernoulliNB() gnb3=MultinomialNB() svc = LinearSVC(C=0.5) EXT =ExtraTreesClassifier(criterion='gini', bootstrap=True,n_estimators=80,oob_score=True) EXT2 = ExtraTreesClassifier(criterion='entropy', bootstrap=True,n_estimators=125,oob_score=True) bag = BaggingClassifier(DecisionTreeClassifier(), n_estimators=100) model = GradientBoostingClassifier() model2=AdaBoostClassifier() model3=GradientBoostingClassifier() model4=LinearDiscriminantAnalysis() model5=QuadraticDiscriminantAnalysis() Y=shuffle(Y)#不對稱洗牌 X=shuffle(X)#不對稱洗牌 bag.fit(X, Y) classifier.fit(X, Y) rfc.fit(X, Y) gnb.fit(X, Y) gnb2.fit(X, Y) gnb3.fit(X, Y) EXT2.fit(X, Y) EXT.fit(X, Y) svc.fit(X,Y) model.fit(X,Y) model2.fit(X,Y) model3.fit(X,Y) model4.fit(X,Y) model5.fit(X,Y) pred = EXT.predict(X+Y).ravel() # 預測 一維化 pred_2=EXT2.predict(X+Y).ravel() # 預測 一維化 pred2 = gnb.predict(X + Y).ravel() # 預測 一維化 pred2_2 = gnb2.predict(X + Y).ravel() # 預測 一維化 pred2_3 = gnb3.predict(X + Y).ravel() # 預測 一維化 pred3=svc.predict(X + Y).ravel() pred4=bag.predict(X + Y).ravel() pred5=classifier.predict(X + Y).ravel() pred6=rfc.predict(X + Y).ravel() pred7=model.predict(X + Y).ravel() pred7_2 = model2.predict(X + Y).ravel() pred7_3 = model3.predict(X + Y).ravel() pred7_4 = model4.predict(X + Y).ravel() pred7_5= model5.predict(X + Y).ravel() print("ExtraTreesClassifier_gini",pred) print("ExtraTreesClassifier_entropy",pred_2) print("GaussianNB",pred2) print("BernoulliNB", pred2_2) print("MultinomialNB",pred2_3) print("LinearSVC(C=0.5)",pred3) print("BaggingClassifier(DecisionTreeClassifier(), n_estimators=100)", pred4) print("LogisticRegression", pred5) print("RandomForestClassifier", pred6) print('''model = GradientBoostingClassifier() model2=AdaBoostClassifier() model3=GradientBoostingClassifier() model4=LinearDiscriminantAnalysis() model5=QuadraticDiscriminantAnalysis()''') print(pred7) print(pred7_2) print(pred7_3) print(pred7_4) print(pred7_5) print(model4.predict_log_proba(X + Y).ravel()) print(model4.predict_proba(X+Y).ravel())
X, y) # Returns the mean accuracy on the given test data and labels. print "true label of 121th sample is ", y[120] print "preidcted label of 121th sample is ", lda.predict(X[120]) i = 0, for i in range(149): if (y[i] != pred[i]): print "The misclassified item:", i Zx = [[5, 5, 5, 5], [3, 3, 3, 3]] # This is the item that I have made up with 4 features Z = np.array(Zx) # I have changed it as a numpy array print Z print lda.predict_log_proba( Z ) # This function returns posterior log-probabilities of classification according to each class on an array of test vectors X. print lda.predict_proba( Z ) # This function returns posterior probabilities of classification according to each class on an array of test vectors X. print lda.predict( Z) # This function does classification on an array of test vectors X. print lda.decision_function( Z ) # This function returns the decision function values related to each class on an array of test vectors X. print confusion_matrix(pred, y) # # print fit.score(X, y) # 96% of accuracy print accuracy_score( y, pred ) # the use of another function for calculating the accuracy (correct_predictions / all_predictions)
# Shuffle training data for each iteration X_train, y_train = unison_shuffled_copies(X_train, y_train) X_l, X_u, y_l, y_u = prepare_labeled_unlabeled(X_train, y_train, num_labeled, num_unlabeled) # Semi-supervised 1 # Train on labeled data first, predict labels for unlabeled data, # and train classifier further with these predicted labels clf = LinearDiscriminantAnalysis() clf = self_training(clf, X_l, y_l, X_u) # Do predictions for test set and evaluate y_pred = clf.predict(X_test) y_probs = np.sum(np.max(clf.predict_log_proba(X_test), axis=1)) accuracy = accuracy_score(y_true=y_test, y_pred=y_pred) error_rates[j, i, 0] = 1 - accuracy log_probs[j, i, 0] = y_probs # ## Semi-supervised 2 # Find labels for unlabeled data with label propagation # Set up data for LabelPropagation X_l, X_u, y_l, y_u = prepare_labeled_unlabeled(X_train, y_train, num_labeled, num_unlabeled) if num_unlabeled == 0: # First iteration error_rates[j, i, 1] = 1 - accuracy
class LDA(object): def __init__(self, solver="svd", shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=1e-4): """ :param solver: string, 可选项,"svd","lsqr", "eigen"。 默认使用svd, 不计算协方差矩阵,适用于大量特征 的数据, 最小二乘 lsqr, 结合shrinkage 使用。 eigen 特征值分解, 集合shrinkage 使用 :param shrinkage: str/float 可选项,概率值,默认为None, "auto", 自动收缩, 0到1内的float, 固定的收缩参数 :param priors: array, optional, shape (n_classes,) 分类优先 :param n_components: # 分量数, 默认None, int, 可选项 :param store_covariance: bool, 可选项, 只用于”svd“ 额外计算分类协方差矩阵 :param tol: 浮点型,默认1e-4, 在svd 中,用于排序评估的阈值 """ self.model = LinearDiscriminantAnalysis( solver=solver, shrinkage=shrinkage, priors=priors, n_components=n_components, store_covariance=store_covariance, tol=tol) def fit(self, x, y): self.model.fit(X=x, y=y) def transform(self, x): return self.model.transform(X=x) def fit_transform(self, x, y): return self.model.fit_transform(X=x, y=y) def get_params(self, deep=True): return self.model.get_params(deep=deep) def set_params(self, **params): self.model.set_params(**params) def decision_function(self, x): self.model.decision_function(X=x) def predict(self, x): return self.model.predict(X=x) def predict_log_proba(self, x): return self.model.predict_log_proba(X=x) def predict_proba(self, x): return self.model.predict_proba(X=x) def score(self, x, y, sample_weight): return self.model.score(X=x, y=y, sample_weight=sample_weight) def get_attributes(self): # 生成模型之后才能获取相关属性值 coef = self.model.coef_ # 权重向量, intercept = self.model.intercept_ # 截距项 covariance = self.model.covariance_ # 协方差矩阵 explained_variance_ratio = self.model.explained_variance_ratio_ means = self.model.means_ priors = self.model.priors_ # 分类等级, 求和为1 shape (n_classes) scalings = self.model.scalings_ # shape(rank,n_classes-1). 缩放 xbar = self.model.xbar_ # 所有的均值 classes = self.model.classes_ # 分类标签 return coef, intercept, covariance, explained_variance_ratio, means, priors, scalings, xbar, classes
# train the model clf.fit(x_train, y_train) # looking at the attributes coef = clf.coef_ intercept = clf.intercept_ #covariance_mat = clf.covariance_ # gives the covariance matrix, does not work for the solver 'svd' perc_vari = clf.explained_variance_ratio_ means = clf.means_ priors = clf.priors_ scalings = clf.scalings_ overall_mean = clf.xbar_ classes = clf.classes_ # looking at the methods decison_function = clf.decision_function(x_test) fit_transform = clf.fit_transform(x_test, y_test) get_params = clf.get_params() prediction = clf.predict(x_test) predict_log_proba = clf.predict_log_proba(x_test) predict_proba = clf.predict_proba(x_test) mean_accuracy_train = clf.score(x_train, y_train) mean_accuracy_test = clf.score(x_test, y_test) transform = clf.transform(x_test) print( 'The mean accuracy of the train dataset is: %.3f and the mean accuracy of the test dataset is: %.3f' % (mean_accuracy_train, mean_accuracy_test)) pdb.set_trace()