def test_lda_predict():
    # Test LDA classification.
    # This checks that LDA implements fit and predict and returns correct
    # values for simple toy data.
    for test_case in solver_shrinkage:
        solver, shrinkage = test_case
        clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
        y_pred = clf.fit(X, y).predict(X)
        assert_array_equal(y_pred, y, "solver %s" % solver)

        # Assert that it works with 1D data
        y_pred1 = clf.fit(X1, y).predict(X1)
        assert_array_equal(y_pred1, y, "solver %s" % solver)

        # Test probability estimates
        y_proba_pred1 = clf.predict_proba(X1)
        assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y,
                           "solver %s" % solver)
        y_log_proba_pred1 = clf.predict_log_proba(X1)
        assert_allclose(
            np.exp(y_log_proba_pred1),
            y_proba_pred1,
            rtol=1e-6,
            atol=1e-6,
            err_msg="solver %s" % solver,
        )

        # Primarily test for commit 2f34950 -- "reuse" of priors
        y_pred3 = clf.fit(X, y3).predict(X)
        # LDA shouldn't be able to separate those
        assert np.any(y_pred3 != y3), "solver %s" % solver

    clf = LinearDiscriminantAnalysis(solver="svd", shrinkage="auto")
    with pytest.raises(NotImplementedError):
        clf.fit(X, y)

    clf = LinearDiscriminantAnalysis(solver="lsqr",
                                     shrinkage=0.1,
                                     covariance_estimator=ShrunkCovariance())
    with pytest.raises(
            ValueError,
            match=("covariance_estimator and shrinkage "
                   "parameters are not None. "
                   "Only one of the two can be set."),
    ):
        clf.fit(X, y)

    # test bad solver with covariance_estimator
    clf = LinearDiscriminantAnalysis(solver="svd",
                                     covariance_estimator=LedoitWolf())
    with pytest.raises(ValueError,
                       match="covariance estimator is not supported with svd"):
        clf.fit(X, y)

    # test bad covariance estimator
    clf = LinearDiscriminantAnalysis(solver="lsqr",
                                     covariance_estimator=KMeans(
                                         n_clusters=2, n_init="auto"))
    with pytest.raises(ValueError):
        clf.fit(X, y)
def test_lda_predict():
    # Test LDA classification.
    # This checks that LDA implements fit and predict and returns correct
    # values for simple toy data.
    for test_case in solver_shrinkage:
        solver, shrinkage = test_case
        clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
        y_pred = clf.fit(X, y).predict(X)
        assert_array_equal(y_pred, y, "solver %s" % solver)

        # Assert that it works with 1D data
        y_pred1 = clf.fit(X1, y).predict(X1)
        assert_array_equal(y_pred1, y, "solver %s" % solver)

        # Test probability estimates
        y_proba_pred1 = clf.predict_proba(X1)
        assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y, "solver %s" % solver)
        y_log_proba_pred1 = clf.predict_log_proba(X1)
        assert_array_almost_equal(np.exp(y_log_proba_pred1), y_proba_pred1, 8, "solver %s" % solver)

        # Primarily test for commit 2f34950 -- "reuse" of priors
        y_pred3 = clf.fit(X, y3).predict(X)
        # LDA shouldn't be able to separate those
        assert_true(np.any(y_pred3 != y3), "solver %s" % solver)

    # Test invalid shrinkages
    clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=-0.2231)
    assert_raises(ValueError, clf.fit, X, y)
    clf = LinearDiscriminantAnalysis(solver="eigen", shrinkage="dummy")
    assert_raises(ValueError, clf.fit, X, y)
    clf = LinearDiscriminantAnalysis(solver="svd", shrinkage="auto")
    assert_raises(NotImplementedError, clf.fit, X, y)
    # Test unknown solver
    clf = LinearDiscriminantAnalysis(solver="dummy")
    assert_raises(ValueError, clf.fit, X, y)
def test_lda_predict():
    # Test LDA classification.
    # This checks that LDA implements fit and predict and returns correct
    # values for simple toy data.
    for test_case in solver_shrinkage:
        solver, shrinkage = test_case
        clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
        y_pred = clf.fit(X, y).predict(X)
        assert_array_equal(y_pred, y, 'solver %s' % solver)

        # Assert that it works with 1D data
        y_pred1 = clf.fit(X1, y).predict(X1)
        assert_array_equal(y_pred1, y, 'solver %s' % solver)

        # Test probability estimates
        y_proba_pred1 = clf.predict_proba(X1)
        assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y,
                           'solver %s' % solver)
        y_log_proba_pred1 = clf.predict_log_proba(X1)
        assert_allclose(np.exp(y_log_proba_pred1),
                        y_proba_pred1,
                        rtol=1e-6,
                        atol=1e-6,
                        err_msg='solver %s' % solver)

        # Primarily test for commit 2f34950 -- "reuse" of priors
        y_pred3 = clf.fit(X, y3).predict(X)
        # LDA shouldn't be able to separate those
        assert np.any(y_pred3 != y3), 'solver %s' % solver

    # Test invalid shrinkages
    clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=-0.2231)
    assert_raises(ValueError, clf.fit, X, y)
    clf = LinearDiscriminantAnalysis(solver="eigen", shrinkage="dummy")
    assert_raises(ValueError, clf.fit, X, y)
    clf = LinearDiscriminantAnalysis(solver="svd", shrinkage="auto")
    assert_raises(NotImplementedError, clf.fit, X, y)
    # Test unknown solver
    clf = LinearDiscriminantAnalysis(solver="dummy")
    assert_raises(ValueError, clf.fit, X, y)
Exemple #4
0
def func(path, repath):
    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    #path = '/home/hao/桌面/学科分类新/2gram/geog.txt'  # geog.txt
    #mapPath = '/home/hao/PycharmProjects/subjectClassify/TFIDF/map/aa/all.txt'
    #三个文本类别
    lablemap = {"识记与理解": '0', "分析与应用": '1', "综合与拓展": '2'}

    lables = []  #标签y
    corpus = []  #切词后用空格分开的文本
    list2 = []
    for line in open(path, 'r').readlines():
        words = line.strip().split(' ')
        lable = lablemap.get(words[0])

        line = line[line.find(' ') + 1:]
        corpus.append(line)
        lables.append(lable)
    # for ti in range(0, 4, 1):
    #     for ind in range(0, len(list2), 1):
    #         lables.append('2')
    #         corpus.append(list2[ind])
    print os.path.basename(
        path) + '------------------------------------------------------------'
    fwrite = open(repath + os.path.basename(path), 'w')
    fwrite.write(os.path.basename(path) + '\n')
    # 5fold交叉检验
    #lables = np.array(lables)
    kf = StratifiedKFold(lables, n_folds=5)
    #kf = KFold(len(lables), n_folds=5)
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
    tfidf = SVD_Vec(tfidf, 1000)
    i = 0
    for train, test in kf:
        i = i + 1
        print 'fold' + str(i) + ''
        fwrite.write('fold' + str(i) + '\n')

        clf = LogisticRegression()
        clf2 = LDA()
        clf4 = LinearSVC()

        X = []
        y = []
        for ti in train:
            # if(lables[ti]=='2'):
            #     for time in range(0,10,1):
            #         X.append(tfidf[ti])
            #         y.append(lables[ti])
            # if (lables[ti] == '1'):
            #     for time in range(0, 4, 1):
            #         X.append(tfidf[ti])
            #         y.append(lables[ti])
            # else:
            X.append(tfidf[ti])
            y.append(lables[ti])

        clf.fit(X, y)
        X = clf.predict_log_proba(X)
        clf2.fit(X, y)
        X = clf2.predict_log_proba(X)
        clf4.fit(X, y)

        Xt = []
        yt = []
        for xti in test:
            yt.append(lables[xti])
            Xt.append(tfidf[xti])

        Xt = clf.predict_log_proba(Xt)
        Xt = clf2.predict_log_proba(Xt)
        predicted = clf4.predict(Xt)
        fwrite.write(
            classification_report(yt, predicted).replace('\n\n', '\n'))
        print classification_report(yt, predicted).replace('\n\n', '\n')
        #print accuracy_score(testlables, predicted)

    #scores = scv.cross_val_score(clf, tfidf, lables1, cv=5, scoring='accuracy')
    #print scores
    #predicted = scv.cross_val_predict(clf, tfidf1, lables1, cv=5)

    #predicted = clf.predict(tfidf1)
    #print os.path.basename(path)
    #print classification_report(lables1,predicted)
    #print len(predicted)
    # prere = mt.accuracy_score(lables,predicted)
    #fwrite = open('/home/hao/桌面/学科分类新/pre/lr/'+ os.path.basename(path),'w')
    #for pre in predicted:
    #    fwrite.write(pre)
    #    fwrite.write('\n')
    fwrite.close()
Exemple #5
0
def func(trainpath,testpath,repath,testFile):
    clf = LogisticRegression()
    clf2 = LDA();
    clf4 = LinearSVC();
    repath = repath + os.path.basename(trainpath)
    testpath = testpath + os.path.basename(trainpath)
    repath = repath + os.path.basename(trainpath).replace('.txt','.csv')
    testFile = testFile + os.path.basename(trainpath).replace('.txt','.csv')

    lablemap = {"识记与理解": '0', "分析与应用": '1', "综合与拓展": '2'}
    lablemap2 = {'0': "识记与理解", '1': "分析与应用", '2':"综合与拓展"}

    y = []#标签y
    corpus = []#切词后用空格分开的文本
    list2=[]
    for line in open(trainpath, 'r').readlines():
        words = line.strip().split(' ')
        lable = lablemap.get(words[0])
        line = line[line.find(' ') + 1:]

        corpus.append(line)
        y.append(lable)
        if lable == '2':
            list2.append(line)

    # for ti in range(0, 4, 1):
    #     for ind in range(0, len(list2), 1):
    #         y.append('2')
    #         corpus.append(list2[ind])

    X = transformer.fit_transform(vectorizer.fit_transform(corpus))
    X = SVD_Vec(X, 1000)
    print os.path.basename(trainpath)+'------------------------------------------------------------'

    clf.fit(X,y)
    X = clf.predict_log_proba(X)
    clf2.fit(X,y)
    X = clf2.predict_log_proba(X)
    clf4.fit(X, y)

    csvfile = file(testFile,'rb')
    testAll = csv.reader(csvfile)

    csvtest = []  #
    for line in testAll:
        csvtest.append(line)

    csvout=file(repath, 'wb')
    csvwriter = csv.writer(csvout)

    testX = loadTest(testpath)
    predicted = clf.predict_log_proba(testX)
    predicted = clf2.predict_log_proba(predicted);
    predicted = clf4.predict(predicted);
    for preindex in range(0,len(predicted)):
        pre = predicted[preindex]
        csvnub = csvtest[preindex]
        lableStr=lablemap2.get(pre)
        csvreline=[]
        csvreline.append(csvnub[0])
        csvreline.append(csvnub[1])
        csvreline.append(lableStr)
        csvwriter.writerow(csvreline)
    csvout.close()
Exemple #6
0
def type(X,Y):
    rfc = RandomForestClassifier()
    classifier =LogisticRegression()  # SVC(kernel="linear") #svm.SVC(kernel='rbf',C=1,gamma='auto')
    gnb =GaussianNB() #BernoulliNB()#MultinomialNB()#
    gnb2=BernoulliNB()
    gnb3=MultinomialNB()
    svc = LinearSVC(C=0.5)
    EXT =ExtraTreesClassifier(criterion='gini', bootstrap=True,n_estimators=80,oob_score=True)
    EXT2 = ExtraTreesClassifier(criterion='entropy', bootstrap=True,n_estimators=125,oob_score=True)
    bag = BaggingClassifier(DecisionTreeClassifier(), n_estimators=100)
    model = GradientBoostingClassifier()
    model2=AdaBoostClassifier()
    model3=GradientBoostingClassifier()
    model4=LinearDiscriminantAnalysis()
    model5=QuadraticDiscriminantAnalysis()

    Y=shuffle(Y)#不對稱洗牌
    X=shuffle(X)#不對稱洗牌

    bag.fit(X, Y)
    classifier.fit(X, Y)
    rfc.fit(X, Y)
    gnb.fit(X, Y)
    gnb2.fit(X, Y)
    gnb3.fit(X, Y)
    EXT2.fit(X, Y)
    EXT.fit(X, Y)
    svc.fit(X,Y)
    model.fit(X,Y)
    model2.fit(X,Y)
    model3.fit(X,Y)
    model4.fit(X,Y)
    model5.fit(X,Y)


    pred = EXT.predict(X+Y).ravel()  # 預測 一維化
    pred_2=EXT2.predict(X+Y).ravel()  # 預測 一維化
    pred2 = gnb.predict(X + Y).ravel()  # 預測 一維化
    pred2_2 = gnb2.predict(X + Y).ravel()  # 預測 一維化
    pred2_3 = gnb3.predict(X + Y).ravel()  # 預測 一維化
    pred3=svc.predict(X + Y).ravel()
    pred4=bag.predict(X + Y).ravel()
    pred5=classifier.predict(X + Y).ravel()
    pred6=rfc.predict(X + Y).ravel()
    pred7=model.predict(X + Y).ravel()
    pred7_2 = model2.predict(X + Y).ravel()
    pred7_3 = model3.predict(X + Y).ravel()
    pred7_4 = model4.predict(X + Y).ravel()
    pred7_5= model5.predict(X + Y).ravel()



    print("ExtraTreesClassifier_gini",pred)
    print("ExtraTreesClassifier_entropy",pred_2)
    print("GaussianNB",pred2)
    print("BernoulliNB", pred2_2)
    print("MultinomialNB",pred2_3)
    print("LinearSVC(C=0.5)",pred3)
    print("BaggingClassifier(DecisionTreeClassifier(), n_estimators=100)", pred4)
    print("LogisticRegression", pred5)
    print("RandomForestClassifier", pred6)
    print('''model = GradientBoostingClassifier()  
    model2=AdaBoostClassifier()
    model3=GradientBoostingClassifier()
    model4=LinearDiscriminantAnalysis()
    model5=QuadraticDiscriminantAnalysis()''')
    print(pred7)
    print(pred7_2)
    print(pred7_3)
    print(pred7_4)
    print(pred7_5)
    print(model4.predict_log_proba(X + Y).ravel())
    print(model4.predict_proba(X+Y).ravel())
    X, y)  # Returns the mean accuracy on the given test data and labels.

print "true label of 121th sample is ", y[120]
print "preidcted label of 121th sample is ", lda.predict(X[120])

i = 0,
for i in range(149):
    if (y[i] != pred[i]):
        print "The misclassified item:", i

Zx = [[5, 5, 5, 5],
      [3, 3, 3, 3]]  # This is the item that I have made up with 4 features
Z = np.array(Zx)  # I have changed it as a numpy array
print Z
print lda.predict_log_proba(
    Z
)  # This function returns posterior log-probabilities of classification according to each class on an array of test vectors X.
print lda.predict_proba(
    Z
)  # This function returns posterior probabilities of classification according to each class on an array of test vectors X.
print lda.predict(
    Z)  # This function does classification on an array of test vectors X.
print lda.decision_function(
    Z
)  # This function returns the decision function values related to each class on an array of test vectors X.

print confusion_matrix(pred, y)  #
# print fit.score(X, y)  # 96% of accuracy
print accuracy_score(
    y, pred
)  # the use of another function for calculating the accuracy (correct_predictions / all_predictions)
        # Shuffle training data for each iteration
        X_train, y_train = unison_shuffled_copies(X_train, y_train)

        X_l, X_u, y_l, y_u = prepare_labeled_unlabeled(X_train, y_train,
                                                       num_labeled,
                                                       num_unlabeled)

        # Semi-supervised 1
        # Train on labeled data first, predict labels for unlabeled data,
        # and train classifier further with these predicted labels
        clf = LinearDiscriminantAnalysis()
        clf = self_training(clf, X_l, y_l, X_u)

        # Do predictions for test set and evaluate
        y_pred = clf.predict(X_test)
        y_probs = np.sum(np.max(clf.predict_log_proba(X_test), axis=1))
        accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
        error_rates[j, i, 0] = 1 - accuracy
        log_probs[j, i, 0] = y_probs

        # ## Semi-supervised 2
        # Find labels for unlabeled data with label propagation

        # Set up data for LabelPropagation
        X_l, X_u, y_l, y_u = prepare_labeled_unlabeled(X_train, y_train,
                                                       num_labeled,
                                                       num_unlabeled)

        if num_unlabeled == 0:
            # First iteration
            error_rates[j, i, 1] = 1 - accuracy
Exemple #9
0
class LDA(object):
    def __init__(self,
                 solver="svd",
                 shrinkage=None,
                 priors=None,
                 n_components=None,
                 store_covariance=False,
                 tol=1e-4):
        """
        :param solver: string, 可选项,"svd","lsqr", "eigen"。 默认使用svd, 不计算协方差矩阵,适用于大量特征
        的数据, 最小二乘 lsqr, 结合shrinkage 使用。 eigen 特征值分解, 集合shrinkage  使用
        :param shrinkage: str/float 可选项,概率值,默认为None, "auto", 自动收缩, 0到1内的float, 固定的收缩参数
        :param priors: array, optional, shape (n_classes,) 分类优先
        :param n_components:  # 分量数, 默认None, int, 可选项
        :param store_covariance:  bool, 可选项, 只用于”svd“ 额外计算分类协方差矩阵
        :param tol: 浮点型,默认1e-4, 在svd 中,用于排序评估的阈值
        """
        self.model = LinearDiscriminantAnalysis(
            solver=solver,
            shrinkage=shrinkage,
            priors=priors,
            n_components=n_components,
            store_covariance=store_covariance,
            tol=tol)

    def fit(self, x, y):
        self.model.fit(X=x, y=y)

    def transform(self, x):
        return self.model.transform(X=x)

    def fit_transform(self, x, y):
        return self.model.fit_transform(X=x, y=y)

    def get_params(self, deep=True):
        return self.model.get_params(deep=deep)

    def set_params(self, **params):
        self.model.set_params(**params)

    def decision_function(self, x):
        self.model.decision_function(X=x)

    def predict(self, x):
        return self.model.predict(X=x)

    def predict_log_proba(self, x):
        return self.model.predict_log_proba(X=x)

    def predict_proba(self, x):
        return self.model.predict_proba(X=x)

    def score(self, x, y, sample_weight):
        return self.model.score(X=x, y=y, sample_weight=sample_weight)

    def get_attributes(self):  # 生成模型之后才能获取相关属性值
        coef = self.model.coef_  # 权重向量,
        intercept = self.model.intercept_  # 截距项
        covariance = self.model.covariance_  # 协方差矩阵
        explained_variance_ratio = self.model.explained_variance_ratio_
        means = self.model.means_
        priors = self.model.priors_  # 分类等级, 求和为1 shape (n_classes)
        scalings = self.model.scalings_  # shape(rank,n_classes-1). 缩放
        xbar = self.model.xbar_  # 所有的均值
        classes = self.model.classes_  # 分类标签

        return coef, intercept, covariance, explained_variance_ratio, means, priors, scalings, xbar, classes
# train the model
clf.fit(x_train, y_train)

# looking at the attributes
coef = clf.coef_
intercept = clf.intercept_
#covariance_mat = clf.covariance_ # gives the covariance matrix, does not work for the solver 'svd'
perc_vari = clf.explained_variance_ratio_
means = clf.means_
priors = clf.priors_
scalings = clf.scalings_
overall_mean = clf.xbar_
classes = clf.classes_

# looking at the methods
decison_function = clf.decision_function(x_test)
fit_transform = clf.fit_transform(x_test, y_test)
get_params = clf.get_params()
prediction = clf.predict(x_test)
predict_log_proba = clf.predict_log_proba(x_test)
predict_proba = clf.predict_proba(x_test)
mean_accuracy_train = clf.score(x_train, y_train)
mean_accuracy_test = clf.score(x_test, y_test)
transform = clf.transform(x_test)

print(
    'The mean accuracy of the train dataset is: %.3f and the mean accuracy of the test dataset is: %.3f'
    % (mean_accuracy_train, mean_accuracy_test))

pdb.set_trace()