Esempio n. 1
0
def test_ecoc_delegate_sparse_base_estimator():
    # Non-regression test for
    # https://github.com/scikit-learn/scikit-learn/issues/17218
    X, y = iris.data, iris.target
    X_sp = sp.csc_matrix(X)

    # create an estimator that does not support sparse input
    base_estimator = CheckingClassifier(
        check_X=check_array,
        check_X_params={
            "ensure_2d": True,
            "accept_sparse": False
        },
    )
    ecoc = OutputCodeClassifier(base_estimator, random_state=0)

    with pytest.raises(TypeError, match="A sparse matrix was passed"):
        ecoc.fit(X_sp, y)

    ecoc.fit(X, y)
    with pytest.raises(TypeError, match="A sparse matrix was passed"):
        ecoc.predict(X_sp)

    # smoke test to check when sparse input should be supported
    ecoc = OutputCodeClassifier(LinearSVC(random_state=0))
    ecoc.fit(X_sp, y).predict(X_sp)
    assert len(ecoc.estimators_) == 4
Esempio n. 2
0
def run_test(**kwargs):
    b = fetch_sw_orl()
    tic = time.time()

    # split the data in
    X_train, X_test, y_train, y_true = train_test_split(b.data,
                                                        b.target,
                                                        test_size=0.2,
                                                        stratify=b.target)

    hog_train = []
    for img_array in X_train:
        fd, _ = hog(img_array.reshape(b.shape),
                    orientations=8,
                    pixels_per_cell=(PPC, PPC),
                    cells_per_block=(1, 1),
                    visualize=True,
                    multichannel=False)
        hog_train.append(fd)

    clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2)
    clf.fit(hog_train, y_train)
    tok = time.time()

    hog_test = []
    for img_arry in X_test:
        fd, _ = hog(img_arry.reshape(b.shape),
                    orientations=8,
                    pixels_per_cell=(PPC, PPC),
                    cells_per_block=(1, 1),
                    visualize=True,
                    multichannel=False)
        hog_test.append(fd)
    y_pred = clf.predict(hog_test)
    return tok - tic, accuracy_score(y_true, y_pred)
Esempio n. 3
0
def clasificar_ECOC(X, y, df, trainInputs, trainOutputs, testInputs, testOutputs, graphname):
	print("\n[" + str(graphname) + "]")
	kernelRBF=1.0*RBF(1.0)
	clf=OutputCodeClassifier(estimator = DecisionTreeClassifier())
	clf=clf.fit(trainInputs, trainOutputs)
	precisionTrain = clf.score(trainInputs, trainOutputs)
	precisionTest = clf.score(testInputs, testOutputs)
	print("\tCCR train = %.2f%% | CCR test = %.2f%%" % (precisionTrain*100, precisionTest*100))
	prediccion_test = clf.predict(testInputs)
	print(prediccion_test)
	print(testOutputs)
	return precisionTest
Esempio n. 4
0
 def evaluateOutputCode(X, Y, printReport=False):
     time = datetime.datetime.now()
     X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                         Y,
                                                         test_size=0.2,
                                                         random_state=42)
     clf = OutputCodeClassifier(LinearSVC(random_state=0),
                                code_size=2,
                                random_state=0)
     clf.fit(X_train, Y_train)
     if printReport:
         print 'Training time:' + str(datetime.datetime.now() - time)
         print 'Evaluation result: OneVsOne: ' + str(
             clf.score(X_test, Y_test))
     Y_test = clf.predict(X_test)
     if printReport:
         print '0: ' + str((Y_test == 0).sum())
         print '1: ' + str((Y_test == 1).sum())
         print '2: ' + str((Y_test == 2).sum())
     return [clf.score(X_test, Y_test), (Y_test == 1).sum(), clf]
Esempio n. 5
0
def OutputCodeClassifier(data, label, pred_data, pred_last):
    '''
    0.76473194506
    Number of mislabeled points out of a total 841 points : 211
    0.749108204518
    需要规范化
    '''
    data = np.array(data)
    pred_data = np.array(pred_data)
    label = np.array(label)
    pred_last = np.array(pred_last)
    from sklearn.multiclass import OutputCodeClassifier
    from sklearn.svm import LinearSVC
    clf = OutputCodeClassifier(LinearSVC(random_state=0),
                               code_size=2,
                               random_state=0)
    clf.fit(data, label)

    print clf.score(data, label)
    pred_result = clf.predict(pred_data)
    print("Number of mislabeled points out of a total %d points : %d" %
          (pred_data.shape[0], (pred_last != pred_result).sum()))
    print clf.score(pred_data, pred_last)
    return pred_result
            row += get_haar_features(im, top_left, bottom_right)

        train_ecoc_table[ind] = row

    test_ecoc_table = np.zeros(shape=(np.shape(test_images)[0], 200))
    for ind, im in enumerate(test_images):
        row = []
        for (top_left, bottom_right) in rectangles:
            row += get_haar_features(im, top_left, bottom_right)

        test_ecoc_table[ind] = row

    clf = OutputCodeClassifier(AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200), code_size=5, random_state=0)
    clf.fit(train_ecoc_table, labels)

    train_pred = np.array(clf.predict(train_ecoc_table))
    print "Digits Training Accuracy: %f" % (np.sum(train_pred == np.array(labels)).astype(np.float)/np.shape(train_pred)[0])

    test_pred = np.array(clf.predict(test_ecoc_table))
    print "Digits Testing Accuracy: %f" % (np.sum(test_pred == np.array(test_labels)).astype(np.float)/np.shape(test_pred)[0])

    # ecoc_table = []
    # for im in images:
    #
    #     im_preprocess = np.matrix([[np.sum(im[:i,:j]) for i in range(1, 29)] for j in range(1, 29)])
    #
    #     def get_black_rectangle(top_left, bottom_right):
    #         x1, y1 = top_left
    #         x2, y2 = bottom_right
    #
    #         return im_preprocess[x2, y2] - im_preprocess[x2, y1] - im_preprocess[x1, y2] + im_preprocess[x1, y1]
Esempio n. 7
0
def test_ecoc_exceptions():
    ecoc = OutputCodeClassifier(LinearSVC(random_state=0))
    with pytest.raises(NotFittedError):
        ecoc.predict([])
Esempio n. 8
0
                                                    breast.target,
                                                    test_size=0.2)

# creating a classification
clf_1 = MLPClassifier(solver='lbfgs',
                      alpha=1e-5,
                      hidden_layer_sizes=(5, 2),
                      random_state=42)
clf_2 = OutputCodeClassifier(LinearSVC(random_state=0),
                             code_size=2,
                             random_state=42)

# train the classifier with training data
clf_1.fit(x_train, y_train)
clf_2.fit(x_train, y_train)

# find y_pred prediction best on x_test data
y_pred_1 = clf_1.predict(x_test)
y_pred_2 = clf_2.predict(x_test)

# calculate accuracy of y_pred using y_test
print(f'accuracy {accuracy_score(y_test, y_pred_1)}')
print(f'accuracy {accuracy_score(y_test, y_pred_2)}')

# use classification_report function to print more information
print(
    f'\n\nClassification report for MLPClassifier is\n {classification_report(y_test, y_pred_2)}'
)
print(
    f'\n\nClassification report for MLPClassifierOutpuCodeClassifier is\n {classification_report(y_test, y_pred_2)}'
)
Esempio n. 9
0
@author: 凯风
"""

from sklearn.datasets import load_iris
from sklearn.multiclass import OutputCodeClassifier
from sklearn.svm import LinearSVR
from sklearn.model_selection import train_test_split

iris_data = load_iris()
X, Y = iris_data.data, iris_data.target
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.3)
'''
    纠错输出码
        和O-vs-O、O-vs-Rest不太一样的方法
        主要是在欧几里得空间表示
        具体的文本解释,看《机器学习》周治平的那个本里面有提到
'''

clf = LinearSVR(random_state=0)
ovrc = OutputCodeClassifier(clf, code_size=1.5, random_state=None, n_jobs=1)
ovrc.fit(trainX, trainY)
ovrc.predict(testX)
ovrc.code_book_
'''
    estimator           评估器
    code_size           空间尺寸? 
    random_state        随机器
    n_jobs              CPU的作业数量
'''
def get_key(val):
    identifiedKey = [k for k,v in category_codes.items() if v == val]
    if  len(identifiedKey) == 0:
        return "No value"
    return identifiedKey[0]

## Actual code to run for large data prediction
#labels_predict = svc.predict(features_test)
#print(accuracy_score(labels_test_test, labels_predict_test))

## Test predict only for 2000 rows due to machine constraints
very_small_sample_size = 2000
labels_test_test = labels_test.head(very_small_sample_size)
input_test = X_test.head(very_small_sample_size)
features_test_input = tfidf.transform(input_test).toarray()
labels_predict_test = occ.predict(features_test_input)

## Print the test results accuracy compared to actual
print("The training accuracy is: ")
print(accuracy_score(labels_test_test, labels_predict_test))

print("Classification report is as follows: ")
print(classification_report(labels_test_test, labels_predict_test))

## Uncomment below for printing confusion matrix
# cm=confusion_matrix(labels_test_test, labels_predict_test)
# sns.heatmap(cm, annot=True)

## For very small given sample size print - Item Desc || Actual test category || Predicted test category
for i in range(very_small_sample_size):
    print (input_test.values[i], " || ", get_key(labels_test.values[i]), " || ", get_key(labels_predict_test[i]))
Esempio n. 11
0
# 划分数据集合
welfare_data_train,welfare_data_test,welfare_target_train,welfare_target_test = \
train_test_split(welfare_data,welfare_target,test_size=0.2,random_state=666)

# 数据标准化
# stdScaler = StandardScaler().fit(welfare_data_train)
# welfare_data_train_std = stdScaler.transform(welfare_data_train)
# welfare_data_test_std = stdScaler.transform(welfare_data_test)

# 建立svm模型,使用线性核函数
model = OutputCodeClassifier(LinearSVC())
model = model.fit(welfare_data_train,welfare_target_train)
# 保存模型
joblib.dump(model, 'welfare_predict.pkl')

welfare_target_predict = model.predict(welfare_data_test)
print('预测前20个结果为:\n',welfare_target_predict[:20])

print('使用SVM预测数据的准确率为:',
      accuracy_score(welfare_target_test,welfare_target_predict))
print('使用SVM预测数据的精确率为:',
      precision_score(welfare_target_test,welfare_target_predict,average='micro'))
print('使用SVM预测数据的召回率为:',
      recall_score(welfare_target_test,welfare_target_predict,average='micro'))
print('使用SVM预测数据的F1值为:',
      f1_score(welfare_target_test,welfare_target_predict,average='micro'))
print('使用SVM预测数据的Cohen’s Kappa系数为:',
      cohen_kappa_score(welfare_target_test,welfare_target_predict))
# 使用SVM预测数据的准确率为: 0.9966957044157405
# 使用SVM预测数据的精确率为: 0.9966957044157405
# 使用SVM预测数据的召回率为: 0.9966957044157405
Esempio n. 12
0
                           random_state=42)
clf.fit(hog_train, y_train)
tok = time.time()

if control[1]:
    # create the hog fro the X_test
    hog_test = []
    for img_arry in X_test:
        fd, _ = hog(img_arry.reshape(b.shape),
                    orientations=8,
                    pixels_per_cell=(PPC, PPC),
                    cells_per_block=(1, 1),
                    visualize=True,
                    multichannel=False)
        hog_test.append(fd)
    y_pred = clf.predict(hog_test)

    print(
        f'the number of correct example is {accuracy_score(y_true, y_pred, normalize=False)}, with accuracy score of {accuracy_score(y_true, y_pred)}'
    )
    print(classification_report(y_true, y_pred, zero_division=0.0))
    print(f'time to train : {tok - tic:.5}')


def run_test(**kwargs):
    b = fetch_sw_orl()
    tic = time.time()

    # split the data in
    X_train, X_test, y_train, y_true = train_test_split(b.data,
                                                        b.target,
Esempio n. 13
0
# Initialise scaler to scale the data
scaler = StandardScaler()

train_set = np.empty(train_x.shape, dtype=float)
test_set = np.empty(test_x.shape, dtype=float)

# Make training data suitable for scaling
for index in range(len(train_x)):
    train_set[index] = train_x[index].astype(float)

# Make testing data suitable for scaling
for index in range(len(test_x)):
    test_set[index] = test_x[index].astype(float)

# Fit the training data
scaler.fit(train_set)

# Scale the training and testing data w.r.t scaler
data_train = scaler.transform(train_set)
data_test = scaler.transform(test_set)

occ = OutputCodeClassifier(BaggingClassifier())
occ.fit(data_train, train_y)
prediction = occ.predict(data_test)
accuracy = accuracy_score(test_y, prediction)

# -------------------- Print the final result -------------------- #

print("\nAccuracy using Output Code Classifier :", round(accuracy * 100, 3),
      "%\n")
Esempio n. 14
0
        for w in words:
            for i, word in enumerate(vocab):
                if word == w:
                    bag_vector[i] += 1

        print("{0} \n{1}\n".format(sentence, numpy.array(bag_vector)))


allsentences = [
    "Joe waited`s for the train", "The train was late",
    "Mary and Samantha took the bus",
    "I looked for Mary and Samantha at the bus station",
    "Mary and Samantha arrived at the bus station early but waited until noon for the bus"
]

generate_bow(allsentences)

from sklearn import datasets
from sklearn.multiclass import OutputCodeClassifier
from sklearn.svm import LinearSVC
iris = datasets.load_iris()
X, y = iris.data, iris.target
print(X)
clf = OutputCodeClassifier(LinearSVC(random_state=0),
                           code_size=2,
                           random_state=0)

clf.fit(X, y)
m = clf.predict(X)
print(m)
Esempio n. 15
0
y= np.array(np.ones(15050), ndmin=1)     #label 1 for malignant
y_train=np.concatenate((x,y), axis=0)

'''

#labeling y_test
x1= np.array(np.zeros(50), ndmin=1)  #label 0 for benign
y1= np.array(np.ones(50), ndmin=1)   #label 1 for malignant
y_test=np.concatenate((x1,y1), axis=0)

################Using LinearSVC
param_grid = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000,5000, 10000]}
clf = OutputCodeClassifier(LinearSVC(random_state=0, verbose=5),
                           code_size=3, random_state=0)
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)

print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

###########Using GridSearchCV
param_grid = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000,5000, 10000], 'gamma':[100,10,1,0.1,0.01,0.001,0.0001]}
model_grid = GridSearchCV(SVC(), param_grid, verbose=5,cv=10)


ecoc = OutputCodeClassifier(LinearSVC(random_state=0), random_state=0)
Cs = [0.0001,0.001, 0.01,0.5, 0.8, 0.1, 1, 10, 100, 1000, 5000, 10000]
cv = GridSearchCV(ecoc, {'estimator__C': Cs}, verbose=5, cv=10)

cv.fit(X_train,y_train)
grid_pred = cv.predict(X_test)
Esempio n. 16
0
# Test
threshold_test = np.where((y_test == 0) | (y_test == 1) | (y_test == 7)
                          | (y_test == 8))
y_test_thres, x_test_thres = y_test[threshold_test], x_test[threshold_test]

###################################################################################################
################################# Training a classifier (4  numbers) ##############################

num_iter = 5

start_time_OCC = time.time()

OCC = OutputCodeClassifier(Perceptron(max_iter=num_iter, random_state=0))
OCC.fit(x_train_thres, y_train_thres)
predictionsOCC = OCC.predict(x_test_thres)
scoreOCC = OCC.score(x_test_thres, y_test_thres)

cmOCC = metrics.confusion_matrix(y_test_thres, predictionsOCC)
plt.figure(figsize=(9, 9))
sns.heatmap(cmOCC,
            annot=True,
            fmt=".3f",
            linewidths=.5,
            square=True,
            cmap='Blues_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'OCC - Accuracy Score: {0}'.format(scoreOCC)
plt.title(all_sample_title, size=15)
plt.show()
Esempio n. 17
0
"""

from sklearn import datasets
from sklearn.multiclass import OutputCodeClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

#数据获取
iris = datasets.load_iris()
x, y = iris.data, iris.target
print('样本数量,%d,特征数量%d' % x.shape)

#模型对象创建
#code_size 指定最终使用多少个子模型,实际的子模型数量=code_size*label_number
clf = OutputCodeClassifier(LinearSVC(random_state=0),
                           code_size=30,
                           random_state=0)
#模型构建
clf.fit(x, y)

#输出预测结果值
print(clf.predict(x))
print('准确率%.3f' % accuracy_score(y, clf.predict(x)))

#模型属性输出
k = 1
for item in clf.estimators_:
    print('第%d个模型' % k)
    print(item)
    k += 1
print(clf.classes_)
Esempio n. 18
0
def main():

    filenameLB = 'mfcc_lb.csv'
    allsongcat = pickle.load(open('mfcc_fv.p', 'rb'))
    hcdf = pickle.load(open('hcdf_fv.p', 'rb'))

    with open('mfcc_lb.csv') as f:
        reader = csv.reader(f)
        for row in reader:
            labels = row

    # select training and test sets
    '''
    TEidx = np.array(random.sample(range(0,1000), 100))
    
    training = []
    test = []
    
    trainingLB = []
    testLB = []

    # make numpy arrays
    for i in range(1000):
        if i in TEidx:
            test.append(featureDict[i])
            testLB.append(int(labels[i]))
        else:
            training.append(featureDict[i])
            trainingLB.append(int(labels[i]))
        
    # fit with classifier and predict
    X = np.array(training)
    Y = np.array(trainingLB)

    '''
    l = [allsongcat, hcdf]
    all_feats = combineFeatures(l)
    feats_shuf = []
    labels_shuf = []
    index_shuf = range(len(labels))
    shuffle(index_shuf)
    for i in index_shuf:
        feats_shuf.append(all_feats[i])
        labels_shuf.append(labels[i])

    X = np.array(feats_shuf)
    Y = np.array(labels_shuf)

    kf = KFold(1000, n_folds=10)
    #rf = RandomForestClassifier(n_estimators=50, max_features = 'log2')
    sgd = SGDClassifier(loss="hinge", penalty="l2")
    #svc = svm.SVC(kernel='linear')
    dtree = DecisionTreeClassifier(max_depth=3)
    lsvc = LinearSVC(random_state=0)
    cla = OutputCodeClassifier(sgd, code_size=128, random_state=0)

    cm_all = np.zeros((10, 10), dtype=np.int)

    cb = np.zeros((10, 20))
    losses = []

    with open('ECOC_sgd_error.csv', 'w') as f1:
        wrtest = csv.writer(f1,
                            quoting=csv.QUOTE_NONNUMERIC,
                            lineterminator='\n')
        scores = 0.0
        for train, test in kf:
            X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[
                test]
            cla.fit(X_train, y_train)
            predictions = cla.predict(X_test)
            loss = zero_one_loss(predictions, y_test)
            losses.append(loss)
            scores += loss
            # print y_test
            # print predictions

            cb = cla.code_book_

            np.savetxt('codebook.csv', cb, delimiter=',')

            # Compute confusion matrix
            cm = confusion_matrix(
                y_test,
                predictions,
                labels=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
            np.set_printoptions(precision=2)
            #print(cm_all)
            cm_all = np.add(cm_all, cm)

        # make ECOC coding matrix 0-1 binary
        cb[cb <= 0] = 0
        wrtest.writerow(losses)
    print cb

    print scores / 10
Esempio n. 19
0
def ml_models(train, test, lab, labt):
    #Random Forest
    forest = RandomForestClassifier(n_estimators=200,
                                    max_leaf_nodes=50,
                                    criterion="entropy")
    forest = forest.fit(train, lab)
    output_rf = forest.predict(test).astype(int)
    suc_rf = 0
    totals_rf = [0 for m in range(num)]
    preds_rf = [0 for m in range(num)]
    for i in range(0, len(labt)):
        totals_rf[labt[i]] += 1
        if output_rf[i] == labt[i]:
            suc_rf = suc_rf + 1
            preds_rf[labt[i]] += 1

    accuracy_rf = suc_rf / len(labt)

    #KNearest Neighbour

    neigh = KNeighborsClassifier(n_neighbors=7)
    neigh.fit(train, lab)
    output_kn = neigh.predict(test)
    suc_kn = 0
    totals_kn = [0 for m in range(num)]
    preds_kn = [0 for m in range(num)]
    for i in range(0, len(labt)):
        totals_kn[labt[i]] += 1
        if output_kn[i] == labt[i]:
            suc_kn = suc_kn + 1
            preds_kn[labt[i]] += 1

    accuracy_kn = suc_kn / len(labt)

    # Logistic Regression

    model = LogisticRegression()
    model.fit(train, lab)
    output_lr = model.predict(test)
    suc_lr = 0
    totals_lr = [0 for m in range(num)]
    preds_lr = [0 for m in range(num)]
    for i in range(0, len(labt)):
        totals_lr[labt[i]] += 1
        if output_lr[i] == labt[i]:
            suc_lr = suc_lr + 1
            preds_lr[labt[i]] += 1

    accuracy_lr = suc_lr / len(labt)

    # Naive Bayes

    model = GaussianNB()
    model.fit(train, lab)
    # print(model)
    # make predictions
    # expected = y
    output_nb = model.predict(test)

    suc_nb = 0
    totals_nb = [0 for m in range(num)]
    preds_nb = [0 for m in range(num)]
    for i in range(0, len(labt)):
        totals_nb[labt[i]] += 1
        if output_nb[i] == labt[i]:
            suc_nb = suc_nb + 1
            preds_nb[labt[i]] += 1

    accuracy_nb = suc_nb / len(labt)

    # Decision Tree Classifier

    model = DecisionTreeClassifier()
    model.fit(train, lab)
    output_dt = model.predict(test)

    suc_dt = 0
    totals_dt = [0 for m in range(num)]
    preds_dt = [0 for m in range(num)]
    for i in range(0, len(labt)):
        totals_dt[labt[i]] += 1
        if output_dt[i] == labt[i]:
            suc_dt = suc_dt + 1
            preds_dt[labt[i]] += 1

    accuracy_dt = suc_dt / len(labt)

    # Support Vector Machine

    clf = OutputCodeClassifier(LinearSVC(random_state=0),
                               code_size=2,
                               random_state=0)
    clf.fit(train, lab)
    output_sv = clf.predict(test)

    suc_sv = 0
    totals_sv = [0 for m in range(num)]
    preds_sv = [0 for m in range(num)]
    for i in range(0, len(labt)):
        totals_sv[labt[i]] += 1
        if output_sv[i] == labt[i]:
            suc_sv = suc_sv + 1
            preds_sv[labt[i]] += 1

    accuracy_sv = suc_sv / len(labt)

    # Majority voting

    def Most_Common(lst):
        data = Counter(lst)
        return data.most_common(1)[0][0]

    output_mv = []
    for i in range(0, len(labt)):
        c = [output_dt[i], output_rf[i], output_lr[i]]
        output_mv.append(Most_Common(c))

    suc_mv = 0
    totals_mv = [0 for m in range(num)]
    preds_mv = [0 for m in range(num)]
    for i in range(0, len(labt)):
        totals_mv[labt[i]] += 1
        if output_mv[i] == labt[i]:
            suc_mv = suc_mv + 1
            preds_mv[labt[i]] += 1

    accuracy_mv = suc_mv / len(labt)

    return accuracy_rf, accuracy_kn, accuracy_lr, accuracy_nb, accuracy_dt, accuracy_sv, accuracy_mv, \
           preds_rf, preds_kn, preds_lr, preds_nb, preds_dt, preds_sv, preds_mv, \
           totals_rf, totals_kn, totals_lr, totals_nb, totals_dt, totals_sv, totals_mv