def test_ecoc_delegate_sparse_base_estimator(): # Non-regression test for # https://github.com/scikit-learn/scikit-learn/issues/17218 X, y = iris.data, iris.target X_sp = sp.csc_matrix(X) # create an estimator that does not support sparse input base_estimator = CheckingClassifier( check_X=check_array, check_X_params={ "ensure_2d": True, "accept_sparse": False }, ) ecoc = OutputCodeClassifier(base_estimator, random_state=0) with pytest.raises(TypeError, match="A sparse matrix was passed"): ecoc.fit(X_sp, y) ecoc.fit(X, y) with pytest.raises(TypeError, match="A sparse matrix was passed"): ecoc.predict(X_sp) # smoke test to check when sparse input should be supported ecoc = OutputCodeClassifier(LinearSVC(random_state=0)) ecoc.fit(X_sp, y).predict(X_sp) assert len(ecoc.estimators_) == 4
def run_test(**kwargs): b = fetch_sw_orl() tic = time.time() # split the data in X_train, X_test, y_train, y_true = train_test_split(b.data, b.target, test_size=0.2, stratify=b.target) hog_train = [] for img_array in X_train: fd, _ = hog(img_array.reshape(b.shape), orientations=8, pixels_per_cell=(PPC, PPC), cells_per_block=(1, 1), visualize=True, multichannel=False) hog_train.append(fd) clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2) clf.fit(hog_train, y_train) tok = time.time() hog_test = [] for img_arry in X_test: fd, _ = hog(img_arry.reshape(b.shape), orientations=8, pixels_per_cell=(PPC, PPC), cells_per_block=(1, 1), visualize=True, multichannel=False) hog_test.append(fd) y_pred = clf.predict(hog_test) return tok - tic, accuracy_score(y_true, y_pred)
def clasificar_ECOC(X, y, df, trainInputs, trainOutputs, testInputs, testOutputs, graphname): print("\n[" + str(graphname) + "]") kernelRBF=1.0*RBF(1.0) clf=OutputCodeClassifier(estimator = DecisionTreeClassifier()) clf=clf.fit(trainInputs, trainOutputs) precisionTrain = clf.score(trainInputs, trainOutputs) precisionTest = clf.score(testInputs, testOutputs) print("\tCCR train = %.2f%% | CCR test = %.2f%%" % (precisionTrain*100, precisionTest*100)) prediccion_test = clf.predict(testInputs) print(prediccion_test) print(testOutputs) return precisionTest
def evaluateOutputCode(X, Y, printReport=False): time = datetime.datetime.now() X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) clf.fit(X_train, Y_train) if printReport: print 'Training time:' + str(datetime.datetime.now() - time) print 'Evaluation result: OneVsOne: ' + str( clf.score(X_test, Y_test)) Y_test = clf.predict(X_test) if printReport: print '0: ' + str((Y_test == 0).sum()) print '1: ' + str((Y_test == 1).sum()) print '2: ' + str((Y_test == 2).sum()) return [clf.score(X_test, Y_test), (Y_test == 1).sum(), clf]
def OutputCodeClassifier(data, label, pred_data, pred_last): ''' 0.76473194506 Number of mislabeled points out of a total 841 points : 211 0.749108204518 需要规范化 ''' data = np.array(data) pred_data = np.array(pred_data) label = np.array(label) pred_last = np.array(pred_last) from sklearn.multiclass import OutputCodeClassifier from sklearn.svm import LinearSVC clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) clf.fit(data, label) print clf.score(data, label) pred_result = clf.predict(pred_data) print("Number of mislabeled points out of a total %d points : %d" % (pred_data.shape[0], (pred_last != pred_result).sum())) print clf.score(pred_data, pred_last) return pred_result
row += get_haar_features(im, top_left, bottom_right) train_ecoc_table[ind] = row test_ecoc_table = np.zeros(shape=(np.shape(test_images)[0], 200)) for ind, im in enumerate(test_images): row = [] for (top_left, bottom_right) in rectangles: row += get_haar_features(im, top_left, bottom_right) test_ecoc_table[ind] = row clf = OutputCodeClassifier(AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200), code_size=5, random_state=0) clf.fit(train_ecoc_table, labels) train_pred = np.array(clf.predict(train_ecoc_table)) print "Digits Training Accuracy: %f" % (np.sum(train_pred == np.array(labels)).astype(np.float)/np.shape(train_pred)[0]) test_pred = np.array(clf.predict(test_ecoc_table)) print "Digits Testing Accuracy: %f" % (np.sum(test_pred == np.array(test_labels)).astype(np.float)/np.shape(test_pred)[0]) # ecoc_table = [] # for im in images: # # im_preprocess = np.matrix([[np.sum(im[:i,:j]) for i in range(1, 29)] for j in range(1, 29)]) # # def get_black_rectangle(top_left, bottom_right): # x1, y1 = top_left # x2, y2 = bottom_right # # return im_preprocess[x2, y2] - im_preprocess[x2, y1] - im_preprocess[x1, y2] + im_preprocess[x1, y1]
def test_ecoc_exceptions(): ecoc = OutputCodeClassifier(LinearSVC(random_state=0)) with pytest.raises(NotFittedError): ecoc.predict([])
breast.target, test_size=0.2) # creating a classification clf_1 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=42) clf_2 = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=42) # train the classifier with training data clf_1.fit(x_train, y_train) clf_2.fit(x_train, y_train) # find y_pred prediction best on x_test data y_pred_1 = clf_1.predict(x_test) y_pred_2 = clf_2.predict(x_test) # calculate accuracy of y_pred using y_test print(f'accuracy {accuracy_score(y_test, y_pred_1)}') print(f'accuracy {accuracy_score(y_test, y_pred_2)}') # use classification_report function to print more information print( f'\n\nClassification report for MLPClassifier is\n {classification_report(y_test, y_pred_2)}' ) print( f'\n\nClassification report for MLPClassifierOutpuCodeClassifier is\n {classification_report(y_test, y_pred_2)}' )
@author: 凯风 """ from sklearn.datasets import load_iris from sklearn.multiclass import OutputCodeClassifier from sklearn.svm import LinearSVR from sklearn.model_selection import train_test_split iris_data = load_iris() X, Y = iris_data.data, iris_data.target trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.3) ''' 纠错输出码 和O-vs-O、O-vs-Rest不太一样的方法 主要是在欧几里得空间表示 具体的文本解释,看《机器学习》周治平的那个本里面有提到 ''' clf = LinearSVR(random_state=0) ovrc = OutputCodeClassifier(clf, code_size=1.5, random_state=None, n_jobs=1) ovrc.fit(trainX, trainY) ovrc.predict(testX) ovrc.code_book_ ''' estimator 评估器 code_size 空间尺寸? random_state 随机器 n_jobs CPU的作业数量 '''
def get_key(val): identifiedKey = [k for k,v in category_codes.items() if v == val] if len(identifiedKey) == 0: return "No value" return identifiedKey[0] ## Actual code to run for large data prediction #labels_predict = svc.predict(features_test) #print(accuracy_score(labels_test_test, labels_predict_test)) ## Test predict only for 2000 rows due to machine constraints very_small_sample_size = 2000 labels_test_test = labels_test.head(very_small_sample_size) input_test = X_test.head(very_small_sample_size) features_test_input = tfidf.transform(input_test).toarray() labels_predict_test = occ.predict(features_test_input) ## Print the test results accuracy compared to actual print("The training accuracy is: ") print(accuracy_score(labels_test_test, labels_predict_test)) print("Classification report is as follows: ") print(classification_report(labels_test_test, labels_predict_test)) ## Uncomment below for printing confusion matrix # cm=confusion_matrix(labels_test_test, labels_predict_test) # sns.heatmap(cm, annot=True) ## For very small given sample size print - Item Desc || Actual test category || Predicted test category for i in range(very_small_sample_size): print (input_test.values[i], " || ", get_key(labels_test.values[i]), " || ", get_key(labels_predict_test[i]))
# 划分数据集合 welfare_data_train,welfare_data_test,welfare_target_train,welfare_target_test = \ train_test_split(welfare_data,welfare_target,test_size=0.2,random_state=666) # 数据标准化 # stdScaler = StandardScaler().fit(welfare_data_train) # welfare_data_train_std = stdScaler.transform(welfare_data_train) # welfare_data_test_std = stdScaler.transform(welfare_data_test) # 建立svm模型,使用线性核函数 model = OutputCodeClassifier(LinearSVC()) model = model.fit(welfare_data_train,welfare_target_train) # 保存模型 joblib.dump(model, 'welfare_predict.pkl') welfare_target_predict = model.predict(welfare_data_test) print('预测前20个结果为:\n',welfare_target_predict[:20]) print('使用SVM预测数据的准确率为:', accuracy_score(welfare_target_test,welfare_target_predict)) print('使用SVM预测数据的精确率为:', precision_score(welfare_target_test,welfare_target_predict,average='micro')) print('使用SVM预测数据的召回率为:', recall_score(welfare_target_test,welfare_target_predict,average='micro')) print('使用SVM预测数据的F1值为:', f1_score(welfare_target_test,welfare_target_predict,average='micro')) print('使用SVM预测数据的Cohen’s Kappa系数为:', cohen_kappa_score(welfare_target_test,welfare_target_predict)) # 使用SVM预测数据的准确率为: 0.9966957044157405 # 使用SVM预测数据的精确率为: 0.9966957044157405 # 使用SVM预测数据的召回率为: 0.9966957044157405
random_state=42) clf.fit(hog_train, y_train) tok = time.time() if control[1]: # create the hog fro the X_test hog_test = [] for img_arry in X_test: fd, _ = hog(img_arry.reshape(b.shape), orientations=8, pixels_per_cell=(PPC, PPC), cells_per_block=(1, 1), visualize=True, multichannel=False) hog_test.append(fd) y_pred = clf.predict(hog_test) print( f'the number of correct example is {accuracy_score(y_true, y_pred, normalize=False)}, with accuracy score of {accuracy_score(y_true, y_pred)}' ) print(classification_report(y_true, y_pred, zero_division=0.0)) print(f'time to train : {tok - tic:.5}') def run_test(**kwargs): b = fetch_sw_orl() tic = time.time() # split the data in X_train, X_test, y_train, y_true = train_test_split(b.data, b.target,
# Initialise scaler to scale the data scaler = StandardScaler() train_set = np.empty(train_x.shape, dtype=float) test_set = np.empty(test_x.shape, dtype=float) # Make training data suitable for scaling for index in range(len(train_x)): train_set[index] = train_x[index].astype(float) # Make testing data suitable for scaling for index in range(len(test_x)): test_set[index] = test_x[index].astype(float) # Fit the training data scaler.fit(train_set) # Scale the training and testing data w.r.t scaler data_train = scaler.transform(train_set) data_test = scaler.transform(test_set) occ = OutputCodeClassifier(BaggingClassifier()) occ.fit(data_train, train_y) prediction = occ.predict(data_test) accuracy = accuracy_score(test_y, prediction) # -------------------- Print the final result -------------------- # print("\nAccuracy using Output Code Classifier :", round(accuracy * 100, 3), "%\n")
for w in words: for i, word in enumerate(vocab): if word == w: bag_vector[i] += 1 print("{0} \n{1}\n".format(sentence, numpy.array(bag_vector))) allsentences = [ "Joe waited`s for the train", "The train was late", "Mary and Samantha took the bus", "I looked for Mary and Samantha at the bus station", "Mary and Samantha arrived at the bus station early but waited until noon for the bus" ] generate_bow(allsentences) from sklearn import datasets from sklearn.multiclass import OutputCodeClassifier from sklearn.svm import LinearSVC iris = datasets.load_iris() X, y = iris.data, iris.target print(X) clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) clf.fit(X, y) m = clf.predict(X) print(m)
y= np.array(np.ones(15050), ndmin=1) #label 1 for malignant y_train=np.concatenate((x,y), axis=0) ''' #labeling y_test x1= np.array(np.zeros(50), ndmin=1) #label 0 for benign y1= np.array(np.ones(50), ndmin=1) #label 1 for malignant y_test=np.concatenate((x1,y1), axis=0) ################Using LinearSVC param_grid = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000,5000, 10000]} clf = OutputCodeClassifier(LinearSVC(random_state=0, verbose=5), code_size=3, random_state=0) clf.fit(X_train, y_train) predictions=clf.predict(X_test) print(confusion_matrix(y_test, predictions)) print(classification_report(y_test, predictions)) ###########Using GridSearchCV param_grid = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000,5000, 10000], 'gamma':[100,10,1,0.1,0.01,0.001,0.0001]} model_grid = GridSearchCV(SVC(), param_grid, verbose=5,cv=10) ecoc = OutputCodeClassifier(LinearSVC(random_state=0), random_state=0) Cs = [0.0001,0.001, 0.01,0.5, 0.8, 0.1, 1, 10, 100, 1000, 5000, 10000] cv = GridSearchCV(ecoc, {'estimator__C': Cs}, verbose=5, cv=10) cv.fit(X_train,y_train) grid_pred = cv.predict(X_test)
# Test threshold_test = np.where((y_test == 0) | (y_test == 1) | (y_test == 7) | (y_test == 8)) y_test_thres, x_test_thres = y_test[threshold_test], x_test[threshold_test] ################################################################################################### ################################# Training a classifier (4 numbers) ############################## num_iter = 5 start_time_OCC = time.time() OCC = OutputCodeClassifier(Perceptron(max_iter=num_iter, random_state=0)) OCC.fit(x_train_thres, y_train_thres) predictionsOCC = OCC.predict(x_test_thres) scoreOCC = OCC.score(x_test_thres, y_test_thres) cmOCC = metrics.confusion_matrix(y_test_thres, predictionsOCC) plt.figure(figsize=(9, 9)) sns.heatmap(cmOCC, annot=True, fmt=".3f", linewidths=.5, square=True, cmap='Blues_r') plt.ylabel('Actual label') plt.xlabel('Predicted label') all_sample_title = 'OCC - Accuracy Score: {0}'.format(scoreOCC) plt.title(all_sample_title, size=15) plt.show()
""" from sklearn import datasets from sklearn.multiclass import OutputCodeClassifier from sklearn.svm import LinearSVC from sklearn.metrics import accuracy_score #数据获取 iris = datasets.load_iris() x, y = iris.data, iris.target print('样本数量,%d,特征数量%d' % x.shape) #模型对象创建 #code_size 指定最终使用多少个子模型,实际的子模型数量=code_size*label_number clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=30, random_state=0) #模型构建 clf.fit(x, y) #输出预测结果值 print(clf.predict(x)) print('准确率%.3f' % accuracy_score(y, clf.predict(x))) #模型属性输出 k = 1 for item in clf.estimators_: print('第%d个模型' % k) print(item) k += 1 print(clf.classes_)
def main(): filenameLB = 'mfcc_lb.csv' allsongcat = pickle.load(open('mfcc_fv.p', 'rb')) hcdf = pickle.load(open('hcdf_fv.p', 'rb')) with open('mfcc_lb.csv') as f: reader = csv.reader(f) for row in reader: labels = row # select training and test sets ''' TEidx = np.array(random.sample(range(0,1000), 100)) training = [] test = [] trainingLB = [] testLB = [] # make numpy arrays for i in range(1000): if i in TEidx: test.append(featureDict[i]) testLB.append(int(labels[i])) else: training.append(featureDict[i]) trainingLB.append(int(labels[i])) # fit with classifier and predict X = np.array(training) Y = np.array(trainingLB) ''' l = [allsongcat, hcdf] all_feats = combineFeatures(l) feats_shuf = [] labels_shuf = [] index_shuf = range(len(labels)) shuffle(index_shuf) for i in index_shuf: feats_shuf.append(all_feats[i]) labels_shuf.append(labels[i]) X = np.array(feats_shuf) Y = np.array(labels_shuf) kf = KFold(1000, n_folds=10) #rf = RandomForestClassifier(n_estimators=50, max_features = 'log2') sgd = SGDClassifier(loss="hinge", penalty="l2") #svc = svm.SVC(kernel='linear') dtree = DecisionTreeClassifier(max_depth=3) lsvc = LinearSVC(random_state=0) cla = OutputCodeClassifier(sgd, code_size=128, random_state=0) cm_all = np.zeros((10, 10), dtype=np.int) cb = np.zeros((10, 20)) losses = [] with open('ECOC_sgd_error.csv', 'w') as f1: wrtest = csv.writer(f1, quoting=csv.QUOTE_NONNUMERIC, lineterminator='\n') scores = 0.0 for train, test in kf: X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[ test] cla.fit(X_train, y_train) predictions = cla.predict(X_test) loss = zero_one_loss(predictions, y_test) losses.append(loss) scores += loss # print y_test # print predictions cb = cla.code_book_ np.savetxt('codebook.csv', cb, delimiter=',') # Compute confusion matrix cm = confusion_matrix( y_test, predictions, labels=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']) np.set_printoptions(precision=2) #print(cm_all) cm_all = np.add(cm_all, cm) # make ECOC coding matrix 0-1 binary cb[cb <= 0] = 0 wrtest.writerow(losses) print cb print scores / 10
def ml_models(train, test, lab, labt): #Random Forest forest = RandomForestClassifier(n_estimators=200, max_leaf_nodes=50, criterion="entropy") forest = forest.fit(train, lab) output_rf = forest.predict(test).astype(int) suc_rf = 0 totals_rf = [0 for m in range(num)] preds_rf = [0 for m in range(num)] for i in range(0, len(labt)): totals_rf[labt[i]] += 1 if output_rf[i] == labt[i]: suc_rf = suc_rf + 1 preds_rf[labt[i]] += 1 accuracy_rf = suc_rf / len(labt) #KNearest Neighbour neigh = KNeighborsClassifier(n_neighbors=7) neigh.fit(train, lab) output_kn = neigh.predict(test) suc_kn = 0 totals_kn = [0 for m in range(num)] preds_kn = [0 for m in range(num)] for i in range(0, len(labt)): totals_kn[labt[i]] += 1 if output_kn[i] == labt[i]: suc_kn = suc_kn + 1 preds_kn[labt[i]] += 1 accuracy_kn = suc_kn / len(labt) # Logistic Regression model = LogisticRegression() model.fit(train, lab) output_lr = model.predict(test) suc_lr = 0 totals_lr = [0 for m in range(num)] preds_lr = [0 for m in range(num)] for i in range(0, len(labt)): totals_lr[labt[i]] += 1 if output_lr[i] == labt[i]: suc_lr = suc_lr + 1 preds_lr[labt[i]] += 1 accuracy_lr = suc_lr / len(labt) # Naive Bayes model = GaussianNB() model.fit(train, lab) # print(model) # make predictions # expected = y output_nb = model.predict(test) suc_nb = 0 totals_nb = [0 for m in range(num)] preds_nb = [0 for m in range(num)] for i in range(0, len(labt)): totals_nb[labt[i]] += 1 if output_nb[i] == labt[i]: suc_nb = suc_nb + 1 preds_nb[labt[i]] += 1 accuracy_nb = suc_nb / len(labt) # Decision Tree Classifier model = DecisionTreeClassifier() model.fit(train, lab) output_dt = model.predict(test) suc_dt = 0 totals_dt = [0 for m in range(num)] preds_dt = [0 for m in range(num)] for i in range(0, len(labt)): totals_dt[labt[i]] += 1 if output_dt[i] == labt[i]: suc_dt = suc_dt + 1 preds_dt[labt[i]] += 1 accuracy_dt = suc_dt / len(labt) # Support Vector Machine clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=0) clf.fit(train, lab) output_sv = clf.predict(test) suc_sv = 0 totals_sv = [0 for m in range(num)] preds_sv = [0 for m in range(num)] for i in range(0, len(labt)): totals_sv[labt[i]] += 1 if output_sv[i] == labt[i]: suc_sv = suc_sv + 1 preds_sv[labt[i]] += 1 accuracy_sv = suc_sv / len(labt) # Majority voting def Most_Common(lst): data = Counter(lst) return data.most_common(1)[0][0] output_mv = [] for i in range(0, len(labt)): c = [output_dt[i], output_rf[i], output_lr[i]] output_mv.append(Most_Common(c)) suc_mv = 0 totals_mv = [0 for m in range(num)] preds_mv = [0 for m in range(num)] for i in range(0, len(labt)): totals_mv[labt[i]] += 1 if output_mv[i] == labt[i]: suc_mv = suc_mv + 1 preds_mv[labt[i]] += 1 accuracy_mv = suc_mv / len(labt) return accuracy_rf, accuracy_kn, accuracy_lr, accuracy_nb, accuracy_dt, accuracy_sv, accuracy_mv, \ preds_rf, preds_kn, preds_lr, preds_nb, preds_dt, preds_sv, preds_mv, \ totals_rf, totals_kn, totals_lr, totals_nb, totals_dt, totals_sv, totals_mv