def LDA_test(k=2, i_of_e=0, fun_type=None, data_name='orl', n_used_class=None, dimension=None): # k:训练集个数; i_of_e 最终取e值的i值, 0或-1为不取e值 print('K = ' + str(k) + ':') data, n_class, n_one_class, _ = get_data(data_name) if n_used_class is not None: # 选取一部分 n_class = n_used_class data = data[:n_class * n_one_class] accuracy = 0 total_time = 0 best_d = n_class - 1 if dimension is not None and 0 < dimension < n_class: best_d = dimension n_train = int(n_one_class * k / 10) if n_train == 1: n_train = 2 print('n of train set: %d' % n_train) for i in range(n_one_class): train_data, train_data_index, test_data, test_data_index = separateData( data, n_train, i, n_class) train_data_norm = get_normalize(train_data) test_data_norm = get_normalize(test_data) time1 = time.time() W = lda.LDATrain(train_data_norm, n_class, n_train, i_of_e, fun_type) tempW = np.array(W[:best_d, :]) # get 1 to max_d dimension train_mat, test_mat = get_mat(tempW, train_data_norm, test_data_norm) res = np.mean(test_data_index == KNN(train_mat, test_mat, train_data_index, n_train)) print("res %d" % (i + 1)) print(res) accuracy += res time2 = time.time() total_time += time2 - time1 accuracy /= n_one_class total_time /= n_one_class print('total_time:') print(total_time) print('accuracy:') print(accuracy) return accuracy
def getORL(): data, n_class, n_one_class, _ = get_data() train_data, train_data_index, test_data, test_data_index = separateData(data, 6, 0, n_class) # validX, testX, validY, testY = train_test_split(test_data, test_data_index, train_size=0.4, test_size=0.6, # random_state=0) train_data_index = keras.utils.to_categorical(train_data_index, n_class) test_data_index = keras.utils.to_categorical(test_data_index, n_class) size = 46 validNum = 1 testNum = n_one_class - 6 - validNum validX = np.zeros((n_class * validNum, size * size)) validY = np.zeros((n_class * validNum, n_class)) testX = np.zeros((n_class * testNum, size * size)) testY = np.zeros((n_class * testNum, n_class)) valid_num = 0 # 训练集的个数 test_num = 0 # 测试集的个数 for j in range(test_data.shape[0]): if j % (n_one_class - 6) < validNum: validX[valid_num] = test_data[j] validY[valid_num] = test_data_index[j] valid_num += 1 else: testX[test_num] = test_data[j] testY[test_num] = test_data_index[j] test_num += 1 print(train_data.shape) print(validX.shape) print(testX.shape) train_data = normalize_image_array(train_data) validX = normalize_image_array(validX) testX = normalize_image_array(testX) train_data = np.reshape(train_data, [-1, size, size, 1]) validX = np.reshape(validX, [-1, size, size, 1]) testX = np.reshape(testX, [-1, size, size, 1]) print(train_data.shape) print(validX.shape) print(testX.shape) print(train_data_index.shape) print(validY.shape) print(testY.shape) return train_data, train_data_index, validX, validY, testX, testY
#starter.py import dataprocess import measurement import models import numpy as np import matplotlib.pyplot as plt from sklearn.metrics import roc_auc_score, accuracy_score, brier_score_loss, zero_one_loss from sklearn import preprocessing if __name__ == '__main__': #-------GET DATA FROM CSV FILE------------- X_train, Y_train = dataprocess.get_data(path='../data/trainingset.csv') X_cross, Y_cross = dataprocess.get_data(path='../data/crossset.csv') X_test, Y_test = dataprocess.get_data(path='../data/testset.csv') X_train_cross, Y_train_cross = dataprocess.get_data( path='../data/train_and_cross.csv') #Check size of data # print(len(Y_train)) # print(len(Y_cross)) # print(len(Y_test)) # print(len(Y_train_cross)) #-------Preprocess DATA------------- # Add more features for data # X_train = dataprocess.addDimensional(X_train) # X_cross = dataprocess.addDimensional(X_cross) # X_test = dataprocess.addDimensional(X_test) # X_train = dataprocess.addDimensionalCube(X_train) # X_cross = dataprocess.addDimensionalCube(X_cross) # X_test = dataprocess.addDimensionalCube(X_test) #normalize:
def pca_test(k=8, data_name='orl', n_used_class=None, dimension=120): print('K = ' + str(k) + ':') data, n_class, n_one_class, _ = get_data(data_name) data = data / np.max(data) if n_used_class is not None: # 选取一部分 n_class = n_used_class data = data[:n_class * n_one_class] accuracy = 0 total_time = 0 n_train = int(n_one_class * k / 10) if n_train == 1: n_train = 2 print('n of train set: %d' % n_train) for i in range(n_one_class): train_data, train_data_index, test_data, test_data_index = separateData( data, n_train, i, n_class) # train_data_norm = get_normalize(train_data) # test_data_norm = get_normalize(test_data) # train_data_norm = train_data # test_data_norm = test_data time1 = time.time() pca_W = pca(train_data) # 特征矩阵 # lda_W = lda.LDATrain(train_data, n_class, k) # 特征矩阵 # lda_W = lda.LDATrain(train_data, n_class, k) # 特征矩阵 # matplot show image # img = train_data[0].reshape((46, 46)) # fig = plt.figure() # for j in range(4): # # PCA # img = pca_W[j].reshape((46, 46)) # plt.subplot(2, 4, j + 1) # plt.axis('off') # plt.xticks([]) # 去掉横坐标值 # plt.yticks([]) # 去掉纵坐标值 # plt.title('PCA') # plt.imshow(img, cmap='Greys_r') # # LDA # img = lda_W[j].reshape((46, 46)) # plt.subplot(2, 4, j + 5) # plt.axis('off') # plt.xticks([]) # 去掉横坐标值 # plt.yticks([]) # 去掉纵坐标值 # plt.title('LDA') # plt.imshow(img, cmap='Greys_r') # plt.tight_layout() # plt.savefig('E:\\机器学习\\实验报告\\实验2\\eigenface与fisherface(pinv).png') # plt.show() # showImage # 我不用这一部分,仅作参考和日后学习 # image1 = Image.new("RGB", (46*8, 132), 'white') # size = 46 # x = 0 # y = 0 # img = Image.fromarray(train_data[0].reshape((size, size))) # font = ImageFont.truetype("simhei.ttf", 5, encoding="utf-8") # draw = ImageDraw.Draw(image1) # 可以理解为画笔工具 # draw.text((x, y), 'original', "black") # for j in range(8): # image1.paste(img, (x+j*46, y+20)) # image1.show() # continue matplot image show # for j in range(8): # plt.subplot(2, 8, j + 9) # plt.axis('off') # plt.xticks([]) # 去掉横坐标值 # plt.yticks([]) # 去掉纵坐标值 # plt.title('dimension = '+str((j+1)*20)) # img = (train_data[0].reshape((1, train_data.shape[1]))-mu) @ np.array(W[:(j+1)*20]).T @ np.array(W[:(j+1)*20]) + mu # img = (img.reshape((46, 46))).real # plt.imshow(img, cmap='Greys_r') # plt.tight_layout() # plt.savefig('E:\\机器学习\\实验报告\\face_in_dif_D.png') # plt.show() # break # 测试识别率 res_W = np.array(pca_W[:dimension]) # get dimension dimension train_mat, test_mat = get_mat(res_W, train_data, test_data, dimension) res = np.mean( test_data_index == pcaKNN(train_mat, test_mat, train_data_index)) print("res %d " % (i + 1)) print(res) accuracy += res time2 = time.time() total_time = time2 - time1 total_time /= n_one_class print('total_time:') print(total_time) accuracy /= n_one_class print('accuracy:') print(accuracy) return accuracy
def B2DPCA_test(k=8, data_name='orl', dimension=5, fun_name='B2DPCA'): """ :param k: :param data_name: :param dimension: :param fun_name: B2DPCA, 2DPCA :return: """ print('K = ' + str(k) + ':') data, n_class, n_one_class, _ = get_data(data_name, keepDim=True) # data = data / np.max(data) accuracy = 0 total_time = 0 n_train = int(n_one_class * k / 10) if n_train == 1: n_train = 2 print('n of train set: %d' % n_train) for i in range(n_one_class): train_data, train_data_index, test_data, test_data_index = separateData( data, n_train, i, n_class) time1 = time.time() if fun_name.lower() == 'b2dpca': pca_W1, pca_W2 = BTwoDPCA(train_data, dimension) # 特征矩阵 elif fun_name.lower() == '2dpca': pca_W1 = twoDPCA(train_data, dimension) train_mat = [] test_mat = [] for j in range(train_data.shape[0]): if fun_name.lower() == 'b2dpca': temp1 = (train_data[j, :, :] @ pca_W1).T @ pca_W2 # (b, c) @ (c, k) = (b, k) else: temp1 = train_data[ j, :, :] @ pca_W1 # (b, c) @ (c, k) = (b, k) train_mat.append(temp1.T) # (k, b) 1 for j in range(test_data.shape[0]): if fun_name.lower() == 'b2dpca': temp1 = (test_data[j, :, :] @ pca_W1).T @ pca_W2 # (b, c) @ (c, k) = (b, k) else: temp1 = test_data[j, :, :] @ pca_W1 # (b, c) @ (c, k) = (b, k) test_mat.append(temp1.T) # (k, b) 1 train_mat = np.array(train_mat) test_mat = np.array(test_mat) res = np.mean( test_data_index == pcaKNN(train_mat, test_mat, train_data_index)) print("res %d " % (i + 1)) print(res) accuracy += res time2 = time.time() total_time = time2 - time1 total_time /= n_one_class print('total_time:') print(total_time) accuracy /= n_one_class print('accuracy:') print(accuracy) return accuracy
def clusterTest(fun_name=KMeans, n_cluster=2, data_name='orl', k=5, kernel_type=None): """ :param n_cluster: 聚类数目 :param fun_name: K :param data_name: :param k: training sample percent :param kernel_type: none :return: """ print('In cluster:\n' + 'cluster type: ' + fun_name.__name__) print('Kernel type: ' + str(kernel_type)) print('In ' + data_name) print('K = %d' % k) # 加载数据集 oldData, n_class, n_one_class, oldLabel = get_data(data_name) # 注意PCA处理 # data = pcaProject(oldData, dimension=160) # ---------------取前面3个类------------- # n_class = 3 # n = None # # PCA 处理 # data = pcaProject(oldData[0:n_class * n_one_class], dimension=160) # # 不处理 # # data = oldData[:n_class * n_one_class] # if oldLabel is not None: # label = oldLabel[0:n_class * n_one_class] # --------------------------------------- # ---------------或者取前面n张------------- n = 100 """ 修改张数 """ # PCA 处理0.791 0.797 0.626 0.99 # 0.221 1.27 5.76 data = pcaProject(oldData[0:n], dimension=120) # 不处理 # data = oldData[:n] if oldLabel is not None: label = oldLabel[:n] else: label = None # --------------------------------------- # 生成点集 # n_class = 3 # n_one_class = 10 # data = getPointSet(n_class=n_class, n_one_class=n_one_class, dimension=2) print(data.shape) accuracy = 0 if n is None and n_one_class is not None: n_train = int(n_one_class * k / 10) elif n is not None: n_train = n else: print('something wrong...') return # 忘了这是干嘛的 if n_train == 1: n_train = 2 print('n of train set: %d' % n_train) # 聚类没有单独的投影矩阵给测试集进行准确度的预测,因此测试集可以去除 train_data, train_data_index, test_data, test_data_index = separateData(data, 1, 0, n_class, k_is_rate=True) if oldLabel is not None: train_data_index, _, test_data_index, _ = separateData(label, 1, 0, n_class, k_is_rate=True) print(train_data.shape) t1 = time() centroids, clusterAssignment = fun_name(train_data, n_cluster, distFun=distEuclidean, createCent=randCent) t2 = time() print("运行时间:", t2 - t1) # 类别如果按顺序放 # res = 0 # for i in range(n_class): # n_class:种类数 n_train:一类样本的个数 # a = clusterAssignment[i * n_train:(i + 1) * n_train, 0] # label # temp = max([np.sum(a == j) for j in set(a.flat)]) # if res == label # print('temp' + str(temp)) # res += temp / n_train # res /= n_class # 不按顺序放 res = 0 for i in range(n_class): # n_class:种类数 n_train:一类样本的个数 a = np.array( [clusterAssignment[j, 0] for j in range(clusterAssignment.shape[0]) if train_data_index[j] == i]) # label temp = max([np.sum(a == j) for j in set(a.flat)]) # if res == label print('class ' + str(i) + ' num ' + str(temp) + ' of ' + str(a.size)) res += temp / a.size res /= n_class print(res) # plotData = train_data plotData = pcaProject(oldData[:train_data.shape[0]], dimension=2) print('plotData.shape: ', plotData.shape) fig1 = plt.figure() ax = fig1.add_subplot(111) for i in range(plotData.shape[0]): t = int(clusterAssignment[i, 0]) ax.scatter(plotData[i, 0], plotData[i, 1], s=30, c=colors[t], marker=markers[int(train_data_index[i])]) plt.savefig('E:\\最优化方法\\实验\\test'+str(n)+'.png', dpi=1000) plt.show()
def linear_test(k=5, fun_name='linear', data_name='orl', kernel_type='linear', n_used_class=None): """ :param k: :param fun_name: linear, Softmax, USSL, Locality, KRRC :param data_name: :param kernel_type: rbf, poly, linear :return: """ print('In linear regression:\n' + 'Regression type: ' + fun_name) print('In ' + data_name) print('K = %d' % k) data, n_class, n_one_class = get_data(data_name) if fun_name.lower() != 'ussl': data = data / np.max(data) if n_used_class is not None: # 选取一部分 n_class = n_used_class data = data[:n_class * n_one_class] # data = get_normalize(data) # project_W = pca(data) # d = 160 # project_W = project_W[:d] # data = data @ project_W # data = mat["Yale5040165"].reshape((2000, 165)) accuracy = 0 accuracy2 = 0 n_iters = 2000 for i in range(n_one_class): # 处理数据集 train_data, train_data_index, test_data, test_data_index = separateData( data, k, i, n_class) # # 归一化可以提高识别率 # train_data = get_normalize(train_data) # test_data = get_normalize(test_data) res = 0 # 线性回归 if fun_name.lower() == 'linear': re1 = Regression() # 调用库看看效果 # re1 = sklearn.linear_model.LinearRegression() # re1.fit(train_data, train_data_index) # predict1 = re1.normal_equation_predict(train_data) # predict1 = re1.linear_predict(train_data) # re1.train_normal_equation(train_data, train_data_index, k, n_class) re1.train_normal_equation(train_data, train_data_index, k, n_class, type='ridge', lam=0.01) predict1 = re1.normal_equation_predict(test_data) # predict1 = re1.linear_predict(test_data) res = np.mean(test_data_index == predict1) # softmax梯度下降 elif fun_name.lower() == 'softmax': re1 = Regression() W, costs = re1.train_gradient_descent(train_data, train_data_index, learning_rate=0.1, n_iters=n_iters, type='softmax', n_class=n_class, n_batch=None) res = np.mean(test_data_index == re1.softmax_predict(test_data)) # # draw costs # fig = plt.figure() # x = [i for i in range(n_iters)] # plt.plot(x, costs, c='r') # plt.show() # USSL 谱回归 elif fun_name.lower() == 'ussl': re1 = USSL(train_data, train_data_index) re1.ussl_fit(type='ussl') # re1.ussl_fit(type='one_hot') res = np.mean(test_data_index == re1.classify(test_data)) # 局部加权回归 elif fun_name.lower() == 'locality': re1 = LocalityRegression(train_data, train_data_index, n_class, lam=0.005, sigma=1) # res = np.mean(test_data_index == re1.classify(test_data, type='one_hot')) res = np.mean( test_data_index == re1.classify(test_data, type='hat')) # 核岭回归 elif fun_name.lower() == 'krrc': # re1 = KRRC(train_data, train_data_index, n_class, lam=0.005, sigma=1, kernel='rbf') print('kernel type: ' + kernel_type) re1 = KRRC(train_data, train_data_index, n_class, lam=0.005, sigma=3, kernel=kernel_type) # re1 = KRRC(train_data, train_data_index, n_class, lam=0.005, sigma=1, kernel='linear') # re1 = KRRC(train_data, train_data_index, n_class, lam=0.005, sigma=1, kernel='linear', locality=True) res = np.mean(test_data_index == re1.classify(test_data)) # res = np.mean(train_data_index == pcaKNN(train_data, predict1, train_data_index)) # print("res %d" % (i + 1)) # print(res) # accuracy += res # predict1 = re1.normal_equation_predict(test_data) # predict1 = re1.softmax_predict(test_data) # predict1 = re1.linear_predict(test_data) # predict1 = predict1.astype(int) # res = np.mean(test_data_index == predict1) # res = np.mean(test_data_index == pcaKNN(test_data, predict1, test_data_index)) print("res %d" % (i + 1)) print(res) accuracy += res accuracy /= n_one_class print('In ' + fun_name + ' accuracy: ') print(accuracy) return accuracy