def lda(k, sam_tr, ind_tr, sam_te, ind_te, c, classifier=0): """ fisherface implementation ------------------------- inputs: k --size of subspace; sam_tr --training set sam_te --testing set c --num of classes classifier --0 for kNN(default), 1 for SVM outputs: accu --classification accuracy co_mat --confusion mat W --projection mat """ # data preparation flg = 0 num_tr = sam_tr.shape[1]/c dm = sam_tr.shape[0] # training # compute variables # compute within class mean mu_c = np.mean(sam_tr.T.reshape(c, num_tr, dm), axis=1).T mu = np.mean(sam_tr, 1) # compute overall mean # compute within class scatter matrix Sw mu_c_l = np.tile(mu_c.T, (num_tr, 1, 1)).transpose(1, 0, 2)\ .reshape(num_tr*c, dm).T # enlarged mean mat, note the usage of transpose B = sam_tr - mu_c_l Sw = B.dot(B.T) # compute between class scatter matrix Sb Ni = num_tr*np.ones((dm, c)) # sample numbers in each class C = mu_c - np.tile(mu.reshape(dm, 1), (1, c)) Sb = C.dot(Ni.T*C.T) # solve generalized eigenvalue problem W = mf.eigs(np.linalg.inv(Sw).dot(Sb), k) # the projection matrix # testing sam_tr_p = W.T.dot(sam_tr-np.tile(mu.reshape(dm, 1), (1, sam_tr.shape[1]))) # projected training samples sam_te_p = W.T.dot(sam_te-np.tile(mu.reshape(dm, 1), (1, sam_te.shape[1]))) # projected testing samples # 1NN if classifier == 0: nei = kNN(n_neighbors=1) nei.fit(sam_tr_p.T, ind_tr) p_label = nei.predict(sam_te_p.T) co_mat = met.confusion_matrix(ind_te, p_label) # SVM if classifier == 1: lb_tr, ins_tr, lb_te, ins_te = mf.np2libsvm(ind_tr, sam_tr_p, ind_te, sam_te_p) accu, co_mat = libsvm(lb_tr, ins_tr, lb_te, ins_te) flg = 1 if flg == 0: accu = np.trace(co_mat).astype(float)/np.sum(co_mat).astype(float) else: accu = accu/100 return accu, co_mat, W
def pca(k, sam_tr, ind_tr, sam_te, ind_te, c, classifier=0): """ eigenface implementation ---------------------------------------- inputs: k --size of subspace sam_tr --training set sam_te --testing set c --num of classes classifier --0 for kNN(default), 1 for SVM outputs accu --classfication accuracy co_mat --confusion mat W --projection mat """ # data preparation flg = 0 num_tr = sam_tr.shape[1]/c dm = sam_tr.shape[0] # training mu = np.mean(sam_tr, axis=1) # compute overall mean B = sam_tr - np.tile(mu.reshape(dm, 1), (1, num_tr*c)) S = B.T.dot(B) # S = sam_tr.dot(sam_tr.T) W = mf.eigs1(S, B, k) # the projection matrix W = np.real(W) # testing sam_tr_p = W.T.dot(sam_tr-np.tile(mu.reshape(dm, 1), (1, sam_tr.shape[1]))) # projected training samples sam_te_p = W.T.dot(sam_te-np.tile(mu.reshape(dm, 1), (1, sam_te.shape[1]))) # projected testing samples # 1NN if classifier == 0: nei = kNN(n_neighbors=1) nei.fit(sam_tr_p.T, ind_tr) p_label = nei.predict(sam_te_p.T) co_mat = met.confusion_matrix(ind_te, p_label) # SVM if classifier == 1: lb_tr, ins_tr, lb_te, ins_te = mf.np2libsvm(ind_tr, sam_tr_p, ind_te, sam_te_p) accu, co_mat = libsvm(lb_tr, ins_tr, lb_te, ins_te) flg = 1 if flg == 0: accu = np.trace(co_mat).astype(float)/np.sum(co_mat).astype(float) else: accu = accu/100 return accu, co_mat, W
def handwritingClassTest(): #测试集的Labels hwLabels = [] #返回trainingDigits目录下的文件名 trainingFileList = listdir('trainingDigits') #返回文件夹下文件的个数 m = len(trainingFileList) #初始化训练的Mat矩阵,测试集 trainingMat = np.zeros((m, 1024)) #从文件名中解析出训练集的类别 for i in range(m): #获得文件的名字 fileNameStr = trainingFileList[i] #获得分类的数字 classNumber = int(fileNameStr.split('_')[0]) #将获得的类别添加到hwLabels中 hwLabels.append(classNumber) #将每一个文件的1x1024数据存储到trainingMat矩阵中 trainingMat[i,:] = img2vector('trainingDigits/%s' % (fileNameStr)) #构建kNN分类器 neigh = kNN(n_neighbors = 3, algorithm = 'auto') #拟合模型, trainingMat为训练矩阵,hwLabels为对应的标签 neigh.fit(trainingMat, hwLabels) #返回testDigits目录下的文件列表 testFileList = listdir('testDigits') #错误检测计数 errorCount = 0.0 #测试数据的数量 mTest = len(testFileList) #从文件中解析出测试集的类别并进行分类测试 for i in range(mTest): #获得文件的名字 fileNameStr = testFileList[i] #获得分类的数字 classNumber = int(fileNameStr.split('_')[0]) #获得测试集的1x1024向量,用于训练 vectorUnderTest = img2vector('testDigits/%s' % (fileNameStr)) #获得预测结果 # classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) classifierResult = neigh.predict(vectorUnderTest) print("分类返回结果为%d\t真实结果为%d" % (classifierResult, classNumber)) if(classifierResult != classNumber): errorCount += 1.0 print("总共错了%d个数据\n错误率为%f%%" % (errorCount, errorCount/mTest * 100))
def handwritingClassTest(): hwLabels = []#测试集的标签矩阵 trainingFileList = listdir('machinelearning/Ch02/trainingDigits')#返回trainingDigits目录下的文件名 m = len(trainingFileList)#返回文件夹下文件的个数 trainingMat = np.zeros((m, 1024))#初始化训练的Mat矩阵,测试集向量大小为训练数据个数*1024,即多少张图像,就有多少行,一行存一个图像 for i in range(m): #从文件名中解析出训练集的类别标签 fileNameStr = trainingFileList[i] #获得文件的名字 classNumber = int(fileNameStr.split('_')[0])##第一个字符串存储标签,故取分离后的第一个元素,即相当于获取了该图像类别标签 hwLabels.append(classNumber)#将获得的类别标签添加到hwLabels中 trainingMat[i,:] = img2vector('machinelearning/Ch02/trainingDigits/%s' % (fileNameStr))#将每一个文件的1x1024数据存储到trainingMat中 neigh = kNN(n_neighbors = 3, algorithm = 'auto')#构建kNN分类器,第一个参数表示近邻数为3,算法为权重均匀的算法 neigh.fit(trainingMat, hwLabels)#拟合模型, trainingMat为测试矩阵,hwLabels为对应的标签 testFileList = listdir('machinelearning/Ch02/trainingDigits')#返回testDigits目录下的文件列表 errorCount = 0.0#错误检测计数,初始值为0 mTest = len(testFileList)#测试数据的数量 for i in range(mTest):#从文件中解析出测试集的类别并进行分类测试 fileNameStr = testFileList[i]#获得文件的名字 classNumber = int(fileNameStr.split('_')[0])#获得分类的数字标签 vectorUnderTest = img2vector('machinelearning/Ch02/trainingDigits/%s' % (fileNameStr)) #获得测试集的1x1024向量,用于训练 classifierResult = neigh.predict(vectorUnderTest)#获得预测结果 print("分类返回结果为%d\t真实结果为%d" % (classifierResult, classNumber)) if(classifierResult != classNumber):#如果预测结果与实际结果不符,则错误数加一 errorCount += 1.0 print("总共错了%d个数据\n错误率为%f%%" % (errorCount, errorCount/mTest * 100))#获取错误率
def handwritingClassTest(): hwLabels = [] trainingFileList = listdir('trainingDigits') m = len(trainingFileList) trainingMat = np.zeros((m, 1024)) for i in range(m): filenameStr = trainingFileList[i] classNumber = int(filenameStr.split('_')[0]) hwLabels.append(classNumber) trainingMat[i, :] = img2vector('trainingDigits/%s' % (filenameStr)) neigh = kNN(n_neighbors=3, algorithm='auto') neigh.fit(trainingMat, hwLabels) testFileList = listdir('testDigits') errorCount = 0.0 mTest = len(testFileList) for i in range(mTest): filenameStr = testFileList[i] classNumber = int(filenameStr.split('_')[0]) vectorUnderTest = img2vector('testDigits/%s' % (filenameStr)) classifierResult = neigh.predict(vectorUnderTest) print('分类返回结果为%d\t真实值结果为%d' % (classifierResult, classNumber)) if (classifierResult != classNumber): errorCount += 1.0 print('总错误了%d个数据\n错误率为%f%%' % (errorCount, errorCount / mTest * 100))
def handwritingClassTest(): #训练数据集的标签 hwLabels = [] #返回文件夹trainingDigits下的文件名 trainingFileList = listdir("trainingDigits") #获得文件数 m = len(trainingFileList) #创建默仍值为0的m*1024训练集矩阵 trainingMat = np.zeros((m, 1024)) #依次对文件夹下的每个文件进行解析 #获得文件名,文件标签和文件中的数据,构建训练数据集矩阵 for i in range(m): fileNameStr = trainingFileList[i] classNumber = int(fileNameStr.split('_')[0]) hwLabels.append(classNumber) trainingMat[i, :] = imageToVector('trainingDigits/' + str(fileNameStr)) #构建kNN分类器 neigh = kNN(n_neighbors=3, algorithm='auto') #拟合训练 neigh.fit(trainingMat, hwLabels) #错误数检测 errorCount = 0.0 #读取testDigits文件夹下的文件,并且解析 #利用predict进行预测返回分类结果 testFileList = listdir('testDigits') mTest = len(testFileList) for i in range(mTest): fileNameStr = testFileList[i] classNumber = int(fileNameStr.split('_')[0]) vectorUnderTest = imageToVector('testDigits/' + str(fileNameStr)) classifierUnderTest = neigh.predict(vectorUnderTest) print("分类返回结果为:" + str(classifierUnderTest) + "真实结果为:" + str(classNumber)) if classifierUnderTest != classNumber: errorCount += 1.0 print("共算错了%d个数据\n错误率为%f%%" % (errorCount, errorCount / mTest * 100))
def score_classifier(symbol, market_df, start_date, end_date, base=60, span=4, profit=.05): """ Adapted gen classifier function specifically to generate scores """ # 1. get stock data df = get_stock_data(symbol=symbol, start_date=start_date, end_date=end_date) # 2. add market data df = merge_datasets(df, market_df) # 3. calculate stock indicators df = get_tech_indicators(df) # 4. create features df = create_features(df, base=base) # 5. create labels df = create_labels(df, span=span, profit=profit) # 6. split features and labels X, y = split_features_labels(df) # 7. scale scaler = StandardScaler() X = scaler.fit_transform(X) # 8. fit classifier # knn with pca, train faster. can't lose this amount of time pca = PCA(n_components=10, random_state=42) #whiten=True X = pca.fit_transform(X) clf = kNN() clf.fit(X, y) # 9. calculate precision cv = StratifiedShuffleSplit(n_splits=10, test_size=.1, random_state=42) scores = cross_val_score(clf, X, y, cv=cv, scoring='precision') return scores.mean(), scores.std(), np.bincount(y)[1] / len(y)
def Knn_train(data, label): # 构建kNN分类器 neigh = kNN(n_neighbors=10, algorithm='auto', weights='distance', n_jobs=1) # 拟合模型, trainingMat为测试矩阵,hwLabels为对应的标签 neigh.fit(data, label) # 验证机数据集 valid_data, valid_label = Data_change(train=False) # 正确检测计数 resultCount = 0.0 # 验证数据集的数量 mTest = len(valid_data) valid_data = np.array(valid_data) valid_label = np.array(valid_label) # classifierResult = neigh.predict(valid_data[1].reshape(1,-1)) # print((classifierResult==valid_label[1]).all()) # print(valid_label[1]) # print(type(valid_label[1])) # valid_label=np.array(valid_label) for i in range(mTest): classifierResult = neigh.predict(valid_data[i].reshape(1, -1)) # print("分类返回结果为%d\t真实结果为%d" % (classifierResult, valid_label)) if ((classifierResult == valid_label[i]).all()): resultCount += 1.0 print("总共对了%d个数据\n准确率率为%f%%" % (resultCount, resultCount / mTest * 100))
def train(): labels = [] trainSet = listdir('./digits/trainSet') numTrain = len(trainSet) trainMatrix = np.zeros((numTrain, 1024)) #32*32 img size for i in range(numTrain): filename = trainSet[i] label = int(filename.split('_')[0]) labels.append(label) trainMatrix[i, :] = img2vector('./digits/trainSet/%s' % (filename)) neigh = kNN(n_neighbors=3, algorithm='auto') neigh.fit(trainMatrix, labels) testSet = listdir('./digits/testSet') errorCount = 0.0 numTest = len(testSet) for i in range(numTest): filename = testSet[i] label = int(filename.split('_')[0]) vectorImg = img2vector('./digits/testSet/%s' % (filename)) predLabel = neigh.predict(vectorImg) print('label: %d vs predLabel: %d' % (label, predLabel)) if (label != predLabel): errorCount += 1.0 print('Error Rate : %f%%' % (errorCount / numTest * 100))
def main(args): if args.people: with open(args.people[0]) as f: people = [line.rstrip() for line in f] people = np.array(people) auth_descriptors = np.loadtxt(args.people[1], dtype=np.float32) auth_id = np.arange(len(people)).repeat(10) print('Authorized Features:', auth_descriptors.shape) if args.use_sklearn_knn: from sklearn.neighbors import KNeighborsClassifier as kNN knn = kNN(args.k, weights='distance', n_jobs=4) knn.fit(auth_descriptors, auth_id) # initialize camera vs = VideoStream(usePiCamera=True, framerate=args.framerate, resolution=args.resolution) vs.start() # initialize face detector DET_NET_DEF = 'detector/res10_300x300_ssd.prototxt' DET_NET_WEIGHTS = 'detector/res10_300x300_ssd_iter_140000.caffemodel' start = time.time() detector = cv2.dnn.readNetFromCaffe(DET_NET_DEF, DET_NET_WEIGHTS) end = time.time() print('Detector loading time:', (end - start)) # initialize the extraction network SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) MODEL = os.path.join( SCRIPT_DIR, 'extractor/caffe_models/{0}_caffe/{0}'.format(args.model)) EXT_NET_DEF = '{}.prototxt'.format(MODEL) EXT_NET_WEIGHTS = '{}.caffemodel'.format(MODEL) LAYER = 'pool5/7x7_s1' # 'classifier' mean = np.array([91.4953, 103.8827, 131.0912]).reshape(3, 1, 1) args.use_caffe = 'senet' in args.model start = time.time() if args.use_caffe: os.environ['GLOG_minloglevel'] = '2' import caffe extractor = caffe.Net(EXT_NET_DEF, caffe.TEST, weights=EXT_NET_WEIGHTS) extractor.blobs['data'].reshape(1, 3, args.side, args.side) else: extractor = cv2.dnn.readNetFromCaffe(EXT_NET_DEF, EXT_NET_WEIGHTS) end = time.time() print('Extractor loading time:', (end - start)) while True: start_whole = time.time() # capture image from camera start = time.time() img = vs.read() if img is None: continue # skip initial empty frames due to camera init. delay end = time.time() capture_time = end - start print('\n\tCapture:', capture_time, 's') # detect faces start = time.time() img_det = cv2.resize(img, (300, 300)) img_det = cv2.dnn.blobFromImage(img_det, 1.0, (300, 300), (104.0, 177.0, 123.0), swapRB=False) detector.setInput(img_det) faces = detector.forward().squeeze() end = time.time() del img_det detection_time = end - start confidences = faces[:, 2] faces = faces[confidences > args.detection_confidence, 3:7] print('\tDetect :', detection_time, 's,', len(faces), 'faces') for face in faces: face *= np.tile(args.resolution, 2) (startX, startY, endX, endY) = face.astype("int") face = img[startY:endY, startX:endX] if face.size == 0: print('\tDiscarded empty bounding box') continue # preprocess face start = time.time() face = preprocess_image(face, mean, side=args.side) end = time.time() preproc_time = end - start print('\tPreProc:', preproc_time, 's') # get the description start = time.time() if args.use_caffe: extractor.blobs['data'].data[...] = face descriptor = extractor.forward(end=LAYER)[LAYER].squeeze() else: extractor.setInput(face) descriptor = extractor.forward(LAYER) end = time.time() extraction_time = end - start print('\tExtract:', extraction_time, 's') if args.people: start = time.time() descriptor = normalize(descriptor.reshape(1, -1)) if args.use_sklearn_knn: # sklearn knn confidences = knn.predict_proba(descriptor) person_id = np.argmax(confidences) confidence = confidences[person_id] else: # VIR knn person_id, confidence = knn_score(descriptor, auth_descriptors, auth_id, args.k) end = time.time() match_time = end - start match = people[ person_id] if confidence > args.match_confidence else 'Unauthorized' match = '{} (Conf = {:.2f})'.format(match, confidence) print('\tMatch :', match_time, 's,', match) end_whole = time.time() whole = end_whole - start_whole print('\tTOTAL :', whole, 's')
#split training data and test data, The ratio is 4: 1 X_train, X_test, y_train, y_test = train_test_split(select_X, y1, test_size=0.2, random_state=0) print(X_train.shape) print(y_train.shape) print(X_test.shape) print(y_test.shape) #cross validation and grid search for hyperparameter estimation param_dist = {'weights': ["uniform", "distance"]} cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0) clf = GridSearchCV(kNN(), param_grid=param_dist, cv=cv) clf = clf.fit(X_train, y_train.values.ravel()) print("Best estimator found by grid search:") print(clf.best_estimator_) #apply the classifier on the test data and show the accuracy of the model print('the acuracy for all is:') print(clf.score(X_test, y_test.values.ravel())) prediction = clf.predict(X_test) #use the metrics.classification to report. print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test, prediction)) print("Classification report:\n %s\n" % metrics.classification_report(y_test, prediction)) ########################################################################################################################
estimator_lol = SVC(C=C_val, gamma=gamma_val, kernel='rbf', random_state=100) samp_lol, SVM_train_score_lol, SVM_train_time_lol, SVM_pred_time_lol = plotTraining( estimator_lol, X_train, y_train, title="SVM for LOL games") evaluation(estimator_lol, X_train, X_test, y_train, y_test) # ### 5. KNN # In[19]: X_train, X_test, y_train, y_test = train_test_split(X1, Y1, test_size=0.20) hyperKNN(X_train, y_train, X_test, y_test, title="F1 Score(NBA games)\nHyperparameter : No. Neighbors") estimator_nba = kNN(n_neighbors=20, n_jobs=-1) samp_nba, kNN_train_score_nba, kNN_train_time_nba, kNN_pred_time_nba = plotTraining( estimator_nba, X_train, y_train, title="kNN SVM for NBA games") evaluation(estimator_nba, X_train, X_test, y_train, y_test) # In[20]: X_train, X_test, y_train, y_test = train_test_split(X2, Y2, test_size=0.20) hyperKNN(X_train, y_train, X_test, y_test, title="F1 Score(LOL games)\nHyperparameter: No. Neighbors") estimator_lol = kNN(n_neighbors=5, n_jobs=-1) samp_lol, kNN_train_score_lol, kNN_train_time_lol, kNN_pred_time_lol = plotTraining( estimator_lol, X_train, y_train, title="kNN SVM for LOL games")
print(f"Eigenvectors {evecs}") # list the eigenvectors transf = evecs * LA.inv(np.sqrt( np.diagflat(evals))) # compute the transformation matrix print("Transformation Matrix = \n", transf) transf_x = data_inp * transf # compute the transformed matrix print("The Transformed x = \n", transf_x) # Proof for orthogonalization xtx = transf_x.transpose( ) * transf_x # this should yield an identity matrix as orthogonalized variables would # have zero correlation between them. print("Expect an Identity Matrix = \n", xtx) kNNSpec = kNN(n_neighbors=5, algorithm='brute') # specify the parameters for KNN classifier. knn_fit = kNNSpec.fit(transf_x, fraud_df['FRAUD']) # fit the model. print( f"Accuracy for model: {round(knn_fit.score(transf_x, fraud_df['FRAUD']), 4)}" ) # compute accuracy of model on # training data. new_inp = [[7500, 15, 3, 127, 2, 2]] # test input. transf_inp = new_inp * transf # transform test input. transf_inp_nbrs = knn_fit.kneighbors( transf_inp, return_distance=False) # compute the nearest neighbors for the # test input, the distance values are not needed and hence, has been omitted. # print the input and output values for nearest neighbors. for nbr in transf_inp_nbrs.tolist()[0]:
# MACs of networks in dataset networks = [z for (_, _, z) in list(dataset)[3:]] # If we want only selected wifis in dataset: network_names = ["ap", "hub0", "hub1", "hub2"] networks = [z for (x, _, z) in list(dataset)[3:] if x in network_names] features = np.asarray(dataset[network_names]) found_networks = [0] * len(networks) # a = dataset.hist() # plt.plot(a) clf = kNN(n_neighbors=2) print("-" * 30) print(clf) clf.fit(features, labels) # wifi_monitor = net.wifi_mon(interface = 'ap') stop = False while not stop: try: # cells = net.parse_scan(None, wifi_monitor) # print(cells) # unit test mock some test data cells = [ { "bssid": "00:0b:6b:de:ea:36", "frequency": "2437",
def fMAP(samples,nnn=25): nbrs = kNN(n_neighbors=nnn).fit(samples) distances, indices = nbrs.kneighbors(samples) idx = np.argmin(distances[:,-1]) return samples[idx]
def execute_program(): def status(word): word = str(word) if word.upper() != 'OK': beautiful_output.red_normal('--> Status: ' + '[' + "Error because " + str(word) + ']') input('Enter to quit...') os._exit(0) beautiful_output.green_normal('--> Status: ' + '[' + str(word) + ']') # Generate the code print('Begin to reconstruct...', end=' ') try: DB.reconstruct() # do not use del command become the CLI will be ugly os.system('rd /s/q train_data') os.system('md train_data') status('OK') except Exception as e: status(e) print('\nExecute generating progress...') try: num = int(input('The number of code you wanna generate: ')) print('Generating...') generator.generate(num) except Exception as e: num = 0 status(e) # print sample database print('\n') beautiful_output.underline('DATABASE CHECK:') DB.disp_DB() print('\nInitialize database...', end=' ') status('OK') # Clean the DB print('Remake the database, it may take a while...') try: data = data_generator.generate() except Exception as e: data = [] status(e) # print out the clean data beautiful_output.underline('\nData check:') try: print(list(data.values())[0][0]) print('\nData Check...', end=' ') status('OK') except Exception as e: status(e) # print log beautiful_output.underline('\nLog:') print('-------------------------------') print('train data number: ' + str(num * 4)) print('train data Pairs : ' + str(len(list(data.keys())))) print('covered data rate: ' + str(len(list(data.keys())) / 26 * 100)[:4] + '%') print('data shape : ' + str(list(data.values())[0][0].shape)) print('-------------------------------') # data constructor for training def construct_data(): label_total = list(data.keys()) feature = [] label = [] for i in label_total: for ii in data[i]: # decrease the dimension ii = ii.reshape(1, ii.shape[1] * ii.shape[0])[0] feature.append(ii) label.append(i) return [feature, label] # train the data print('\nReconstruct the feature and label array...') try: # construct the knn model temp = construct_data() feature = temp[0] label = temp[1] beautiful_output.underline('\nCheck the feature:') print(feature[0][:10]) print('\nReconstruct data...', end=' ') status('OK') except Exception as e: label = [] feature = [] status(e) print('\nTraining...', end=' ') try: # define cluster neighbor_num = len(np.unique(label)) mode = kNN(n_neighbors=neighbor_num, algorithm='auto') mode.fit(feature, label) status('OK') except Exception as e: neighbor_num = 0 mode = None status(e) # save model print('\nSave the model...', end=' ') try: joblib.dump(mode, './model.m') status('OK') except Exception as e: status(e) # validate accuracy print('\nValidate model accuracy') print('processing...') try: print('\nReconstruct...') DB.reconstruct() os.system('rd /s/q train_data') os.system('md train_data') print('\nGenerating test data...') generator.generate(int(num / 4)) print('Clean the data') data = data_generator.generate() print('Reconstruct the data', end=' ') temp = construct_data() feature = temp[0] label = temp[1] predict_label = mode.predict(feature) compare = sum(list(map(lambda x, y: x == y, predict_label, label))) accuracy = str(compare / len(label) * 100)[:4] status('OK') except Exception as e: predict_label = [] label = [] accuracy = None status(e) beautiful_output.underline('\nModel accuracy: ') print('---------------------') print('Predict: ' + str(predict_label[:10]) + '...') print('Actual: ' + str(label[:10]) + '...') print('---------------------') print(accuracy + '%') # print final summary beautiful_output.underline('\nSummary:') print('---------------------') print('Train data: ' + str(len(predict_label))) print('Test data: ' + str(int(len(predict_label) * 0.2))) print('Neighbor: ' + str(neighbor_num) + '/26') print('Model Accuracy: ' + accuracy + '%') print('Model Address: ' + './model.m') print('Train method: ' + 'Knn') print('---------------------')
# pip install scikit-optimize from sklearn.neighbors import KNeighborsClassifier as kNN from skopt import BayesSearchCV import warnings warnings.filterwarnings("ignore") import sklearn.datasets as ds # 数据准备 iris = ds.load_iris(as_frame=True) irisData = iris.data irisTarget = iris.target.values print('irisData:\n', irisData) print('irisTarget:\n', irisTarget) # parameter ranges are specified by one of below # from skopt.space import Real, Categorical, Integer knn = kNN() # 定义贝叶斯搜索的参数组合 grid_param = { 'n_neighbors': list(range(2, 11)), 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'] } # 实例化贝叶斯搜索器,传入分类模型和参数组合以及迭代次数 Bayes = BayesSearchCV(knn, grid_param, n_iter=10, random_state=14) Bayes.fit(irisData, irisTarget) # best parameter combination print("最优参数:", Bayes.best_params_) # Score achieved with best parameter combination print("最优分数:", Bayes.best_score_)
# define X and y training and testing sets X, y = data[symbol + '_X'], data[symbol + '_y'] X_test, y_test = test_data[symbol + '_X'], test_data[symbol + '_y'] ##### Training ################################## # scale scaler = StandardScaler() X = scaler.fit_transform(X) data[symbol + '_scaler'] = scaler # predict pca = PCA(n_components=10, random_state=42) X = pca.fit_transform(X) clf = kNN() clf.fit(X, y) data[symbol + '_pca'] = pca data[symbol + '_clf'] = clf ##### Cross Validating ################################## # validate cv = StratifiedShuffleSplit(n_splits=10, test_size=.1, random_state=42) scores = cross_val_score(clf, X, y, cv=cv, scoring='precision') # save results row = (strategy, symbol) try: results.loc[row, 'CV_Success%'] = np.bincount(y)[1] / len(y) results.loc[row, 'CV_Precision_Mean'] = scores.mean()
from sklearn.neighbors import KNeighborsClassifier as kNN X_test = happiness_num_df[happiness_num_df['D.1_values'].isna()].select_dtypes([np.number]) # 4k obs X_train = happiness_num_df[happiness_num_df['D.1_values'].notna()].select_dtypes([np.number]) # 36.3k obs happiness_num_df = happiness_num_df.select_dtypes([np.number]).\ drop(columns=['N_Entrevista']) print(happiness_num_df.dtypes) Xtr, Xtt = X_train, X_test X_test, X_train = X_test.drop(columns=['D.1_values']).fillna(0), X_train.drop(columns=['D.1_values']).fillna(0) y_train, y_test = np.array(Xtr['D.1_values']), np.array(Xtt['D.1_values']) print(X_train.shape, len(y_train), X_test.shape, len(y_test)) logreg = LogisticRegression() logreg.fit(X_train, y_train) knn = kNN(n_neighbors=10) # it1 (n=20 OR n=10, error = 0.02), it2 (n=100, error = 0.03) knn.fit(X_train, y_train) dt = DecisionTreeClassifier(max_depth=24, min_samples_leaf=0.05) # it1 <md=8, msl=0.15> ~ lightly underfitted, # itn <md=24, msl=0.05> ~ lightly underfitted dt.fit(X_train, y_train) classifiers = [('Logistic Regression', logreg), ('K Nearest Neighbours', knn), ('Classification Tree', dt)] from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.metrics import confusion_matrix for clf_name, clf in classifiers: clf.fit(X_train, y_train)
def __init__(self, nn_model, layer='fc6', k=3): """Your implementation""" network_output = nn_model.get_layer(layer).output self.feature_extraction_model = Model(model.input, network_output) self.clf = kNN(n_neighbors=k, weights='distance')
def combinational_cost(data1, data2, data3, data4, reg_type, image_tag, no_of_folds, number_rep): ''' Parameters ---------- data1 : arrays matrix of all costs of group1 (normal) for training/testing. Each individual cost (feature) should be arranged as a column data2 : arrays matrix of all costs of group2 (abnormal) for training/testing. Each individual cost (feature) should be arranged as a column data3 : arrays matrix of all costs of group1 (normal) for validation. Each individual cost (feature) should be arranged as a column data4 : arrays matrix of all costs of group2 (abnormal) for validation. Each individual cost (feature) should be arranged as a column reg_type : str registration type, either rigid (6-dof) or affine (12-dof), or it could be non-linear. no_of_folds : int specify number of folds for nested cross-validation Returns ------- accuracy and AUC of the combinational cost function based on different supervised-learning classifiers for identifying mis-registrations. ''' print(f'classifier comparison for {image_tag}-{reg_type}--------------') # transposing and creating labels for data1 X_normal = np.transpose(data1) # to make each feature into a column x_normal_label = np.zeros(len(X_normal)) print( f'number of correctly aligned images for cross-validation are {len(x_normal_label)}' ) balance_data = 0 # Since the generated misaligned images are 5 times higher, the two classes can be balanced by considering this flag variable one # transposing and creating labels for data2 if balance_data: X_misaligned = np.transpose(data2)[:np.shape(X_normal)[0], :] else: X_misaligned = np.transpose(data2) x_misaligned_label = np.ones(len(X_misaligned)) print( f'number of misaligned images for cross-validation are {len(x_misaligned_label)}' ) # data for validation, combining data3 and data4 X_normal_val = np.transpose(data3) x_normal_val_label = np.zeros(len(X_normal_val)) print(f'number of images for testing are {len(x_normal_val_label)}') if balance_data: X_misaligned_val = np.transpose(data4)[:np.shape(X_normal_val)[0], :] else: X_misaligned_val = np.transpose(data4) x_misaligned_val_label = np.ones(len(X_misaligned_val)) #print(f'number of misaligned images for validation are {len(x_misaligned_val_label)}') #X_val = np.concatenate((X_normal_val, X_misaligned_val)) #y_val = np.concatenate((x_normal_val_label, x_misaligned_val_label)) X_val = X_normal_val y_val = x_normal_val_label # combining data1 and data2 and the corresponding labels X = np.concatenate((X_normal, X_misaligned)) y = np.concatenate((x_normal_label, x_misaligned_label)) visualize_costs = 0 # This will do a 3D plot to visualize the costs if visualize_costs: visualize_cost_values(X, y, standardize=1) # print('cost values (min, mean, max) before scaling') # for a in range(3): # print(np.min(X[:, a]), np.mean(X[:, a]), np.max(X[:, a]), np.min(X_val[:, a]), np.mean(X_val[:, a]), np.max(X_val[:, a])) # scaling the costs (features) to make sure the ranges of individual features are same to avoid the effect of features that have relatively large values. It may not be necessary in this case as all these 3 costs lie between 0 and 1 #scale = QuantileTransformer(n_quantiles = 10, output_distribution = 'uniform') # Subtracting the mean and dividing with standard deviation scale = StandardScaler() scale_test = StandardScaler() # scale.fit(X) # X = scale.transform(X) #X_val = scale_test.fit_transform(X_val) # fit_transform is necessary here instead of just transform # print('cost values (min, mean, max) after scaling') # X1 = StandardScaler().fit_transform(X) # Making a copy for standardization # for a in range(3): # print(np.min(X1[:, a]), np.median(X1[:, a]), np.max(X1[:, a]), np.min(X_val[:, a]), np.median(X_val[:, a]), np.max(X_val[:, a])) # X = np.concatenate((X, X_val)) # y = np.concatenate((y, y_val)) unsupervised_learning = 0 # reated models without giving labels, if unsupervised_learning: kmeans = KMeans(n_clusters=2, random_state=0).fit(X) visualize_cost_values(X, 1 - kmeans.labels_, standardize=1) print( f'balanced accuracy using KMeans Clustering algorithm is {metrics.balanced_accuracy_score(y, 1-kmeans.labels_)}' ) print( f'tested BA score using KMeans Clustering algorithm is {metrics.balanced_accuracy_score(y_val, 1-kmeans.predict(X_val))}' ) agglo = AgglomerativeClustering(n_clusters=2).fit(X) visualize_cost_values(X, 1 - agglo.labels_, standardize=1) print( f'balanced accuracy using Agglomerative Clustering algorithm is {metrics.balanced_accuracy_score(y, 1-agglo.labels_)}' ) #print(f'tested BA score using Agglomerative Clustering algorithm is {metrics.balanced_accuracy_score(y_val, 1-agglo.predict(X_val))}') # doing Grid Search CV for hyper-parameter tuning #X_gridCV = StandardScaler().fit_transform(X) X_gridCV = X print( 'doing hyperparameter tuning of the ML models using Grid Search Cross-Validation' ) lda_parameters = { 'solver': ('svd', 'lsqr', 'eigen'), 'tol': [0.001, 0.0001, 0.00001] } gridCV_lda = GridSearchCV(LDA(n_components=1), lda_parameters, refit=True, scoring=('balanced_accuracy'), cv=5).fit(X_gridCV, y) print( f'F1-score for LDA {gridCV_lda.best_params_} is {gridCV_lda.best_score_}' ) rfc_parameters = { 'n_estimators': [1, 10, 20, 40, 60, 80, 100], 'criterion': ('gini', 'entropy'), 'max_depth': [2, 4, 6, 8, 10], 'max_features': ('auto', 'sqrt', 'log2') } gridCV_rfc = GridSearchCV(RandomForestClassifier(), rfc_parameters, refit=True, scoring=('balanced_accuracy'), cv=5).fit(X_gridCV, y) print( f'F1-score for Random Forest {gridCV_rfc.best_params_} is {gridCV_rfc.best_score_}' ) svc_parameters = { 'kernel': ('linear', 'poly', 'rbf', 'sigmoid'), 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000] } gridCV_svc = GridSearchCV(SVC(), svc_parameters, refit=True, scoring=('balanced_accuracy'), cv=5).fit(X_gridCV, y) print( f'F1-score for Support Vector Machine {gridCV_svc.best_params_} is {gridCV_svc.best_score_}' ) knn_parameters = { 'n_neighbors': [3, 7, 11, 15, 19, 21], 'weights': ('uniform', 'distance'), 'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute') } gridCV_knn = GridSearchCV(kNN(), knn_parameters, refit=True, scoring=('balanced_accuracy'), cv=5).fit(X_gridCV, y) print( f'F1-score for K Nearest Neighbors {gridCV_knn.best_params_} is {gridCV_knn.best_score_}' ) ada_parameters = { 'n_estimators': [1, 10, 20, 40, 60, 80, 100], 'learning_rate': [0.8, 0.9, 1] } gridCV_ada = GridSearchCV(AdaBoostClassifier(), ada_parameters, refit=True, scoring=('balanced_accuracy'), cv=5).fit(X_gridCV, y) print( f'F1-score for Adaptive Boosting {gridCV_ada.best_params_} is {gridCV_ada.best_score_}' ) # Repeated K-fold cross validation, n_splits specifies the number of folds, n_repeats specifies the no.of repetetions folds = RepeatedStratifiedKFold(n_splits=no_of_folds, n_repeats=number_rep) scores_lda = [] scores_qda = [] scores_rfc = [] scores_svm = [] scores_gnb = [] scores_knn = [] scores_lor = [] scores_ada = [] scores_gra = [] scores_automl = [] auc_lda = [] auc_qda = [] auc_rfc = [] auc_svm = [] auc_gnb = [] auc_knn = [] auc_lor = [] auc_ada = [] auc_gra = [] auc_automl = [] for train_index, test_index in folds.split(X, y): X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[ train_index], y[test_index] # X_train = scale.fit_transform(X_train) # scaling is implemented on X_train and the transformation is implemented on the X_test # X_test = scale.transform(X_test) #X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size = 0.20, stratify = y_1, shuffle = True) # 1. Linear Discriminant Analysis Classifier lda = LDA(solver='eigen', shrinkage='auto', n_components=1) score_lda, roc_auc_lda, model_lda = classifier_accuracy( lda, X_train, X_test, y_train, y_test) # print('F1-score for LDA', {metrics.f1_score(y_val, LDA(solver = 'eigen', shrinkage = 'auto', n_components = 1).fit(X, y).predict(X_val))}) scores_lda.append(score_lda) # F1-score auc_lda.append(roc_auc_lda) # AUC #auc_lda.append(metrics.f1_score(y_validation, LDA(solver = 'eigen', shrinkage = 'auto', n_components = 1).fit(X_1, y_1).predict(X_validation))) # AUC # 1a. Quadratic Discriminant Analysis Classifier qda = QDA() score_qda, roc_auc_qda, model_qda = classifier_accuracy( qda, X_train, X_test, y_train, y_test) scores_qda.append(score_qda) auc_qda.append(roc_auc_qda) # 2. Random Forest Classifier (it could be done in LDA transformed space if you have large number of features) rfc = RandomForestClassifier(criterion='gini', n_estimators=100) score_rfc, roc_auc_rfc, model_rfc = classifier_accuracy( rfc, X_train, X_test, y_train, y_test) scores_rfc.append(score_rfc) auc_rfc.append(roc_auc_rfc) # 3. Support Vector Machine Classifier svc = SVC(kernel='rbf', gamma='scale', probability=True) score_svm, roc_auc_svm, model_svm = classifier_accuracy( svc, X_train, X_test, y_train, y_test) scores_svm.append(score_svm) auc_svm.append(roc_auc_svm) #print(svc.coef_) # 4. Gaussian Naive Bayes Classifier gnb = GaussianNB() score_gnb, roc_auc_gnb, model_gnb = classifier_accuracy( gnb, X_train, X_test, y_train, y_test) scores_gnb.append(score_gnb) auc_gnb.append(roc_auc_gnb) # 5. k-Nearest Neighbour Classifier knn = kNN(n_neighbors=15) score_knn, roc_auc_knn, model_knn = classifier_accuracy( knn, X_train, X_test, y_train, y_test) scores_knn.append(score_knn) auc_knn.append(roc_auc_knn) # 6. Logistic Regression Classifier lor = LogisticRegression() score_lor, roc_auc_lor, model_lor = classifier_accuracy( lor, X_train, X_test, y_train, y_test) scores_lor.append(score_lor) auc_lor.append(roc_auc_lor) # 7. Ada Boost Classifier ada = AdaBoostClassifier(n_estimators=100) score_ada, roc_auc_ada, model_ada = classifier_accuracy( ada, X_train, X_test, y_train, y_test) scores_ada.append(score_ada) auc_ada.append(roc_auc_ada) # # 7a. Gradient Boosting Classifier # gra = GradientBoostingClassifier(random_state = 0) # score_gra, roc_auc_gra, model_gra = classifier_accuracy(gra, X_train, X_test, y_train, y_test) # scores_gra.append(score_gra) # auc_gra.append(roc_auc_gra) # 8. Auto Sklearn Classifier (for automatic model selection with hyperparameter tuning) # automl = AutoSklearnClassifier(time_left_for_this_task = 50, per_run_time_limit = 10) # score_automl, roc_auc_automl, model_automl = classifier_accuracy(automl, X_train, X_test, y_train, y_test) # scores_automl.append(score_automl) # auc_automl.append(roc_auc_automl) # Note: 'cross_val_score' method from sklearn could be used directly on the classifier model to avoid the above for loop. Further, f1-score could be used instead of accuracy metric if number of positive samples (mis-aligned) are low. if False: print( f'accuracy using LDA classifier for {image_tag}-{reg_type} is: {np.average(scores_lda)}, AUC is: {np.average(auc_lda)}' ) #print(f'accuracy using QDA classifier for {image_tag}-{reg_type} is: {np.average(scores_qda)}, AUC is: {np.average(auc_qda)}\n') print( f'accuracy using RandomForest classifier for {image_tag}-{reg_type} is: {np.average(scores_rfc)}, AUC is: {np.average(auc_rfc)}' ) print( f'accuracy using SVM classifier for {image_tag}-{reg_type} is: {np.average(scores_svm)}, AUC is: {np.average(auc_svm)}' ) print( f'accuracy using Naive Bayes classifier for {image_tag}-{reg_type} is: {np.average(scores_gnb)}, AUC is: {np.average(auc_gnb)}' ) print( f'accuracy using kNN classifier for {image_tag}-{reg_type} is: {np.average(scores_knn)}, AUC is: {np.average(auc_knn)}' ) #print(f'accuracy using Logistic Regression classifier for {image_tag}-{reg_type} is: {np.average(scores_lor)}, AUC is: {np.average(auc_lor)}\n') print( f'accuracy using Ada Boost classifier for {image_tag}-{reg_type} is: {np.average(scores_ada)}, AUC is: {np.average(auc_ada)}' ) #print(f'accuracy using Gradient boosting classifier for {image_tag}-{reg_type} is: {np.average(scores_gra)}, AUC is: {np.average(auc_gra)}\n') #print(f'accuracy using AutoML Classifier for {image_tag}-{reg_type} is: {np.average(scores_automl)}, AUC is: {np.average(auc_automl)}\n') save_model = '/home/tummala/mri/ml_classifier_models_checking_reg' if not os.path.exists(save_model): os.makedirs(save_model) # saving the trained model, e.g. shown for saving ada boost classifier model and minmax scaling model #scale_all = StandardScaler().fit(X) #X = scale_all.transform(X) #pickle.dump(scale_all, open(save_model+'/'+'scale_'+reg_type+image_tag, 'wb')) pickle.dump(gridCV_lda, open(save_model + '/' + 'lda_' + reg_type + image_tag, 'wb')) pickle.dump(gridCV_rfc, open(save_model + '/' + 'rfc_' + reg_type + image_tag, 'wb')) pickle.dump(gridCV_svc, open(save_model + '/' + 'svm_' + reg_type + image_tag, 'wb')) pickle.dump(GaussianNB().fit(X, y), open(save_model + '/' + 'gnb_' + reg_type + image_tag, 'wb')) pickle.dump(gridCV_knn, open(save_model + '/' + 'knn_' + reg_type + image_tag, 'wb')) pickle.dump( gridCV_ada, open(save_model + '/' + 'ada_boost_' + reg_type + image_tag, 'wb')) # automl_model = AutoSklearnClassifier(time_left_for_this_task = 50, per_run_time_limit = 10).fit(X, y) # pickle.dump(automl_model, open(save_model+'/'+'automl_'+reg_type+image_tag, 'wb')) # pickle.load method could be used to load the model for later use and predict method of the seved model to categorize new cases # plotting ROC curve for Sensitivity/Specificity all above classifiers subjects_test = os.listdir(subpath2) for index, subject in enumerate(subjects_test, start=1): global_cost_vector = [] local_cost_vector = [] cost_folder = subpath2 + '/' + subject + '/cost' + str(voi_size) + str( step_size) #print('{}-{}, {}-{}'.format(index, subject, reg_type, cost_func)) data_files = os.listdir(cost_folder) for data_file in data_files: if reg_type in data_file and (image_tag in data_file): if not 'alignedToT1' in data_file: cost_data = np.loadtxt(cost_folder + '/' + data_file) global_cost_vector.append(cost_data[0]) local_cost_vector.append(cost_data[1]) if not local_cost_vector: print(f'No {image_tag} image for {subject}') continue sample = np.reshape(np.array(local_cost_vector), (1, 3)) reg_quality = gridCV_knn.predict_proba(sample)[0][0] * 100 if reg_quality < 50: print( f'Quality of {reg_type} registration for {subject} using kNN Classifier is {reg_quality}' ) if False: lda_disp = metrics.plot_roc_curve(gridCV_lda, X_val, y_val, drop_intermediate=False) print('F1-score for LDA', {metrics.accuracy_score(y_val, gridCV_lda.predict(X_val))}) #qda_disp = metrics.plot_roc_curve(qda, X_test, y_test, ax = lda_disp.ax_) svm_disp = metrics.plot_roc_curve(gridCV_svc, X_val, y_val, ax=lda_disp.ax_) print('F1-score for SVM', {metrics.accuracy_score(y_val, gridCV_svc.predict(X_val))}) #nsvm_disp = metrics.plot_roc_curve(nsvm, X_test, y_test, ax = lda_disp.ax_) gnb_disp = metrics.plot_roc_curve(GaussianNB().fit(X, y), X_val, y_val, ax=lda_disp.ax_) print('F1-score for GNB', { metrics.accuracy_score(y_val, GaussianNB().fit(X, y).predict(X_val)) }) knn_disp = metrics.plot_roc_curve(gridCV_knn, X_val, y_val, ax=lda_disp.ax_) print('F1-score for kNN', {metrics.accuracy_score(y_val, gridCV_knn.predict(X_val))}) rfc_disp = metrics.plot_roc_curve(gridCV_rfc, X_val, y_val, ax=lda_disp.ax_) print('F1-score for RFC', {metrics.accuracy_score(y_val, gridCV_rfc.predict(X_val))}) #print(y_val) #print(RandomForestClassifier(criterion = 'gini', n_estimators = 100).fit(X, y).predict_proba(X_val)[:, 1]) metrics.plot_confusion_matrix(gridCV_rfc, X_val, y_val, colorbar=False) plt.show() ada_disp = metrics.plot_roc_curve(gridCV_ada, X_val, y_val, ax=lda_disp.ax_) print('F1-score for Ada Boost', {metrics.accuracy_score(y_val, gridCV_ada.predict(X_val))}) # automl_disp = metrics.plot_roc_curve(automl_model, X_val, y_val, ax = lda_disp.ax_) # print('F1-score for AutoML Classifier', {metrics.balanced_accuracy_score(y_val, automl_model.predict(X_val))}) # print(automl_model.sprint_statistics()) # # Plotting the sklearn models using PipelineProfiler # profiler_data = PipelineProfiler.import_autosklearn(automl_model) # PipelineProfiler.plot_pipeline_matrix(profiler_data) # plt.show() #knn_disp.figure_.suptitle(f"ROC curve comparison {image_tag}-{reg_type}") # plotting Precision-Recall ROC curve for all above classifiers if False: lda_disp = metrics.plot_precision_recall_curve( LDA(solver='eigen', shrinkage='auto', n_components=1).fit(X, y), X_val, y_val) #qda_disp = metrics.precision_recall_curve(qda, X_test, y_test, ax = lda_disp.ax_) svm_disp = metrics.plot_precision_recall_curve(SVC( kernel='linear', gamma='scale', probability=True).fit(X, y), X_val, y_val, ax=lda_disp.ax_) #nsvm_disp = metrics.plot_roc_curve(nsvm, X_test, y_test, ax = lda_disp.ax_) gnb_disp = metrics.plot_precision_recall_curve(GaussianNB().fit(X, y), X_val, y_val, ax=lda_disp.ax_) knn_disp = metrics.plot_precision_recall_curve(kNN(n_neighbors=15).fit( X, y), X_val, y_val, ax=lda_disp.ax_) rfc_disp = metrics.plot_precision_recall_curve(RandomForestClassifier( criterion='gini', n_estimators=100).fit(X, y), X_val, y_val, ax=lda_disp.ax_) ada_disp = metrics.plot_precision_recall_curve( AdaBoostClassifier(n_estimators=100).fit(X, y), X_val, y_val, ax=lda_disp.ax_)
def main(): # # Loading the datasets cancer_data = pd.read_csv('breast_cancer.csv') print("Data has", len(cancer_data), "rows and", len(cancer_data.columns), "columns.") phishing_data = pd.read_csv('PhishingWebsites.csv') print("Data has", len(phishing_data), "rows and", len(phishing_data.columns), "columns.") # # Preprocessing datasets # This section is preprocessing the breast cancer dataset. Like dropping unnecessary columns and replacing the values of target to 0/1 numeric values. # In[26]: y_can = cancer_data.diagnosis # malignant (M) or benign (B) # The column "Unnamed: 32" feature includes NaN so drop it from the data. Also drop "id" as it is not a feature and # "diagnosis" as it is the label column_names = ['Unnamed: 32', 'id', 'diagnosis'] X_can = cancer_data.drop(column_names, axis=1) X_can = preprocessing.scale(X_can) # Convert string labels to numerical values y_can = y_can.values y_can[y_can == 'M'] = 1 y_can[y_can == 'B'] = 0 y_can = y_can.astype(int) column_order = list(cancer_data) column_order.insert(0, column_order.pop( column_order.index('diagnosis'))) #move target diagnosis to front cancer_data = cancer_data.loc[:, column_order] cancer_data.describe(include='all') # For preprocessing the phishing dataset, there are several columns that are categorical with the levels {-1,0,1} and the rest are all binary with levels {-1,1}. For the 3-level columns one-hot encoding was used to create additional features with level {0,1}. Finally, edited the binary features so that the new levels are all {0,1}. There are more features now, but it will all be binary. # In[28]: column_names = [ 'URL_Length', 'having_Sub_Domain', 'SSLfinal_State', 'URL_of_Anchor', 'Links_in_tags', 'SFH', 'web_traffic', 'Links_pointing_to_page' ] data_1hot = phishing_data[column_names] data_1hot = pd.get_dummies(data_1hot) data_others = phishing_data.drop(column_names, axis=1) phishing_data = pd.concat([data_1hot, data_others], axis=1) phishing_data = phishing_data.replace(-1, 0).astype('category') column_order = list(phishing_data) column_order.insert(0, column_order.pop(column_order.index('Result'))) phishing_data = phishing_data.loc[:, column_order] #move the target variable 'Result' to the front phishing_data.describe(include='all') # # Splitting the datasets # The datasets are now split into train and test datasets. Training dataset is 70% of the original dataset while test dataset is 30% of the original dataset. # In[83]: X_can_train, X_can_test, y_can_train, y_can_test = train_test_split( X_can, y_can, test_size=0.3, random_state=18) X_phish = np.array(phishing_data.values[:, 1:], dtype='int64') y_phish = np.array(phishing_data.values[:, 0], dtype='int64') X_ph_train, X_ph_test, y_ph_train, y_ph_test = train_test_split( X_phish, y_phish, test_size=0.3, random_state=18) # In[41]: plt.rcParams['xtick.labelsize'] = 12 plt.rcParams['ytick.labelsize'] = 12 plt.rcParams['axes.titlesize'] = 12 plt.rcParams['font.size'] = 12 # # Decision Tree Classifier # In this section constructed a Decision Tree Classifier using information gain (based on entropy) to determine the best feature split per the ID3 algorithm. The model will be pre-pruned by limiting tree depth using the hyperparameter 'max_depth' and by ensuring that each leaf (a terminal node on the tree) has at least 'min_samples_leaf'. DTree_classifier( X_can_train, y_can_train, X_can_test, y_can_test, title= "Model Complexity Curve for Decision Tree (Breast Cancer Data)\nHyperparameter : Tree Max Depth" ) start_leaf_n = round(0.005 * len(X_can_train)) end_leaf_n = round( 0.05 * len(X_can_train)) #leaf nodes of size [0.5%, 5% will be tested] max_depth, min_samples_leaf = TreeGridSearchCV(start_leaf_n, end_leaf_n, X_can_train, y_can_train) estimator_can = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf, random_state=100, criterion='entropy') train_samp_can, DT_train_score_can, DT_fit_time_can, DT_pred_time_can = plot_learning_curve( estimator_can, 5, X_can_train, y_can_train, title="Decision Tree Breast CancerData") DTree_fp_can, DTree_tp_can = final_classifier_evaluation( estimator_can, X_can_train, X_can_test, y_can_train, y_can_test, title="Decision Tree Breast CancerData") DTree_classifier( X_ph_train, y_ph_train, X_ph_test, y_ph_test, title= "Model Complexity Curve for Decision Tree (Phishing Data)\nHyperparameter : Tree Max Depth" ) start_leaf_n = round(0.005 * len(X_ph_train)) end_leaf_n = round( 0.05 * len(X_ph_train)) #leaf nodes of size [0.5%, 5% will be tested] max_depth, min_samples_leaf = TreeGridSearchCV(start_leaf_n, end_leaf_n, X_ph_train, y_ph_train) estimator_phish = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf, random_state=100, criterion='entropy') train_samp_phish, DT_train_score_phish, DT_fit_time_phish, DT_pred_time_phish = plot_learning_curve( estimator_phish, 10, X_ph_train, y_ph_train, title="Decision Tree Phishing Data") DTree_fp_phish, DTree_tp_phish = final_classifier_evaluation( estimator_phish, X_ph_train, X_ph_test, y_ph_train, y_ph_test, title="Decision Tree Phishing Data") kNN_classifier( X_can_train, y_can_train, X_can_test, y_can_test, title= "Model Complexity Curve for kNN (Breast Cancer Data)\nHyperparameter : No. Neighbors" ) estimator_can = kNN(n_neighbors=14, n_jobs=-1) train_samp_can, kNN_train_score_can, kNN_fit_time_can, kNN_pred_time_can = plot_learning_curve( estimator_can, 5, X_can_train, y_can_train, title="kNN Breast Cancer Data") KNN_fp_can, KNN_tp_can = final_classifier_evaluation( estimator_can, X_can_train, X_can_test, y_can_train, y_can_test, title="kNN Breast Cancer Data") kNN_classifier( X_ph_train, y_ph_train, X_ph_test, y_ph_test, title= "Model Complexity Curve for kNN (Phishing Data)\nHyperparameter : No. Neighbors" ) estimator_phish = kNN(n_neighbors=20, n_jobs=-1) train_samp_phish, kNN_train_score_phish, kNN_fit_time_phish, kNN_pred_time_phish = plot_learning_curve( estimator_phish, 10, X_ph_train, y_ph_train, title="kNN Phishing Data") KNN_fp_ph, KNN_tp_ph = final_classifier_evaluation( estimator_phish, X_ph_train, X_ph_test, y_ph_train, y_ph_test, title="kNN Phishing Data") hlist = np.linspace(1, 150, 30).astype('int') NN_classifier( X_can_train, y_can_train, X_can_test, y_can_test, title= "Model Complexity Curve for NN (Breast Cancer Data)\nHyperparameter : No. Hidden Units" ) h_units, learn_rate = NNGridSearchCV(X_can_train, y_can_train) estimator_can = MLPClassifier(hidden_layer_sizes=(h_units, ), solver='adam', activation='logistic', learning_rate_init=learn_rate, random_state=100) train_samp_can, NN_train_score_can, NN_fit_time_can, NN_pred_time_can = plot_learning_curve( estimator_can, 5, X_can_train, y_can_train, title="Neural Net Breast Cancer Data") NN_fp_can, NN_tp_can = final_classifier_evaluation( estimator_can, X_can_train, X_can_test, y_can_train, y_can_test, title="Neural Net Breast Cancer Data") NN_classifier( X_ph_train, y_ph_train, X_ph_test, y_ph_test, title= "Model Complexity Curve for NN (Phishing Data)\nHyperparameter : No. Hidden Units" ) h_units, learn_rate = NNGridSearchCV(X_ph_train, y_ph_train) estimator_phish = MLPClassifier(hidden_layer_sizes=(h_units, ), solver='adam', activation='logistic', learning_rate_init=learn_rate, random_state=100) train_samp_phish, NN_train_score_phish, NN_fit_time_phish, NN_pred_time_phish = plot_learning_curve( estimator_phish, 10, X_ph_train, y_ph_train, title="Neural Net Phishing Data") NN_fp_ph, NN_tp_ph = final_classifier_evaluation( estimator_phish, X_ph_train, X_ph_test, y_ph_train, y_ph_test, title="Neural Net Phishing Data") estimator_can = MLPClassifier(hidden_layer_sizes=(30, ), solver='adam', activation='logistic', learning_rate_init=0.01, random_state=100, max_iter=200) estimator_can.fit(X_can_train, y_can_train) loss_values_can = estimator_can.loss_curve_ estimator_can.fit(X_can_test, y_can_test) loss_can_test = estimator_can.loss_curve_ estimator_phish = MLPClassifier(hidden_layer_sizes=(50, ), solver='adam', activation='logistic', learning_rate_init=0.05, random_state=100, max_iter=200) estimator_phish.fit(X_ph_train, y_ph_train) loss_values_ph = estimator_phish.loss_curve_ estimator_phish.fit(X_ph_test, y_ph_test) loss_ph_test = estimator_phish.loss_curve_ plt.figure() plt.title("Loss Curve for NN") plt.xlabel("Number of Iterations") plt.ylabel("Loss") plt.plot(loss_values_can, 'o-', color="g", label="Train Breast Cancer Data") plt.plot(loss_can_test, 'o-', color="r", label="Test Breast Cancer Data") plt.legend(loc="best") plt.show() plt.figure() plt.title("Loss Curve for NN") plt.xlabel("Number of Iterations") plt.ylabel("Loss") plt.plot(loss_values_ph, 'o-', color="g", label="Train Phishing Data") plt.plot(loss_ph_test, 'o-', color="r", label="Test Phishing Data") plt.legend(loc="best") plt.show() SVM_classifier( X_can_train, y_can_train, X_can_test, y_can_test, title= "Model Complexity Curve for SVM (Breast Cancer Data)\nHyperparameter : Kernel Function" ) C_val, gamma_val = SVMGridSearchCV(X_can_train, y_can_train) estimator_bank = svm.SVC(C=C_val, gamma=gamma_val, kernel='linear', random_state=100) train_samp_can, SVM_train_score_can, SVM_fit_time_can, SVM_pred_time_can = plot_learning_curve( estimator_bank, 5, X_can_train, y_can_train, title="SVM Breast Cancer Data") SVM_fp_can, SVM_tp_can = svm_classifier_evaluation( estimator_bank, X_can_train, X_can_test, y_can_train, y_can_test, title="SVM Breast Cancer Data") SVM_classifier( X_ph_train, y_ph_train, X_ph_test, y_ph_test, title= "Model Complexity Curve for SVM (Phishing Data)\nHyperparameter : Kernel Function" ) C_val, gamma_val = SVMGridSearchCV(X_ph_train, y_ph_train) estimator_phish = svm.SVC(C=C_val, gamma=gamma_val, kernel='linear', random_state=100) train_samp_phish, SVM_train_score_phish, SVM_fit_time_phish, SVM_pred_time_phish = plot_learning_curve( estimator_phish, 10, X_ph_train, y_ph_train, title="SVM Phishing Data") SVM_fp_ph, SVM_tp_ph = svm_classifier_evaluation(estimator_phish, X_ph_train, X_ph_test, y_ph_train, y_ph_test, title="SVM Phishing Data") AdaBoost_classifier( X_can_train, y_can_train, X_can_test, y_can_test, 3, 50, title= "Model Complexity Curve for Boosted Tree (Breast Cancer Data)\nHyperparameter : No. Estimators" ) start_leaf_n = round(0.005 * len(X_can_train)) end_leaf_n = round( 0.05 * len(X_can_train)) #leaf nodes of size [0.5%, 5% will be tested] n_est, learn_rate = BoostedGridSearchCV(start_leaf_n, end_leaf_n, X_can_train, y_can_train) dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1) estimator_can = AdaBoostClassifier(base_estimator=dt_stump, n_estimators=n_est, random_state=100) train_samp_can, BT_train_score_can, BT_fit_time_can, BT_pred_time_can = plot_learning_curve( estimator_can, 5, X_can_train, y_can_train, title="Boosted Tree Breast Cancer Data") BT_fp_can, BT_tp_can = final_classifier_evaluation( estimator_can, X_can_train, X_can_test, y_can_train, y_can_test, title="Boosted Tree Breast Cancer Data") AdaBoost_classifier( X_ph_train, y_ph_train, X_ph_test, y_ph_test, 3, 50, title= "Model Complexity Curve for Boosted Tree (Phishing Data)\nHyperparameter : No. Estimators" ) start_leaf_n = round(0.005 * len(X_ph_train)) end_leaf_n = round( 0.05 * len(X_ph_train)) #leaf nodes of size [0.5%, 5% will be tested] n_est, learn_rate = BoostedGridSearchCV(start_leaf_n, end_leaf_n, X_ph_train, y_ph_train) dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1) estimator_phish = AdaBoostClassifier(base_estimator=dt_stump, n_estimators=n_est, random_state=100) train_samp_phish, BT_train_score_phish, BT_fit_time_phish, BT_pred_time_phish = plot_learning_curve( estimator_phish, 10, X_ph_train, y_ph_train, title="Boosted Tree Phishing Data") BT_fp_phish, BT_tp_phish = final_classifier_evaluation( estimator_phish, X_ph_train, X_ph_test, y_ph_train, y_ph_test, title="Boosted Tree Phishing Data") compare_fit_time(train_samp_can, NN_fit_time_can, SVM_fit_time_can, kNN_fit_time_can, DT_fit_time_can, BT_fit_time_can, 'Breast Cancer Dataset') compare_pred_time(train_samp_can, NN_pred_time_can, SVM_pred_time_can, kNN_pred_time_can, DT_pred_time_can, BT_pred_time_can, 'Breast Cancer Dataset') compare_roc(NN_fp_can, NN_tp_can, SVM_fp_can, SVM_tp_can, KNN_fp_can, KNN_tp_can, DTree_fp_can, DTree_tp_can, BT_fp_can, BT_tp_can, 'Breast Cancer Dataset') compare_fit_time(train_samp_phish, NN_fit_time_phish, SVM_fit_time_phish, kNN_fit_time_phish, DT_fit_time_phish, BT_fit_time_phish, 'Phishing Dataset') compare_pred_time(train_samp_phish, NN_pred_time_phish, SVM_pred_time_phish, kNN_pred_time_phish, DT_pred_time_phish, BT_pred_time_phish, 'Phishing Dataset') compare_roc(NN_fp_ph, NN_tp_ph, SVM_fp_ph, SVM_tp_ph, KNN_fp_ph, KNN_tp_ph, DTree_fp_phish, DTree_tp_phish, BT_fp_phish, BT_tp_phish, 'Phishing Dataset') # In[103]: classifiers = ('Decision tree', 'kNN', 'Neural network', 'SVM', 'AdaBoost') y_pos = np.arange(len(classifiers)) f1_score_ph = (0.92, 0.93, 0.95, 0.93, 0.93) f1_score_can = (0.95, 0.91, 0.98, 0.97, 0.96) # In[106]: plt.figure() plt.barh(y_pos, f1_score_ph) plt.gca().set_yticks(y_pos) plt.gca().set_xlim(0.9, 1.0) plt.gca().set_yticklabels(classifiers) plt.gca().invert_yaxis() # labels read top-to-bottom plt.title('F1 score: Phishing Dataset') plt.xlabel('F1 score') plt.show() # In[107]: plt.figure() plt.barh(y_pos, f1_score_can) plt.gca().set_yticks(y_pos) plt.gca().set_xlim(0.9, 1.0) plt.gca().set_yticklabels(classifiers) plt.gca().invert_yaxis() # labels read top-to-bottom plt.title('F1 score: Breast Cancer Dataset') plt.xlabel('F1 score') plt.show()
#scale the attributes attributes = [ "c1", "c2", "c3", "m1", "m2", "m3", "n1", "n2", "n3", "p1", "p2", "p3" ] scaled_attributes = scaledata(dataframe, attributes) scaled_predict = scaledata(dataframe_predict, attributes) #fitting training data model.fit(scaled_attributes, target) #calculate predictions according to fitted model probeB_tna = model.predict(scaled_predict) #calculate Rsquared r2 = metrics.r2_score(target, model.predict(scaled_attributes)) return r2, probeB_tna #kNNreg model kNNregModel = kNN(n_neighbors=6) #Rsquared of kNNreg print "kNNreg:", Rsquared(kNNregModel, probeAs, probeA["tna"], probeBs)[0] #output probeB_tna to csv file "tnaB.csv" with open("tnaB.csv", "wb") as f: thewriter = csv.writer(f) thewriter.writerow(["tna"]) for val in Rsquared(kNNregModel, probeAs, probeA["tna"], probeBs)[1]: thewriter.writerow([val])
print('irisData:\n', irisData) print('irisTarget:\n', irisTarget) # 选择模型 # RandomForest ''' model = RandomForestClassifier() # 参数搜索空间 param_grid = { 'max_depth': np.arange(1, 20, 1), 'n_estimators': np.arange(1, 50, 10), 'max_leaf_nodes': np.arange(2, 100, 10) } ''' # kNN model = kNN() # 参数搜索空间 param_grid = { 'n_neighbors': list(range(2, 11)), 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'] } # 网格搜索模型参数 grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_micro') grid_search.fit(irisData, irisTarget) print('best_params_:\n', grid_search.best_params_) print('grid_search best_score_:\n', grid_search.best_score_) print('best_estimator_:\n', type(grid_search.best_estimator_)) '''' # 随机搜索模型参数 rd_search = RandomizedSearchCV(model, param_grid, n_iter=200, cv=5, scoring='f1_micro') rd_search.fit(x, y)
import pandas data = pandas.read_csv('/Users/snehamitta/Desktop/ML/Assignment1/Fraud(1).csv', delimiter=',') from sklearn.neighbors import NearestNeighbors as kNN import numpy as np kNNSpec = kNN(n_neighbors=5, algorithm='brute', metric='euclidean') trainData = data[[ 'TOTAL_SPEND', 'DOCTOR_VISITS', 'NUM_CLAIMS', 'MEMBER_DURATION', 'OPTOM_PRESC', 'NUM_MEMBERS' ]] print(trainData.shape) # Build nearest neighbors nbrs = kNNSpec.fit(trainData) print(nbrs.shape) distances, indices = nbrs.kneighbors(trainData) print(distances) print(indices) target = data[['FRAUD']] print(target.shape) from sklearn.neighbors import KNeighborsClassifier neigh = KNeighborsClassifier(n_neighbors=5, algorithm='brute', metric='euclidean') nbrs = neigh.fit(trainData, target) class_result = nbrs.predict(trainData)
def kNN_pack(xtrain, xtest, ytrain, k): model_kNN = kNN(n_neighbors=k) model_kNN.fit(xtrain, ytrain) ypre = model_kNN.predict(xtest) return ypre
#k-Nearest Neighbours start = time.time() #To create TFxIDF Matrix by converting document terms into a TF-IDF matrix tfidfvector = TfIdf(min_df=0.01, max_df=0.5, sublinear_tf=True, stop_words='english') #Model for Dataset 1 x_train1_tfidf_matrix = tfidfvector.fit(x_variableTrain1) x_train1_tfidf_matrix = tfidfvector.transform(x_variableTrain1) x_test1_tfidf_matrix = tfidfvector.transform(x_variableTest1) k_nearest = kNN(n_neighbors=12, metric="minkowski", random_state=1) k_n_n = k_nearest.fit(x_train1_tfidf_matrix, y_variableTrain1) knn_pred1 = k_n_n.predict(x_test1_tfidf_matrix) knn_pred1_train = k_n_n.predict(x_train1_tfidf_matrix) accuracy_dataset1 = metrics.accuracy_score(y_variableTest1, knn_pred1) accuracy_dataset1_train = metrics.accuracy_score(y_variableTrain1, knn_pred1_train) accuracy_dataset1 *= 100 accuracy_dataset1_train *= 100 print( "Accuray on Testing Dataset1 (Debate Data) Using k-Nearest Neighbour: %.2f" % (accuracy_dataset1) + "%") print( "Accuray on Training Dataset1 (Debate Data) Using k-Nearest Neighbour: %.2f" % (accuracy_dataset1_train) + "%")
train.head() # In[ ]: from mlxtend.classifier import StackingCVClassifier as SCVC # In[ ]: from sklearn.neighbors import KNeighborsClassifier as kNN from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier as RFC from sklearn.linear_model import RidgeClassifier as RC # In[ ]: clf1 = kNN() clf2 = SVC(probability=True) clf3 = RFC() meta_clf = RC() # In[ ]: stacker = SCVC(classifiers=[clf1, clf2, clf3, clf1], meta_classifier=meta_clf, use_probas=True, use_features_in_secondary=True) # In[ ]: for c in train.columns: train[c] = train[c].fillna(train[c].median())
# for iter in range(1000): # print nca(A, X, y) # Ascale.append( np.sum(A) ) # # sns.plt.plot(Ascale) # sns.plt.show() # from sklearn.datasets import make_classification from sklearn.neighbors import KNeighborsClassifier as kNN from sklearn.model_selection import cross_val_score X, y = make_classification(n_samples=100, n_features=2, n_redundant=0) plt.scatter(X[:, 0], X[:, 1], c=y) plt.show() clf = kNN(weights='distance') scores = cross_val_score(clf, X, y, scoring='neg_log_loss', cv=25) print(np.mean(scores)) A = np.eye(X.shape[1]) Xt = transform(A, X) print(Xt.shape) Ascale = [] for iter in range(20): if iter % 5 == 0: print('Iteration', iter) nca(A, X, y) # print 'A',A flattenedA = np.sum(A) Ascale.append(np.sum(A))
hour = train.Dates.dt.hour hour = pd.get_dummies(hour) #Build new array train_data = pd.concat([hour, days, district], axis=1) train_data['crime_category']= crime_category #Repeat for test data days = pd.get_dummies(test.DayOfWeek) district = pd.get_dummies(test.PdDistrict) hour = test.Dates.dt.hour hour = pd.get_dummies(hour) # Array for the test data test_data = pd.concat([hour, days, district], axis=1) training, validation = train_test_split(train_data, train_size=.66) # BUILDING THE CLASSIFIER daysfeatures = [x for x in days] districtfeatures = [x for x in district] hoursf = [h for h in hour] features = daysfeatures + districtfeatures + hoursf classifier = kNN() classifier.fit(training[features], training['crime_category']) validation_predict = classifier.predict(validation[features]) # Testing the model print accuracy_score(validation['crime_category'], validation_predict)
f1_train.append(f1_score(y_train, y_pred_train)) plt.plot(klist, f1_test, 'o-', color='r', label='Test F1 Score') plt.plot(klist, f1_train, 'o-', color='b', label='Train F1 Score') plt.ylabel('Model F1 Score') plt.xlabel('No. Neighbors') plt.title(title) plt.legend(loc='best') plt.tight_layout() # plt.show() plt.savefig(algoName + "hyper" + ".png") print("stage 1") hyperKNN( X_train, y_train, X_test, y_test, title= "Model Complexity Curve for kNN (Phishing Data)\nHyperparameter : No. Neighbors" ) print("stage 2") estimator_phish = kNN(n_neighbors=25, n_jobs=-1) print("stage 3") train_samp_phish, kNN_train_score_phish, kNN_fit_time_phish, kNN_pred_time_phish = plot_learning_curve( estimator_phish, X_train, y_train, title="kNN Phishing Data") print("stage 4") final_classifier_evaluation(estimator_phish, X_train, X_test, y_train, y_test)
def combinational_cost(data1, data2, no_of_folds): ''' Parameters ---------- data1 : arrays matrix of all costs of group1 (normal). Each individual feature should be arrnaged as a column data2 : arrays matrix of all costs of group2 (abnormal). Each individual feature should be arrnaged as a column no_of_folds : int specify number of folds for nested cross-validation Returns ------- accuracy and AUC of the combinational cost function based on different supervised-learning classifiers for identifying mis-registrations. ''' print( f'classifier comparison for DTI metrics with {no_of_folds} fold cross-validation --------------' ) # transposing and creating labels for data1 X_normal = np.transpose(data1) x_normal_label = np.zeros(len(X_normal)) # transposing and creating labels for data2 X_misaligned = np.transpose(data2) x_misaligned_label = np.ones(len(X_misaligned)) # combining data1 and data2 and the corresponding labels X = np.concatenate((X_normal, X_misaligned)) y = np.concatenate((x_normal_label, x_misaligned_label)) # scaling the costs (features) to make sure the ranges of individual features are same to avoid the effect of features that have relatively large values. It may not be necessary in this case as all these 3 costs lie between 0 and 1 scale = MaxAbsScaler() X = scale.fit_transform(X) # K-fold cross validation, n_splits specifies the number of folds folds = StratifiedKFold(n_splits=no_of_folds) scores_lda = [] scores_qda = [] scores_rfc = [] scores_svm = [] scores_gnb = [] scores_knn = [] scores_lor = [] scores_ada = [] scores_gra = [] scores_ann = [] auc_lda = [] auc_qda = [] auc_rfc = [] auc_svm = [] auc_gnb = [] auc_knn = [] auc_lor = [] auc_ada = [] auc_gra = [] auc_ann = [] for train_index, test_index in folds.split(X, y): X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[ train_index], y[test_index] # 1. Linear Discriminant Analysis Classifier lda = LDA(solver='eigen', shrinkage='auto', n_components=1) scores_lda.append( classifier_accuracy(lda, X_train, X_test, y_train, y_test)[0]) # Accuracy auc_lda.append( classifier_accuracy(lda, X_train, X_test, y_train, y_test)[1]) # AUC # 1a. Quadratic Discriminant Analysis Classifier qda = QDA() scores_qda.append( classifier_accuracy(qda, X_train, X_test, y_train, y_test)[0]) auc_qda.append( classifier_accuracy(qda, X_train, X_test, y_train, y_test)[1]) # 2. Random Forest Classifier (it could be done in LDA transformed space if you have large number of features) rfc = RandomForestClassifier(criterion='gini', n_estimators=100) scores_rfc.append( classifier_accuracy(rfc, X_train, X_test, y_train, y_test)[0]) auc_rfc.append( classifier_accuracy(rfc, X_train, X_test, y_train, y_test)[1]) # 3. Support Vector Machine Classifier svc = SVC(kernel='rbf', gamma=2, probability=True) scores_svm.append( classifier_accuracy(svc, X_train, X_test, y_train, y_test)[0]) auc_svm.append( classifier_accuracy(svc, X_train, X_test, y_train, y_test)[1]) # 4. Gaussian Naive Bayes Classifier gnb = GaussianNB() scores_gnb.append( classifier_accuracy(gnb, X_train, X_test, y_train, y_test)[0]) auc_gnb.append( classifier_accuracy(gnb, X_train, X_test, y_train, y_test)[1]) # 5. k-Nearest Neighbour Classifier knn = kNN(n_neighbors=15) scores_knn.append( classifier_accuracy(knn, X_train, X_test, y_train, y_test)[0]) auc_knn.append( classifier_accuracy(knn, X_train, X_test, y_train, y_test)[1]) # 6. Logistic Regression Classifier lor = LogisticRegression() scores_lor.append( classifier_accuracy(lor, X_train, X_test, y_train, y_test)[0]) auc_lor.append( classifier_accuracy(lor, X_train, X_test, y_train, y_test)[1]) # 7. Ada Boost Classifier ada = AdaBoostClassifier(n_estimators=100) scores_ada.append( classifier_accuracy(ada, X_train, X_test, y_train, y_test)[0]) auc_ada.append( classifier_accuracy(ada, X_train, X_test, y_train, y_test)[1]) # 7a. Gradient Boosting Classifier gra = GradientBoostingClassifier(random_state=0) scores_gra.append( classifier_accuracy(gra, X_train, X_test, y_train, y_test)[0]) auc_gra.append( classifier_accuracy(gra, X_train, X_test, y_train, y_test)[1]) # 8. Arteficial Neural Network (Deep Learning) # model_ann = tf.keras.models.Sequential() # model_ann.add(tf.keras.layers.Dense(units = np.shape(X_train)[1] + 1, activation = 'relu', input_shape = (np.shape(X_train)[1],))) # input_shape takes height of the input layer which is usually fed during first dense layer allocation # model_ann.add(tf.keras.layers.Dense(units = np.shape(X_train)[1] + 1, activation = 'relu')) # hidden layer # model_ann.add(tf.keras.layers.Dense(units = np.shape(X_train)[1] + 2, activation = 'relu')) # hidden layer # model_ann.add(tf.keras.layers.Dense(units = np.shape(X_train)[1] + 1, activation = 'relu')) # hidden layer # model_ann.add(tf.keras.layers.Dense(units = 2, activation = 'softmax')) # hidden layer # model_ann.compile(optimizer = 'sgd', loss = 'binary_crossentropy', metrics = ['accuracy']) # compile the neural network # model_ann.fit(X_train, y_train, epochs = 20) # fit the neural network on the training data # scores_ann.append(model_ann.evaluate(X_test, y_test)) # network accuracy # auc_ann.append(metrics.roc_auc_score(y_test, model_ann.predict_proba(X_test)[:, 1])) # network AUC # Note: 'cross_val_score' method from sklearn could be used directly on the classifier model to avoid the above for loop. Further, f1-score could be used instead of accuracy metric if number of positive samples (mis-aligned) are low. print( f'accuracy using LDA classifier for dti measures is: {np.average(scores_lda)}, AUC is: {np.average(auc_lda)}\n' ) print( f'accuracy using QDA classifier for dti measures is: {np.average(scores_qda)}, AUC is: {np.average(auc_qda)}\n' ) print( f'accuracy using RandomForest classifier for dti measures is: {np.average(scores_rfc)}, AUC is: {np.average(auc_rfc)}\n' ) print( f'accuracy using SVM classifier for dti measures is: {np.average(scores_svm)}, AUC is: {np.average(auc_svm)}\n' ) print( f'accuracy using Naive Bayes classifier for dti measures is: {np.average(scores_gnb)}, AUC is: {np.average(auc_gnb)}\n' ) print( f'accuracy using kNN classifier for dti measures is: {np.average(scores_knn)}, AUC is: {np.average(auc_knn)}\n' ) print( f'accuracy using Logistic Regression classifier for dti measures is: {np.average(scores_lor)}, AUC is: {np.average(auc_lor)}\n' ) print( f'accuracy using Ada Boost classifier for dti measures is: {np.average(scores_ada)}, AUC is: {np.average(auc_ada)}\n' ) print( f'accuracy using Gradient boosting classifier for dti measures is: {np.average(scores_gra)}, AUC is: {np.average(auc_gra)}\n' ) #print(f'accuracy using ANN for dti measures is: {np.average(scores_ann)}, AUC is: {np.average(auc_ann)}\n') save_model = 'D:/Tummala/Parkinson-Data/ml-models-dti1000' if not os.path.exists(save_model): os.makedirs(save_model) # saving the trained model, e.g. shown for saving ada boost classifier model and minmax scaling model pickle.dump(scale, open(save_model + '/' + 'scale', 'wb')) pickle.dump(lda, open(save_model + '/' + 'lda', 'wb')) pickle.dump(qda, open(save_model + '/' + 'qda', 'wb')) pickle.dump(rfc, open(save_model + '/' + 'rfc', 'wb')) pickle.dump(svc, open(save_model + '/' + 'svm', 'wb')) pickle.dump(gnb, open(save_model + '/' + 'gnb', 'wb')) pickle.dump(knn, open(save_model + '/' + 'knn', 'wb')) pickle.dump(lor, open(save_model + '/' + 'lor', 'wb')) pickle.dump(ada, open(save_model + '/' + 'ada_boost', 'wb'))
'Alcalinity of ash', 'Magnesium', 'Total phenols' 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline' ] """ Classes = data[0] Features = data.drop(0, axis = 1) from sklearn.cross_validation import KFold from sklearn.preprocessing import scale from sklearn.grid_search import GridSearchCV from sklearn.neighbors import KNeighborsClassifier as kNN CV = KFold(Classes.count(), n_folds = 5, shuffle = True, random_state = 42) params = {'n_neighbors':list(range(1,50))} grid = GridSearchCV(kNN(), params, cv = CV) grid.fit(X = Features, y = Classes) print(grid.best_score_) print(grid.best_params_) X_scaled = scale(Features) grid_scaled = GridSearchCV(kNN(), params, cv = CV) grid_scaled.fit(X = X_scaled, y = Y) print(grid_scaled.best_score_) print(grid_scaled.best_params_)
# In[15]: phishX, phishY, bankX, bankY = import_data() X_train, X_test, y_train, y_test = train_test_split(np.array(phishX), np.array(phishY), test_size=0.20) hyperKNN( X_train, y_train, X_test, y_test, title= "Model Complexity Curve for kNN (Phishing Data)\nHyperparameter : No. Neighbors" ) estimator_phish = kNN(n_neighbors=20, n_jobs=-1) train_samp_phish, kNN_train_score_phish, kNN_fit_time_phish, kNN_pred_time_phish = plot_learning_curve( estimator_phish, X_train, y_train, title="kNN Phishing Data") final_classifier_evaluation(estimator_phish, X_train, X_test, y_train, y_test) X_train, X_test, y_train, y_test = train_test_split(np.array(bankX), np.array(bankY), test_size=0.20) hyperKNN( X_train, y_train, X_test, y_test, title= "Model Complexity Curve for kNN (Banking Data)\nHyperparameter : No. Neighbors" )
def classifier(self, sdss, des, des_tags=None, sdss_tags = None, train=None, train_size=60000): print 'after prior cut (s/d) :', len(sdss), len(des) print 'train_size ', train_size """ dperp_sdss = ( sdss['MODELMAG_R'] - sdss['MODELMAG_I']) - (sdss['MODELMAG_G'] - sdss['MODELMAG_R'])/8. sdss_cuts = ( ( dperp_sdss > 0.55 ) & (sdss['CMODELMAG_I'] < (19.86 + 1.6*(dperp_sdss - 0.8))) & (sdss['CMODELMAG_I'] < 19.9) & (sdss['CMODELMAG_I'] > 17.5) & (sdss['FIBER2MAG_I'] < 21.5) & ((sdss['MODELMAG_R'] - sdss['MODELMAG_I']) < 2.) ) """ sdss_matched, des_matched = sdss, des #sdss_matched, des_matched = DES_to_SDSS.match(sdss, des) #sdss, des = DES_to_SDSS.match(sdss, des) #sdss_cuts = SDSS_cmass_criteria(sdss) #sdss, des = DES_to_SDSS.match(sdss, des) #if train is None: sdss_cuts = self._SDSS_cmass_criteria(sdss_matched) train = np.random.choice(np.arange(des_matched.size), size=train_size, replace=False) train_mask = np.zeros(des_matched.size, dtype=bool) train_mask[train] = 1 test = np.where(~train_mask)[0] test_size = np.sum((~train_mask)) x = self._arrange_data_for_fitting(des_matched[train],tags=des_tags) y = np.zeros(train_size) y[sdss_cuts[train]] = 1 x_test = self._arrange_data_for_fitting(des_matched[test],tags=des_tags) y_test = np.zeros(test_size) y_test[sdss_cuts[test]] = 1 #x_all = self._arrange_data_for_fitting(des,tags=des_tags) #y_all = np.zeros(des.size) #y_all[sdss_cuts] = 1 """ elif train is not None: train_sample = train.copy() sdss_matched, train_sample = DES_to_SDSS.match(sdss, train_sample) sdss_cuts = self._SDSS_cmass_criteria(sdss_matched) train = np.random.choice(np.arange(train_sample.size), size=train_size, replace=False) train_mask = np.zeros(train_sample.size, dtype=bool) train_mask[train] = 1 test = np.where(~train_mask)[0] test_size = np.sum((~train_mask)) x = self_.arrange_data_for_fitting(train_sample[train],tags=des_tags) y = np.zeros(train_size) y[sdss_cuts[train]] = 1 x_test = self_.arrange_data_for_fitting(train_sample[test],tags=des_tags) y_test = np.zeros(test_size) y_test[sdss_cuts[test]] = 1 #x_all = self_.arrange_data_for_fitting(des,tags=des_tags) #y_all = np.zeros(des.size) #y_all[sdss_cuts] = 1 """ print 'train set ', np.sum(train_mask), ' test set ', np.sum((~train_mask)) print 'cmass/train', np.sum(sdss_cuts[train]), ' cmass/test', np.sum(sdss_cuts[test]), ' total', np.sum(sdss_cuts) #from sklearn.ensemble import RandomForestClassifier as rfc #from sklearn.ensemble import AdaBoostClassifier as rfc #from sklearn.ensemble import GradientBoostingClassifier as rfc #pl = rfc(n_estimators=1000) from sklearn.neighbors import KNeighborsClassifier as kNN n_neighbors = 100 # int(train_size * 0.02) print 'n_neighbors', n_neighbors pl = kNN(n_neighbors=n_neighbors,weights='distance',p=2,n_jobs=-1) pl.fit(x,y) predict_test = pl.predict(x_test) truth_test = y_test == 1 predict_test_all = pl.predict(x_all) #truth_test_all = y_all == 1 print "Classifier completeness:", np.sum(predict_test * truth_test) *1. / np.sum(truth_test) print "Classifier purity:", np.sum(predict_test * truth_test) * 1./np.sum(predict_test) print "number (test/all)", np.sum(predict_test), np.sum(predict_test_all) # Now reverse it, and see what SDSS galaxies we can be sure are selected by the classifier. #sdss_matched, des_matched = DES_to_SDSS.match(sdss, des) x = arrange_data_for_fitting(sdss[train],tags=sdss_tags) y = np.zeros(sdss.size) y[predict_test_all[train] == 1] = 1 x_test = arrange_data_for_fitting(sdss_matched[test],tags=sdss_tags) y_test[predict_test == 1] = 1 x_all = arrange_data_for_fitting(sdss,tags=sdss_tags) y_all[predict_test_all == 1] = 1 pl2 = kNN(n_neighbors=n_neighbors,weights='distance',p=2,n_jobs=-1) pl2.fit(x,y) predict_rev = pl2.predict(x_test) good = (predict_rev ==0) & (predict_test == 1) predict_rev_all = pl2.predict(x_all) good = (predict_rev_all ==0) & (predict_test_all == 1) print "Reverse classifier completeness:", np.sum(predict_rev * predict_test ) *1. / np.sum(predict_test) print "Reverse classifier purity:", np.sum(predict_rev * predict_test) * 1./np.sum(predict_rev) return pl, (predict_test_all == 1), (predict_test_all == 1)
# print(image_test, label_test); image_test = read_image(test_image_path) label_test = read_label(test_label_path) image_train = read_image(train_image_path) label_train = read_label(train_label_path) # print_num(image_train[0]) # print(label_train) # 将图像二值化处理,大于等于128设置为1 小于128为0 误差率会上升,通用性更强 image_test = pic_binary(image_test) image_train = pic_binary(image_train) # print_num(image_train[0]) knn = kNN(n_neighbors = 5, algorithm = 'auto') knn.fit(image_train, label_train) res = knn.predict(image_test[:100]) # right = 0 wrong = 0 for i in range(len(res)): if (res[i] == label_test[i]): right += 1 print("预测值:%s,真是值:%s" % (res[i], label_test[i])) else: wrong += 1 print("\033预测值:%s,真是值:%s\033" % (res[i], label_test[i])) print("准确率:%2f%%" % (right*100/(right+wrong)))
plt.title('Twin moon dataset') plt.show() #Divide train and test, forget about the latter for a while X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2) test_errors = [] errors = [] stds = [] cv_numb = 10 upper_bound_k = len(X_train) - (len(X_train) // cv_numb) - 1 print('K from 1 to {}'.format(upper_bound_k)) K = range(0, upper_bound_k, 10) for k in K: #Create knn classifier and fit with knn = kNN(n_neighbors=k + 1) #Perform cross validation with cv_numb divisions results = cross_val_score(knn, X_train, y_train, cv=cv_numb) #Mean error and std for this step of cross-validation mean_error = np.array(results).sum() / cv_numb std = np.array(results).std() errors.append(1 - mean_error) stds.append(std) #How this k performs on future data? Retrain with that k and whole set and score test tstknn = kNN(n_neighbors=k + 1).fit(X_train, y_train) y_pred = tstknn.predict(X_test) test_errors.append(1 - np.sum(y_pred == y_test) / len(y_pred))
from sklearn import datasets from sklearn import neighbors, metrics from sklearn.neighbors import KNeighborsClassifier as kNN import matplotlib.pyplot as plt Digits = datasets.load_digits() Imglabels = [x for x in list(zip(Digits.images, Digits.target)) if x[1] == 7 or x[1] == 3] for ind, (image, label) in enumerate(Imglabels[:4]): plt.subplot(2, 4, ind + 1) plt.axis('off') plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest') plt.title('Training: label') plt.show() num = len(Digits.images) #calculates number imgs = Digits.images.reshape((num, -1)) labs = Digits.target y_trainset, x_trainset = labs[:int(num*.7)].reshape(-1,), imgs[:int(num*.7)] y_testset, x_testset= labs[int(num*.7):].reshape(-1,), imgs[int(num*.7):] neighbor = kNN(n_neighbors=3) neighbor.fit(x_trainset, y_trainset) new_val = neighbor.predict(x_testset) print("kNN classifirer reports: %s:\n%s\n" % (neighbor, metrics.classification_report(y_testset, new_val))) print("Confusion matrix is:", metrics.confusion_matrix(y_testset, new_val))