def test_RadiusNeighborsClassifier_multioutput(): """Test k-NN classifier on multioutput data""" rng = check_random_state(0) n_features = 2 n_samples = 40 n_output = 3 X = rng.rand(n_samples, n_features) y = rng.randint(0, 3, (n_samples, n_output)) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) weights = [None, 'uniform', 'distance', _weight_func] for algorithm, weights in product(ALGORITHMS, weights): # Stack single output prediction y_pred_so = [] for o in range(n_output): rnn = neighbors.RadiusNeighborsClassifier(weights=weights, algorithm=algorithm) rnn.fit(X_train, y_train[:, o]) y_pred_so.append(rnn.predict(X_test)) y_pred_so = np.vstack(y_pred_so).T assert_equal(y_pred_so.shape, y_test.shape) # Multioutput prediction rnn_mo = neighbors.RadiusNeighborsClassifier(weights=weights, algorithm=algorithm) rnn_mo.fit(X_train, y_train) y_pred_mo = rnn_mo.predict(X_test) assert_equal(y_pred_mo.shape, y_test.shape) assert_array_almost_equal(y_pred_mo, y_pred_so)
def get_classifier(classifier_str): ''' This functions maps the classifier string classifier_str to the corresponding classifier object with the default paramers set. ''' # SVC if (classifier_str == 'linearsvc'): cl = svm.LinearSVC(**svm_default_param) elif (classifier_str == 'svc_linear'): libsvm_default_param['kernel'] = 'linear' cl = svm.SVC(**libsvm_default_param) elif (classifier_str == 'svc_rbf'): libsvm_default_param['kernel'] = 'rbf' cl = svm.SVC(**libsvm_default_param) # polynomial, sigmoid kernel # nuSVC # Nearest Neighbors (euclidian distance used by default) elif (classifier_str == 'kn_uniform'): kn_default_param['weights'] = 'uniform' cl = neighbors.KNeighborsClassifier(**kn_default_param) elif (classifier_str == 'kn_distance'): kn_default_param['weights'] = 'distance' cl = neighbors.KNeighborsClassifier(**kn_default_param) elif (classifier_str == 'rn_uniform'): rn_default_param['weights'] = 'uniform' cl = neighbors.RadiusNeighborsClassifier(**rn_default_param) elif (classifier_str == 'rn_distance'): rn_default_param['weights'] = 'distance' cl = neighbors.RadiusNeighborsClassifier(**rn_default_param) elif (classifier_str == 'nc'): cl = neighbors.NearestCentroid() # LDA and QDA, priors are by default set to 1/len(class) for each class elif (classifier_str == 'lda'): cl = lda.LDA() elif (classifier_str == 'qda'): cl = qda.QDA() # Gaussion naive bayes # from the code it is unclear how priors are set elif (classifier_str == 'gnb'): cl = naive_bayes.GaussianNB() elif (classifier_str == 'mnb'): cl = naive_bayes.MultinomialNB() elif (classifier_str == 'bnb'): cl = naive_bayes.BernoulliNB() # Decision tree elif (classifier_str == 'dtree'): cl = tree.DecisionTreeClassifier() elif (classifier_str == 'rforest'): cl = ensemble.RandomForestClassifier() else: # raise error if classifier not found raise ValueError('Classifier not implemented: %s' % (classifier_str)) return (cl)
def _init_model(self, parms=None): """Set ML model""" from sklearn import neighbors if self.args.radius == 0: if parms is not None: model = neighbors.KNeighborsClassifier(**parms) else: model = neighbors.KNeighborsClassifier() else: if parms is not None: model = neighbors.RadiusNeighborsClassifier(**parms) else: model = neighbors.RadiusNeighborsClassifier() return 'Neighbors', model
def test_radius_neighbors_classifier_outlier_labeling(): # Test radius-based classifier when no neighbors found and outliers # are labeled. X = np.array([[1.0, 1.0], [2.0, 2.0], [0.99, 0.99], [0.98, 0.98], [2.01, 2.01]]) y = np.array([1, 2, 1, 1, 2]) radius = 0.1 z1 = np.array([[1.01, 1.01], [2.01, 2.01]]) # no outliers z2 = np.array([[1.4, 1.4], [1.01, 1.01], [2.01, 2.01]]) # one outlier correct_labels1 = np.array([1, 2]) correct_labels2 = np.array([-1, 1, 2]) weight_func = _weight_func for algorithm in ALGORITHMS: for weights in ['uniform', 'distance', weight_func]: clf = neighbors.RadiusNeighborsClassifier(radius=radius, weights=weights, algorithm=algorithm, outlier_label=-1) clf.fit(X, y) assert_array_equal(correct_labels1, clf.predict(z1)) assert_array_equal(correct_labels2, clf.predict(z2))
def test_radius_neighbors_classifier(n_samples=40, n_features=5, n_test_pts=10, radius=0.5, random_state=0): """Test radius-based classification""" rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 y = ((X ** 2).sum(axis=1) < .5).astype(np.int) y_str = y.astype(str) weight_func = _weight_func for algorithm in ALGORITHMS: for weights in ['uniform', 'distance', weight_func]: neigh = neighbors.RadiusNeighborsClassifier(radius=radius, weights=weights, algorithm=algorithm) neigh.fit(X, y) epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) y_pred = neigh.predict(X[:n_test_pts] + epsilon) assert_array_equal(y_pred, y[:n_test_pts]) neigh.fit(X, y_str) y_pred = neigh.predict(X[:n_test_pts] + epsilon) assert_array_equal(y_pred, y_str[:n_test_pts])
def KNNski(x, r, X, Y, W='uniform'): # we create an instance of Neighbours Classifier and fit the data. clf = neighbors.RadiusNeighborsClassifier(r, weights=W, metric='seuclidean') clf.fit(X, Y) x = np.atleast_2d(x) return clf.predict(x)
def _fit(cls, dataset, **hp): classifier = neighbors.RadiusNeighborsClassifier(outlier_label=0, radius=hp['radius']) # Train the model using the training set classifier.fit(dataset.train_data, dataset.train_target) # Return score on test set return cls._score(classifier.predict(dataset.test_data), dataset.test_target)
def EvalKNNRadius(XT, YT, p=0.25, weights='uniform'): xtrain, xtest, ytrain, ytest = V.train_test_split(XT, YT, test_size=p, random_state=0) knn = neighbors.RadiusNeighborsClassifier(radius=30, weights=weights) print "Learning KNN model with " + str(weights) + " weights " kmodel = knn.fit(xtrain, ytrain) y_scores = kmodel.predict(xtest) print "k-radius Train error: " + str(kmodel.score(xtrain, ytrain)) print "k-radius Test error: " + str(kmodel.score(xtest, ytest)) eval_regres(ytest, y_scores)
def radNearestNeighborsGridSeach(X, y): param_grid = [{ 'radius': np.linspace(0.1, 2, 20), 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'kd_tree'], 'outlier_label': [-1] }] grid_search = skgs.GridSearchCV(skn.RadiusNeighborsClassifier(), param_grid, cv=5) grid_search.fit(X, y) print 'Best Score of Grid Search: ' + str(grid_search.best_score_) print 'Best Params of Grid Search: ' + str(grid_search.best_params_)
def rnc_tunning(Train, Test, sampling=None, scores='f1', label='FRAUDE'): Train = pd.concat([Train, Test], axis=0, ignore_index=True) yTrain = Train[[label]] xTrain = Train del xTrain[label] if sampling == None: pass elif sampling == 'ALLKNN': xTrain, yTrain = under_sampling(xTrain, yTrain) else: xTrain, yTrain = over_sampling(xTrain, yTrain, model=sampling) tuned_parameters = [{'radius': list(numpy.arange(1, 101, 1)), 'weights': ['distance'], 'algorithm': ['auto'], 'leaf_size': list(numpy.arange(30, 301, 30)), 'p': [2], 'outlier_label': [-1] }] fileModel = GridSearchCV(neighbors.RadiusNeighborsClassifier(), param_grid=tuned_parameters, cv=10, scoring='%s_macro' % scores) fileModel.fit(xTrain.drop(['id_siniestro'], axis=1).values, yTrain[label].values) print("Best parameters set found on development set:") print() dict_values = fileModel.best_params_ print(dict_values) print() print("Grid scores on development set:") print() means = fileModel.cv_results_['mean_test_score'] stds = fileModel.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, fileModel.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) radius = int(dict_values['radius']) leaf_size = int(dict_values['leaf_size']) df = pd.DataFrame.from_dict(dict_values, orient="index") df.to_csv('final_files\\sup_rnc.csv', sep=';', encoding='latin1', index=False) return radius, leaf_size, sampling
def test_radius_neighbors_classifier_zero_distance(): """ Test radius-based classifier, when distance to a sample is zero. """ X = np.array([[1.0, 1.0], [2.0, 2.0]]) y = np.array([1, 2]) radius = 0.1 z1 = np.array([[1.01, 1.01], [2.0, 2.0]]) correct_labels1 = np.array([1, 2]) weight_func = _weight_func for algorithm in ALGORITHMS: for weights in ['uniform', 'distance', weight_func]: clf = neighbors.RadiusNeighborsClassifier(radius=radius, weights=weights, algorithm=algorithm) clf.fit(X, y) assert_array_equal(correct_labels1, clf.predict(z1))
def test_radius_neighbors_classifier_when_no_neighbors(): """ Test radius-based classifier when no neighbors found. In this case it should rise an informative exception """ X = np.array([[1.0, 1.0], [2.0, 2.0]]) y = np.array([1, 2]) radius = 0.1 z1 = np.array([[1.01, 1.01], [2.01, 2.01]]) # no outliers z2 = np.array([[1.01, 1.01], [1.4, 1.4]]) # one outlier weight_func = _weight_func for algorithm in ALGORITHMS: for weights in ['uniform', 'distance', weight_func]: clf = neighbors.RadiusNeighborsClassifier(radius=radius, weights=weights, algorithm=algorithm) clf.fit(X, y) clf.predict(z1) assert_raises(ValueError, clf.predict, z2)
def knnRadiusTuning(): train = getTrainingData('train.csv', visualize=False) X = train.drop(['Exited'], axis=1) sc = StandardScaler() X = sc.fit_transform(X) y = train.Exited # split training data half half X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) params = { "radius": list(np.arange(5.0, 8.0, 0.1)), "weights": ['uniform', 'distance'] } model = neighbors.RadiusNeighborsClassifier() grid_search_cv = GridSearchCV(model, params, verbose=1, n_jobs=-1, cv=3, scoring='accuracy') # print(grid_search_cv.best_params_) grid_search_cv.fit(X_train, y_train) print_score(grid_search_cv, X_train, y_train, X_test, y_test, train=True) print_score(grid_search_cv, X_train, y_train, X_test, y_test, train=False) ROC(grid_search_cv, X_train, y_train, X_test, y_test, train=True) ROC(grid_search_cv, X_train, y_train, X_test, y_test, train=False) results = pd.DataFrame(grid_search_cv.cv_results_) printFullRow(results[results['rank_test_score'] == 1]) # best param setting: n_neighbors == 11/13, p ==2, weights = distance return # knnRadiusTuning() # knnTuning()
def radius_neighbours(X_train, Y_train, X_test, Y_test): try: a=time.time() radius=0.2 #Come back to this - need a rigorous approach clf=neighbors.RadiusNeighborsClassifier(radius, weights='distance') clf.fit(X_train, Y_train) preds=clf.predict(X_test) print print 'Radius nearest neighbours' print 'Time taken', time.time()-a, 's' print 'Accuracy', sum(preds==Y_test)/(float)(len(preds)) mismatched=preds[preds!=Y_test] print 'False Ia detection', sum(mismatched==1)/(float)(sum(preds==1)) probs=clf.predict_proba(X_test) #fpr, tpr, auc=roc(probs, Y_test) except ValueError: print 'ValueError in RNN - probably due to no neighbours within radius' fpr, tpr, auc , probs, Y_test= (None, None, None, -9999, None) #return fpr, tpr, auc return probs
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) #生成网格型二维数据对 Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Create color maps cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) #给不同区域赋以颜色 cmap_bold = ListedColormap(['#FF0000', '#003300', '#0000FF']) #给不同属性的点赋以颜色 #将预测的结果在平面坐标中画出其类别区域 Z = Z.reshape(xx.shape) plt.figure() plt.pcolormesh(xx, yy, Z, cmap=cmap_light) # 也画出所有的训练集数据 plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold) plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.show() clf1 = neighbors.RadiusNeighborsClassifier(10.0, weights='distance') clf1.fit(X, y) Z1 = clf1.predict(np.c_[xx.ravel(), yy.ravel()]) Z1 = Z1.reshape(xx.shape) plt.figure() plt.pcolormesh(xx, yy, Z1, cmap=cmap_light) # 也画出所有的训练集数据 plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold) plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.show() #由图可以看出稀疏系数越小对表示结果越有利
def classify(dataSet, labels): # 取得knn分类器 knn = neighbors.RadiusNeighborsClassifier(radius=100.0) knn.fit(dataSet, labels) return knn
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2) ''''' 创建网格以方便绘制 ''' h = .02 cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # we create an instance of Neighbours Classifier and fit the data. ''''' 训练RadiusNN分类器 ''' f = 5.0 clf = neighbors.RadiusNeighborsClassifier(radius=f, n_neighbors=n_neighbors, weights='distance') clf.fit(x_train, y_train) '''''测试结果的打印''' answer = clf.predict(X) # print ('X :', X) print('answer :', answer.shape) print('y :', y.shape) print('avg:', np.mean(answer == y)) '''''准确率与召回率''' # precision_recall_curve 二分类问题中使用 # precision, recall, thresholds = precision_recall_curve(y_train, clf.predict(x_train)) # answer = clf.predict_proba(X)[:,1] print(classification_report(y, answer, target_names=['0', '1', '2'])) ''''' 将整个测试空间的分类结果用不同颜色区分开'''
train_data = np.concatenate((data[0:10, ], data[50:60, ], data[100:110, ]), axis=0) train_label = np.concatenate( (labels[0:10, ], labels[50:60, ], labels[100:110, ]), axis=0) # 构造测试数据,数据垂直组合concatenate test_data = np.concatenate((data[10:50, ], data[60:100, ], data[110:150, ]), axis=0) test_label_expected = np.concatenate( (labels[10:50, ], labels[60:100, ], labels[110:150, ]), axis=0) # 分析数据 # 创建分类器 #classifier = nb.KNeighborsClassifier(n_neighbors=3,weights='uniform',algorithm='auto') classifier = nb.RadiusNeighborsClassifier(n_neighbors=2, weights='uniform', algorithm='auto') # 训练分类器 classifier.fit(train_data, train_label) # 预测 test_label_predicted = classifier.predict(test_data) # 比较结果 size = len(test_label_predicted) outer = np.zeros((size), dtype=int) for i in range(size): if test_label_expected[i] != test_label_predicted[i]: outer[i] = 1 result = np.vstack((test_label_expected, test_label_predicted, outer))
def rnc_treshold(Train, Valid, Test, radius, leaf_size, sampling=None, label='FRAUDE', beta=2): # With beta = 2, we give the same importance to Recall and Precision yTrain = Train[label] xTrain = Train del xTrain[label] names = Train.columns.values.tolist() fileNames = numpy.array(names) if sampling == None: pass elif sampling == 'ALLKNN': xTrain, yTrain = under_sampling(xTrain, yTrain) else: xTrain, yTrain = over_sampling(xTrain, yTrain, model=sampling) min_sample_leaf = round((len(xTrain.index)) * 0.005) fileModel = neighbors.RadiusNeighborsClassifier(radius= radius, weights= 'distance', algorithm= 'auto', leaf_size= leaf_size, p= 2, outlier_label= -1) fileModel.fit(xTrain.values, yTrain.values) print(np.median(fileModel.predict_proba(Valid[Valid[label] == 0].drop(label, axis=1).values))) print(np.median(fileModel.predict_proba(Valid[Valid[label] == 1].drop(label, axis=1).values))) tresholds = np.linspace(0.1, 1.0, 200) scores = [] y_pred_score = fileModel.predict_proba(Valid.drop(label, axis=1).values) y_pred_score = np.delete(y_pred_score, 0, axis=1) print('min', y_pred_score.min()) print('max', y_pred_score.max()) for treshold in tresholds: y_hat = (y_pred_score > treshold).astype(int) y_hat = y_hat.tolist() y_hat = [item for sublist in y_hat for item in sublist] scores.append([ recall_score(y_pred=y_hat, y_true=Valid[label].values), precision_score(y_pred=y_hat, y_true=Valid[label].values), fbeta_score(y_pred=y_hat, y_true=Valid[label].values, beta=2)]) scores = np.array(scores) print('max_scores', scores[:, 2].max(), scores[:, 2].argmax()) plot.plot(tresholds, scores[:, 0], label='$Recall$') plot.plot(tresholds, scores[:, 1], label='$Precision$') plot.plot(tresholds, scores[:, 2], label='$F_2$') plot.ylabel('Score') plot.xlabel('Threshold') plot.legend(loc='best') plot.show() final_tresh = tresholds[scores[:, 2].argmax()] y_hat_test = fileModel.predict_proba(Test.drop(label, axis=1).values) y_hat_test = np.delete(y_hat_test, 0, axis=1) y_hat_test = (y_hat_test > final_tresh).astype(int) y_hat_test = y_hat_test.tolist() y_hat_test = [item for sublist in y_hat_test for item in sublist] print('Final threshold: %.3f' % final_tresh) print('Test Recall Score: %.3f' % recall_score(y_pred=y_hat_test, y_true=Test[label].values)) print('Test Precision Score: %.3f' % precision_score(y_pred=y_hat_test, y_true=Test[label].values)) print('Test F2 Score: %.3f' % fbeta_score(y_pred=y_hat_test, y_true=Test[label].values, beta=beta)) cnf_matrix = confusion_matrix(Test[label].values, y_hat_test) plot_confusion_matrix(cnf_matrix, classes=['Normal', 'Anormal'], title='Confusion matrix') featureImportance = fileModel.feature_importances_ featureImportance = featureImportance / featureImportance.max() sorted_idx = numpy.argsort(featureImportance) barPos = numpy.arange(sorted_idx.shape[0]) + 0.5 plot.barh(barPos, featureImportance[sorted_idx], align='center') plot.yticks(barPos, fileNames[sorted_idx]) plot.xlabel('Variable Importance') plot.show() return final_tresh
# import some data to play with iris = datasets.load_iris() X = iris.data[:, :2] # we only take the first two features. We could # avoid this ugly slicing by using a two-dim dataset y = iris.target h = .02 # step size in the mesh # Create color maps cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) for weights in ['uniform', 'distance']: # we create an instance of Neighbours Classifier and fit the data. clf = neighbors.RadiusNeighborsClassifier(n_neighbors=n_neighbors, weights=weights) clf.fit(X, y) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.figure() plt.pcolormesh(xx, yy, Z, cmap=cmap_light)