Esempio n. 1
0
def test_RadiusNeighborsClassifier_multioutput():
    """Test k-NN classifier on multioutput data"""
    rng = check_random_state(0)
    n_features = 2
    n_samples = 40
    n_output = 3

    X = rng.rand(n_samples, n_features)
    y = rng.randint(0, 3, (n_samples, n_output))

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    weights = [None, 'uniform', 'distance', _weight_func]

    for algorithm, weights in product(ALGORITHMS, weights):
        # Stack single output prediction
        y_pred_so = []
        for o in range(n_output):
            rnn = neighbors.RadiusNeighborsClassifier(weights=weights,
                                                      algorithm=algorithm)
            rnn.fit(X_train, y_train[:, o])
            y_pred_so.append(rnn.predict(X_test))

        y_pred_so = np.vstack(y_pred_so).T
        assert_equal(y_pred_so.shape, y_test.shape)

        # Multioutput prediction
        rnn_mo = neighbors.RadiusNeighborsClassifier(weights=weights,
                                                     algorithm=algorithm)
        rnn_mo.fit(X_train, y_train)
        y_pred_mo = rnn_mo.predict(X_test)

        assert_equal(y_pred_mo.shape, y_test.shape)
        assert_array_almost_equal(y_pred_mo, y_pred_so)
Esempio n. 2
0
def get_classifier(classifier_str):
    '''
    This functions maps the classifier string classifier_str to the
    corresponding classifier object with the default paramers set.
    '''

    # SVC
    if (classifier_str == 'linearsvc'):
        cl = svm.LinearSVC(**svm_default_param)
    elif (classifier_str == 'svc_linear'):
        libsvm_default_param['kernel'] = 'linear'
        cl = svm.SVC(**libsvm_default_param)
    elif (classifier_str == 'svc_rbf'):
        libsvm_default_param['kernel'] = 'rbf'
        cl = svm.SVC(**libsvm_default_param)
    # polynomial, sigmoid kernel
    # nuSVC
    # Nearest Neighbors (euclidian distance used by default)
    elif (classifier_str == 'kn_uniform'):
        kn_default_param['weights'] = 'uniform'
        cl = neighbors.KNeighborsClassifier(**kn_default_param)
    elif (classifier_str == 'kn_distance'):
        kn_default_param['weights'] = 'distance'
        cl = neighbors.KNeighborsClassifier(**kn_default_param)
    elif (classifier_str == 'rn_uniform'):
        rn_default_param['weights'] = 'uniform'
        cl = neighbors.RadiusNeighborsClassifier(**rn_default_param)
    elif (classifier_str == 'rn_distance'):
        rn_default_param['weights'] = 'distance'
        cl = neighbors.RadiusNeighborsClassifier(**rn_default_param)
    elif (classifier_str == 'nc'):
        cl = neighbors.NearestCentroid()
    # LDA and QDA, priors are by default set to 1/len(class) for each class
    elif (classifier_str == 'lda'):
        cl = lda.LDA()
    elif (classifier_str == 'qda'):
        cl = qda.QDA()
    # Gaussion naive bayes
    # from the code it is unclear how priors are set
    elif (classifier_str == 'gnb'):
        cl = naive_bayes.GaussianNB()
    elif (classifier_str == 'mnb'):
        cl = naive_bayes.MultinomialNB()
    elif (classifier_str == 'bnb'):
        cl = naive_bayes.BernoulliNB()
    # Decision tree
    elif (classifier_str == 'dtree'):
        cl = tree.DecisionTreeClassifier()
    elif (classifier_str == 'rforest'):
        cl = ensemble.RandomForestClassifier()
    else:
        # raise error if classifier not found
        raise ValueError('Classifier not implemented: %s' % (classifier_str))

    return (cl)
Esempio n. 3
0
    def _init_model(self, parms=None):
        """Set ML model"""

        from sklearn import neighbors
        if self.args.radius == 0:
            if parms is not None:
                model = neighbors.KNeighborsClassifier(**parms)
            else:
                model = neighbors.KNeighborsClassifier()

        else:
            if parms is not None:
                model = neighbors.RadiusNeighborsClassifier(**parms)
            else:
                model = neighbors.RadiusNeighborsClassifier()
        return 'Neighbors', model
def test_radius_neighbors_classifier_outlier_labeling():
    # Test radius-based classifier when no neighbors found and outliers
    # are labeled.

    X = np.array([[1.0, 1.0], [2.0, 2.0], [0.99, 0.99], [0.98, 0.98],
                  [2.01, 2.01]])
    y = np.array([1, 2, 1, 1, 2])
    radius = 0.1

    z1 = np.array([[1.01, 1.01], [2.01, 2.01]])  # no outliers
    z2 = np.array([[1.4, 1.4], [1.01, 1.01], [2.01, 2.01]])  # one outlier
    correct_labels1 = np.array([1, 2])
    correct_labels2 = np.array([-1, 1, 2])

    weight_func = _weight_func

    for algorithm in ALGORITHMS:
        for weights in ['uniform', 'distance', weight_func]:
            clf = neighbors.RadiusNeighborsClassifier(radius=radius,
                                                      weights=weights,
                                                      algorithm=algorithm,
                                                      outlier_label=-1)
            clf.fit(X, y)
            assert_array_equal(correct_labels1, clf.predict(z1))
            assert_array_equal(correct_labels2, clf.predict(z2))
Esempio n. 5
0
def test_radius_neighbors_classifier(n_samples=40,
                                     n_features=5,
                                     n_test_pts=10,
                                     radius=0.5,
                                     random_state=0):
    """Test radius-based classification"""
    rng = np.random.RandomState(random_state)
    X = 2 * rng.rand(n_samples, n_features) - 1
    y = ((X ** 2).sum(axis=1) < .5).astype(np.int)
    y_str = y.astype(str)

    weight_func = _weight_func

    for algorithm in ALGORITHMS:
        for weights in ['uniform', 'distance', weight_func]:
            neigh = neighbors.RadiusNeighborsClassifier(radius=radius,
                                                        weights=weights,
                                                        algorithm=algorithm)
            neigh.fit(X, y)
            epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1)
            y_pred = neigh.predict(X[:n_test_pts] + epsilon)
            assert_array_equal(y_pred, y[:n_test_pts])
            neigh.fit(X, y_str)
            y_pred = neigh.predict(X[:n_test_pts] + epsilon)
            assert_array_equal(y_pred, y_str[:n_test_pts])
Esempio n. 6
0
def KNNski(x, r, X, Y, W='uniform'):
    # we create an instance of Neighbours Classifier and fit the data.
    clf = neighbors.RadiusNeighborsClassifier(r,
                                              weights=W,
                                              metric='seuclidean')
    clf.fit(X, Y)
    x = np.atleast_2d(x)
    return clf.predict(x)
Esempio n. 7
0
    def _fit(cls, dataset, **hp):
        classifier = neighbors.RadiusNeighborsClassifier(outlier_label=0,
                                                         radius=hp['radius'])

        # Train the model using the training set
        classifier.fit(dataset.train_data, dataset.train_target)

        # Return score on test set
        return cls._score(classifier.predict(dataset.test_data),
                          dataset.test_target)
Esempio n. 8
0
def EvalKNNRadius(XT, YT, p=0.25, weights='uniform'):
    xtrain, xtest, ytrain, ytest = V.train_test_split(XT,
                                                      YT,
                                                      test_size=p,
                                                      random_state=0)
    knn = neighbors.RadiusNeighborsClassifier(radius=30, weights=weights)
    print "Learning KNN model with " + str(weights) + " weights "
    kmodel = knn.fit(xtrain, ytrain)
    y_scores = kmodel.predict(xtest)
    print "k-radius Train error: " + str(kmodel.score(xtrain, ytrain))
    print "k-radius Test error: " + str(kmodel.score(xtest, ytest))
    eval_regres(ytest, y_scores)
Esempio n. 9
0
def radNearestNeighborsGridSeach(X, y):
    param_grid = [{
        'radius': np.linspace(0.1, 2, 20),
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'kd_tree'],
        'outlier_label': [-1]
    }]
    grid_search = skgs.GridSearchCV(skn.RadiusNeighborsClassifier(),
                                    param_grid,
                                    cv=5)
    grid_search.fit(X, y)
    print 'Best Score of Grid Search: ' + str(grid_search.best_score_)
    print 'Best Params of Grid Search: ' + str(grid_search.best_params_)
Esempio n. 10
0
    def rnc_tunning(Train, Test,  sampling=None, scores='f1', label='FRAUDE'):

        Train = pd.concat([Train, Test], axis=0, ignore_index=True)
        yTrain = Train[[label]]
        xTrain = Train
        del xTrain[label]


        if sampling == None:
            pass
        elif sampling == 'ALLKNN':
            xTrain, yTrain = under_sampling(xTrain, yTrain)
        else:
            xTrain, yTrain = over_sampling(xTrain, yTrain, model=sampling)

        tuned_parameters = [{'radius': list(numpy.arange(1, 101, 1)),
                             'weights': ['distance'], 'algorithm': ['auto'],
                             'leaf_size': list(numpy.arange(30, 301, 30)),
                             'p': [2],
                             'outlier_label': [-1]
                             }]

        fileModel = GridSearchCV(neighbors.RadiusNeighborsClassifier(), param_grid=tuned_parameters, cv=10,
                                 scoring='%s_macro' % scores)


        fileModel.fit(xTrain.drop(['id_siniestro'], axis=1).values, yTrain[label].values)

        print("Best parameters set found on development set:")
        print()
        dict_values = fileModel.best_params_
        print(dict_values)
        print()
        print("Grid scores on development set:")
        print()
        means = fileModel.cv_results_['mean_test_score']
        stds = fileModel.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, fileModel.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))

        radius = int(dict_values['radius'])
        leaf_size = int(dict_values['leaf_size'])

        df = pd.DataFrame.from_dict(dict_values, orient="index")
        df.to_csv('final_files\\sup_rnc.csv', sep=';', encoding='latin1', index=False)

        return radius, leaf_size, sampling
Esempio n. 11
0
def test_radius_neighbors_classifier_zero_distance():
    """ Test radius-based classifier, when distance to a sample is zero. """

    X = np.array([[1.0, 1.0], [2.0, 2.0]])
    y = np.array([1, 2])
    radius = 0.1

    z1 = np.array([[1.01, 1.01], [2.0, 2.0]])
    correct_labels1 = np.array([1, 2])

    weight_func = _weight_func

    for algorithm in ALGORITHMS:
        for weights in ['uniform', 'distance', weight_func]:
            clf = neighbors.RadiusNeighborsClassifier(radius=radius,
                                                      weights=weights,
                                                      algorithm=algorithm)
            clf.fit(X, y)
            assert_array_equal(correct_labels1, clf.predict(z1))
Esempio n. 12
0
def test_radius_neighbors_classifier_when_no_neighbors():
    """ Test radius-based classifier when no neighbors found.
    In this case it should rise an informative exception """

    X = np.array([[1.0, 1.0], [2.0, 2.0]])
    y = np.array([1, 2])
    radius = 0.1

    z1 = np.array([[1.01, 1.01], [2.01, 2.01]])  # no outliers
    z2 = np.array([[1.01, 1.01], [1.4, 1.4]])  # one outlier

    weight_func = _weight_func

    for algorithm in ALGORITHMS:
        for weights in ['uniform', 'distance', weight_func]:
            clf = neighbors.RadiusNeighborsClassifier(radius=radius,
                                                      weights=weights,
                                                      algorithm=algorithm)
            clf.fit(X, y)
            clf.predict(z1)
            assert_raises(ValueError, clf.predict, z2)
Esempio n. 13
0
def knnRadiusTuning():
    train = getTrainingData('train.csv', visualize=False)
    X = train.drop(['Exited'], axis=1)
    sc = StandardScaler()
    X = sc.fit_transform(X)
    y = train.Exited
    # split training data half half
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0)
    params = {
        "radius": list(np.arange(5.0, 8.0, 0.1)),
        "weights": ['uniform', 'distance']
    }
    model = neighbors.RadiusNeighborsClassifier()
    grid_search_cv = GridSearchCV(model,
                                  params,
                                  verbose=1,
                                  n_jobs=-1,
                                  cv=3,
                                  scoring='accuracy')
    # print(grid_search_cv.best_params_)
    grid_search_cv.fit(X_train, y_train)
    print_score(grid_search_cv, X_train, y_train, X_test, y_test, train=True)
    print_score(grid_search_cv, X_train, y_train, X_test, y_test, train=False)
    ROC(grid_search_cv, X_train, y_train, X_test, y_test, train=True)
    ROC(grid_search_cv, X_train, y_train, X_test, y_test, train=False)
    results = pd.DataFrame(grid_search_cv.cv_results_)
    printFullRow(results[results['rank_test_score'] == 1])
    # best param setting: n_neighbors == 11/13, p ==2, weights = distance
    return


# knnRadiusTuning()
# knnTuning()
def radius_neighbours(X_train, Y_train, X_test, Y_test):
    
    try:
        a=time.time()
        radius=0.2      #Come back to this - need a rigorous approach
        clf=neighbors.RadiusNeighborsClassifier(radius, weights='distance')
        clf.fit(X_train, Y_train)
        preds=clf.predict(X_test)
        
        print
        print 'Radius nearest neighbours'
        print 'Time taken', time.time()-a, 's'
        print 'Accuracy', sum(preds==Y_test)/(float)(len(preds))
        mismatched=preds[preds!=Y_test]
        print 'False Ia detection',  sum(mismatched==1)/(float)(sum(preds==1))
        
        probs=clf.predict_proba(X_test)
        #fpr, tpr, auc=roc(probs, Y_test)
    except ValueError:
        print 'ValueError in RNN - probably due to no neighbours within radius'
        fpr,  tpr,  auc ,  probs,  Y_test= (None, None, None,  -9999,  None)
        
    #return fpr, tpr, auc
    return probs
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max,
                                                           h))  #生成网格型二维数据对
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])  #给不同区域赋以颜色
cmap_bold = ListedColormap(['#FF0000', '#003300', '#0000FF'])  #给不同属性的点赋以颜色
#将预测的结果在平面坐标中画出其类别区域
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# 也画出所有的训练集数据
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())

plt.show()
clf1 = neighbors.RadiusNeighborsClassifier(10.0, weights='distance')
clf1.fit(X, y)
Z1 = clf1.predict(np.c_[xx.ravel(), yy.ravel()])
Z1 = Z1.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z1, cmap=cmap_light)
# 也画出所有的训练集数据
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())

plt.show()

#由图可以看出稀疏系数越小对表示结果越有利
Esempio n. 16
0
def classify(dataSet, labels):
    # 取得knn分类器
    knn = neighbors.RadiusNeighborsClassifier(radius=100.0)
    knn.fit(dataSet, labels)
    return knn
Esempio n. 17
0
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
''''' 创建网格以方便绘制 '''
h = .02
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# we create an instance of Neighbours Classifier and fit the data.
''''' 训练RadiusNN分类器 '''
f = 5.0
clf = neighbors.RadiusNeighborsClassifier(radius=f,
                                          n_neighbors=n_neighbors,
                                          weights='distance')
clf.fit(x_train, y_train)
'''''测试结果的打印'''
answer = clf.predict(X)
# print ('X :', X)
print('answer :', answer.shape)
print('y :', y.shape)
print('avg:', np.mean(answer == y))
'''''准确率与召回率'''
# precision_recall_curve 二分类问题中使用
# precision, recall, thresholds = precision_recall_curve(y_train, clf.predict(x_train))
# answer = clf.predict_proba(X)[:,1]

print(classification_report(y, answer, target_names=['0', '1', '2']))
''''' 将整个测试空间的分类结果用不同颜色区分开'''
Esempio n. 18
0
train_data = np.concatenate((data[0:10, ], data[50:60, ], data[100:110, ]),
                            axis=0)
train_label = np.concatenate(
    (labels[0:10, ], labels[50:60, ], labels[100:110, ]), axis=0)

# 构造测试数据,数据垂直组合concatenate
test_data = np.concatenate((data[10:50, ], data[60:100, ], data[110:150, ]),
                           axis=0)
test_label_expected = np.concatenate(
    (labels[10:50, ], labels[60:100, ], labels[110:150, ]), axis=0)

# 分析数据
# 创建分类器
#classifier = nb.KNeighborsClassifier(n_neighbors=3,weights='uniform',algorithm='auto')
classifier = nb.RadiusNeighborsClassifier(n_neighbors=2,
                                          weights='uniform',
                                          algorithm='auto')

# 训练分类器
classifier.fit(train_data, train_label)

# 预测
test_label_predicted = classifier.predict(test_data)

# 比较结果
size = len(test_label_predicted)
outer = np.zeros((size), dtype=int)
for i in range(size):
    if test_label_expected[i] != test_label_predicted[i]:
        outer[i] = 1
result = np.vstack((test_label_expected, test_label_predicted, outer))
Esempio n. 19
0
    def rnc_treshold(Train, Valid, Test, radius, leaf_size,
                                   sampling=None, label='FRAUDE', beta=2):

        # With beta = 2, we give the same importance to Recall and Precision

        yTrain = Train[label]
        xTrain = Train
        del xTrain[label]

        names = Train.columns.values.tolist()
        fileNames = numpy.array(names)

        if sampling == None:
            pass
        elif sampling == 'ALLKNN':
            xTrain, yTrain = under_sampling(xTrain, yTrain)
        else:
            xTrain, yTrain = over_sampling(xTrain, yTrain, model=sampling)

        min_sample_leaf = round((len(xTrain.index)) * 0.005)

        fileModel = neighbors.RadiusNeighborsClassifier(radius= radius,
                             weights= 'distance', algorithm= 'auto',
                             leaf_size= leaf_size,
                             p= 2,
                             outlier_label= -1)

        fileModel.fit(xTrain.values, yTrain.values)

        print(np.median(fileModel.predict_proba(Valid[Valid[label] == 0].drop(label, axis=1).values)))
        print(np.median(fileModel.predict_proba(Valid[Valid[label] == 1].drop(label, axis=1).values)))

        tresholds = np.linspace(0.1, 1.0, 200)

        scores = []

        y_pred_score = fileModel.predict_proba(Valid.drop(label, axis=1).values)
        y_pred_score = np.delete(y_pred_score, 0, axis=1)

        print('min', y_pred_score.min())
        print('max', y_pred_score.max())

        for treshold in tresholds:
            y_hat = (y_pred_score > treshold).astype(int)
            y_hat = y_hat.tolist()
            y_hat = [item for sublist in y_hat for item in sublist]

            scores.append([
                recall_score(y_pred=y_hat, y_true=Valid[label].values),
                precision_score(y_pred=y_hat, y_true=Valid[label].values),
                fbeta_score(y_pred=y_hat, y_true=Valid[label].values,
                            beta=2)])

        scores = np.array(scores)
        print('max_scores', scores[:, 2].max(), scores[:, 2].argmax())

        plot.plot(tresholds, scores[:, 0], label='$Recall$')
        plot.plot(tresholds, scores[:, 1], label='$Precision$')
        plot.plot(tresholds, scores[:, 2], label='$F_2$')
        plot.ylabel('Score')
        plot.xlabel('Threshold')
        plot.legend(loc='best')
        plot.show()

        final_tresh = tresholds[scores[:, 2].argmax()]

        y_hat_test = fileModel.predict_proba(Test.drop(label, axis=1).values)
        y_hat_test = np.delete(y_hat_test, 0, axis=1)

        y_hat_test = (y_hat_test > final_tresh).astype(int)
        y_hat_test = y_hat_test.tolist()
        y_hat_test = [item for sublist in y_hat_test for item in sublist]

        print('Final threshold: %.3f' % final_tresh)
        print('Test Recall Score: %.3f' % recall_score(y_pred=y_hat_test, y_true=Test[label].values))
        print('Test Precision Score: %.3f' % precision_score(y_pred=y_hat_test, y_true=Test[label].values))
        print('Test F2 Score: %.3f' % fbeta_score(y_pred=y_hat_test, y_true=Test[label].values, beta=beta))

        cnf_matrix = confusion_matrix(Test[label].values, y_hat_test)
        plot_confusion_matrix(cnf_matrix, classes=['Normal', 'Anormal'], title='Confusion matrix')

        featureImportance = fileModel.feature_importances_

        featureImportance = featureImportance / featureImportance.max()

        sorted_idx = numpy.argsort(featureImportance)
        barPos = numpy.arange(sorted_idx.shape[0]) + 0.5
        plot.barh(barPos, featureImportance[sorted_idx], align='center')
        plot.yticks(barPos, fileNames[sorted_idx])
        plot.xlabel('Variable Importance')
        plot.show()

        return final_tresh
Esempio n. 20
0
# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features. We could
# avoid this ugly slicing by using a two-dim dataset
y = iris.target

h = .02  # step size in the mesh

# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

for weights in ['uniform', 'distance']:
    # we create an instance of Neighbours Classifier and fit the data.
    clf = neighbors.RadiusNeighborsClassifier(n_neighbors=n_neighbors,
                                              weights=weights)
    clf.fit(X, y)

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, m_max]x[y_min, y_max].
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)