def _nearestneighbors(*,
                      train,
                      test,
                      x_predict=None,
                      metrics,
                      n_neighbors=5,
                      radius=1.0,
                      algorithm='auto',
                      leaf_size=30,
                      metric='minkowski',
                      p=2,
                      metric_params=None,
                      n_jobs=None):
    """
    For more info visit :
    https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html#sklearn.neighbors.NearestNeighbors
    """

    model = NearestNeighbors(n_neighbors=n_neighbors,
                             radius=radius,
                             algorithm=algorithm,
                             leaf_size=leaf_size,
                             metric=metric,
                             p=p,
                             metric_params=metric_params,
                             n_jobs=n_jobs)
    model.fit(train[0], train[1])
    model_name = 'Nearest Neighbors'
    y_hat = model.predict(test[0])

    if metrics == 'accuracy':
        accuracy = accuracy_score(test[1], y_hat)

    if metrics == 'f1':
        accuracy = f1_score(test[1], y_hat)

    if metrics == 'jaccard':
        accuracy = jaccard_score(test[1], y_hat)

    if x_predict is None:
        return (model_name, accuracy, None)

    y_predict = model.predict(x_predict)
    return (model_name, accuracy, y_predict)
Example #2
0
def wine_cross():
    wine = datasets.load_wine()
    x = wine.data
    y = wine.target
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, random_state=42)

    clf = NearestNeighbors(n_neighbors=5)
    clf = KMeans(n_clusters=4, random_state=0)
    clf.fit(x_train)
    clf.labels_

    y_pred = clf.predict(x_test)
    print('accuracy: ', accuracy_score(y_test, y_pred))
Example #3
0
def fixed_outlier_detector_by_LOF(feature, outlier_fraction):
	"""
	this function takes a training data X, output the outlier index in X
	with the outlier fraction is outlier_fraction
	"""
	model = NearestNeighbors(contamination= outlier_fraction)
	model.fit(feature)
	y_predict = model.predict(feature)
	outliers = []
	i=0
	for y in y_predict:
		if y == -1:
			outliers.append(i)
		i=i+1
	return outliers
Example #4
0
    def predict(self):
        """
        trains the scikit-learn  python machine learning algorithm library function
        https://scikit-learn.org

        then passes the trained algorithm the features set and returns the
        predicted y test values form, the function

        then compares the y_test values from scikit-learn predicted to
        y_test values passed in

        then returns the accuracy
        """
        algorithm = NearestNeighbors(n_neighbors=2)
        algorithm.fit(self.X_train, self.y_train)
        y_pred = list(algorithm.predict(self.X_test))
        self.acc = OneHotPredictor.get_accuracy(y_pred, self.y_test)
        return self.acc
Example #5
0
    def knn(self, partitions, predictors, outcome):
        # making individual that's necessary to determine the optimal amount of neighbors
        test_individual = partitions['valid_X'][predictors].iloc[0, :]

        # making initial KNN model
        knn = NearestNeighbors(n_neighbors=3)
        knn.fit(partitions['train_X'][predictors])

        results = []
        for k in range(1, 40):
            knn = KNeighborsClassifier(n_neighbors=k).fit(
                partitions['train_X'], partitions['train_y'])
            results.append({
                'k':
                k,
                'accuracy':
                accuracy_score(predictors['valid_y'],
                               knn.predict(predictors['valid_X']))
            })
# K-NN
import numpy as np

from sklearn import datasets
from sklearn.model_selection import train_test_split

irisDataset = datasets.load_iris()

irisFeatures = irisDataset.data
irisTarget = irisDataset.target

xTrain, xTest, yTrain, yTest = train_test_split(irisFeatures,
                                                irisTarget,
                                                test_size=0.2)

from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(n_neighbors=3, algorithm='ballTree')
knn.fit(xTrain, yTrain)
yPred = knn.predict(xTest)

from sklearn.metrics import confusion_matrix, f1_score

confusion_matrix = confusion_matrix(yTest, yPred)
f1_score = f1_score(yTest, yPred, average='weighted')

print("Confusion Matrix: \n", confusion_matrix)
print("F1-Score: ", f1_score)
Example #7
0
    features_train[ii][1] for ii in range(0, len(features_train))
    if labels_train[ii] == 1
]

#### initial visualization
plt.xlim(0.0, 1.0)
plt.ylim(0.0, 1.0)
plt.scatter(bumpy_fast, grade_fast, color="b", label="fast")
plt.scatter(grade_slow, bumpy_slow, color="r", label="slow")
plt.legend()
plt.xlabel("bumpiness")
plt.ylabel("grade")
plt.show()
################################################################################

### your code here!  name your classifier object clf if you want the
### visualization code (prettyPicture) to show you the decision boundary
from sklearn.neighbors import NearestNeighbors
clf = NearestNeighbors(n_neighbors=2)
clf = clf.fit(features_train, labels_train)
pred = clf.predict(features_test)

from sklearn.metrics import accuracy_score
acc = accuracy_score(pred, labels_test)
print acc

try:
    prettyPicture(clf, features_test, labels_test)
except NameError:
    pass
Example #8
0
def run(tr, ts):
    Xtr = tr.as_matrix(['lat', 'lon'])
    Xts = ts.as_matrix(['lat', 'lon'])

    print('check outliers...')
    m = NearestNeighbors(10).fit(Xtr)

    dtr, _ = m.kneighbors(Xtr)
    dtr = np.mean(dtr[:, 1:], 1)

    dts, _ = m.kneighbors(Xts)
    dts = np.mean(dts[:, :-1], 1)

    tr_inliers = dtr < 0.02
    ts_inliers = dts < 0.02

    print('clustering all points...')
    k_all = 10
    m = KMeans(k_all)
    _Ctr = m.fit_predict(Xtr[tr_inliers])
    _Cts = m.predict(Xts[ts_inliers])

    # outliers = cluster 0
    _Ctr += 1
    Ctr = np.zeros(len(Xtr), int)
    Ctr[tr_inliers] = _Ctr

    _Cts += 1
    Cts = np.zeros(len(Xts), int)
    Cts[ts_inliers] = _Cts

    Dtr = m.transform(Xtr)
    Dts = m.transform(Xts)

    # one hot encoding
    Ctr = np.asarray([[int(c == i) for c in Ctr] for i in range(k_all + 1)]).T
    Cts = np.asarray([[int(c == i) for c in Cts] for i in range(k_all + 1)]).T

    Xtr_ = np.c_[Ctr, Dtr]
    Xts_ = np.c_[Cts, Dts]

    print('clustering across revenue classes...')
    k_across = 3
    y = tr.as_matrix(['y'])[:, 0]
    Dtrs = []
    Dtss = []
    for klass in range(1, 6):
        Xtr[y == klass]
        m = KMeans(k_across)
        m.fit(Xtr[np.logical_and(tr_inliers, y == klass)])
        Dtrs.append(np.amin(m.transform(Xtr), 1))
        Dtss.append(np.amin(m.transform(Xts), 1))

    Dtrs = np.asarray(Dtrs).T
    Dtss = np.asarray(Dtss).T

    Xtr_ = np.c_[Xtr_, Dtrs]
    Xts_ = np.c_[Xts_, Dtss]

    names = ['cluster-%d' % i for i in range(k_all+1)] + \
        ['cluster-dist-%d' % i for i in range(k_all)] + \
        ['cluster-class-dist-%d' % i for i in range(1, 6)]
    return pd.DataFrame(Xtr_, columns=names), pd.DataFrame(Xts_, columns=names)
Example #9
0
def fast_knn(X,
             n_clusters=5,
             n_neighbors=None,
             graph_mode='distance',
             cluster_mode='spectral',
             algorithm='brute',
             n_jobs=1,
             random_state=1234,
             force_sklearn=False):
    r"""
  Arguments:
    X : `ndarray` or tuple of (X, y)
    n_neighbors: int (default = 5)
      The top K closest datapoints you want the algorithm to return.
      Currently, this value must be < 1024.
    graph_mode : {'distance', 'connectivity'}, default='distance'
      This mode decides which values `kneighbors_graph` will return:
        - 'connectivity' : will return the connectivity matrix with ones and
          zeros (for 'SpectralClustering').
        - 'distance' : will return the distances between neighbors according
          to the given metric (for 'DBSCAN').
    cluster_mode: {'vote', 'spectral', 'isomap'}, default='vote'
        This mode decides how to generate cluster prediction from the
        neighbors graph:
        - 'dbscan' :
        - 'spectral' :
        - 'isomap' :
        - 'kmeans' :
    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
        Algorithm used to compute the nearest neighbors:
        - 'ball_tree' will use :class:`BallTree`
        - 'kd_tree' will use :class:`KDTree`
        - 'brute' will use a brute-force search.
        - 'auto' will attempt to decide the most appropriate algorithm
          based on the values passed to :meth:`fit` method.
        Note: fitting on sparse input will override the setting of
        this parameter, using brute force.
  """
    kwargs = dict(locals())
    X = kwargs.pop('X')
    force_sklearn = kwargs.pop('force_sklearn')
    random_state = kwargs.pop('random_state')
    n_clusters = int(kwargs.pop('n_clusters'))
    if n_neighbors is None:
        kwargs['n_neighbors'] = n_clusters
        n_neighbors = n_clusters
    ## graph mode
    graph_mode = str(kwargs.pop('graph_mode')).strip().lower()
    assert graph_mode in ('distance', 'connectivity')
    ## cluster mode
    cluster_mode = str(kwargs.pop('cluster_mode')).strip().lower()
    ## fine-tuning the kwargs
    use_cuml = _check_cuml(force_sklearn)
    if use_cuml:
        from cuml.neighbors import NearestNeighbors
        kwargs['n_gpus'] = kwargs['n_jobs']
        kwargs.pop('n_jobs')
        kwargs.pop('algorithm')
    else:
        from sklearn.neighbors import NearestNeighbors
    ## fitting
    knn = NearestNeighbors(**kwargs)
    knn.fit(X)
    knn._fitid = id(X)
    ## Transform mode
    knn._random_state = random_state
    knn._n_clusters = n_clusters
    knn._graph_mode = graph_mode
    knn._cluster_mode = cluster_mode
    if use_cuml:
        knn.n_samples_fit_ = X.shape[0]
    knn.kneighbors_graph = types.MethodType(nn_kneighbors_graph, knn)
    knn.transform = types.MethodType(nn_transform, knn)
    knn.fit_transform = types.MethodType(nn_fit_transform, knn)
    knn.predict = types.MethodType(nn_predict, knn)
    return knn
# 5.超参优化(略)
from sklearn.model_selection import GridSearchCV

params = {'n_neighbors': range(1, 10)}
mdl = KNeighborsClassifier()
grid = GridSearchCV(mdl, param_grid=params)
grid.fit(X, y)

print('最优参数:', grid.best_params_)
print('最优得分:', grid.best_score_)

mdl = grid.best_estimator_

# 6.评估模型
y_pred = mdl.predict(X)
displayClassifierMetrics(y, y_pred, mdl.classes_)

y_prob = mdl.predict_proba(X)
displayROCurve(y, y_prob, mdl.classes_)

# 相关类
# KNeighborsClassifier(n_neighbors=5,weights=’uniform’,algorithm=’auto’,
#               leaf_size=30,p=2,metric=’minkowski’,metric_params=None,n_jobs=1,*kwargs)
# n_neighbors: int, 可选参数(默认为 5)
# weights(权重): str or callable(自定义类型), 可选参数(默认为 ‘uniform’)
# 用于预测的权重函数。可选参数如下:
# - ‘uniform’ : 统一的权重. 在每一个邻居区域里的点的权重都是一样的。
# - ‘distance’ : 权重点等于他们距离的倒数。使用此函数,更近的邻居对于所预测的点的影响更大。
# - [callable] : 一个用户自定义的方法,此方法接收一个距离的数组,然后返回一个相同形状并且包含权重的数组。
# algorithm(算法): {‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}, 可选参数(默认为 'auto')
Example #11
0
# In[45]:

#accuracy

train_X = trainNorm[['zinventorygrowth', 'zpopulationgrowth']]
train_y = trainNorm['yoygtenp']
valid_X = validNorm[['zinventorygrowth', 'zpopulationgrowth']]
valid_y = validNorm['yoygtenp']

# Train a classifier for different values of k
results = []
for k in range(1, 12):
    knn = KNeighborsClassifier(n_neighbors=k).fit(train_X, train_y)
    results.append({
        'k': k,
        'accuracy': accuracy_score(valid_y, knn.predict(valid_X))
    })

# Convert results to a pandas data frame
results = pd.DataFrame(results)
print(results)

# Retrain with full dataset---KNN
retail_X = retailNorm[['zinventorygrowth', 'zpopulationgrowth']]
retail_y = retailNorm['yoygtenp']
knn = KNeighborsClassifier(n_neighbors=4).fit(retail_X, retail_y)
distances, indices = knn.kneighbors(newretailNorm)
print(knn.predict(newretailNorm))
print('Distances', distances)
print('Indices', indices)
print(retailNorm.iloc[indices[0], :])
Example #12
0
def knn_predictor(x_train, y_train, x_test, y_test):
	clf = NearestNeighbors(n_neighbors = 5)
	clf.fit(x_train)
	accuracy = clf.score(x_test, y_test)
	f1 = precision_recall_fscore_support(y_test, clf.predict(x_test), average = 'weighted')[2]
	print(accuracy, f1)