def run(tr, ts): Xtr = tr.as_matrix(['lat', 'lon']) Xts = ts.as_matrix(['lat', 'lon']) print('check outliers...') m = NearestNeighbors(10).fit(Xtr) dtr, _ = m.kneighbors(Xtr) dtr = np.mean(dtr[:, 1:], 1) dts, _ = m.kneighbors(Xts) dts = np.mean(dts[:, :-1], 1) tr_inliers = dtr < 0.02 ts_inliers = dts < 0.02 print('clustering all points...') k_all = 10 m = KMeans(k_all) _Ctr = m.fit_predict(Xtr[tr_inliers]) _Cts = m.predict(Xts[ts_inliers]) # outliers = cluster 0 _Ctr += 1 Ctr = np.zeros(len(Xtr), int) Ctr[tr_inliers] = _Ctr _Cts += 1 Cts = np.zeros(len(Xts), int) Cts[ts_inliers] = _Cts Dtr = m.transform(Xtr) Dts = m.transform(Xts) # one hot encoding Ctr = np.asarray([[int(c == i) for c in Ctr] for i in range(k_all + 1)]).T Cts = np.asarray([[int(c == i) for c in Cts] for i in range(k_all + 1)]).T Xtr_ = np.c_[Ctr, Dtr] Xts_ = np.c_[Cts, Dts] print('clustering across revenue classes...') k_across = 3 y = tr.as_matrix(['y'])[:, 0] Dtrs = [] Dtss = [] for klass in range(1, 6): Xtr[y == klass] m = KMeans(k_across) m.fit(Xtr[np.logical_and(tr_inliers, y == klass)]) Dtrs.append(np.amin(m.transform(Xtr), 1)) Dtss.append(np.amin(m.transform(Xts), 1)) Dtrs = np.asarray(Dtrs).T Dtss = np.asarray(Dtss).T Xtr_ = np.c_[Xtr_, Dtrs] Xts_ = np.c_[Xts_, Dtss] names = ['cluster-%d' % i for i in range(k_all+1)] + \ ['cluster-dist-%d' % i for i in range(k_all)] + \ ['cluster-class-dist-%d' % i for i in range(1, 6)] return pd.DataFrame(Xtr_, columns=names), pd.DataFrame(Xts_, columns=names)
def fast_knn(X, n_clusters=5, n_neighbors=None, graph_mode='distance', cluster_mode='spectral', algorithm='brute', n_jobs=1, random_state=1234, force_sklearn=False): r""" Arguments: X : `ndarray` or tuple of (X, y) n_neighbors: int (default = 5) The top K closest datapoints you want the algorithm to return. Currently, this value must be < 1024. graph_mode : {'distance', 'connectivity'}, default='distance' This mode decides which values `kneighbors_graph` will return: - 'connectivity' : will return the connectivity matrix with ones and zeros (for 'SpectralClustering'). - 'distance' : will return the distances between neighbors according to the given metric (for 'DBSCAN'). cluster_mode: {'vote', 'spectral', 'isomap'}, default='vote' This mode decides how to generate cluster prediction from the neighbors graph: - 'dbscan' : - 'spectral' : - 'isomap' : - 'kmeans' : algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. - 'auto' will attempt to decide the most appropriate algorithm based on the values passed to :meth:`fit` method. Note: fitting on sparse input will override the setting of this parameter, using brute force. """ kwargs = dict(locals()) X = kwargs.pop('X') force_sklearn = kwargs.pop('force_sklearn') random_state = kwargs.pop('random_state') n_clusters = int(kwargs.pop('n_clusters')) if n_neighbors is None: kwargs['n_neighbors'] = n_clusters n_neighbors = n_clusters ## graph mode graph_mode = str(kwargs.pop('graph_mode')).strip().lower() assert graph_mode in ('distance', 'connectivity') ## cluster mode cluster_mode = str(kwargs.pop('cluster_mode')).strip().lower() ## fine-tuning the kwargs use_cuml = _check_cuml(force_sklearn) if use_cuml: from cuml.neighbors import NearestNeighbors kwargs['n_gpus'] = kwargs['n_jobs'] kwargs.pop('n_jobs') kwargs.pop('algorithm') else: from sklearn.neighbors import NearestNeighbors ## fitting knn = NearestNeighbors(**kwargs) knn.fit(X) knn._fitid = id(X) ## Transform mode knn._random_state = random_state knn._n_clusters = n_clusters knn._graph_mode = graph_mode knn._cluster_mode = cluster_mode if use_cuml: knn.n_samples_fit_ = X.shape[0] knn.kneighbors_graph = types.MethodType(nn_kneighbors_graph, knn) knn.transform = types.MethodType(nn_transform, knn) knn.fit_transform = types.MethodType(nn_fit_transform, knn) knn.predict = types.MethodType(nn_predict, knn) return knn