Example #1
0
def run(tr, ts):
    Xtr = tr.as_matrix(['lat', 'lon'])
    Xts = ts.as_matrix(['lat', 'lon'])

    print('check outliers...')
    m = NearestNeighbors(10).fit(Xtr)

    dtr, _ = m.kneighbors(Xtr)
    dtr = np.mean(dtr[:, 1:], 1)

    dts, _ = m.kneighbors(Xts)
    dts = np.mean(dts[:, :-1], 1)

    tr_inliers = dtr < 0.02
    ts_inliers = dts < 0.02

    print('clustering all points...')
    k_all = 10
    m = KMeans(k_all)
    _Ctr = m.fit_predict(Xtr[tr_inliers])
    _Cts = m.predict(Xts[ts_inliers])

    # outliers = cluster 0
    _Ctr += 1
    Ctr = np.zeros(len(Xtr), int)
    Ctr[tr_inliers] = _Ctr

    _Cts += 1
    Cts = np.zeros(len(Xts), int)
    Cts[ts_inliers] = _Cts

    Dtr = m.transform(Xtr)
    Dts = m.transform(Xts)

    # one hot encoding
    Ctr = np.asarray([[int(c == i) for c in Ctr] for i in range(k_all + 1)]).T
    Cts = np.asarray([[int(c == i) for c in Cts] for i in range(k_all + 1)]).T

    Xtr_ = np.c_[Ctr, Dtr]
    Xts_ = np.c_[Cts, Dts]

    print('clustering across revenue classes...')
    k_across = 3
    y = tr.as_matrix(['y'])[:, 0]
    Dtrs = []
    Dtss = []
    for klass in range(1, 6):
        Xtr[y == klass]
        m = KMeans(k_across)
        m.fit(Xtr[np.logical_and(tr_inliers, y == klass)])
        Dtrs.append(np.amin(m.transform(Xtr), 1))
        Dtss.append(np.amin(m.transform(Xts), 1))

    Dtrs = np.asarray(Dtrs).T
    Dtss = np.asarray(Dtss).T

    Xtr_ = np.c_[Xtr_, Dtrs]
    Xts_ = np.c_[Xts_, Dtss]

    names = ['cluster-%d' % i for i in range(k_all+1)] + \
        ['cluster-dist-%d' % i for i in range(k_all)] + \
        ['cluster-class-dist-%d' % i for i in range(1, 6)]
    return pd.DataFrame(Xtr_, columns=names), pd.DataFrame(Xts_, columns=names)
Example #2
0
def fast_knn(X,
             n_clusters=5,
             n_neighbors=None,
             graph_mode='distance',
             cluster_mode='spectral',
             algorithm='brute',
             n_jobs=1,
             random_state=1234,
             force_sklearn=False):
    r"""
  Arguments:
    X : `ndarray` or tuple of (X, y)
    n_neighbors: int (default = 5)
      The top K closest datapoints you want the algorithm to return.
      Currently, this value must be < 1024.
    graph_mode : {'distance', 'connectivity'}, default='distance'
      This mode decides which values `kneighbors_graph` will return:
        - 'connectivity' : will return the connectivity matrix with ones and
          zeros (for 'SpectralClustering').
        - 'distance' : will return the distances between neighbors according
          to the given metric (for 'DBSCAN').
    cluster_mode: {'vote', 'spectral', 'isomap'}, default='vote'
        This mode decides how to generate cluster prediction from the
        neighbors graph:
        - 'dbscan' :
        - 'spectral' :
        - 'isomap' :
        - 'kmeans' :
    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
        Algorithm used to compute the nearest neighbors:
        - 'ball_tree' will use :class:`BallTree`
        - 'kd_tree' will use :class:`KDTree`
        - 'brute' will use a brute-force search.
        - 'auto' will attempt to decide the most appropriate algorithm
          based on the values passed to :meth:`fit` method.
        Note: fitting on sparse input will override the setting of
        this parameter, using brute force.
  """
    kwargs = dict(locals())
    X = kwargs.pop('X')
    force_sklearn = kwargs.pop('force_sklearn')
    random_state = kwargs.pop('random_state')
    n_clusters = int(kwargs.pop('n_clusters'))
    if n_neighbors is None:
        kwargs['n_neighbors'] = n_clusters
        n_neighbors = n_clusters
    ## graph mode
    graph_mode = str(kwargs.pop('graph_mode')).strip().lower()
    assert graph_mode in ('distance', 'connectivity')
    ## cluster mode
    cluster_mode = str(kwargs.pop('cluster_mode')).strip().lower()
    ## fine-tuning the kwargs
    use_cuml = _check_cuml(force_sklearn)
    if use_cuml:
        from cuml.neighbors import NearestNeighbors
        kwargs['n_gpus'] = kwargs['n_jobs']
        kwargs.pop('n_jobs')
        kwargs.pop('algorithm')
    else:
        from sklearn.neighbors import NearestNeighbors
    ## fitting
    knn = NearestNeighbors(**kwargs)
    knn.fit(X)
    knn._fitid = id(X)
    ## Transform mode
    knn._random_state = random_state
    knn._n_clusters = n_clusters
    knn._graph_mode = graph_mode
    knn._cluster_mode = cluster_mode
    if use_cuml:
        knn.n_samples_fit_ = X.shape[0]
    knn.kneighbors_graph = types.MethodType(nn_kneighbors_graph, knn)
    knn.transform = types.MethodType(nn_transform, knn)
    knn.fit_transform = types.MethodType(nn_fit_transform, knn)
    knn.predict = types.MethodType(nn_predict, knn)
    return knn