def fast_knn( X, *, n_clusters: int = 5, n_neighbors: Optional[int] = None, graph_mode='distance', cluster_mode='spectral', algorithm='brute', n_jobs: Optional[int] = None, random_state: int = 1, framework: Literal['auto', 'cuml', 'sklearn'] = 'auto', ) -> NearestNeighbors: """ Parameters ---------- X : `ndarray` or tuple of (X, y) n_neighbors: int (default = 5) The top K closest datapoints you want the algorithm to return. Currently, this value must be < 1024. graph_mode : {'distance', 'connectivity'}, default='distance' This mode decides which values `kneighbors_graph` will return: - 'connectivity' : will return the connectivity matrix with ones and zeros (for 'SpectralClustering'). - 'distance' : will return the distances between neighbors according to the given metric (for 'DBSCAN'). cluster_mode: {'vote', 'spectral', 'isomap'}, default='vote' This mode decides how to generate cluster prediction from the neighbors graph: - 'dbscan' : - 'spectral' : - 'isomap' : - 'kmeans' : algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. - 'auto' will attempt to decide the most appropriate algorithm based on the values passed to :meth:`fit` method. Note: fitting on sparse input will override the setting of this parameter, using brute force. """ kwargs = dict(locals()) X = kwargs.pop('X') framework = kwargs.pop('framework') random_state = kwargs.pop('random_state') n_clusters = int(kwargs.pop('n_clusters')) if n_neighbors is None: kwargs['n_neighbors'] = n_clusters n_neighbors = n_clusters ## graph mode graph_mode = str(kwargs.pop('graph_mode')).strip().lower() assert graph_mode in ('distance', 'connectivity') ## cluster mode cluster_mode = str(kwargs.pop('cluster_mode')).strip().lower() ## fine-tuning the kwargs use_cuml = _check_cuml(framework) if use_cuml: from cuml.neighbors import NearestNeighbors as KNN kwargs.pop('n_jobs') kwargs.pop('algorithm') else: KNN = NearestNeighbors ## fitting knn = KNN(**kwargs) knn.fit(X) knn._fitid = id(X) ## Transform mode knn._random_state = random_state knn._n_clusters = n_clusters knn._graph_mode = graph_mode knn._cluster_mode = cluster_mode if use_cuml: knn.n_samples_fit_ = X.shape[0] knn.kneighbors_graph = types.MethodType(nn_kneighbors_graph, knn) knn.transform = types.MethodType(nn_transform, knn) knn.fit_transform = types.MethodType(nn_fit_transform, knn) knn.predict = types.MethodType(nn_predict, knn) return knn
# In[14]: train['oof_cnn'] = preds if COMPUTE_CV: train['f1'] = train.apply(getMetric('oof_cnn'), axis=1) print('CV score for baseline =', train.f1.mean()) # # title TFIDF # In[15]: # from sklearn.feature_extraction.text import TfidfVectorizer model = TfidfVectorizer(stop_words=None, binary=True, max_features=25000) text_embeddings = model.fit_transform(train_gf.title).toarray() print('text embeddings shape', text_embeddings.shape) # In[16]: preds = [] CHUNK = 1024 * 4 print('Finding similar titles...') CTS = len(train) // CHUNK if len(train) % CHUNK != 0: CTS += 1 for j in range(CTS): a = j * CHUNK b = (j + 1) * CHUNK b = min(b, len(train))