class Model(): def __init__(self): self.knn = KNeighborsTransformer(n_neighbors=5,n_jobs=-1) def fit(self, dtm): self.knn.fit(dtm, dtm.index.tolist()) def predict(self): pass
def test_transformers(): """Test that AnnoyTransformer and KNeighborsTransformer give same results""" X = np.random.RandomState(42).randn(10, 2) knn = KNeighborsTransformer() Xt0 = knn.fit_transform(X) ann = AnnoyTransformer() Xt1 = ann.fit_transform(X) nms = NMSlibTransformer() Xt2 = nms.fit_transform(X) assert_array_almost_equal(Xt0.toarray(), Xt1.toarray(), decimal=5) assert_array_almost_equal(Xt0.toarray(), Xt2.toarray(), decimal=5)
def test_isomap(): # Test chaining KNeighborsTransformer and Isomap with # neighbors_algorithm='precomputed' algorithm = 'auto' n_neighbors = 10 X, _ = make_blobs(random_state=0) X2, _ = make_blobs(random_state=1) # compare the chained version and the compact version est_chain = make_pipeline( KNeighborsTransformer(n_neighbors=n_neighbors, algorithm=algorithm, mode='distance'), Isomap(n_neighbors=n_neighbors, metric='precomputed')) est_compact = Isomap(n_neighbors=n_neighbors, neighbors_algorithm=algorithm) Xt_chain = est_chain.fit_transform(X) Xt_compact = est_compact.fit_transform(X) assert_array_almost_equal(Xt_chain, Xt_compact) Xt_chain = est_chain.transform(X2) Xt_compact = est_compact.transform(X2) assert_array_almost_equal(Xt_chain, Xt_compact)
def test_lof_novelty_true(): # Test chaining KNeighborsTransformer and LocalOutlierFactor n_neighbors = 4 rng = np.random.RandomState(0) X1 = rng.randn(40, 2) X2 = rng.randn(40, 2) # compare the chained version and the compact version est_chain = make_pipeline( KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance"), LocalOutlierFactor( metric="precomputed", n_neighbors=n_neighbors, novelty=True, contamination="auto", ), ) est_compact = LocalOutlierFactor(n_neighbors=n_neighbors, novelty=True, contamination="auto") pred_chain = est_chain.fit(X1).predict(X2) pred_compact = est_compact.fit(X1).predict(X2) assert_array_almost_equal(pred_chain, pred_compact)
def test_spectral_embedding(): # Test chaining KNeighborsTransformer and SpectralEmbedding n_neighbors = 5 n_samples = 1000 centers = np.array([ [0.0, 5.0, 0.0, 0.0, 0.0], [0.0, 0.0, 4.0, 0.0, 0.0], [1.0, 0.0, 0.0, 5.0, 1.0], ]) S, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) # compare the chained version and the compact version est_chain = make_pipeline( KNeighborsTransformer(n_neighbors=n_neighbors, mode='connectivity'), SpectralEmbedding(n_neighbors=n_neighbors, affinity='precomputed', random_state=42)) est_compact = SpectralEmbedding(n_neighbors=n_neighbors, affinity='nearest_neighbors', random_state=42) St_compact = est_compact.fit_transform(S) St_chain = est_chain.fit_transform(S) assert_array_almost_equal(St_chain, St_compact)
def test_tsne(): # Test chaining KNeighborsTransformer and TSNE n_iter = 250 perplexity = 5 n_neighbors = int(3. * perplexity + 1) rng = np.random.RandomState(0) X = rng.randn(20, 2) for metric in ['minkowski', 'sqeuclidean']: # compare the chained version and the compact version est_chain = make_pipeline( KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance', metric=metric), TSNE(metric='precomputed', perplexity=perplexity, method="barnes_hut", random_state=42, n_iter=n_iter)) est_compact = TSNE(metric=metric, perplexity=perplexity, n_iter=n_iter, method="barnes_hut", random_state=42) Xt_chain = est_chain.fit_transform(X) Xt_compact = est_compact.fit_transform(X) assert_array_almost_equal(Xt_chain, Xt_compact)
def get_kNN_score_torch(pairwise_distances, matching_matrix, n_neighbours=5): # The score shows how the collection of persistent landscapes, corresponding to each label, # are separeted from each other, in the sense of L2 distance in the Hilbert space of persistent landscape # pairwise_distances - torch tensor of the shape (n_samples, n_samples). # mathcing_matrix - numpy array of the shape (n_samples, n_samples). 1 if samples have the same label, 0 otherwise # n_neighbours - integer, number of nearest used to calculate the score # returns kNN_score - real number between 0 and 1 n_samples = pairwise_distances.size()[0] kNN_transformer = KNeighborsTransformer(mode='connectivity', metric='precomputed', n_neighbors=n_neighbours) connectivity_matrix = kNN_transformer.fit_transform(pairwise_distances.numpy()).toarray() #if(matching_matrix == 0): # matching_matrix = labels.numpy()[:, np.newaxis] == labels.numpy()[np.newaxis, :] kNN_score = (np.sum(matching_matrix * connectivity_matrix) - n_samples) / (np.sum(connectivity_matrix) - n_samples) return kNN_score
def test_kneighbors_regressor(): # Test chaining KNeighborsTransformer and classifiers/regressors rng = np.random.RandomState(0) X = 2 * rng.rand(40, 5) - 1 X2 = 2 * rng.rand(40, 5) - 1 y = rng.rand(40, 1) n_neighbors = 12 radius = 1.5 # We precompute more neighbors than necessary, to have equivalence between # k-neighbors estimator after radius-neighbors transformer, and vice-versa. factor = 2 k_trans = KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance') k_trans_factor = KNeighborsTransformer(n_neighbors=int(n_neighbors * factor), mode='distance') r_trans = RadiusNeighborsTransformer(radius=radius, mode='distance') r_trans_factor = RadiusNeighborsTransformer(radius=int(radius * factor), mode='distance') k_reg = KNeighborsRegressor(n_neighbors=n_neighbors) r_reg = RadiusNeighborsRegressor(radius=radius) test_list = [ (k_trans, k_reg), (k_trans_factor, r_reg), (r_trans, r_reg), (r_trans_factor, k_reg), ] for trans, reg in test_list: # compare the chained version and the compact version reg_compact = clone(reg) reg_precomp = clone(reg) reg_precomp.set_params(metric='precomputed') reg_chain = make_pipeline(clone(trans), reg_precomp) y_pred_chain = reg_chain.fit(X, y).predict(X2) y_pred_compact = reg_compact.fit(X, y).predict(X2) assert_array_almost_equal(y_pred_chain, y_pred_compact)
def test_explicit_diagonal(): # Test that the diagonal is explicitly stored in the sparse graph n_neighbors = 5 n_samples_fit, n_samples_transform, n_features = 20, 18, 10 rng = np.random.RandomState(42) X = rng.randn(n_samples_fit, n_features) X2 = rng.randn(n_samples_transform, n_features) nnt = KNeighborsTransformer(n_neighbors=n_neighbors) Xt = nnt.fit_transform(X) assert _has_explicit_diagonal(Xt) assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0) Xt = nnt.transform(X) assert _has_explicit_diagonal(Xt) assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0) # Using transform on new data should not always have zero diagonal X2t = nnt.transform(X2) assert not _has_explicit_diagonal(X2t)
def _calculate_pairwise_distances(X, Y=None, metric='precomputed', n_neighbors=None): if metric in ('precomputed', 'ignore'): return X if n_neighbors is None: if metric == 'euclidean': X_pairwise = pairwise_distances(X, Y=Y, metric=metric, squared=True) elif metric == 'correlation' or metric == 'cosine': # An in-place version of: # X_pairwise = 1 - (1 - pairwise_distances(X, metric=metric)) ** 2 X_pairwise = pairwise_distances(X, Y=Y, metric=metric) X_pairwise = numpy.subtract(1, X_pairwise, out=X_pairwise) X_pairwise = numpy.square(X_pairwise, out=X_pairwise) X_pairwise = numpy.subtract(1, X_pairwise, out=X_pairwise) else: X_pairwise = pairwise_distances(X, Y=Y, metric=metric) else: if metric == 'correlation' or metric == 'cosine': # An in-place version of: # X = 1 - (1 - pairwise_distances(X, metric=metric)) ** 2 X = pairwise_distances(X, Y=Y, metric=metric) X = numpy.subtract(1, X, out=X) X = numpy.square(X, out=X) X = numpy.subtract(1, X, out=X) metric = 'precomputed' if isinstance(n_neighbors, int): X_pairwise = KNeighborsTransformer(n_neighbors=n_neighbors, metric=metric).fit_transform(X) elif isinstance(n_neighbors, KNeighborsTransformer): X_pairwise = n_neighbors.fit_transform(X) if metric == 'correlation' or metric == 'cosine': if isinstance(X_pairwise, csr_matrix): X_pairwise.data = numpy.subtract(1, X_pairwise.data, out=X_pairwise.data) else: X_pairwise = numpy.subtract(1, X_pairwise, out=X_pairwise) else: if isinstance(X_pairwise, csr_matrix): X_pairwise.data = numpy.subtract(X_pairwise.max(), X_pairwise.data, out=X_pairwise.data) else: X_pairwise = numpy.subtract(X_pairwise.max(), X_pairwise, out=X_pairwise) return X_pairwise
def test_sklearn_k_neighbours_transformer_connectivity(self): model, X_test = fit_classification_model( KNeighborsTransformer(n_neighbors=3, mode='connectivity'), 3) model_onnx = convert_sklearn( model, "KNN transformer", [("input", FloatTensorType((None, X_test.shape[1])))], target_opset=TARGET_OPSET) self.assertIsNotNone(model_onnx) dump_data_and_model(X_test, model, model_onnx, basename="SklearnKNNTransformerConnectivity")
def test_sklearn_k_neighbours_transformer_distance(self): model, X_test = fit_classification_model( KNeighborsTransformer(n_neighbors=4, mode='distance'), 2) model_onnx = convert_sklearn( model, "KNN transformer", [("input", FloatTensorType((None, X_test.shape[1])))], ) self.assertIsNotNone(model_onnx) dump_data_and_model( X_test, model, model_onnx, basename="SklearnKNNTransformerDistance", )
def test_spectral_clustering(): # Test chaining KNeighborsTransformer and SpectralClustering n_neighbors = 5 X, _ = make_blobs(random_state=0) # compare the chained version and the compact version est_chain = make_pipeline( KNeighborsTransformer(n_neighbors=n_neighbors, mode='connectivity'), SpectralClustering(n_neighbors=n_neighbors, affinity='precomputed', random_state=42)) est_compact = SpectralClustering( n_neighbors=n_neighbors, affinity='nearest_neighbors', random_state=42) labels_compact = est_compact.fit_predict(X) labels_chain = est_chain.fit_predict(X) assert_array_almost_equal(labels_chain, labels_compact)
def test_lof(): # Test chaining KNeighborsTransformer and LocalOutlierFactor n_neighbors = 4 rng = np.random.RandomState(0) X = rng.randn(40, 2) # compare the chained version and the compact version est_chain = make_pipeline( KNeighborsTransformer(n_neighbors=n_neighbors + 1, mode='distance'), LocalOutlierFactor(metric='precomputed', n_neighbors=n_neighbors)) est_compact = LocalOutlierFactor(n_neighbors=n_neighbors) pred_chain = est_chain.fit_predict(X) pred_compact = est_compact.fit_predict(X) assert_array_almost_equal(pred_chain, pred_compact)
def train_knn(): logging.info("Training KNN") latent_codes = [] to_tensor = torchvision.transforms.ToTensor() for image in dataset: image = to_tensor(image).float() image = image.unsqueeze(dim=0) latent = ae_model.encode(image) latent = latent.detach().cpu() latent_codes.append(latent) latent_codes = np.vstack(latent_codes) global knn knn = KNeighborsTransformer().fit(latent_codes)
def test_transformer_result(): # Test the number of neighbors returned n_neighbors = 5 n_samples_fit = 20 n_queries = 18 n_features = 10 rng = np.random.RandomState(42) X = rng.randn(n_samples_fit, n_features) X2 = rng.randn(n_queries, n_features) radius = np.percentile(euclidean_distances(X), 10) # with n_neighbors for mode in ["distance", "connectivity"]: add_one = mode == "distance" nnt = KNeighborsTransformer(n_neighbors=n_neighbors, mode=mode) Xt = nnt.fit_transform(X) assert Xt.shape == (n_samples_fit, n_samples_fit) assert Xt.data.shape == (n_samples_fit * (n_neighbors + add_one), ) assert Xt.format == "csr" assert _is_sorted_by_data(Xt) X2t = nnt.transform(X2) assert X2t.shape == (n_queries, n_samples_fit) assert X2t.data.shape == (n_queries * (n_neighbors + add_one), ) assert X2t.format == "csr" assert _is_sorted_by_data(X2t) # with radius for mode in ["distance", "connectivity"]: add_one = mode == "distance" nnt = RadiusNeighborsTransformer(radius=radius, mode=mode) Xt = nnt.fit_transform(X) assert Xt.shape == (n_samples_fit, n_samples_fit) assert not Xt.data.shape == (n_samples_fit * (n_neighbors + add_one), ) assert Xt.format == "csr" assert _is_sorted_by_data(Xt) X2t = nnt.transform(X2) assert X2t.shape == (n_queries, n_samples_fit) assert not X2t.data.shape == (n_queries * (n_neighbors + add_one), ) assert X2t.format == "csr" assert _is_sorted_by_data(X2t)
def convert2graph(components): knn = KNeighborsTransformer(n_neighbors=10, n_jobs=-1) graph = knn.fit_transform(components) G = nx.Graph(graph) return G
def run_benchmark(): datasets = [ ("MNIST_2000", load_mnist(n_samples=2000)), ("MNIST_10000", load_mnist(n_samples=10000)), ] n_iter = 500 perplexity = 30 metric = "euclidean" # TSNE requires a certain number of neighbors which depends on the # perplexity parameter. # Add one since we include each sample as its own neighbor. n_neighbors = int(3.0 * perplexity + 1) + 1 tsne_params = dict( perplexity=perplexity, method="barnes_hut", random_state=42, n_iter=n_iter, square_distances=True, ) transformers = [ ("AnnoyTransformer", AnnoyTransformer(n_neighbors=n_neighbors, metric=metric)), ( "NMSlibTransformer", NMSlibTransformer(n_neighbors=n_neighbors, metric=metric), ), ( "KNeighborsTransformer", KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance", metric=metric), ), ( "TSNE with AnnoyTransformer", make_pipeline( AnnoyTransformer(n_neighbors=n_neighbors, metric=metric), TSNE(metric="precomputed", **tsne_params), ), ), ( "TSNE with NMSlibTransformer", make_pipeline( NMSlibTransformer(n_neighbors=n_neighbors, metric=metric), TSNE(metric="precomputed", **tsne_params), ), ), ( "TSNE with KNeighborsTransformer", make_pipeline( KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance", metric=metric), TSNE(metric="precomputed", **tsne_params), ), ), ("TSNE with internal NearestNeighbors", TSNE(metric=metric, **tsne_params)), ] # init the plot nrows = len(datasets) ncols = np.sum([1 for name, model in transformers if "TSNE" in name]) fig, axes = plt.subplots(nrows=nrows, ncols=ncols, squeeze=False, figsize=(5 * ncols, 4 * nrows)) axes = axes.ravel() i_ax = 0 for dataset_name, (X, y) in datasets: msg = "Benchmarking on %s:" % dataset_name print("\n%s\n%s" % (msg, "-" * len(msg))) for transformer_name, transformer in transformers: start = time.time() Xt = transformer.fit_transform(X) duration = time.time() - start # print the duration report longest = np.max([len(name) for name, model in transformers]) whitespaces = " " * (longest - len(transformer_name)) print("%s: %s%.3f sec" % (transformer_name, whitespaces, duration)) # plot TSNE embedding which should be very similar across methods if "TSNE" in transformer_name: axes[i_ax].set_title(transformer_name + "\non " + dataset_name) axes[i_ax].scatter( Xt[:, 0], Xt[:, 1], c=y.astype(np.int32), alpha=0.2, cmap=plt.cm.viridis, ) axes[i_ax].xaxis.set_major_formatter(NullFormatter()) axes[i_ax].yaxis.set_major_formatter(NullFormatter()) axes[i_ax].axis("tight") i_ax += 1 fig.tight_layout() plt.show()
# :class:`neighbors.KNeighborsTransformer` and # :class:`neighbors.RadiusNeighborsTransformer`. The precomputation # can also be performed by custom estimators to use alternative # implementations, such as approximate nearest neighbors methods. # See more details in the :ref:`User Guide <neighbors_transformer>`. from tempfile import TemporaryDirectory from sklearn.neighbors import KNeighborsTransformer from sklearn.manifold import Isomap from sklearn.pipeline import make_pipeline X, y = make_classification(random_state=0) with TemporaryDirectory(prefix="sklearn_cache_") as tmpdir: estimator = make_pipeline( KNeighborsTransformer(n_neighbors=10, mode="distance"), Isomap(n_neighbors=10, metric="precomputed"), memory=tmpdir, ) estimator.fit(X) # We can decrease the number of neighbors and the graph will not be # recomputed. estimator.set_params(isomap__n_neighbors=5) estimator.fit(X) # %% # KNN Based Imputation # ------------------------------------ # We now support imputation for completing missing values using k-Nearest # Neighbors.
def __init__(self): self.knn = KNeighborsTransformer(n_neighbors=5,n_jobs=-1)
def run_benchmark(): datasets = [ ('MNIST_2000', load_mnist(n_samples=2000)), ('MNIST_10000', load_mnist(n_samples=10000)), ] n_iter = 500 perplexity = 30 # TSNE requires a certain number of neighbors which depends on the # perplexity parameter. # Add one since we include each sample as its own neighbor. n_neighbors = int(3. * perplexity + 1) + 1 transformers = [ ('AnnoyTransformer', AnnoyTransformer(n_neighbors=n_neighbors, metric='sqeuclidean')), ('NMSlibTransformer', NMSlibTransformer(n_neighbors=n_neighbors, metric='sqeuclidean')), ('KNeighborsTransformer', KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance', metric='sqeuclidean')), ('TSNE with AnnoyTransformer', make_pipeline( AnnoyTransformer(n_neighbors=n_neighbors, metric='sqeuclidean'), TSNE(metric='precomputed', perplexity=perplexity, method="barnes_hut", random_state=42, n_iter=n_iter), )), ('TSNE with NMSlibTransformer', make_pipeline( NMSlibTransformer(n_neighbors=n_neighbors, metric='sqeuclidean'), TSNE(metric='precomputed', perplexity=perplexity, method="barnes_hut", random_state=42, n_iter=n_iter), )), ('TSNE with KNeighborsTransformer', make_pipeline( KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance', metric='sqeuclidean'), TSNE(metric='precomputed', perplexity=perplexity, method="barnes_hut", random_state=42, n_iter=n_iter), )), ('TSNE with internal NearestNeighbors', TSNE(metric='sqeuclidean', perplexity=perplexity, method="barnes_hut", random_state=42, n_iter=n_iter)), ] # init the plot nrows = len(datasets) ncols = np.sum([1 for name, model in transformers if 'TSNE' in name]) fig, axes = plt.subplots(nrows=nrows, ncols=ncols, squeeze=False, figsize=(5 * ncols, 4 * nrows)) axes = axes.ravel() i_ax = 0 for dataset_name, (X, y) in datasets: msg = 'Benchmarking on %s:' % dataset_name print('\n%s\n%s' % (msg, '-' * len(msg))) for transformer_name, transformer in transformers: start = time.time() Xt = transformer.fit_transform(X) duration = time.time() - start # print the duration report longest = np.max([len(name) for name, model in transformers]) whitespaces = ' ' * (longest - len(transformer_name)) print('%s: %s%.3f sec' % (transformer_name, whitespaces, duration)) # plot TSNE embedding which should be very similar across methods if 'TSNE' in transformer_name: axes[i_ax].set_title(transformer_name + '\non ' + dataset_name) axes[i_ax].scatter(Xt[:, 0], Xt[:, 1], c=y, alpha=0.2, cmap=plt.cm.viridis) axes[i_ax].xaxis.set_major_formatter(NullFormatter()) axes[i_ax].yaxis.set_major_formatter(NullFormatter()) axes[i_ax].axis('tight') i_ax += 1 fig.tight_layout() plt.show()
import matplotlib.pyplot as plt from sklearn.neighbors import KNeighborsTransformer, KNeighborsClassifier from sklearn.model_selection import GridSearchCV from sklearn.datasets import load_digits from sklearn.pipeline import Pipeline print(__doc__) X, y = load_digits(return_X_y=True) n_neighbors_list = [1, 2, 3, 4, 5, 6, 7, 8, 9] # The transformer computes the nearest neighbors graph using the maximum number # of neighbors necessary in the grid search. The classifier model filters the # nearest neighbors graph as required by its own n_neighbors parameter. graph_model = KNeighborsTransformer(n_neighbors=max(n_neighbors_list), mode="distance") classifier_model = KNeighborsClassifier(metric="precomputed") # Note that we give `memory` a directory to cache the graph computation # that will be used several times when tuning the hyperparameters of the # classifier. with TemporaryDirectory(prefix="sklearn_graph_cache_") as tmpdir: full_model = Pipeline( steps=[("graph", graph_model), ("classifier", classifier_model)], memory=tmpdir ) param_grid = {"classifier__n_neighbors": n_neighbors_list} grid_model = GridSearchCV(full_model, param_grid) grid_model.fit(X, y) # Plot the results of the grid search.
"""A function to take a feature and tokenize then return a tfidf df of that input """ self.tokenizer.fit_on_texts(feature) a = self.tokenizer.texts_to_matrix(feature, mode='tfidf') config = self.tokenizer.get_config() feature_names = json_normalize(loads( config['word_index'])).columns.tolist() dtm = pd.DataFrame(a) return dtm if __name__ == "__main__": tr = Transformer() negative = ['negative'] ignore = [] user_transformed, y = tr.transform( pd.DataFrame({ 'name': "blue berry kush", 'race': 'sativa', 'flavors': ['blueberry', 'sweet'], 'negative': ['dry mouth', 'dry eyes'], 'positive': ['creativity', 'stress'], 'medical': ['ptsd', 'stress'], 'description': "blueberry kush my dude blueberry_kush:10, whitewhidow:10 ", }), negative, ignore) model = KNeighborsTransformer() model.fit()
# To use this feature in a pipeline, one can use the `memory` parameter, along # with one of the two new transformers, # :class:`neighbors.KNeighborsTransformer` and # :class:`neighbors.RadiusNeighborsTransformer`. The precomputation # can also be performed by custom estimators to use alternative # implementations, such as approximate nearest neighbors methods. # See more details in the :ref:`User Guide <neighbors_transformer>`. from tempfile import TemporaryDirectory from sklearn.neighbors import KNeighborsTransformer from sklearn.manifold import Isomap from sklearn.pipeline import make_pipeline with TemporaryDirectory(prefix="sklearn_cache_") as tmpdir: estimator = make_pipeline( KNeighborsTransformer(n_neighbors=10, mode='distance'), Isomap(n_neighbors=10, metric='precomputed'), memory=tmpdir) estimator.fit(X) # We can decrease the number of neighbors and the graph will not be # recomputed. estimator.set_params(isomap__n_neighbors=5) estimator.fit(X) ############################################################################ # Stacking Classifier and Regressor # --------------------------------- # :class:`~ensemble.StackingClassifier` and # :class:`~ensemble.StackingRegressor` # allow you to have a stack of estimators with a final classifier or
def weighted_knn(train_adata, valid_adata, label_key, n_neighbors=50, threshold=0.5, pred_unknown=True): """Annotates ``valid_adata`` cells with a trained weighted KNN classifier on ``train_adata``. Parameters ---------- train_adata: :class:`~anndata.AnnData` Annotated dataset to be used to train KNN classifier with ``label_key`` as the target variable. valid_adata: :class:`~anndata.AnnData` Annotated dataset to be used to validate KNN classifier. label_key: str Name of the column to be used as target variable (e.g. cell_type) in ``train_adata`` and ``valid_adata``. n_neighbors: int Number of nearest neighbors in KNN classifier. threshold: float Threshold of uncertainty used to annotating cells as "Unknown". cells with uncertainties upper than this value will be annotated as "Unknown". pred_unknown: bool ``True`` by default. Whether to annotate any cell as "unknown" or not. If `False`, will not use ``threshold`` and annotate each cell with the label which is the most common in its ``n_neighbors`` nearest cells. """ print( f'Weighted KNN with n_neighbors = {n_neighbors} and threshold = {threshold} ... ', end='') k_neighbors_transformer = KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance', algorithm='brute', metric='euclidean', n_jobs=-1) k_neighbors_transformer.fit(train_adata.X) y_train_labels = train_adata.obs[label_key].values y_valid_labels = valid_adata.obs[label_key].values top_k_distances, top_k_indices = k_neighbors_transformer.kneighbors( X=valid_adata.X) stds = np.std(top_k_distances, axis=1) stds = (2. / stds)**2 stds = stds.reshape(-1, 1) top_k_distances_tilda = np.exp(-np.true_divide(top_k_distances, stds)) weights = top_k_distances_tilda / np.sum( top_k_distances_tilda, axis=1, keepdims=True) uncertainties = [] pred_labels = [] for i in range(len(weights)): unique_labels = np.unique(y_train_labels[top_k_indices[i]]) best_label, best_prob = None, 0.0 for candidate_label in unique_labels: candidate_prob = weights[i, y_train_labels[top_k_indices[i]] == candidate_label].sum() if best_prob < candidate_prob: best_prob = candidate_prob best_label = candidate_label if pred_unknown: if best_prob >= threshold: pred_label = best_label else: pred_label = 'Unknown' else: pred_label = best_label if pred_label == y_valid_labels[i]: uncertainties.append(max(1 - best_prob, 0)) else: true_prob = weights[i, y_train_labels[top_k_indices[i]] == y_valid_labels[i]].sum() if true_prob > 0.5: pass uncertainties.append(max(1 - true_prob, 0)) pred_labels.append(pred_label) pred_labels = np.array(pred_labels).reshape(-1, ) uncertainties = np.array(uncertainties).reshape(-1, ) labels_eval = pred_labels == y_valid_labels labels_eval = labels_eval.astype(object) n_correct = len(labels_eval[labels_eval == True]) n_incorrect = len(labels_eval[labels_eval == False]) - len( labels_eval[pred_labels == 'Unknown']) n_unknown = len(labels_eval[pred_labels == 'Unknown']) labels_eval[labels_eval == True] = f'Correct' labels_eval[labels_eval == False] = f'InCorrect' labels_eval[pred_labels == 'Unknown'] = f'Unknown' valid_adata.obs['uncertainty'] = uncertainties valid_adata.obs[f'pred_{label_key}'] = pred_labels valid_adata.obs['evaluation'] = labels_eval print('finished!') print(f"Number of correctly classified samples: {n_correct}") print(f"Number of misclassified samples: {n_incorrect}") print(f"Number of samples classified as unknown: {n_unknown}")
# with one of the two new transformers, # :class:`neighbors.KNeighborsTransformer` and # :class:`neighbors.RadiusNeighborsTransformer`. The precomputation # can also be performed by custom estimators to use alternative # implementations, such as approximate nearest neighbors methods. # See more details in the :ref:`User Guide <neighbors_transformer>`. from tempfile import TemporaryDirectory from sklearn.neighbors import KNeighborsTransformer from sklearn.manifold import Isomap from sklearn.pipeline import make_pipeline X, y = make_classification(random_state=0) with TemporaryDirectory(prefix="sklearn_cache_") as tmpdir: estimator = make_pipeline(KNeighborsTransformer(n_neighbors=10, mode='distance'), Isomap(n_neighbors=10, metric='precomputed'), memory=tmpdir) estimator.fit(X) # We can decrease the number of neighbors and the graph will not be # recomputed. estimator.set_params(isomap__n_neighbors=5) estimator.fit(X) # %% # KNN Based Imputation # ------------------------------------ # We now support imputation for completing missing values using k-Nearest # Neighbors. #
def weighted_knn(train_adata, valid_adata, label_key, n_neighbors=50, threshold=0.5, pred_unknown=True, return_uncertainty=True): """ Taken from scnet: https://github.com/theislab/scarches/blob/e84cfa5cf361bb22fd70865cb1f398af72248684/scnet/utils.py """ print( f'Weighted KNN with n_neighbors = {n_neighbors} and threshold = {threshold} ... ', end='') k_neighbors_transformer = KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance', algorithm='brute', metric='euclidean', n_jobs=-1) train_adata = remove_sparsity(train_adata) valid_adata = remove_sparsity(valid_adata) k_neighbors_transformer.fit(train_adata.X) y_train_labels = train_adata.obs[label_key].values y_valid_labels = valid_adata.obs[label_key].values top_k_distances, top_k_indices = k_neighbors_transformer.kneighbors( X=valid_adata.X) stds = np.std(top_k_distances, axis=1) stds = (2. / stds)**2 stds = stds.reshape(-1, 1) top_k_distances_tilda = np.exp(-np.true_divide(top_k_distances, stds)) weights = top_k_distances_tilda / np.sum( top_k_distances_tilda, axis=1, keepdims=True) uncertainties = [] pred_labels = [] for i in range(len(weights)): # labels = y_train_labels[top_k_indices[i]] most_common_label, _ = Counter( y_train_labels[top_k_indices[i]]).most_common(n=1)[0] most_prob = weights[i, y_train_labels[top_k_indices[i]] == most_common_label].sum() if pred_unknown: if most_prob >= threshold: pred_label = most_common_label else: pred_label = 'Unknown' else: pred_label = most_common_label if pred_label == y_valid_labels[i]: uncertainties.append(1 - most_prob) else: true_prob = weights[i, y_train_labels[top_k_indices[i]] == y_valid_labels[i]].sum() uncertainties.append(1 - true_prob) pred_labels.append(pred_label) pred_labels = np.array(pred_labels).reshape(-1, 1) uncertainties = np.array(uncertainties).reshape(-1, 1) print('finished!') if return_uncertainty: return pred_labels, uncertainties else: return pred_labels