def get_best_thread(self, question, tag_name): """ Returns id of the most similar thread for the question. The search is performed across the threads with a given tag. """ thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name) question_vec = question_to_vec(question, self.word_embeddings, self.embeddings_dim) n = int(len(thread_ids) / 2) best_thread1, dist1 = pairwise_distances_argmin_min( question_vec.reshape((1, self.embeddings_dim)), thread_embeddings[:n, :], metric='cosine') best_thread2, dist2 = pairwise_distances_argmin_min( question_vec.reshape((1, self.embeddings_dim)), thread_embeddings[n:, :], metric='cosine') if dist1[0] <= dist2[0]: best_thread = best_thread1[0] else: best_thread = best_thread2[0] + n return thread_ids[best_thread]
def _labels_inertia_precompute_dense(norm, X, sample_weight, centers, distances): """ Computes labels and inertia using a full distance matrix. This will overwrite the 'distances' array in-place. Parameters ---------- norm : 'l1' or 'l2' X : numpy array, shape (n_sample, n_features) Input data. sample_weight : array-like, shape (n_samples,) The weights for each observation in X. centers : numpy array, shape (n_clusters, n_features) Cluster centers which data is assigned to. distances : numpy array, shape (n_samples,) Pre-allocated array in which distances are stored. Returns ------- labels : numpy array, dtype=numpy.int, shape (n_samples,) Indices of clusters that samples are assigned to. inertia : float Sum of squared distances of samples to their closest cluster center. """ n_samples = X.shape[0] if norm == 'l2': labels, mindist = pairwise_distances_argmin_min( X=X, Y=centers, metric='euclidean', metric_kwargs={'squared': True}) elif norm == 'l1': labels, mindist = pairwise_distances_argmin_min(X=X, Y=centers, metric='manhattan') else: # pragma no cover raise NotImplementedError( "Not implemented for norm '{}'.".format(norm)) # cython k-means code assumes int32 inputs labels = labels.astype(numpy.int32, copy=False) if n_samples == distances.shape[0]: # distances will be changed in-place distances[:] = mindist inertia = (mindist * sample_weight).sum() return labels, inertia
def test_pairwise_distances_argmin_min(): """ Check pairwise minimum distances computation for any metric""" X = [[0], [1]] Y = [[-1], [2]] # euclidean metric D, E = pairwise_distances_argmin_min(X, Y, metric="euclidean") D2 = pairwise_distances_argmin(X, Y, metric="euclidean") assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(D2, [0, 1]) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Non-euclidean sklearn metric D, E = pairwise_distances_argmin_min(X, Y, metric="manhattan") D2 = pairwise_distances_argmin(X, Y, metric="manhattan") assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(D2, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Non-euclidean Scipy distance (callable) D, E = pairwise_distances_argmin_min(X, Y, metric=minkowski, metric_kwargs={"p": 2}) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Non-euclidean Scipy distance (string) D, E = pairwise_distances_argmin_min(X, Y, metric="minkowski", metric_kwargs={"p": 2}) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Compare with naive implementation rng = np.random.RandomState(0) X = rng.randn(97, 149) Y = rng.randn(111, 149) dist = pairwise_distances(X, Y, metric="manhattan") dist_orig_ind = dist.argmin(axis=0) dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))] dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min( X, Y, axis=0, metric="manhattan", batch_size=50) np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7) np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7)
def predict(self, tracks): """ Predict the closest cluster each trajectory in tracks belongs to for each descriptor type. In the vector quantization literature, centroids is called the code book and each value returned by predict is the index of the closest code in the code book. Parameters ---------- tracks : stuctured ndarray Trajectories to predict. Returns ------- labels : dict of array Index of the cluster each trajectory belongs to for each descriptor type. """ labels = dict() for desc in self.descriptors: if len(self.codebooks[desc]) > 0: cb_indices, dist = pairwise_distances_argmin_min( tracks[desc], self.codebooks[desc]['cluster_centers'], metric='euclidean') else: cb_indices = np.full(len(tracks), fill_value=-1, dtype=np.int) labels[desc] = cb_indices return labels
def kmeans_plot(X, y, cluster_centers, ax=None): import matplotlib.patheffects as path_effects from sklearn.metrics.pairwise import pairwise_distances_argmin_min if ax is None: ax = plt.gca() colors = cm.spectral(y.astype(float) / len(cluster_centers)) ax.scatter(*list(zip(*X)), lw=0, c=colors, s=30) offset = max(list(zip(*cluster_centers))[0]) * 0.2 for i, cluster in enumerate(cluster_centers): index, _ = pairwise_distances_argmin_min(cluster.reshape(1, -1), Y=X) cluster_color = colorConverter.to_rgb(colors[index[0]]) if is_luminous(cluster_color) is False: cluster_color = darken_rgb(cluster_color, 0.35) label = ax.text(x=cluster[0] + offset, y=cluster[1], s='{:d}'.format(i + 1), color=cluster_color) label.set_path_effects([path_effects.Stroke(lw=2, foreground='white'), path_effects.Normal()]) limit = max(*ax.get_xlim(), *ax.get_xlim()) ax.set_xlim(0, limit) ax.set_ylim(0, limit) ax.set_xlabel("Feature space for the 1st feature") ax.set_ylabel("Feature space for the 2nd feature") return ax
def get_best_thread(self, question, tag_name): """ Returns id of the most similar thread for the question. The search is performed across the threads with a given tag. """ thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name) # HINT: you have already implemented a similar routine in the 3rd assignment. #### YOUR CODE HERE #### question_vec = question_to_vec(question, self.word_embeddings, dim=self.embeddings_dim) #### YOUR CODE HERE #### # best_thread = pairwise_distances_argmin(question_vec.reshape(1, -1), thread_embeddings, metric='cosine')[0] # Due to memory errors we load the threads in chunks scores_list = [] k = 10 n = int(len(thread_ids) / k) for i in range(k): if i == k - 1: break # Due to memory error (AWS 1GB Free Tier) we do not include the last chunk of threads # best_thread, dist = pairwise_distances_argmin_min(question_vec.reshape((1, self.embeddings_dim)), # thread_embeddings[i * n:, :], metric='cosine') else: best_thread, dist = pairwise_distances_argmin_min(question_vec.reshape((1, self.embeddings_dim)), thread_embeddings[i * n: (i + 1) * n, :], metric='cosine') scores_list.append({'thread': i * n + best_thread[0], 'dist': dist[0]}) df = pd.DataFrame(scores_list).sort_values(by='dist') best_thread = int(df.iloc[0]['thread']) return thread_ids[best_thread]
def project_cells_to_epg(adata): input_data = adata.obsm['X_dr'] epg = adata.uns['epg'] dict_nodes_pos = nx.get_node_attributes(epg, 'pos') nodes_pos = np.empty((0, input_data.shape[1])) nodes = np.empty((0, 1), dtype=int) for key in dict_nodes_pos.keys(): nodes_pos = np.vstack((nodes_pos, dict_nodes_pos[key])) nodes = np.append(nodes, key) indices = pairwise_distances_argmin_min(input_data, nodes_pos, axis=1, metric='euclidean')[0] x_node = nodes[indices] adata.obs['node'] = x_node #update the projection info for each cell flat_tree = adata.uns['flat_tree'] dict_branches_nodes = nx.get_edge_attributes(flat_tree, 'nodes') dict_branches_id = nx.get_edge_attributes(flat_tree, 'id') dict_node_state = nx.get_node_attributes(flat_tree, 'label') list_x_br_id = list() list_x_br_id_alias = list() list_x_lam = list() list_x_dist = list() for ix, xp in enumerate(input_data): list_br_id = [ flat_tree.edges[br_key]['id'] for br_key, br_value in dict_branches_nodes.items() if x_node[ix] in br_value ] dict_br_matrix = dict() for br_id in list_br_id: dict_br_matrix[br_id] = np.array( [dict_nodes_pos[i] for i in flat_tree.edges[br_id]['nodes']]) dict_results = dict() list_dist_xp = list() for br_id in list_br_id: dict_results[br_id] = project_point_to_line_segment_matrix( dict_br_matrix[br_id], xp) list_dist_xp.append(dict_results[br_id][2]) x_br_id = list_br_id[np.argmin(list_dist_xp)] x_br_id_alias = dict_node_state[x_br_id[0]], dict_node_state[ x_br_id[1]] br_len = flat_tree.edges[x_br_id]['len'] results = dict_results[x_br_id] x_dist = results[2] x_lam = results[3] if (x_lam > br_len): x_lam = br_len list_x_br_id.append(x_br_id) list_x_br_id_alias.append(x_br_id_alias) list_x_lam.append(x_lam) list_x_dist.append(x_dist) adata.obs['branch_id'] = list_x_br_id adata.obs['branch_id_alias'] = list_x_br_id_alias # adata.uns['branch_id'] = list(set(adata.obs['branch_id'].tolist())) adata.obs['branch_lam'] = list_x_lam adata.obs['branch_dist'] = list_x_dist return None
def select_instance( X_training: modALinput, X_pool: modALinput, X_uncertainty: np.ndarray, mask: np.ndarray, metric: Union[str, Callable], n_jobs: Union[int, None] ) -> Tuple[np.ndarray, modALinput, np.ndarray]: """ Core iteration strategy for selecting another record from our unlabeled records. Given a set of labeled records (X_training) and unlabeled records (X_pool) with uncertainty scores (X_uncertainty), we'd like to identify the best instance in X_pool that best balances uncertainty and dissimilarity. Refer to Cardoso et al.'s "Ranked batch-mode active learning": https://www.sciencedirect.com/science/article/pii/S0020025516313949 TODO: - Add notebook for Active Learning bake-off (passive vs interactive vs batch vs ranked batch) Args: X_training: Mix of both labeled and unlabeled records. X_pool: Unlabeled records to be selected for labeling. X_uncertainty: Uncertainty scores for unlabeled records to be selected for labeling. mask: Mask to exclude previously selected instances from the pool. metric: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`. n_jobs: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`. Returns: Index of the best index from X chosen to be labelled; a single record from our unlabeled set that is considered the most optimal incremental record for including in our query set. """ # Extract the number of labeled and unlabeled records. n_labeled_records, _ = X_training.shape n_unlabeled, _ = X_pool[mask].shape # Determine our alpha parameter as |U| / (|U| + |D|). Note that because we # append to X_training and remove from X_pool within `ranked_batch`, # :alpha: is not fixed throughout our model's lifetime. alpha = n_unlabeled / (n_unlabeled + n_labeled_records) # Compute pairwise distance (and then similarity) scores from every unlabeled record # to every record in X_training. The result is an array of shape (n_samples, ). if n_jobs == 1 or n_jobs is None: _, distance_scores = pairwise_distances_argmin_min(X_pool[mask], X_training, metric=metric) else: distance_scores = pairwise_distances(X_pool[mask], X_training, metric=metric, n_jobs=n_jobs).min(axis=1) similarity_scores = 1 / (1 + distance_scores) # Compute our final scores, which are a balance between how dissimilar a given record # is with the records in X_uncertainty and how uncertain we are about its class. scores = alpha * (1 - similarity_scores) + (1 - alpha) * X_uncertainty[mask] # Isolate and return our best instance for labeling as the one with the largest score. best_instance_index = np.argmax(scores) mask[best_instance_index] = 0 return best_instance_index, X_pool[best_instance_index].reshape(1, -1), mask
def weights(self, split): if self.distanceMatrix is None: validActive = self.features[(split == 0) & (self.labels == 1)].astype(bool) validDecoy = self.features[(split == 0) & (self.labels == 0)].astype(bool) trainActive = self.features[(split == 1) & (self.labels == 1)].astype(bool) trainDecoy = self.features[(split == 1) & (self.labels == 0)].astype(bool) actActDistances = pairwise_distances_argmin_min(validActive, trainActive, metric='jaccard') actDecDistances = pairwise_distances_argmin_min(validActive, trainDecoy, metric='jaccard') decActDistances = pairwise_distances_argmin_min(validDecoy, trainActive, metric='jaccard') decDecDistances = pairwise_distances_argmin_min(validDecoy, trainDecoy, metric='jaccard') decWeights = decDecDistances / decActDistances actWeights = actActDistances / actDecDistances else: actActDistances = self.distanceMatrix[(split == 0) & ( self.labels == 1), :][:, (split == 1) & (self.labels == 1)] actDecDistances = self.distanceMatrix[(split == 0) & ( self.labels == 1), :][:, (split == 1) & (self.labels == 0)] decActDistances = self.distanceMatrix[(split == 0) & ( self.labels == 0), :][:, (split == 1) & (self.labels == 1)] decDecDistances = self.distanceMatrix[(split == 0) & ( self.labels == 0), :][:, (split == 1) & (self.labels == 0)] decWeights = np.amin(decDecDistances, axis=1) / np.amin( decActDistances, axis=1) actWeights = np.amin(actActDistances, axis=1) / np.amin( actDecDistances, axis=1) holdWeights = np.zeros(self.size) validActiveIndices = np.where((split == 0) & (self.labels == 1))[0] for i in range(len(validActiveIndices)): holdWeights[validActiveIndices[i]] = actWeights[i] validDecoyIndices = np.where((split == 0) & (self.labels == 0))[0] for i in range(len(validDecoyIndices)): holdWeights[validDecoyIndices[i]] = decWeights[i] return holdWeights
def test_pairwise_distances_argmin_min(): # Check pairwise minimum distances computation for any metric X = [[0], [1]] Y = [[-1], [2]] Xsp = dok_matrix(X) Ysp = csr_matrix(Y, dtype=np.float32) # euclidean metric D, E = pairwise_distances_argmin_min(X, Y, metric="euclidean") D2 = pairwise_distances_argmin(X, Y, metric="euclidean") assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(D2, [0, 1]) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # sparse matrix case Dsp, Esp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean") assert_array_equal(Dsp, D) assert_array_equal(Esp, E) # We don't want np.matrix here assert_equal(type(Dsp), np.ndarray) assert_equal(type(Esp), np.ndarray) # Non-euclidean sklearn metric D, E = pairwise_distances_argmin_min(X, Y, metric="manhattan") D2 = pairwise_distances_argmin(X, Y, metric="manhattan") assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(D2, [0, 1]) assert_array_almost_equal(E, [1., 1.]) D, E = pairwise_distances_argmin_min(Xsp, Ysp, metric="manhattan") D2 = pairwise_distances_argmin(Xsp, Ysp, metric="manhattan") assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Non-euclidean Scipy distance (callable) D, E = pairwise_distances_argmin_min(X, Y, metric=minkowski, metric_kwargs={"p": 2}) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Non-euclidean Scipy distance (string) D, E = pairwise_distances_argmin_min(X, Y, metric="minkowski", metric_kwargs={"p": 2}) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Compare with naive implementation rng = np.random.RandomState(0) X = rng.randn(97, 149) Y = rng.randn(111, 149) dist = pairwise_distances(X, Y, metric="manhattan") dist_orig_ind = dist.argmin(axis=0) dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))] dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min( X, Y, axis=0, metric="manhattan", batch_size=50) np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7) np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7)
def test_pairwise_distances_argmin_min(): """ Check pairwise minimum distances computation for any metric""" X = [[0], [1]] Y = [[-1], [2]] # euclidean metric D, E = pairwise_distances_argmin_min(X, Y, metric="euclidean") D2 = pairwise_distances_argmin(X, Y, metric="euclidean") assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(D2, [0, 1]) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Non-euclidean sklearn metric D, E = pairwise_distances_argmin_min(X, Y, metric="manhattan") D2 = pairwise_distances_argmin(X, Y, metric="manhattan") assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(D2, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Non-euclidean Scipy distance (callable) D, E = pairwise_distances_argmin_min(X, Y, metric=minkowski, metric_kwargs={"p": 2}) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Non-euclidean Scipy distance (string) D, E = pairwise_distances_argmin_min(X, Y, metric="minkowski", metric_kwargs={"p": 2}) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Compare with naive implementation rng = np.random.RandomState(0) X = rng.randn(97, 149) Y = rng.randn(111, 149) dist = pairwise_distances(X, Y, metric="manhattan") dist_orig_ind = dist.argmin(axis=0) dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))] dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min( X, Y, axis=0, metric="manhattan", batch_size=50) np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7) np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7)
def test_pairwise_distances_argmin_min(): # Check pairwise minimum distances computation for any metric X = [[0], [1]] Y = [[-1], [2]] Xsp = dok_matrix(X) Ysp = csr_matrix(Y, dtype=np.float32) # euclidean metric D, E = pairwise_distances_argmin_min(X, Y, metric="euclidean") D2 = pairwise_distances_argmin(X, Y, metric="euclidean") assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(D2, [0, 1]) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # sparse matrix case Dsp, Esp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean") assert_array_equal(Dsp, D) assert_array_equal(Esp, E) # We don't want np.matrix here assert_equal(type(Dsp), np.ndarray) assert_equal(type(Esp), np.ndarray) # Non-euclidean scikit-learn metric D, E = pairwise_distances_argmin_min(X, Y, metric="manhattan") D2 = pairwise_distances_argmin(X, Y, metric="manhattan") assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(D2, [0, 1]) assert_array_almost_equal(E, [1., 1.]) D, E = pairwise_distances_argmin_min(Xsp, Ysp, metric="manhattan") D2 = pairwise_distances_argmin(Xsp, Ysp, metric="manhattan") assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Non-euclidean Scipy distance (callable) D, E = pairwise_distances_argmin_min(X, Y, metric=minkowski, metric_kwargs={"p": 2}) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Non-euclidean Scipy distance (string) D, E = pairwise_distances_argmin_min(X, Y, metric="minkowski", metric_kwargs={"p": 2}) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Compare with naive implementation rng = np.random.RandomState(0) X = rng.randn(97, 149) Y = rng.randn(111, 149) dist = pairwise_distances(X, Y, metric="manhattan") dist_orig_ind = dist.argmin(axis=0) dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))] dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min( X, Y, axis=0, metric="manhattan", batch_size=50) np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7) np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7)
def gabriel_graph(X, metric='euclidean', weighted=False): n = X.shape[0] a, b = np.triu_indices(n, k=1) midpoints = (X[a] + X[b]) / 2 _, Dmid = pairwise_distances_argmin_min(midpoints, X, metric=metric) Dedge = paired_distances(X[a], X[b], metric=metric) mask = (Dedge - Dmid * 2) < 1e-10 pairs = np.column_stack((a[mask], b[mask])) w = Dedge[mask] if weighted else None return Graph.from_edge_pairs(pairs, num_vertices=n, symmetric=True, weights=w)
def computeScore(self, split): if not self.validSplit(split): return 2.0, else: if self.distanceMatrix is None: validActive = self.features[(split == 0) & (self.labels == 1)] validDecoy = self.features[(split == 0) & (self.labels == 0)] trainActive = self.features[(split == 1) & (self.labels == 1)] trainDecoy = self.features[(split == 1) & (self.labels == 0)] actActDistances = pairwise_distances_argmin_min( validActive, trainActive, metric='jaccard')[1] actDecoyDistances = pairwise_distances_argmin_min( validActive, trainDecoy, metric='jaccard')[1] decoyActDistances = pairwise_distances_argmin_min( validDecoy, trainActive, metric='jaccard')[1] decoyDecoyDistances = pairwise_distances_argmin_min( validDecoy, trainDecoy, metric='jaccard')[1] else: actActDistances = np.amin(self.distanceMatrix[(split == 0) & ( self.labels == 1), :][:, (split == 1) & (self.labels == 1)], axis=1) actDecoyDistances = np.amin(self.distanceMatrix[ (split == 0) & (self.labels == 1), :][:, (split == 1) & (self.labels == 0)], axis=1) decoyActDistances = np.amin(self.distanceMatrix[ (split == 0) & (self.labels == 0), :][:, (split == 1) & (self.labels == 1)], axis=1) decoyDecoyDistances = np.amin(self.distanceMatrix[ (split == 0) & (self.labels == 0), :][:, (split == 1) & (self.labels == 0)], axis=1) activeMeanDistance = np.mean(actDecoyDistances - actActDistances) decoyMeanDistance = np.mean(decoyActDistances - decoyDecoyDistances) if self.AVE: score = activeMeanDistance + decoyMeanDistance else: score = np.sqrt(activeMeanDistance**2 + decoyMeanDistance**2) return score,
def _centroids(n_clusters: int, points: List[List[float]]) -> List[List[float]]: """ Return n_clusters centroids of points """ k_means = KMeans(n_clusters=n_clusters) k_means.fit(points) closest, _ = pairwise_distances_argmin_min(k_means.cluster_centers_, points) return list(map(list, np.array(points)[closest.tolist()]))
def gabriel_graph(X, metric='euclidean', weighted=False): n = X.shape[0] a, b = np.triu_indices(n, k=1) midpoints = (X[a] + X[b]) / 2 _, Dmid = pairwise_distances_argmin_min(midpoints, X, metric=metric) Dedge = paired_distances(X[a], X[b], metric=metric) mask = (Dedge - Dmid * 2) < 1e-10 pairs = np.column_stack((a[mask], b[mask])) w = Dedge[mask] if weighted else None return Graph.from_edge_pairs(pairs, num_vertices=n, symmetric=True, weights=w)
def score(self, X): """ mean distance between X and cluster mean :param X: array-like or sparse matrix of shape = [n_samples, n_features] :return: mean distance between X and cluster mean """ cluster_centers_ = self.model[[1, 3]].values if self.encode_type == 2: cluster_centers_ = self.model[[1, 3, 5]].values labels, mindist = pairwise_distances_argmin_min( X=X, Y=cluster_centers_, metric='euclidean', metric_kwargs={'squared': True}) return np.array(mindist).mean()
def _predict_l1(self, X, sample_weight=None, return_distances=False): """ Returns the distance of each point in *X* to every fit clusters. :param X: features :param sample_weight: (unused) :param return_distances: returns distances as well :return: labels or `labels, distances` """ labels, mindist = pairwise_distances_argmin_min( X=X, Y=self.cluster_centers_, metric='manhattan') labels = labels.astype(numpy.int32, copy=False) if return_distances: return labels, mindist return labels
def _labels_inertia_precompute_dense(X, sample_weight, x_squared_norms, centers, distances): """Compute labels and inertia using a full distance matrix. This will overwrite the 'distances' array in-place. Parameters ---------- X : numpy array, shape (n_sample, n_features) Input data. sample_weight : array-like, shape (n_samples,) The weights for each observation in X. x_squared_norms : numpy array, shape (n_samples,) Precomputed squared norms of X. centers : numpy array, shape (n_clusters, n_features) Cluster centers which data is assigned to. distances : numpy array, shape (n_samples,) Pre-allocated array in which distances are stored. Returns ------- labels : numpy array, dtype=np.int, shape (n_samples,) Indices of clusters that samples are assigned to. inertia : float Sum of squared distances of samples to their closest cluster center. """ n_samples = X.shape[0] # Breakup nearest neighbor distance computation into batches to prevent # memory blowup in the case of a large number of samples and clusters. # TODO: Once PR #7383 is merged use check_inputs=False in metric_kwargs. labels, mindist = pairwise_distances_argmin_min( X=X, Y=centers, metric='euclidean', metric_kwargs={'squared': True}) # cython k-means code assumes int32 inputs labels = labels.astype(np.int32, copy=False) if n_samples == distances.shape[0]: # distances will be changed in-place distances[:] = mindist inertia = (mindist * sample_weight).sum() return labels, inertia
def Subspace_iter(X, n_clusters, init='k-means++', max_iter=300, tol=1e-4, tol_eig=-1e-10, x_squared_norms=None, random_state=None): random_state = check_random_state(random_state) centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) new_labels, new_inertia, new_centers = None, None, None distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype) d_shape = X.shape[1] randomval = random_state.random_sample(d_shape ** 2).reshape(d_shape, d_shape) V_val, _ = np.linalg.qr(randomval, mode='complete') m_val = d_shape // 2 S_D = np.dot(X.T, X) P_Cluster = np.eye(m_val, M=d_shape).T for i in range(max_iter): centers_old = centers.copy() X_values = np.dot(np.dot(X, V_val), P_Cluster) centers_c = np.dot(np.dot(centers, V_val), P_Cluster) labels, _ = pairwise_distances_argmin_min(X = X_values, Y = centers_c, metric='euclidean',metric_kwargs={'squared': True}) labels = labels.astype(np.int32) centers = _k_means._centers_dense(X, labels, n_clusters, distances) S = np.zeros((d_shape, d_shape)) for it in range(n_clusters): X_it = X[:][labels == it] - centers[:][it] S += np.dot(X_it.T, X_it) Sigma = S - S_D EV, _ = np.linalg.eigh(Sigma) m = len(np.where(EV < tol_eig)[0]) P_Cluster = np.eye(m, M=d_shape).T inertia = 0.0 for j in range(n_clusters): inertia += row_norms( X[:][labels == j] - centers[:][j],squared=True).sum() if new_inertia is None or inertia < new_inertia: new_labels = labels.copy() new_centers = centers.copy() new_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: break if center_shift_total > 0: new_labels, new_inertia = _labels_inertia(X, x_squared_norms, new_centers, precompute_distances=False, distances=distances) return new_labels, new_inertia, new_centers, i + 1
def score(self, X, y=None): """Opposite of the value of X on the K-centers objective. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] New data. Returns ------- score : float Opposite of the value of X on the K-centers objective. """ check_is_fitted(self, 'cluster_centers_') X = self._check_test_data(X) return \ -pairwise_distances_argmin_min( self.cluster_centers_, X, metric=self.metric, metric_kwargs=self.metric_kw)[0]
def _assign_labels(X, V, centers_subspace, P_subspace): """ Assign each point in each subspace to its nearest cluster center. :param X: input data :param V: orthogonal rotation matrix :param centers_subspace: cluster centers of the subspace :param P_subspace: projecitons of the subspace :return: list with cluster assignments """ cropped_X = np.matmul(X, V[:, P_subspace]) cropped_centers = np.matmul(centers_subspace, V[:, P_subspace]) # Find nearest center labels, _ = pairwise_distances_argmin_min(X=cropped_X, Y=cropped_centers, metric='euclidean', metric_kwargs={'squared': True}) # cython k-means code assumes int32 inputs labels = labels.astype(np.int32) return labels
def predict(self, data, std_threshold=2.0, min_cluster_size=3): """ calculate abnormal state of data :param data: array-like or sparse matrix of shape = [n_samples, n_features] :param std_threshold: threshold of distance between data and cluster mean :param min_cluster_size: min cluster size, if less then size whole group will be abnormal :return: - predict_list : predict abnormal state - -1 : abnormal data - 1 : normal data - labels: group id of data """ assert len(data) > 0, 'X is empty' assert isinstance(self.model, pd.DataFrame), 'not fit' if self.encode_type == 2: cluster_centers_ = self.model[[1, 3, 5]].values else: cluster_centers_ = self.model[[1, 3]].values labels, mindist = pairwise_distances_argmin_min( X=data, Y=cluster_centers_, metric='euclidean', metric_kwargs={'squared': True}) cluster_centers_ = self.normal_model.iloc[labels] predict_list = [] for index in range(len(data)): cluster_centers_data = cluster_centers_.iloc[index] if cluster_centers_data[0] + 1 <= min_cluster_size: predict_list.append(-1) elif math.fabs(data[index][0] - cluster_centers_data[1]) > cluster_centers_data[2] * std_threshold and \ math.fabs(data[index][1] - cluster_centers_data[3]) > cluster_centers_data[4] * std_threshold: predict_list.append(-1) else: predict_list.append(1) predict_list = np.array(predict_list) return predict_list, labels
def predict(self, X): """Predict the closest cluster each sample in X belongs to. In the vector quantization literature, `cluster_centers_` is called the code book and each value returned by `predict` is the index of the closest code in the code book. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] New data to predict. Returns ------- labels : array, shape [n_samples,] Index of the cluster each sample belongs to. """ check_is_fitted(self, 'cluster_centers_') X = self._check_test_data(X) return \ pairwise_distances_argmin_min( self.cluster_centers_, X, metric=self.metric, metric_kwargs=self.metric_kw)[0]
def hsv_method(img, n_clusters, n_colors): img = rgb2hsv(img) # make img array for kmeans X = img.reshape(-1, 3) n_clusters = n_clusters if n_clusters > n_colors else n_colors km = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++', n_init=3, random_state=0) labels = km.fit_predict(X) cluster_centers = km.cluster_centers_ bincount = np.bincount(labels) # find index of data point closest to each center closest, _ = pairwise_distances_argmin_min(cluster_centers, X) # sort colors by frequency dominants = 1. * X[closest][np.argsort(bincount, axis=0)[::-1]][:n_colors] colors = [hsv2rgb([[x]])[0][0] for x in dominants] return colors
def _labels_inertia(X, sample_weight, x_squared_norms, centers, distances, same_cluster_size=False): """E step of the K-means EM algorithm. Compute the labels and the inertia of the given samples and centers. This will compute the distances in-place. Parameters ---------- X : float64 array-like or CSR sparse matrix, shape (n_samples, n_features) The input samples to assign to the labels. sample_weight : array-like, shape (n_samples,) The weights for each observation in X. x_squared_norms : array, shape (n_samples,) Precomputed squared euclidean norm of each data point, to speed up computations. centers : float array, shape (k, n_features) The cluster centers. distances : float array, shape (n_samples,) Pre-allocated array to be filled in with each sample's distance to the closest center. Returns ------- labels : int array of shape(n) The resulting assignment inertia : float Sum of squared distances of samples to their closest cluster center. """ sample_weight = _check_sample_weight(X, sample_weight) n_samples = X.shape[0] n_clusters = centers.shape[0] # See http://jmonlong.github.io/Hippocamplus/2018/06/09/cluster-same-size/#same-size-k-means-variation if same_cluster_size: cluster_size = n_samples // n_clusters labels = np.zeros(n_samples, dtype=np.int32) mindist = np.zeros(n_samples, dtype=np.float32) # count how many samples have been labeled in a cluster counters = np.zeros(n_clusters, dtype=np.int32) # dist: (n_samples, n_clusters) dist = euclidean_distances(X, centers, squared=False) closeness = dist.min(axis=-1) - dist.max(axis=-1) ranking = np.argsort(closeness) for r in ranking: while True: label = dist[r].argmin() if counters[label] < cluster_size: labels[r] = label counters[label] += 1 # squared distances are used for inertia in this function mindist[r] = dist[r, label] ** 2 break else: dist[r, label] = np.inf else: # Breakup nearest neighbor distance computation into batches to prevent # memory blowup in the case of a large number of samples and clusters. # TODO: Once PR #7383 is merged use check_inputs=False in metric_kwargs. labels, mindist = pairwise_distances_argmin_min( X=X, Y=centers, metric='euclidean', metric_kwargs={'squared': True}) # cython k-means code assumes int32 inputs labels = labels.astype(np.int32, copy=False) if n_samples == distances.shape[0]: # distances will be changed in-place distances[:] = mindist inertia = (mindist * sample_weight).sum() return labels, inertia
def test_pairwise_distances_argmin_min(): # Check pairwise minimum distances computation for any metric X = [[0], [1]] Y = [[-2], [3]] Xsp = dok_matrix(X) Ysp = csr_matrix(Y, dtype=np.float32) expected_idx = [0, 1] expected_vals = [2, 2] expected_vals_sq = [4, 4] # euclidean metric idx, vals = pairwise_distances_argmin_min(X, Y, metric="euclidean") idx2 = pairwise_distances_argmin(X, Y, metric="euclidean") assert_array_almost_equal(idx, expected_idx) assert_array_almost_equal(idx2, expected_idx) assert_array_almost_equal(vals, expected_vals) # sparse matrix case idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean") assert_array_almost_equal(idxsp, expected_idx) assert_array_almost_equal(valssp, expected_vals) # We don't want np.matrix here assert_equal(type(idxsp), np.ndarray) assert_equal(type(valssp), np.ndarray) # euclidean metric squared idx, vals = pairwise_distances_argmin_min(X, Y, metric="euclidean", metric_kwargs={"squared": True}) assert_array_almost_equal(idx, expected_idx) assert_array_almost_equal(vals, expected_vals_sq) # Non-euclidean scikit-learn metric idx, vals = pairwise_distances_argmin_min(X, Y, metric="manhattan") idx2 = pairwise_distances_argmin(X, Y, metric="manhattan") assert_array_almost_equal(idx, expected_idx) assert_array_almost_equal(idx2, expected_idx) assert_array_almost_equal(vals, expected_vals) # sparse matrix case idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="manhattan") assert_array_almost_equal(idxsp, expected_idx) assert_array_almost_equal(valssp, expected_vals) # Non-euclidean Scipy distance (callable) idx, vals = pairwise_distances_argmin_min(X, Y, metric=minkowski, metric_kwargs={"p": 2}) assert_array_almost_equal(idx, expected_idx) assert_array_almost_equal(vals, expected_vals) # Non-euclidean Scipy distance (string) idx, vals = pairwise_distances_argmin_min(X, Y, metric="minkowski", metric_kwargs={"p": 2}) assert_array_almost_equal(idx, expected_idx) assert_array_almost_equal(vals, expected_vals) # Compare with naive implementation rng = np.random.RandomState(0) X = rng.randn(97, 149) Y = rng.randn(111, 149) dist = pairwise_distances(X, Y, metric="manhattan") dist_orig_ind = dist.argmin(axis=0) dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))] dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min( X, Y, axis=0, metric="manhattan") np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7) np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7) # Test batch_size deprecation warning assert_warns_message(DeprecationWarning, "version 0.22", pairwise_distances_argmin_min, X, Y, batch_size=500, metric='euclidean')
def test_pairwise_distances_argmin_min(): # Check pairwise minimum distances computation for any metric X = [[0], [1]] Y = [[-2], [3]] Xsp = dok_matrix(X) Ysp = csr_matrix(Y, dtype=np.float32) expected_idx = [0, 1] expected_vals = [2, 2] expected_vals_sq = [4, 4] # euclidean metric idx, vals = pairwise_distances_argmin_min(X, Y, metric="euclidean") idx2 = pairwise_distances_argmin(X, Y, metric="euclidean") assert_array_almost_equal(idx, expected_idx) assert_array_almost_equal(idx2, expected_idx) assert_array_almost_equal(vals, expected_vals) # sparse matrix case idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean") assert_array_almost_equal(idxsp, expected_idx) assert_array_almost_equal(valssp, expected_vals) # We don't want np.matrix here assert_equal(type(idxsp), np.ndarray) assert_equal(type(valssp), np.ndarray) # euclidean metric squared idx, vals = pairwise_distances_argmin_min(X, Y, metric="euclidean", metric_kwargs={"squared": True}) assert_array_almost_equal(idx, expected_idx) assert_array_almost_equal(vals, expected_vals_sq) # Non-euclidean scikit-learn metric idx, vals = pairwise_distances_argmin_min(X, Y, metric="manhattan") idx2 = pairwise_distances_argmin(X, Y, metric="manhattan") assert_array_almost_equal(idx, expected_idx) assert_array_almost_equal(idx2, expected_idx) assert_array_almost_equal(vals, expected_vals) # sparse matrix case idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="manhattan") assert_array_almost_equal(idxsp, expected_idx) assert_array_almost_equal(valssp, expected_vals) # Non-euclidean Scipy distance (callable) idx, vals = pairwise_distances_argmin_min(X, Y, metric=minkowski, metric_kwargs={"p": 2}) assert_array_almost_equal(idx, expected_idx) assert_array_almost_equal(vals, expected_vals) # Non-euclidean Scipy distance (string) idx, vals = pairwise_distances_argmin_min(X, Y, metric="minkowski", metric_kwargs={"p": 2}) assert_array_almost_equal(idx, expected_idx) assert_array_almost_equal(vals, expected_vals) # Compare with naive implementation rng = np.random.RandomState(0) X = rng.randn(97, 149) Y = rng.randn(111, 149) dist = pairwise_distances(X, Y, metric="manhattan") dist_orig_ind = dist.argmin(axis=0) dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))] dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min( X, Y, axis=0, metric="manhattan") np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7) np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7)
def subspace_kmeans_single(X, sample_weight, n_clusters, init='k-means++', max_iter=300, tol=1e-4, tol_eig=-1e-10, verbose=False, x_squared_norms=None, random_state=None): random_state = check_random_state(random_state) sample_weight = _check_sample_weight(X, sample_weight) best_labels, best_inertia, best_centers = None, None, None # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype) # === Beginning of original implementation of initialization === # Dimensionality of original space d = X.shape[1] # Set initial V as QR-decomposed Q of random matrix rand_vals = random_state.random_sample(d**2).reshape(d, d) V, _ = np.linalg.qr(rand_vals, mode='complete') # Set initial m as d/2 m = d // 2 # Scatter matrix of the dataset in the original space S_D = np.dot(X.T, X) # Projection onto the first m attributes P_C = np.eye(m, M=d).T # === End of original implementation of initialization === # iterations for i in range(max_iter): centers_old = centers.copy() # === Beginning of original implementation of E-step of EM === X_C = np.dot(np.dot(X, V), P_C) mu_C = np.dot(np.dot(centers, V), P_C) labels, _ = pairwise_distances_argmin_min( X=X_C, Y=mu_C, metric='euclidean', metric_kwargs={'squared': True}) labels = labels.astype(np.int32) # === End of original implementation of E-step of EM === # computation of the means is also called the M-step of EM centers = _k_means._centers_dense(X, sample_weight, labels, n_clusters, distances) # === Beginning of original implementation of M-step of EM === S = np.zeros((d, d)) for i in range(n_clusters): X_i = X[:][labels == i] - centers[:][i] S += np.dot(X_i.T, X_i) Sigma = S - S_D evals, evecs = np.linalg.eigh(Sigma) idx = np.argsort(evals)[::1] V = evecs[:, idx] m = len(np.where(evals < tol_eig)[0]) if m == 0: raise ValueError( 'Dimensionality of clustered space is 0. ' 'The dataset is better explained by a single cluster.') P_C = np.eye(m, M=d).T inertia = 0.0 for i in range(n_clusters): inertia += row_norms(X[:][labels == i] - centers[:][i], squared=True).sum() # === End of original implementation of M-step of EM === if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: if verbose: print("Converged at iteration %d: " "center shift %e within tolerance %e" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_inertia(X, sample_weight,x_squared_norms, best_centers, precompute_distances=False, distances=distances) return best_labels, best_inertia, best_centers, i + 1
np.argmax(cs_1) ### 0 from sklearn.metrics.pairwise import _argmin_min_reduce cs5 = _argmin_min_reduce(x.values.reshape(1, -1), df1) cs6 = _argmin_min_reduce(x.values.reshape(1, -1), df2) cs5[0][0] = 0 print(cs5) print(cs6) print(np.argmax(cs5)) print(np.argmax(cs6)) r = [0, 0] cs_2 = [cs5[0][0], cs6[0][0]] print(cs_2) ################### [0,15] np.argmax(cs_2) ### 1 from sklearn.metrics.pairwise import pairwise_distances_argmin_min cs7 = pairwise_distances_argmin_min(x.values.reshape(1, -1), df1, metric="euclidean") cs8 = pairwise_distances_argmin_min(x.values.reshape(1, -1), df2, metric="euclidean") cs7[0][0] = 1 print(cs7) print(cs8) print(np.argmax(cs7)) print(np.argmax(cs8)) r = [0, 0] cs_3 = [cs7[0][0], cs8[0][0]] print(cs_3) ################### [1,85] np.argmax(cs_3) ### 1 cs9 = pairwise_distances_argmin_min(x.values.reshape(1, -1), df1,
def _labels_inertia_precompute_dense(X, sample_weight, x_squared_norms, centers, distances, group=None): """Compute labels and inertia using a full distance matrix. This will overwrite the 'distances' array in-place. Parameters ---------- X : numpy array, shape (n_sample, n_features) Input data. sample_weight : array-like, shape (n_samples,) The weights for each observation in X. x_squared_norms : numpy array, shape (n_samples,) Precomputed squared norms of X. centers : numpy array, shape (n_clusters, n_features) Cluster centers which data is assigned to. distances : numpy array, shape (n_samples,) Pre-allocated array in which distances are stored. Returns ------- labels : numpy array, dtype=np.int, shape (n_samples,) Indices of clusters that samples are assigned to. inertia : float Sum of squared distances of samples to their closest cluster center. """ n_samples = X.shape[0] # Breakup nearest neighbor distance computation into batches to prevent # memory blowup in the case of a large number of samples and clusters. # TODO: Once PR #7383 is merged use check_inputs=False in metric_kwargs. if group is None: labels, mindist = pairwise_distances_argmin_min( X=X, Y=centers, metric='euclidean', metric_kwargs={'squared': True}) else: dists = pairwise_distances(X=X, Y=centers, metric='l2') if isinstance(group, int): dists = mix_utils.const_grouped_mean(dists, group) else: import mix_utils.mixture.mix_utils._utils as _utils assert X.shape[0] == group.shape[0] # for g in range(group.min(), group.max()): # mask = group == g # dists[mask] = dists[mask].mean(axis=0) # dists = mix_utils.grouped_mean(dists, group) dists = _utils.grouped_mean(dists, group) labels = dists.argmin(axis=1) mindist = dists[np.arange(dists.shape[0]), labels] # cython k-means code assumes int32 inputs labels = labels.astype(np.int32) if n_samples == distances.shape[0]: # distances will be changed in-place distances[:] = mindist inertia = (mindist * sample_weight).sum() # print(inertia) return labels, inertia
def _labels_inertia(X, x_squared_norms, centers): labels, distances = pairwise_distances_argmin_min( X=X, Y=centers, metric='euclidean', metric_kwargs={'squared': True}) labels = labels.astype(np.int32, copy=False) inertia = distances.sum() return labels, inertia
def mcbc(data, n_clusters, m, max_iter=100, tol=1e-4): n_samples, n_dims = data.shape km = KMeans(n_clusters=n_clusters, max_iter=1) centers = km.fit(data).cluster_centers_ for iter in xrange(max_iter): # Assign points to clusters nearest_center, dists = pairwise_distances_argmin_min(data, centers) is_unfree = np.zeros([n_samples]) mh = np.zeros([n_clusters]) iter_cluster_assignments = np.zeros([n_samples], dtype=np.uint) unsatisified = 0 for cluster in xrange(n_clusters): members = np.where(nearest_center == cluster)[0] if len(members) > m: tree = KDTree( np.insert(data[[members]], 0, centers[cluster].reshape(1, -1), axis=0)) # Potential bug that needs fixing tree.query may return distances # to other centers rather than datapoints dist, ind = tree.query(centers[cluster].reshape(1, -1), k=m + 1) is_unfree[members[ind[0][1:] - 1]] = 1 mh[cluster] = 0 else: is_unfree[[members]] = 1 mh[cluster] = m - len(members) unsatisified = unsatisified + mh[cluster] iter_cluster_assignments[[members]] = cluster hneari = np.zeros([n_samples], dtype=np.uint) hsupporti = np.zeros([n_samples], dtype=np.uint) dists = euclidean_distances(data, centers) print("Iteration %d, unsatisfied: %d" % (iter, unsatisified)) while unsatisified > 0: #for cluster in xrange(n_clusters): # members = np.where(iter_cluster_assignments == cluster)[0] # print("Cluster %d:" % cluster) # print("Size: %d, unfree: %d, mh: %d" % # (len(members), # np.sum(is_unfree[[members]]), # mh[cluster])) xi0 = -1 min_xi0 = np.inf for sample in xrange(n_samples): if not is_unfree[sample]: dneari = np.min(dists[sample]) hneari[sample] = np.argmin(dists[sample]) mh_candidates_ind = mh.nonzero()[0] dsupporti = np.min(dists[sample][[mh_candidates_ind]]) mh_min_ind = np.argmin(dists[sample][[mh_candidates_ind]]) hsupporti[sample] = mh_candidates_ind[mh_min_ind] diff = dsupporti**2 - dneari**2 if diff < min_xi0: min_xi0 = diff xi0 = sample is_unfree[xi0] = 1 iter_cluster_assignments[xi0] = hsupporti[xi0] if mh[hsupporti[xi0]] > 0: mh[hsupporti[xi0]] = mh[hsupporti[xi0]] - 1 unsatisified = unsatisified - 1 #print("Moving sample %d from cluster %d to cluster %d" % # (xi0, hneari[xi0], hsupporti[xi0])) # Update centers new_centers = [] for cluster in xrange(n_clusters): members = np.where(iter_cluster_assignments == cluster)[0] #print(len(members)) #print("Old centers for cluster %d:" % cluster) #print(centers[cluster]) new_centers.append(np.mean(data[[members]], axis=0)) new_centers = np.array(new_centers) center_shift = np.sqrt(np.sum((new_centers - centers)**2, axis=0)) center_shift_total = np.sum(center_shift) print(center_shift_total) if center_shift_total**2 < tol: print("center shift %e within tolerance %e" % (center_shift_total, tol)) break centers = new_centers #print("New centers for cluster %d:" % cluster) #print(centers[cluster]) #print(mh) #print(np.unique(iter_cluster_assignments, return_counts=True)) #print return iter_cluster_assignments
def select_instance(X_training, X_pool, X_training_feat, X_pool_feat, n_annotated, X_uncertainty: np.ndarray, mask: np.ndarray, metric: Union[str, Callable], n_jobs: Union[int, None] = -1): """ Core iteration strategy for selecting another record from our unlabeled records. Given a set of labeled records (X_training) and unlabeled records (X_pool) with uncertainty scores (X_uncertainty), we'd like to identify the best instance in X_pool that best balances uncertainty and dissimilarity. Refer to Cardoso et al.'s "Ranked batch-mode active learning": https://www.sciencedirect.com/science/article/pii/S0020025516313949 TODO: - Add notebook for Active Learning bake-off (passive vs interactive vs batch vs ranked batch) Args: X_training: Mix of both labeled and unlabeled records. X_pool: Unlabeled records to be selected for labeling. X_training_feat: feature vectors for the training data X_pool_feat: feature vectors for the unlabeld data X_uncertainty: Uncertainty scores for unlabeled records to be selected for labeling. mask: Mask to exclude previously selected instances from the pool. metric: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`. n_jobs: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`. Returns: Index of the best index from X chosen to be labelled; a single record from our unlabeled set that is considered the most optimal incremental record for including in our query set. """ # Extract the number of labeled and unlabeled records. n_labeled_records = X_training.shape[0] + n_annotated n_unlabeled = X_pool[mask].shape[0] # Determine our alpha parameter as |U| / (|U| + |D|). Note that because we # append to X_training and remove from X_pool within `ranked_batch`, # :alpha: is not fixed throughout our model's lifetime. alpha = n_unlabeled / (n_unlabeled + n_labeled_records) # Compute pairwise distance (and then similarity) scores from every unlabeled record # to every record in X_training. The result is an array of shape (n_samples, ). ################## TODO: replace this part with a better similarity computation ############################ ''' Args: X_u: unlabeled data X_l: labeled data Returns: pairwise distance between the two points ''' if X_pool_feat is None or X_training_feat is None: X_pool_features = X_pool[mask].reshape((len(X_pool[mask]), -1)) X_training_features = X_training.reshape((len(X_training), -1)) else: X_pool_features = X_pool_feat[mask] X_training_features = X_training_feat if n_jobs == 1 or n_jobs is None: _, distance_scores = pairwise_distances_argmin_min(X_pool_features, X_training_features, metric=metric) else: #distance_scores = pairwise_distances(X_pool_features, X_training_features, metric=metric, n_jobs=n_jobs).min(axis=1) distance_scores = cdist(X_pool_features, X_training_features, metric=metric).min(axis=1) ############################################################################################################ similarity_scores = 1 / (1 + distance_scores) # Compute our final scores, which are a balance between how dissimilar a given record # is with the records in X_uncertainty and how uncertain we are about its class. scores = alpha * (1 - similarity_scores) + (1 - alpha) * X_uncertainty[mask] # Isolate and return our best instance for labeling as the one with the largest score. best_instance_index_in_unlabeled = np.argmax(scores) n_pool, *rest = X_pool.shape unlabeled_indices = [i for i in range(n_pool) if mask[i]] best_instance_index = unlabeled_indices[best_instance_index_in_unlabeled] mask[best_instance_index] = 0 return best_instance_index, np.expand_dims(X_pool[best_instance_index], axis=0), mask