def get_metric_function(metric=None): """ Parameters ---------- metric : str or function or None, default: None Using None is equivalent to using "euclidean". If str, then this string specifies the distance metric (from scikit-learn) to use for calculating the objective function. Possible values are: * "cityblock" for sklearn.metrics.pairwise.manhattan_distances * "cosine" for sklearn.metrics.pairwise.cosine_distances * "euclidean" for sklearn.metrics.pairwise.euclidean_distances * "l1" for sklearn.metrics.pairwise.manhattan_distances * "l2" for sklearn.metrics.pairwise.euclidean_distances * "manhattan" for sklearn.metrics.pairwise.manhattan_distances If function, then this function should take two arguments and return a scalar value. Furthermore, the following conditions must be fulfilled: 1. d(a, b) >= 0, for all a and b 2. d(a, b) == 0, if and only if a = b, positive definiteness 3. d(a, b) == d(b, a), symmetry 4. d(a, c) <= d(a, b) + d(b, c), the triangle inequality Returns ------- metric_func : function If the `metric` argument is a function, it is returned. If the `metric` argument is a string, then the corresponding distance metric function from `sklearn.metrics.pairwise` is returned. """ if metric is None: metric = "manhattan" if isinstance(metric, str): try: return distance_metrics()[metric] except KeyError: raise ValueError( "{} is not a known metric. Please use rather one of the " "following metrics: {}".format( metric, tuple( name for name in distance_metrics().keys() if name != "precomputed" ), ) ) elif callable(metric): return metric else: raise ValueError( "A {} was passed as `metric` argument. " "Please pass a string or a function " "instead.".format(type(metric)) )
def get_metric_function(metric=None): """ Parameters ---------- metric : str or function or None, default: None Using None is equivalent to using "euclidean". If str, then this string specifies the distance metric (from scikit-learn) to use for calculating the objective function. Possible values are: * "cityblock" for sklearn.metrics.pairwise.manhattan_distances * "cosine" for sklearn.metrics.pairwise.cosine_distances * "euclidean" for sklearn.metrics.pairwise.euclidean_distances * "l1" for sklearn.metrics.pairwise.manhattan_distances * "l2" for sklearn.metrics.pairwise.euclidean_distances * "manhattan" for sklearn.metrics.pairwise.manhattan_distances If function, then this function should take two arguments and return a scalar value. Furthermore, the following conditions must be fulfilled: 1. d(a, b) >= 0, for all a and b 2. d(a, b) == 0, if and only if a = b, positive definiteness 3. d(a, b) == d(b, a), symmetry 4. d(a, c) <= d(a, b) + d(b, c), the triangle inequality Returns ------- metric_func : function If the `metric` argument is a function, it is returned. If the `metric` argument is a string, then the corresponding distance metric function from `sklearn.metrics.pairwise` is returned. """ if metric is None: metric = "manhattan" if isinstance(metric, str): try: return distance_metrics()[metric] except KeyError: raise ValueError( "{} is not a known metric. Please use rather one of the " "following metrics: {}".format(metric, tuple(name for name in distance_metrics().keys() if name != "precomputed")) ) elif callable(metric): return metric else: raise ValueError("A {} was passed as `metric` argument. " "Please pass a string or a function " "instead.".format(type(metric)))
def run_embedding_lookup_distance(self, querys, metric): """ Calculate embedding distance of all querys against the lookup database :param querys: querys for which distances should be calculated :param metric: metric to use to calculate distances :return: distances, query ids """ if metric in pairwise.distance_metrics(): if isinstance(querys, dict): query_ids, raw_data_query = zip(*querys.items()) else: raw_data_query = querys query_ids = range(0, numpy.shape(querys)[0]) raw_data_query = numpy.array(raw_data_query).squeeze() if len(query_ids) == 1: raw_data_query = raw_data_query.reshape(1, -1) distances = pairwise_distances(raw_data_query, self.raw_data, metric=metric) else: sys.exit("{} is not a correct distance metric\n" "See <sklearn.metrics.pairwise.distance_metrics()> " "for all possible distance metrics".format(metric)) return distances, query_ids
def get_adj(train_data, test_data, k, alpha, kappa): eps = np.finfo(float).eps emb_all = np.append(train_data, test_data, axis=0) N = emb_all.shape[0] metric = distance_metrics()['cosine'] S = 1 - metric(emb_all, emb_all) S = torch.tensor(S) S = S - torch.eye(S.shape[0]) if k > 0: topk, indices = torch.topk(S, k) mask = torch.zeros_like(S) mask = mask.scatter(1, indices, 1) mask = ((mask + torch.t(mask)) > 0).type(torch.float32) S = S * mask D = S.sum(0) Dnorm = torch.diag(torch.pow(D, -0.5)) E = torch.matmul(Dnorm, torch.matmul(S, Dnorm)) E = alpha * torch.eye(E.shape[0]) + E E = torch.matrix_power(E, kappa) E = E.cuda() train_data = train_data - train_data.mean(0) train_data_norm = train_data / LA.norm(train_data, 2, 1)[:, None] test_data = test_data - test_data.mean(0) test_data_norm = test_data / LA.norm(test_data, 2, 1)[:, None] features = np.append(train_data_norm, test_data_norm, axis=0) features = torch.tensor(features).cuda() return E, features
def _localReadMoreXML(self, xmlNode): """ Method that reads the portion of the xml input that belongs to this specialized class and initializes internal parameters @ In, xmlNode, xml.etree.Element, Xml element node @ Out, None """ self.distParams = {} for child in xmlNode: if child.tag == 'metricType': self.metricType = child.text else: self.distParams[str(child.tag)] = utils.tryParse(child.text) availableMetrics = pairwise.kernel_metrics().keys( ) + pairwise.distance_metrics().keys() + scores.keys() if self.metricType not in availableMetrics: metricList = ', '.join( availableMetrics[:-1]) + ', or ' + availableMetrics[-1] self.raiseAnError( IOError, 'Metric SKL error: metricType ' + str(self.metricType) + ' is not available. Available metrics are: ' + metricList + '.') for key, value in self.distParams.items(): try: newValue = ast.literal_eval(value) if type(newValue) == list: newValue = np.asarray(newValue) self.distParams[key] = newValue except: self.distParams[key] = value
def kNN(X, k, measure='euclidean'): """ Construct pairwise weights by finding the k nearest neighbors to each point and assigning a Gaussian-based distance. Parameters ---------- X : [n_samples, n_dim] array k : int number of neighbors for each sample in X """ from scipy.spatial import distance weights = [] parallelized_metrics = list(distance_metrics().keys()) if (measure in parallelized_metrics): w = pairwise_distances(X=X, Y=X, metric=measure, n_jobs=-1) else: w = distance.cdist(X, X, measure) y = np.argsort(w, axis=1) for i, x in enumerate(X): distances, indices = w[i, y[i, 1:k + 1]], y[i, 1:k + 1] for (d, j) in zip(distances, indices): if i < j: weights.append((i, j, d * d)) else: weights.append((j, i, d * d)) weights = sorted(weights, key=lambda r: (r[0], r[1])) return np.unique(np.asarray(weights), axis=0)
def __init__(self, sourcedataset: Dataset, alpha: Optional[float] = .5, length: Optional[str] = "auto", batchsize: Optional[int] = 5000, store: Optional[bool] = True, seed: Optional[int] = 0, averageneighbors: Optional[int] = 2, maxneighbors: Optional[int] = 100, distmetric: Optional[str] = "l2", transform: Optional[Callable] = None, limitdata: Optional[int] = None): """ Args: sourcedataset : an iterable Dataset alpha (float): concentration parameter for the Dirichlet distribution length (integer or "auto") : length of this Dataset; auto will create averageneighbors*(length of input dataset) size Dataset batchsize (integer): number of points per batch to use in computing the interpoint distances store (binary) : whether to store the data or not seed (integer): seed used to generate this Dataset averageneighbors (integer): number of neighbors to average over for each sample maxneighbors (integer): maximum number of nearest neighbors to store for each point distmetric (function): pairwise distance function used to select nearby points, must be a key in sklearn.metrics.pairwise.distance_metrics() transform (function): transform for image data sets limitdata (integer): only works with this initial poriton of the source dataset """ self.sourcedataset = sourcedataset if limitdata is None: self.sourcelen = len(self.sourcedataset) else: self.sourcelen = limitdata self.alpha = alpha if length == "auto": self.len = averageneighbors * self.sourcelen else: self.len = length self.batchsize = batchsize self.store = store self.seed = seed self.averageneighbors = averageneighbors self.maxneighbors = maxneighbors self.distmetric = distance_metrics()[distmetric] self.transform = transform self.testpoint = sourcedataset[0][0] self.testtarget = sourcedataset[0][1] self.computedistances() self.generator = torch.manual_seed(self.seed) self.computesamples(self.store) # copied from the VisionDataset class to handle transforms has_separate_transform = transform is not None
def optimal_fuzzifier(data): """ Determines the optimal value of fuzzifier for fuzzy c-means algorithm. This method employs the method described in Dembele & Kastner 2003. Reference: Dembele, D., & Kastner, P. (2003). Fuzzy C-means method for clustering microarray data. bioinformatics, 19(8), 973-980. Params: data: ndarray of shape (n_samples, n_features) - original data which needs to be clustered by Fuzzy C-Means Returns: mOpt: float """ p = data.shape[1] # number of dimensions min_m = 1.0001 # Minimum possible value of m max_m = 50 # Realistic value corresponding to m->np.inf m = 2 # Initialize m for numerical approximation delta = 0.0001 # Acceptable error range for coeeficient of variation of Ym Ym = [] for i in range(data.shape[0]): for j in range(i + 1, data.shape[0]): temp = distance_metrics()['euclidean'](data[i].reshape(1, -1), data[j].reshape(1, -1))[0][0] Ym.append(pow(temp, 1 / (m - 1))) cv_Ym = stdev(Ym) / mean(Ym) while not (cv_Ym > (0.03 * p) - delta and cv_Ym < (0.03 * p) + delta): if cv_Ym <= (0.03 * p) - delta: max_m = m m = (min_m + max_m) / 2 else: min_m = m m = (min_m + max_m) / 2 Ym = [] for i in range(data.shape[0]): for j in range(i + 1, data.shape[0]): temp = distance_metrics()['euclidean'](data[i].reshape( 1, -1), data[j].reshape(1, -1))[0][0] Ym.append(pow(temp, 1 / (m - 1))) cv_Ym = stdev(Ym) / mean(Ym) if m > 10: mOpt = 2 else: mOpt = 1 + (m / 10) return mOpt
def silhouette_samples(X, labels, metric='euclidean'): metric = distance_metrics()[metric] n = labels.shape[0] A = np.array( [intra_cluster_distance(X, labels, metric, i) for i in range(n)]) B = np.array( [nearest_cluster_distance(X, labels, metric, i) for i in range(n)]) sil_samples = (B - A) / np.maximum(A, B) # nan values are for clusters of size 1, and should be 0 return np.nan_to_num(sil_samples)
def distance(self, x, y=None, **kwargs): """ This method returns the distance between two points x and y. If y is not provided then x is a pointSet and a distance matrix is returned @ In, x, dict, dictionary containing data of x @ In, y, dict, dictionary containing data of y @ Out, value, float or numpy.ndarray, distance between x and y (if y is provided) or a square distance matrix if y is None """ if y is not None: if isinstance(x, np.ndarray) and isinstance(y, np.ndarray): dictTemp = utils.mergeDictionaries(kwargs, self.distParams) if self.metricType in pairwise.kernel_metrics().keys(): value = pairwise.kernel_metrics(X=x, Y=y, metric=self.metricType, **dictTemp) elif self.metricType in pairwise.distance_metrics(): value = pairwise.pairwise_distances(X=x, Y=y, metric=self.metricType, **dictTemp) return value else: self.raiseAnError( IOError, 'Metric SKL error: SKL metrics support only PointSets and not HistorySets' ) else: if self.metricType == 'mahalanobis': covMAtrix = np.cov(x.T) kwargs['VI'] = np.linalg.inv(covMAtrix) dictTemp = utils.mergeDictionaries(kwargs, self.distParams) if self.metricType in pairwise.kernel_metrics().keys(): value = pairwise.pairwise_kernels(X=x, metric=self.metricType, **dictTemp) elif self.metricType in pairwise.distance_metrics().keys(): value = pairwise.pairwise_distances(X=x, metric=self.metricType, **dictTemp) return value
def distance(self, x, y=None, **kwargs): """ This method returns the distance between two points x and y. If y is not provided then x is a pointSet and a distance matrix is returned @ In, x, numpy.ndarray, array containing data of x, if 1D array is provided, the array will be reshaped via x.reshape(1,-1) @ In, y, numpy.ndarray, array containing data of y, if 1D array is provided, the array will be reshaped via y.reshape(1,-1) @ Out, value, numpy.ndarray, distance between x and y (if y is provided) or a square distance matrix if y is None """ if y is not None: if isinstance(x,np.ndarray) and isinstance(y,np.ndarray): if len(x.shape) == 1: x = x.reshape(1,-1) #self.raiseAWarning(self, "1D array is provided. For consistence, this array is reshaped via x.reshape(1,-1) ") if len(y.shape) == 1: y = y.reshape(1,-1) #self.raiseAWarning(self, "1D array is provided. For consistence, this array is reshaped via y.reshape(1,-1) ") dictTemp = utils.mergeDictionaries(kwargs,self.distParams) if self.metricType in pairwise.kernel_metrics().keys(): value = pairwise.pairwise_kernels(X=x, Y=y, metric=self.metricType, **dictTemp) elif self.metricType in pairwise.distance_metrics(): value = pairwise.pairwise_distances(X=x, Y=y, metric=self.metricType, **dictTemp) if value.shape == (1,1): return value[0] else: return value else: self.raiseAnError(IOError,'Metric SKL error: SKL metrics support only PointSets and not HistorySets') else: if self.metricType == 'mahalanobis': covMAtrix = np.cov(x.T) kwargs['VI'] = np.linalg.inv(covMAtrix) dictTemp = utils.mergeDictionaries(kwargs,self.distParams) if self.metricType in pairwise.kernel_metrics().keys(): value = pairwise.pairwise_kernels(X=x, metric=self.metricType, **dictTemp) elif self.metricType in pairwise.distance_metrics().keys(): value = pairwise.pairwise_distances(X=x, metric=self.metricType, **dictTemp) if value.shape == (1,1): return value[0] else: return value
def edge_weight(x, y, mode='rbf', gamma=0.5): dists = distance_metrics() kernels = kernel_metrics() kernels['bhattacharya'] = bhattacharya kernels['intersection'] = intersection if mode in dists: diff = dists[mode](x, y) elif mode in kernels: diff = kernels[mode](x, y, gamma=gamma) else: raise Exception('Mode not recognised') return np.float64(diff)
def make_disk_graph(X, radius, metric='euclidean'): """Make a generalized disk graph, in which points whose distance is less than a certain radius are considered adjacent. Params: X: a 2D numpy array of shape (n_observations, n_features). radius: the radius of disks for adjacency. metric: string, representing which metric. Options are given by sklearn.metrics.pairwise.distance_metrics. Default is 'euclidean'. Returns: a networkx simple Graph """ metric = distance_metrics()[metric] dist = metric(X) adj = np.asarray(dist < radius, dtype=np.float) return from_numpy_matrix(adj, create_using=Graph)
def agglomerative(X, affinity, thres, n_clusters=None, p=-3): ''' X: A list of n data features. Each entry is An array of data of size n_samples * n_dimension. Different affinity is applied for differnt feature. thres: The distance thresholed to seperate two clusters. affinity: A list of n affinities. The distance metric to seperate clusters. Could be 'p_norm', 'euclidean', 'cosine'. n_clusters: The result number of clusters. p: the p value if p-norm is used as affinity return: The extimated labels for each datapoint. ''' if len(set( affinity)) == 1: # Apply the same affinity to different features. affinity = affinity[0] if isinstance(X, list): X = np.concatenate(X, axis=1) if affinity in distance_metrics(): ac = AgglomerativeClustering(n_clusters=n_clusters, affinity=affinity, linkage='average', distance_threshold=thres) estimated_labels = ac.fit_predict(X) elif affinity == 'p_norm': ac = AgglomerativeClustering(n_clusters=n_clusters, affinity='precomputed', linkage='average', distance_threshold=thres) distances = p_norm_distance(X, p=p) estimated_labels = ac.fit_predict(distances) elif len(set(affinity) ) != 1: # Apply different affinities to different features. ac = AgglomerativeClustering(n_clusters=n_clusters, affinity='precomputed', linkage='average', distance_threshold=thres) n_data = X[0].shape[0] distances = np.zeros((n_data, n_data)) for i, data in enumerate(X): if affinity[i] == 'p_norm': distances += p_norm_distance(X[i], p=p) elif affinity[i] == 'euclidean': distances += euclidean_distance(X[i]) elif affinity[i] == 'cosine': distances += cos_distance(X[i]) estimated_labels = ac.fit_predict(distances) return estimated_labels
def plot_graph_layout(embedding_set, kind="cosine", **kwargs): """ Handles the plotting of a layout graph using the embeddings in an embeddingset as input. **Input** - embeddings: a set of `whatlies.Embedding` objects to plot - kind: distance metric options: 'cityblock', 'cosine', 'euclidean', 'l2', 'l1', 'manhattan', """ vectors = [token.vector for k, token in embedding_set.items()] label_dict = {i: w for i, (w, _) in enumerate(embedding_set.items())} dist_fnc = distance_metrics()[kind] dist = dist_fnc(np.array(vectors), np.array(vectors)) # Greate graph graph = nx.from_numpy_matrix(dist) distance = pd.DataFrame(dist).to_dict() # Chhange layout positions of the graph pos = nx.kamada_kawai_layout(graph, dist=distance) # Draw nodes and labels nx.draw_networkx_nodes(graph, pos, node_color="b", alpha=0.5) nx.draw_networkx_labels(graph, pos, labels=label_dict, **kwargs)
def agglomerative(X, affinity, thres, n_clusters=None, p=-3): ''' X: An array of data of size n_samples * n_dimension thres: The distance thresholed to seperate two clusters. affinity: The distance metric to seperate clusters. Could be 'p_norm', 'euclidean', 'cosine'. n_clusters: The result number of clusters. p: the p value if p-norm is used as affinity return: The extimated labels for each datapoint. ''' if affinity in distance_metrics(): ac = AgglomerativeClustering(n_clusters=n_clusters, affinity=affinity, linkage='average', distance_threshold=thres) estimated_labels = ac.fit_predict(X) elif affinity == 'p_norm': ac = AgglomerativeClustering(n_clusters=n_clusters, affinity='precomputed', linkage='average', distance_threshold=thres) distances = p_norm_distance(X, p=p) estimated_labels = ac.fit_predict(distances) return estimated_labels
def cluster_diameter(data, labelsMat, mode='max', dist_metric='euclidean'): """ Computes the diameter for each cluster from the given dataset and clustering labels. Params: data: ndarray of shape (n_samples, n_features) - original data elements which were clustered labelsMat: ndarray of shape (n_samples, n_clusteringAlgorithms) - labels assigned by each clustering algorithms stored in columns - assigned labels are in the range [0, n_clusters] mode: Either of the four defined below - 'max' :: Maximum distance between any 2 points of a cluster - 'avg' :: Mean distance between all pairs within the cluster - 'avg_centroid' :: Twice the mean distance of every point from cluster centroid - 'far_centroid' :: Twice the distance between the centroid and the farthest point from it within the same cluster dist_metric: string, can be one of the following: - 'manhattan' or 'l1' - 'euclidean' or 'l2' - 'cosine' - 'haversine' Returns: diamCluster: list[ndarray (n_clusters,)] of length n_clusteringAlgorithms - every element in the list is a numpy array containing diameter of the clusters defined by the a clustering algorithm - list elements arranges according to the labelsMat """ # Change labelsMat shape if only one clustering algorithm is used if len(labelsMat.shape) == 1: labelsMat = np.expand_dims(labelsMat, 1) diamCluster = [] for j in range(labelsMat.shape[1]): #Iterate over n_clusteringAlgorithms nClusters = np.max( labelsMat[:, j] ) + 1 #Find number of clusters corresponding to each algorithm clusterDiameter = np.zeros((nClusters)) clusters = dict( ) # Dictionary of clusters with labels as key and list of numpy vectors (samples corresponding to that label) as values for i in range(data.shape[0]): #Iterate over n_samples clusters.setdefault(labelsMat[i, j], []).append(data[i, :]) for k in range(nClusters): if mode == 'max': if len(clusters[k]) == 1: clusterDiameter[k] = 0 else: clusterDiameter[k] = np.max( distance_metrics()['euclidean'](clusters[k])) elif mode == 'avg': if len(clusters[k]) == 1: clusterDiameter[k] = 0 else: clusterDiameter[k] = np.sum( distance_metrics()['euclidean']( clusters[k])) / (2 * len(clusters[k])) elif mode == 'avg_centroid': if len(clusters[k]) == 1: clusterDiameter[k] = 0 else: centersMat = calc_cluster_centroids(data, labelsMat) clusterDiameter[k] = 2 * np.sum( distance_metrics()['euclidean']( np.array(clusters[k]), centersMat[j][k].reshape( 1, -1))) / (len(clusters[k])) elif mode == 'far_centroid': if len(clusters[k]) == 1: clusterDiameter[k] = 0 else: centersMat = calc_cluster_centroids(data, labelsMat) clusterDiameter[k] = 2 * np.max( distance_metrics()['euclidean'](np.array( clusters[k]), centersMat[j][k].reshape(1, -1))) else: raise Exception( "Unsupported MODE to calculate cluster diameter") diamCluster.append(clusterDiameter) return diamCluster
def xie_beni_score(data, labels=None, fuzzyMembershipMat=None, fuzzifier=2): """ Computes the Xie-Beni index score for the clustering algorithm whose corresponding labels are defined in the vector 'labels' or the membership of each sample point is defined in the fuzzy membership matrix (i.e. 'fuzzyMembershipMat'). If the value of 'fuzzyMembershipMat' remains 'None' upon call to this function, 'labels' vector is used to define membership of each sample to the labelled cluster as 1 while others as 0. In that case, cluster center will also be computed as the mean of all the points belonging to that cluster. Params: data: ndarray of shape (n_samples, n_features) - original data elements which were clustered labels: ndarray of shape (n_samples,) - labels assigned by the clustering algorithms stored in columns - 'None' signifies that the provided algorithm is Fuzzy C-Means fuzzyMembershipMat: ndarray of shape (n_clusters, n_samples) - matrix defining the fuzzy membership of each sample point to the generated clusters - 'None' signifies that the provided algorithm is NOT Fuzzy C-Means fuzzifier: float in range (1,2] - fuzzifier used in Fuzzy C-Means Algorithm - Only considered if 'fuzzyMembershipMat' is not None Returns: score: float - The resulting Xie-Beni Index score correpsonding to the given clustering algorithm. """ # Check if all input parameters are sepcified correctly if fuzzyMembershipMat is None: if labels is None: raise Exception( "Neither fuzzy membership matrix, nor fixed labels provided.") else: # Create fuzzyMembershipMat based on labels nClusters = np.max(labels) + 1 # Find number of clusters fuzzyMembershipMat = np.zeros((nClusters, data.shape[0])) for i in range(data.shape[0]): fuzzyMembershipMat[labels[i], i] = 1 else: nClusters = fuzzyMembershipMat.shape[0] # Find number of clusters if labels is not None: warn( "Since both labels and fuzzy membership matrix are provided, latter one is used." ) if fuzzifier <= 1 or fuzzifier > 2: raise Exception("Value of fuzzifier must lie in range: (1,2]") # Compute Fuzzy Centroids (n_clusters, n_features) fuzzyCentroids = np.zeros((nClusters, data.shape[1])) for i in range(nClusters): fuzzyCentroids[i] = np.matmul( np.power(fuzzyMembershipMat[i], fuzzifier), data) / np.sum( np.power(fuzzyMembershipMat[i], fuzzifier)) # Compute Separation interClusterDist = distance_metrics()['euclidean'](fuzzyCentroids) np.fill_diagonal(interClusterDist, np.inf) separation = np.power(np.min(interClusterDist), 2) # Compute Total Variance sigma = 0 for i in range(nClusters): sigma += np.sum( np.multiply( np.power(fuzzyMembershipMat[i], fuzzifier), np.power( distance_metrics()['euclidean'](np.expand_dims( fuzzyCentroids[i], axis=0), data), 2))) # Compute Compactness compactness = sigma / data.shape[0] # Compute Score score = compactness / separation return score
def intercluster_dist(data, labelsMat, mode='min', dist_metric='euclidean'): """ Computes the intercluster distance between each pair of clusters from the given dataset and clustering labels. Params: data: ndarray of shape (n_samples, n_features) - original data elements which were clustered labelsMat: ndarray of shape (n_samples, n_clusteringAlgorithms) - labels assigned by each clustering algorithms stored in columns - assigned labels are in the range [0, n_clusters] mode: Either of the three defined below - 'min' :: Minimum distance between any pair of points (1 from each cluster) - 'max' :: Maximum distance between any pair of points (1 from each cluster) - 'centroid' :: Distance between centroids of the 2 clusters dist_metric: string, can be one of the following: - 'manhattan' or 'l1' - 'euclidean' or 'l2' - 'cosine' - 'haversine' Returns: interClusterDist: list[ndarray (n_clusters, n_clusters)] of length n_clusteringAlgorithms - every element in the list is a numpy array containing intercluster distance matrix for the clusters defined by the a clustering algorithm - list elements arranges according to the labelsMat """ # Change labelsMat shape if only one clustering algorithm is used if len(labelsMat.shape) == 1: labelsMat = np.expand_dims(labelsMat, 1) interClusterDist = [] if mode == 'centroid': centersMat = calc_cluster_centroids(data, labelsMat) for j in range(labelsMat.shape[1]): #Iterate over n_clusteringAlgorithms nClusters = np.max( labelsMat[:, j] ) + 1 #Find number of clusters corresponding to each algorithm clusterDistMat = np.zeros((nClusters, nClusters)) if mode == 'min' or mode == 'max': clusters = dict() for i in range(data.shape[0]): #Iterate over n_samples clusters.setdefault(labelsMat[i, j], []).append(data[i, :]) for m in range(nClusters): for n in range(m + 1, nClusters): if mode == 'min': clusterDistMat[m][n] = np.min( distance_metrics()[dist_metric](clusters[m], clusters[n])) clusterDistMat[n][m] = clusterDistMat[m][n] elif mode == 'max': clusterDistMat[m][n] = np.max( distance_metrics()[dist_metric](clusters[m], clusters[n])) clusterDistMat[n][m] = clusterDistMat[m][n] elif mode == 'centroid': clusterDistMat[m][n] = np.max( distance_metrics()[dist_metric]( (centersMat[j][m], centersMat[j][n]))) clusterDistMat[n][m] = clusterDistMat[m][n] else: raise Exception( "Unsupported MODE to calculate intercluster distance") interClusterDist.append(clusterDistMat) return interClusterDist
def is_distance(mode): return mode in distance_metrics()
# 'laplacian': sklearn.metrics.pairwise.laplacian_kernel, # 'sigmoid': sklearn.metrics.pairwise.sigmoid_kernel, # 'cosine': sklearn.metrics.pairwise.cosine_similarity} # (Last Updated: sklearn.__version__ == 0.19.1) _METRICS_MISC_PAIRWISE.update(sk_pairwise.kernel_metrics()) # Update with dict of distance names and functions. # >>> distance_metrics() # {'cityblock': sklearn.metrics.pairwise.manhattan_distances, # \/ # 'cosine': sklearn.metrics.pairwise.cosine_distances, # 'euclidean': sklearn.metrics.pairwise.euclidean_distances, # \/ # 'l2': sklearn.metrics.pairwise.euclidean_distances, # /\ # 'l1': sklearn.metrics.pairwise.manhattan_distances, # \/ # 'manhattan': sklearn.metrics.pairwise.manhattan_distances, # /\ # 'precomputed': None} # (Last Updated: sklearn.__version__ == 0.19.1) _METRICS_MISC_PAIRWISE.update(sk_pairwise.distance_metrics()) # Update with paired distance names (prepend "paired_") and functions. # >>> {'paired_' + k: v for k, v in # ... iteritems(sk_pairwise.PAIRED_DISTANCES.copy())} # {'paired_cosine': sklearn.metrics.pairwise.paired_cosine_distances, # 'paired_euclidean': sklearn.metrics.pairwise.paired_euclidean_distances, # 'paired_l2': sklearn.metrics.pairwise.paired_euclidean_distances, # 'paired_l1': sklearn.metrics.pairwise.paired_manhattan_distances, # 'paired_manhattan': sklearn.metrics.pairwise.paired_manhattan_distances, # 'paired_cityblock': sklearn.metrics.pairwise.paired_manhattan_distances} # (Last Updated: sklearn.__version__ == 0.19.1) _METRICS_MISC_PAIRWISE.update({ 'paired_' + k: v for k, v in iteritems(sk_pairwise.PAIRED_DISTANCES.copy()) })
df1 = ratings.iloc[1, :150] x1 = ratings.iloc[100:175, 2] df2 = ratings.iloc[2, :75] cs111 = laplacian_kernel(x.values.reshape(1, -1), [df1]) cs112 = laplacian_kernel(x1.values.reshape(1, -1), [df2]) cs111[0][0] = 1 print(cs111) ### [[1.]] print(cs112) ### [[0.10699213]] print(np.argmax(cs111)) ## 0 print(np.argmax(cs112)) ### 0 r = [0, 0] cs_55 = [cs111[0][0], cs112[0][0]] print(cs_55) ############# [1.0, 0.10699212985311443] np.argmax(cs_55) ### 0 from sklearn.metrics.pairwise import distance_metrics cs113 = distance_metrics() ##cs114 = distance_metrics(x1.values.reshape(1,-1),[df2]) print(np.argmax(cs113)) ## 0 print(cs113) from sklearn.metrics.pairwise import _parallel_pairwise cs114 = _parallel_pairwise(x.values.reshape(1, -1), [df1], n_jobs=1, func=euclidean_distances) cs115 = _parallel_pairwise(x1.values.reshape(1, -1), [df2], n_jobs=2, func=euclidean_distances) cs114[0][0] = 1 print(cs114) ### [[1.]] print(cs115) ### [[28.53037718]] print(np.argmax(cs114)) ## 0 print(np.argmax(cs115)) ### 0
def mkNN(X, k, measure='euclidean'): """ Construct mutual_kNN for large scale dataset If j is one of i's closest neighbors and i is also one of j's closest members, the edge will appear once with (i,j) where i < j. Parameters ---------- X : [n_samples, n_dim] array k : int number of neighbors for each sample in X """ from scipy.spatial import distance from scipy.sparse import csr_matrix, triu, find from scipy.sparse.csgraph import minimum_spanning_tree samples = X.shape[0] batchsize = 10000 b = np.arange(k + 1) b = tuple(b[1:].ravel()) z = np.zeros((samples, k)) weigh = np.zeros_like(z) # This loop speeds up the computation by operating in batches # This can be parallelized to further utilize CPU/GPU resource for x in np.arange(0, samples, batchsize): start = x end = min(x + batchsize, samples) parallelized_metrics = list(distance_metrics().keys()) if (measure in parallelized_metrics): w = pairwise_distances(X=X[start:end], Y=X, metric=measure, n_jobs=-1) else: w = distance.cdist(X[start:end], X, measure) y = np.argpartition(w, b, axis=1) z[start:end, :] = y[:, 1:k + 1] weigh[start:end, :] = np.reshape( w[tuple(np.repeat(np.arange(end - start), k)), tuple(y[:, 1:k + 1].ravel())], (end - start, k)) del (w) ind = np.repeat(np.arange(samples), k) P = csr_matrix((np.ones((samples * k)), (ind.ravel(), z.ravel())), shape=(samples, samples)) Q = csr_matrix((weigh.ravel(), (ind.ravel(), z.ravel())), shape=(samples, samples)) Tcsr = minimum_spanning_tree(Q) P = P.minimum(P.transpose()) + Tcsr.maximum(Tcsr.transpose()) P = triu(P, k=1) return np.asarray(find(P)).T
def distance(self, x, y=None, **kwargs): """ This method returns the distance between two points x and y. If y is not provided then x is a pointSet and a distance matrix is returned @ In, x, numpy.ndarray, array containing data of x, if 1D array is provided, the array will be reshaped via x.reshape(1,-1) @ In, y, numpy.ndarray, array containing data of y, if 1D array is provided, the array will be reshaped via y.reshape(1,-1) @ Out, value, numpy.ndarray, distance between x and y (if y is provided) or a square distance matrix if y is None """ if y is not None: if isinstance(x, np.ndarray) and isinstance(y, np.ndarray): if len(x.shape) == 1 and self.metricType not in scores.keys(): x = x.reshape(1, -1) #self.raiseAWarning(self, "1D array is provided. For consistence, this array is reshaped via x.reshape(1,-1) ") if len(y.shape) == 1 and self.metricType not in scores.keys(): y = y.reshape(1, -1) #self.raiseAWarning(self, "1D array is provided. For consistence, this array is reshaped via y.reshape(1,-1) ") dictTemp = utils.mergeDictionaries(kwargs, self.distParams) try: if self.metricType in pairwise.kernel_metrics().keys(): value = pairwise.pairwise_kernels( X=x, Y=y, metric=self.metricType, **dictTemp) elif self.metricType in pairwise.distance_metrics(): value = pairwise.pairwise_distances( X=x, Y=y, metric=self.metricType, **dictTemp) elif self.metricType in scores.keys(): value = np.zeros((1, 1)) value[:, :] = scores[self.metricType](x, y, **dictTemp) except TypeError as e: self.raiseAWarning( 'There are some unexpected keyword arguments found in Metric with type "', self.metricType, '"!') self.raiseAnError(TypeError, 'Input parameters error:\n', str(e), '\n') if value.shape == (1, 1): return value[0] else: return value else: self.raiseAnError( IOError, 'Metric SKL error: SKL metrics support only PointSets and not HistorySets' ) else: if self.metricType == 'mahalanobis': covMAtrix = np.cov(x.T) kwargs['VI'] = np.linalg.inv(covMAtrix) dictTemp = utils.mergeDictionaries(kwargs, self.distParams) try: if self.metricType in pairwise.kernel_metrics().keys(): value = pairwise.pairwise_kernels(X=x, metric=self.metricType, **dictTemp) elif self.metricType in pairwise.distance_metrics().keys(): value = pairwise.pairwise_distances(X=x, metric=self.metricType, **dictTemp) except TypeError as e: self.raiseAWarning( 'There are some unexpected keyword arguments found in Metric with type "', self.metricType, '"!') self.raiseAnError(TypeError, 'Input parameters error:\n', str(e), '\n') if value.shape == (1, 1): return value[0] else: return value
def silhouette_samples_slow(X, labels, metric='euclidean', **kwds): """Compute the Silhouette Coefficient for each sample. The Silhoeutte Coefficient is a measure of how well samples are clustered with samples that are similar to themselves. Clustering models with a high Silhouette Coefficient are said to be dense, where samples in the same cluster are similar to each other, and well separated, where samples in different clusters are not very similar to each other. The Silhouette Coefficient is calculated using the mean intra-cluster distance (a) and the mean nearest-cluster distance (b) for each sample. The Silhouette Coefficient for a sample is ``(b - a) / max(a, b)``. This function returns the Silhoeutte Coefficient for each sample. The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Parameters ---------- X : array [n_samples_a, n_features] Feature array. labels : array, shape = [n_samples] label values for each sample metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by metrics.pairwise.pairwise_distances. If X is the distance array itself, use "precomputed" as the metric. `**kwds` : optional keyword parameters Any further parameters are passed directly to the distance function. If using a scipy.spatial.distance metric, the parameters are still metric dependent. See the scipy docs for usage examples. Returns ------- silhouette : array, shape = [n_samples] Silhouette Coefficient for each samples. References ---------- Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the Interpretation and Validation of Cluster Analysis". Computational and Applied Mathematics 20: 53-65. doi:10.1016/0377-0427(87)90125-7. http://en.wikipedia.org/wiki/Silhouette_(clustering) """ metric = distance_metrics()[metric] n = labels.shape[0] A = np.array([_intra_cluster_distance_slow(X, labels, metric, i) for i in range(n)]) B = np.array([_nearest_cluster_distance_slow(X, labels, metric, i) for i in range(n)]) sil_samples = (B - A) / np.maximum(A, B) # nan values are for clusters of size 1, and should be 0 return np.nan_to_num(sil_samples)
def silhouette_samples_slow(X, labels, metric='euclidean', **kwds): """Compute the Silhouette Coefficient for each sample. The Silhoeutte Coefficient is a measure of how well samples are clustered with samples that are similar to themselves. Clustering models with a high Silhouette Coefficient are said to be dense, where samples in the same cluster are similar to each other, and well separated, where samples in different clusters are not very similar to each other. The Silhouette Coefficient is calculated using the mean intra-cluster distance (a) and the mean nearest-cluster distance (b) for each sample. The Silhouette Coefficient for a sample is ``(b - a) / max(a, b)``. This function returns the Silhoeutte Coefficient for each sample. The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Parameters ---------- X : array [n_samples_a, n_features] Feature array. labels : array, shape = [n_samples] label values for each sample metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by metrics.pairwise.pairwise_distances. If X is the distance array itself, use "precomputed" as the metric. `**kwds` : optional keyword parameters Any further parameters are passed directly to the distance function. If using a scipy.spatial.distance metric, the parameters are still metric dependent. See the scipy docs for usage examples. Returns ------- silhouette : array, shape = [n_samples] Silhouette Coefficient for each samples. References ---------- Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the Interpretation and Validation of Cluster Analysis". Computational and Applied Mathematics 20: 53-65. doi:10.1016/0377-0427(87)90125-7. http://en.wikipedia.org/wiki/Silhouette_(clustering) """ metric = distance_metrics()[metric] n = labels.shape[0] A = np.array( [_intra_cluster_distance_slow(X, labels, metric, i) for i in range(n)]) B = np.array([ _nearest_cluster_distance_slow(X, labels, metric, i) for i in range(n) ]) sil_samples = (B - A) / np.maximum(A, B) # nan values are for clusters of size 1, and should be 0 return np.nan_to_num(sil_samples)
def GetSklearnPairwiseMetrics(): while True: sklearn_dict = distance_metrics() yield pd.Series(data=itemgetter(*SKLEARN_FUNC_NAMES)(sklearn_dict), index=SKLEARN_FUNC_NAMES, name='sklearn_funcs')
def display_word_similarity( pipe, # nlu component_list default_texts: Tuple[str, str] = ("Donald Trump likes to party!", "Angela Merkel likes to party!"), threshold: float = 0.5, title: Optional[ str] = "Embeddings Similarity Matrix & Visualizations ", sub_tile: Optional[ str] = "Visualize `word-wise similarity matrix` and calculate `similarity scores` for `2 texts` and every `word embedding` loaded", write_raw_pandas: bool = False, display_embed_information: bool = True, similarity_matrix=True, show_algo_select: bool = True, dist_metrics: List[str] = ('cosine'), set_wide_layout_CSS: bool = True, generate_code_sample: bool = False, key: str = "NLU_streamlit", num_cols: int = 2, display_scalar_similarities: bool = False, display_similarity_summary: bool = False, model_select_position: str = 'side', # main or side show_infos: bool = True, show_logo: bool = True, ): """We visualize the following cases : 1. Simmilarity between 2 words - > sim (word_emb1, word_emb2) 2. Simmilarity between 2 sentences -> let weTW stand word word_emb of token T and sentence S 2.1. Raw token level with merged embeddings -> sim([we11,we21,weT1], [we12,we22,weT2]) 2.2 Autogenerate sentemb, basically does 2.1 in the Spark NLP backend 2.3 Already using sentence_embedder model -> sim(se1,se2) 3. Simmilarity between token and sentence -> sim([we11,w21,wT1], se2) 4. Mirrored 3 """ # https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.pairwise StreamlitVizTracker.footer_displayed = False try: import plotly.express as px from sklearn.metrics.pairwise import distance_metrics except: st.error( "You need the sklearn and plotly package in your Python environment installed for similarity visualizations. Run <pip install sklearn plotly>" ) if set_wide_layout_CSS: _set_block_container_style() if title: st.header(title) if show_logo: StreamlitVizTracker.show_logo() if sub_tile: st.subheader(sub_tile) StreamlitVizTracker.loaded_word_embeding_pipes = [] dist_metric_algos = distance_metrics() dist_algos = list(dist_metric_algos.keys()) if 'haversine' in dist_algos: dist_algos.remove('haversine') # not applicable in >2D if 'precomputed' in dist_algos: dist_algos.remove('precomputed') # Not a dist cols = st.columns(2) text1 = cols[0].text_input("Text or word1", default_texts[0], key=key + 'field_1') text2 = cols[1].text_input( "Text or word2", default_texts[1], key=key + 'field_2') if len(default_texts) > 1 else cols[1].text_input( "Text or word2", 'Please enter second string', key=key) # exp = st.sidebar.beta_expander("Select additional Embedding Models and distance metric to compare ") e_coms = StreamlitUtilsOS.find_all_embed_components(pipe) embed_algos_to_load = [] embed_pipes = [pipe] dist_algo_selection = dist_metrics if show_algo_select: # emb_components_usable = Discoverer.get_components('embed') emb_components_usable = [ e for e in Discoverer.get_components( 'embed', True, include_aliases=True) if 'chunk' not in e and 'sentence' not in e ] loaded_embed_nlu_refs = [] loaded_storage_refs = [] loaded_embed_nlu_refs = list(set(loaded_embed_nlu_refs)) for c in e_coms: r = c.nlu_ref if 'en.' not in r and 'embed.' not in r and 'ner' not in r: loaded_embed_nlu_refs.append('en.embed.' + r) elif 'en.' in r and 'embed.' not in r and 'ner' not in r: r = r.split('en.')[0] loaded_embed_nlu_refs.append('en.embed.' + r) else: loaded_embed_nlu_refs.append( StorageRefUtils.extract_storage_ref(c)) loaded_storage_refs.append( StorageRefUtils.extract_storage_ref(c)) for p in StreamlitVizTracker.loaded_word_embeding_pipes: if p != pipe: loaded_embed_nlu_refs.append(p.nlu_ref) for l in loaded_embed_nlu_refs: if l not in emb_components_usable: emb_components_usable.append(l) # embed_algo_selection = exp.multiselect("Click to pick additional Embedding Algorithm",options=emb_components_usable,default=loaded_embed_nlu_refs,key = key) # dist_algo_selection = exp.multiselect("Click to pick additional Distance Metric", options=dist_algos, default=dist_metrics, key = key) emb_components_usable.sort() loaded_embed_nlu_refs.sort() dist_algos.sort() if model_select_position == 'side': embed_algo_selection = st.sidebar.multiselect( "Pick additional Word Embeddings for the Similarity Matrix", options=emb_components_usable, default=loaded_embed_nlu_refs, key=key) dist_algo_selection = st.sidebar.multiselect( "Pick additional Similarity Metrics ", options=dist_algos, default=dist_metrics, key=key) else: exp = st.expander( "Pick additional Word Embeddings and Similarity Metrics") embed_algo_selection = exp.multiselect( "Pick additional Word Embeddings for the Similarity Matrix", options=emb_components_usable, default=loaded_embed_nlu_refs, key=key) dist_algo_selection = exp.multiselect( "Pick additional Similarity Metrics ", options=dist_algos, default=dist_metrics, key=key) embed_algos_to_load = list( set(embed_algo_selection) - set(loaded_embed_nlu_refs)) for embedder in embed_algos_to_load: embed_pipes.append(nlu.load(embedder)) if generate_code_sample: st.code( get_code_for_viz( 'SIMILARITY', [StreamlitUtilsOS.extract_name(p) for p in embed_pipes], default_texts)) StreamlitVizTracker.loaded_word_embeding_pipes += embed_pipes similarity_metrics = {} embed_vector_info = {} cols_full = True col_index = 0 # for p in embed_pipes : for p in StreamlitVizTracker.loaded_word_embeding_pipes: data1 = p.predict(text1, output_level='token', get_embeddings=True).dropna() data2 = p.predict(text2, output_level='token', get_embeddings=True).dropna() e_coms = StreamlitUtilsOS.find_all_embed_components(p) modelhub_links = [ ModelHubUtils.get_url_by_nlu_refrence(c.nlu_ref) for c in e_coms ] e_cols = StreamlitUtilsOS.get_embed_cols(p) for num_emb, e_col in enumerate(e_cols): if col_index == num_cols - 1: cols_full = True if cols_full: cols = st.columns(num_cols) col_index = 0 cols_full = False else: col_index += 1 tok1 = data1['token'] tok2 = data2['token'] emb1 = data1[e_col] emb2 = data2[e_col] def normalize_matrix(m): return np.nan_to_num( m / np.linalg.norm(m, axis=1, keepdims=True)) embed_mat1 = normalize_matrix(np.array([x for x in emb1])) embed_mat2 = normalize_matrix(np.array([x for x in emb2])) # e_name = e_col.split('word_embedding_')[-1] e_name = e_coms[num_emb].nlu_ref e_name = e_name.split( 'embed.')[-1] if 'en.' in e_name else e_name if 'ner' in e_name: e_name = loaded_storage_refs[num_emb] embed_vector_info[e_name] = { "Vector Dimension ": embed_mat1.shape[1], "Num Vectors": embed_mat1.shape[0] + embed_mat1.shape[0], "NLU_reference": e_coms[num_emb].nlu_ref, "Spark_NLP_reference": ModelHubUtils.NLU_ref_to_NLP_ref(e_coms[num_emb].nlu_ref), "Storage Reference": loaded_storage_refs[num_emb], 'Modelhub info': modelhub_links[num_emb] } for dist_algo in dist_algo_selection: # scalar_similarities[e_col][dist_algo]={} sim_score = ((dist_metric_algos[dist_algo] (embed_mat1, embed_mat2) - 1) * -1) sim_score = pd.DataFrame(sim_score) sim_score.index = tok1.values sim_score.columns = tok2.values sim_score.columns = StreamlitVizTracker.pad_duplicate_tokens( list(sim_score.columns)) sim_score.index = StreamlitVizTracker.pad_duplicate_tokens( list(sim_score.index)) if write_raw_pandas: st.write(sim_score, key=key) if sim_score.shape == (1, 1): sim_score = sim_score.iloc[0][0] sim_score = round(sim_score, 2) if sim_score > threshold: st.success(sim_score) st.success( f'Scalar Similarity={sim_score} for distance metric={dist_algo}' ) st.error( 'No similarity matrix for only 2 tokens. Try entering at least 1 sentences in a field' ) else: st.error( f'Scalar Similarity={sim_score} for distance metric={dist_algo}' ) else: ploty_avaiable = True # for tok emb, sum rows and norm by rows, then sum cols and norm by cols to generate a scalar from matrix scalar_sim_score = np.sum( (np.sum(sim_score, axis=0) / sim_score.shape[0])) / sim_score.shape[1] scalar_sim_score = round(scalar_sim_score, 2) if display_scalar_similarities: if scalar_sim_score > threshold: st.success( f'Scalar Similarity :{scalar_sim_score} for distance metric={dist_algo}' ) else: st.error( f'Scalar Similarity :{scalar_sim_score} for embedder={e_col} distance metric={dist_algo}' ) if similarity_matrix: if ploty_avaiable: fig = px.imshow( sim_score, labels=dict(color="similarity") ) # , title=f'Simmilarity Matrix for embedding_model={e_name} distance metric={dist_algo}') # st.write(fig,key =key) similarity_metrics[ f'{e_name}_{dist_algo}_similarity'] = { 'scalar_similarity': scalar_sim_score, 'dist_metric': dist_algo, 'embedding_model': e_name, 'modelhub_info': modelhub_links[num_emb], } subh = f"""Embedding-Model=`{e_name}`, Similarity-Score=`{scalar_sim_score}`, distance metric=`{dist_algo}`""" cols[col_index].markdown(subh) cols[col_index].write(fig, key=key) else: pass # todo fallback plots if display_similarity_summary: exp = st.expander("Similarity summary") exp.write(similarity_metrics) if display_embed_information: exp = st.expander("Embedding vector information") exp.write(embed_vector_info) if show_infos: # VizUtilsStreamlitOS.display_infos() StreamlitVizTracker.display_model_info(pipe.nlu_ref, pipes=[pipe]) StreamlitVizTracker.display_footer()