def run(self, data_y, q_max=10, show=False, tol=1e-4): """ Method to reduce dimension. Every iteration run all points. The new data is stored in attribute 'data_x'. Parameters ---------- data_y : numpy.array Array with the original data. q_max : int (default = 10) Number of iterations. Each iteration run all points in 'data_y'. show : boolean (default = False) If True, shows the stress curve along time. tol : float (default = 1e-4) Tolerance for the stopping criteria. Returns ------- data_x : numpy.array New data representation. """ self.data_y = data_y n = len(data_y) triu = np.triu_indices(n, 1) dist_y = pdist(data_y) data_x = PCA(self.p).fit_transform(data_y) stress = np.zeros(q_max) print("Progress: 0.0%", end='\r') for q in range(q_max): alpha = max(0.001, self.alpha / (1 + q)) lmbda = max(0.1, self.lmbd / (1 + q)) for i in range(n): dist_x = cdist(data_x[i].reshape(1, -1), data_x) dy = np.delete(dist_y[i], i, 0) dx = np.delete(dist_x, i, 1) delta_x = (alpha * (lmbda > dx) * (dy - dx) / dx).reshape( (-1, 1)) * (data_x[i] - np.delete(data_x, i, 0)) delta_x = np.insert(delta_x, i, 0, axis=0) data_x -= delta_x dist_x = pdist(data_x) stress[q] = self._stress(dist_y[triu], dist_x[triu], lmbda) if stress[q] < tol: print("Progress: 100.00%") print(f"Tol achieved in iteration {q}") break print(f"Progress: {round((q+1)*100/q_max,2)}% ", end='\r') if show: plt.plot(np.arange(q_max), stress, marker='.', c='black') plt.xlabel("Iteration") plt.ylabel("Stress") plt.show() print() self.data_x = data_x return data_x
def estimate_med_dist(features, num_slices=5000, percentile=50): pdists = pdist(features.values[:num_slices, :], metric='euclidean').reshape(-1, 1) # to handle sparse data med_dist = np.percentile(pdists[pdists > np.finfo(float).eps * 10], percentile) med_dist = np.max((0.05, med_dist)) return med_dist
def get_IBM_from_pairwise_dist(teX_mag, trX_mag, IBM, K, metric, P=None): teX_feed = teX_mag trX_feed = trX_mag if P is not None and metric == 'hamming': teX_feed, _ = WTA(teX_mag, P) trX_feed, _ = WTA(trX_mag, P) D = pdist(teX_feed.T, trX_feed.T, metric) F, T = teX_mag.shape return get_IBM_med_mean(D, IBM, K, T)
def get_sim_matrix(trX_mag, metric, errmetric, P=None): trX_feed = trX_mag if metric == 'hamming' and P is not None: trX_Pidx, _ = WTA(trX_mag, P) trX_feed = trX_Pidx sim = pdist(trX_feed.T, metric=metric) if errmetric == 'xent': return 1 - sim return sim
def cluster_2point0(df): ''' In this function we computed median of each cluster and tried to merge cluster if the distance between their median is less than thr km. PARAMETERS: df -> dataframe having all the points with column X,Y,Z,dbz eps,min_pnts are the parameters for the DBSCAN on the basis of X,Y,Z values eps2 , min_pts2 will be the parameter of dbz level of clustering thr is the threshold value on the basis of which the cloud cluster has to be merged ''' df = plot_dbscan(df, eps,min_pts, eps2=eps2, min_pts2=min_pts2) #df has now 2 additional columns label level 0 and label level 1 df_median = df[~df['label level 1'].str.contains('-1_')].groupby('label level 1').median()[['X', 'Y', 'Z']] #ignoring the noise points of label level one and grouping df by median. pdistance = pdist(df_median) #computing distance matrix while np.amin(pdistance) <= thr: #entry condition df_median = df.groupby('label level 1').median()[['X', 'Y', 'Z']] #grouping each points with their labels and computing medoid pdistance = pdist(df_median) for i in range(len(pdistance)): pdistance[i][i] = np.inf idx = np.argwhere(pdistance == (np.amin(pdistance)))[0] #using the index of the min distance to get the labels, index here are the labels of 'labels level 1' df_median.index[idx[0]], df_median.index[idx[1]] df['label level 1'].replace({df_median.index[idx[1]]:df_median.index[idx[0]]}, inplace=True) # replacing the second cluster name with the first cluster's name return df #COLUMNS : X,Y,Z,dbz,label level 0,label level 1
def getdist(f): fout = f.split("/")[-1].replace(".txt", ".meg") mat = pd.read_table(f, index_col=0) dm = pdist(mat.T, metric='euclidean', n_jobs=10) dm = pd.DataFrame(dm, index=mat.columns, columns=mat.columns) dm = dm.mask(np.triu(np.ones(dm.shape)).astype(np.bool)) dm.to_csv(fout, sep="\t", index=False, header=False) s = open(fout).read() cs = ["#%s" % c for c in mat.columns] cs = "\n".join(cs) ss = "#mega\n!TITLE Genetic distance data;\n!Format DataType=distance;\n!Description\n CYQ try;\n" ss = ss + cs + s with open(fout, "w") as f2: f2.write(ss)
def calc_loss(self, gap, feature): loss = torch.zeros(1) if self.flag_calc_loss: # calculate median distance between all pairs of points med_dist = np.median( pdist(gap.detach().cpu().numpy(), metric='euclidean').reshape(-1, 1)) # calculate current kernel bandwidth as moving average of previous sigma and current median distance sigma_gap = np.maximum( self.decay_factor * self.sigma_gap + (1 - self.decay_factor) * med_dist, 0.005) gap = gap[:, :self. activation_size] # penalize only the latent representation, not the external features hsic_features = HSIC(gap, feature, kernelX='Gaussian', kernelY='Gaussian', sigmaX=sigma_gap, sigmaY=self.external_feature_std, device=self.device) loss = self.lambda_hsic * hsic_features return loss
def gen_inits_WH(self, init='random', seed=1, H_ortho=True): ''' The function is to initialize the factors W, H for nonnegative matrix factorization There are some options: 1. random ------ generate W, H randomly 2. kmeans ------ generate H based on cluster assignments obtained by Kmeans then W = data_mat * H (since H is orthogonal) 3. nmf ------ use sklearn.nmf on data matrix firstly to get W, H for initialization 4. kmeans++ ---- use heuristic strategy kmeans++ to get cluster assignment which can be used for H and W = data_mat * H Args: data (numpy array or mat): the input data init (string): the name of method used for generating the initializations rank (int): the rank for decomposition seed (float): the seed for random generator Returns: numpy matrix W and H ''' ortho = 'ortho' if H_ortho else '' data_name = self.data_kind + str(self.data_num) initW_path = os.path.join(self.root_dir, 'inits', data_name, 'W' + str(seed) + '.csv') initH_path = os.path.join(self.root_dir, 'inits', data_name, 'H' + '_' + ortho + str(seed) + '.csv') if os.path.exists(initW_path) and os.path.exists(initH_path): if seed < 100: W_init = self.read_data_from_csvfile(initW_path) H_init = self.read_data_from_csvfile(initH_path) else: ( m, n ) = self.data_mat.shape # get the size of data matrix to be decomposed np.random.seed(seed) if init == 'random': abs_mat = np.absolute(self.data_mat) #print np.any(abs_mat < 0) avg = np.sqrt(abs_mat.mean() / self.num_of_cls) print 'mean: ' + str(abs_mat.mean()) print 'rank: ' + str(self.num_of_cls) print 'avg: ' + str(avg) W_init = np.asmatrix(avg * np.random.random( (m, self.num_of_cls))) H_init = np.asmatrix(avg * np.random.random( (n, self.num_of_cls))) elif init == 'kmeans': km = sklearn_KMeans(n_clusters=self.num_of_cls).fit( self.data_mat.transpose()) clusters = km.predict(self.data_mat.transpose()) H_init = np.asmatrix(np.zeros((n, self.num_of_cls))) for i in range(len(clusters)): H_init[i, clusters[i]] = 1 H_init = H_init * np.diag( np.diag(H_init.transpose() * H_init)**(-0.5)) W_init = self.data_mat * H_init elif init == 'nmf': model = sklearn_NMF(n_components=self.num_of_cls, init='nndsvd', random_state=0) W = model.fit_transform(self.data_mat.transpose()) H = model.components_ H_init = np.asmatrix(W) W_init = np.asmatrix(H).transpose() elif init == 'kmeans++': print 'using k++ initialization....' data_mat = self.data_mat.transpose() initial_centroids = np.ones((self.num_of_cls, m)) * (-1) ind_list = [] idx = np.random.choice(n) ind_list.append(idx) initial_centroids[0, :] = data_mat[idx, :] while len(ind_list) < self.rank: cent = initial_centroids[0:len(ind_list), :] D2 = np.array([ min([LA.norm(x - c)**2 for c in cent]) for x in data_mat ]) probs = D2 / D2.sum() cumprobs = probs.cumsum() #r = random.random() r = np.random.random() idx = np.where(cumprobs >= r)[0][0] ind_list.append(idx) initial_centroids[len(ind_list) - 1, :] = data_mat[idx, :] print ind_list W_init = np.asmatrix(initial_centroids).transpose() distances = np.ones((m, self.num_of_cls)) * (-1) for centroid_idx in range(self.num_of_cls): for data_idx in range(n): distances[data_idx, centroid_idx] = LA.norm( data_mat[data_idx, :] - initial_centroids[centroid_idx, :]) cluster_assignments = np.argmin(distances, axis=1) temp_H = np.asmatrix(np.zeros((n, self.num_of_cls))) for j in range(n): temp_H[j, cluster_assignments[j]] = 1 #temp_H = np.diag(np.diag(temp_H * temp_H.transpose()) ** (-0.5)) * temp_H H_init = np.asmatrix(temp_H) else: raise ValueError( 'Error: invalid int parameter - init (None, random, kmeans, nmf)!!' ) H_init = np.asmatrix(H_init.transpose()) if H_ortho: #H_init = np.asmatrix(H_init.transpose()) (ha, hb) = H_init.shape ortho = LA.norm( H_init * H_init.transpose() - np.asmatrix(np.eye(ha)), 'fro') print H_init * H_init.transpose() if ortho > 1e-6: H = np.zeros((ha, hb)) ind = np.asarray(np.argmax(H_init, 0))[0, :] for j in range(hb): H[ind[j], j] = 1 H = np.asmatrix(H) temp = np.diag(H * H.transpose()) if np.any(temp == 0): print temp raise ValueError("some rows of H are zeros!!!") H = np.asmatrix(np.diag(temp**(-0.5))) * H H_init = H if seed >= 100: np.random.seed(seed) (m, n) = self.data_mat.shape # find centers from the smallest clusters cls_idx, cls_sizes = np.unique(self.true_labels, return_counts=True) s_id = cls_idx[np.argmax(cls_sizes)] id_list = np.where(self.true_labels == s_id)[0] print s_id print id_list dis_mat = pdist(self.data_mat.transpose()) print np.argmin(dis_mat) print np.unravel_index(dis_mat.argmin(), dis_mat.shape) print np.where(dis_mat == np.min(dis_mat[np.nonzero(dis_mat)])) print 'select initial points -----' select_idx = [997, 998, 999] print select_idx #print id_list #select_idx = np.random.choice(id_list, self.num_of_cls, replace = False) W_init = self.data_mat[:, select_idx] #raise ValueError('TTEST!') W_init = np.asmatrix(W_init) print W_init.shape # save generated initializations f_manager = FileManager(self.root_dir) f_manager.add_file(initW_path) np.savetxt(initW_path, np.asmatrix(W_init), delimiter=',') f_manager.add_file(initH_path) np.savetxt(initH_path, np.asmatrix(H_init), delimiter=',') return np.asmatrix(W_init), np.asmatrix(H_init)
def generate_metrics(hashes, labels, hamming_N=500, hamming_R=2): dists = pdist(hashes, metric="hamming") * hashes.shape[1] mAP = mean_average_precision(dists, labels) precision_at_N = precision_at_sample(dists, labels, hamming_N) hamming_rank = hamming_radius(dists, labels, hamming_R) return mAP, precision_at_N, hamming_rank