def __init__(self, spts, raw_data_spts): rgp = regionprops_table( spts, raw_data_spts, properties=["label", "intensity_image", "area"]) lbl_vol_ints_avints = [] for k in range(len(rgp["label"])): lbl_vol_ints_avints.append([ rgp["label"][k], rgp["area"][k], rgp["intensity_image"][k].sum(), rgp["intensity_image"][k].sum() / rgp["area"][k] ]) lbl_vol_ints_avints = np.asarray(lbl_vol_ints_avints) clusters_vol, ctrds = kmeans1d.cluster(lbl_vol_ints_avints[:, 1], 3) clusters_ints, ctrds = kmeans1d.cluster(lbl_vol_ints_avints[:, 2], 3) clusters_avints, ctrds = kmeans1d.cluster(lbl_vol_ints_avints[:, 3], 3) clusters_vol = np.asarray(clusters_vol) clusters_ints = np.asarray(clusters_ints) clusters_avints = np.asarray(clusters_avints) idxs = np.where(clusters_vol + clusters_ints + clusters_avints == 6)[0] ts_mtx = np.zeros(spts.shape, dtype=np.int32) for k in idxs: ts_mtx += int(lbl_vol_ints_avints[k, 0]) * (spts == int( lbl_vol_ints_avints[k, 0])) self.ts_mtx = ts_mtx
def Clustering(Fasta, Coverage, Size, cpu): Bin = [] Fa_Size = {} with open(Fasta, 'r') as in_handle: for title, seq in SimpleFastaParser(in_handle): Bin.append(title.split()[0]) Fa_Size[title.split()[0]] = len(seq) df = pd.read_table(Coverage, delimiter='\t', index_col=0) df = df.loc[Bin, :] if int(Size) > len(df.index): return {}, 0 # start clustering if len(df.columns) == 1: # 1-d kmeans df1 = df.iloc[:, 0].to_list() df_name = df.index labels, centroids = kmeans1d.cluster(df1, int(Size)) else: # kmeans df1 = df.values df_name = df.index # Number of clusters kmeans = KMeans(n_clusters=int(Size), n_init=30, n_jobs=cpu) # Fitting the input data kmeans = kmeans.fit(df1) # Getting the cluster labels labels = kmeans.predict(df1) # Centroid values centroids = kmeans.cluster_centers_ Sep_fa = {} for i, j in enumerate(df_name): Sep_fa.setdefault(str(labels[i]), []).append(j) return Sep_fa, Fa_Size
def set_mask_matrix(self): # torch.set_printoptions(threshold=500000) self.var_matrix = self.var_matrix / self.count_var_cov var_flatten = torch.flatten(self.var_matrix) if self.margin == 0: # kmeans1d clustering setting for ISW clusters, centroids = kmeans1d.cluster( var_flatten, self.clusters) # 50 clusters num_sensitive = var_flatten.size()[0] - clusters.count( 0) # 1: Insensitive Cov, 2~50: Sensitive Cov print("num_sensitive, centroids =", num_sensitive, centroids) _, indices = torch.topk(var_flatten, k=int(num_sensitive)) else: # do not use num_sensitive = self.num_off_diagonal - self.margin print("num_sensitive = ", num_sensitive) _, indices = torch.topk(var_flatten, k=int(num_sensitive)) mask_matrix = torch.flatten(torch.zeros(self.dim, self.dim).cuda()) mask_matrix[indices] = 1 if self.mask_matrix is not None: self.mask_matrix = (self.mask_matrix.int() & mask_matrix.view( self.dim, self.dim).int()).float() else: self.mask_matrix = mask_matrix.view(self.dim, self.dim) self.num_sensitive = torch.sum(self.mask_matrix) print("Check whether two ints are same", num_sensitive, self.num_sensitive) self.var_matrix = None self.count_var_cov = 0 if torch.cuda.current_device() == 0: print("Covariance Info: (CXC Shape, Num_Off_Diagonal)", self.mask_matrix.shape, self.num_off_diagonal) print("Selective (Sensitive Covariance)", self.num_sensitive)
def pca_init(self): from sklearn.decomposition import PCA import kmeans1d #Get the codebook inputs for the init_dataset: codebook_layer_in = self.codebook_layer.input n_batches = self.estimation_points // self.data.batch_size pred_fn = tf.keras.backend.function(inputs=self.model.inputs, outputs=[codebook_layer_in]) encoded_data = [ pred_fn(self.init_dataset.__getitem__(i)[0])[0] for i in range(n_batches) ] encoded_data = np.concatenate(encoded_data) encoded_data = np.reshape(encoded_data, (-1, encoded_data.shape[-1])) #Apply PCA: pca_model = PCA(n_components=self.codebook_layer.groups) encoded_low = pca_model.fit_transform(encoded_data) #Kmeans over each principal component groups_centroids = [] for component_values in encoded_low.T: clusters, centroids = kmeans1d.cluster( component_values, self.codebook_layer.codes_per_group) groups_centroids.append(centroids) groups_centroids = np.array(groups_centroids)[:, :, np.newaxis] codebook = groups_centroids * np.transpose( pca_model.components_[:, :, np.newaxis], (0, 2, 1)) return codebook
def groupBlocks(blocks_in, prev_word, pg_num): # Cluster blocks horizontally by the following 5 categories: # left word, left cont., center, right word, right cont. clusters, centroids = kmeans1d.cluster( [block['bbox'][0] for block in blocks_in], 5) # add the cluster component to the blocks blocks = [{ **block, 'cluster': cluster } for block, cluster in zip(blocks_in, clusters)] # sort in vertical direction blocks.sort(key=lambda block: (block['bbox'][1] + block['bbox'][3]) / 2 + (1e5 if block['bbox'][0] > centroids[2] else 0)) left_words = [] # words on left side of page right_words = [] # words on right side of page split_words = [] # words that are split across sides/pages left_word = [] # holds data for left-side words right_word = [] # holds data for right-side words insert_word = None # used to hold data from a split word for block in blocks: cluster = block['cluster'] if cluster == 0: # start of phrase (left side) if prev_word: insert_word = prev_word prev_word = False if left_word: add_word(left_words, left_word) left_word = add_group({}, add_block({}, block)) elif cluster == 1: # continuation of phrase (left side) if prev_word: split_word = add_group(prev_word, add_block({}, block)) add_word(split_words, split_word) prev_word = None continue assert (left_word) add_block(left_word['groups'], block) elif cluster == 3: # start of phrase (right side) if right_word: add_word(right_words, right_word) elif left_word: add_word(left_words, left_word) left_word = None right_word = add_group({}, add_block({}, block)) elif cluster == 4: # continuation of phrase (right side) if right_word: add_block(right_word['groups'], block) else: assert (left_word) split_word = add_group(left_word, add_block({}, block)) add_word(split_words, split_word) left_word = None # make sure we terminate with the last item on either side last_word = right_word if right_word else left_word return left_words + right_words, split_words, last_word, insert_word
def shrink_categorical(cat, n=4): ''' Reduces the categorical distribution to distribution of size n using k-means clustering :param cat: categorical distribution :param n: number of bins/clusters to be reduced to :return: ''' if (not hasattr(cat, "sample")) or (len(cat.vals) < n): return cat clusters, centroids = kmeans1d.cluster(cat.vals, n) probs = [0 for _ in range(n)] for cluster, prob in zip(clusters, cat.probs): probs[cluster] += prob return Categorical(centroids, probs=probs)
def save_quantized_bin(self, path): import kmeans1d # Not required for general pastiche usage, just for generating quantized model. k = 2 ** 8 q_state = {} # quantized state layer_names = [layer_name for layer_name in VGG19.LAYER_NAMES if re.match(r'^block\d+_conv\d+$', layer_name)] for layer_name in layer_names: layer = getattr(self, layer_name) bias = layer.bias shape = layer.weight.shape weight = layer.weight.flatten() clusters, centroids = kmeans1d.cluster(weight, k) q_state[layer_name + '_W_q'] = torch.tensor(clusters, dtype=torch.uint8).reshape(shape) q_state[layer_name + '_W_table'] = torch.tensor(centroids, dtype=torch.float32) q_state[layer_name + '_b'] = bias.detach().to('cpu', copy=True) torch.save(q_state, path)
def cluster(bins, flattened_val, val, inertia=None): if USE_KMEANS_CUDA and kmeans_cuda: invalids = None int_bins = bins while invalids is None or int_bins - invalids < bins: if invalids: int_bins = bins + invalids codebook, _ = kmeans_cuda(flattened_val.reshape((-1, 1)), int_bins, device=1) invalids = np.count_nonzero( np.isnan(codebook).any(axis=1)) + np.count_nonzero( np.isneginf(codebook).any(axis=1)) + np.count_nonzero( np.isposinf(codebook).any(axis=1)) # this is not a good solution since it will possibly end up with the wrong numer of bins # however kmeans cuda is not easy to install so probably not right anyway codebook = codebook[~np.isnan(codebook).any(axis=1)] codebook = codebook[~np.isneginf(codebook).any(axis=1)] codebook = codebook[~np.isposinf(codebook).any(axis=1)] last_inertia = None elif USE_KMEANS_1D and kmeans1d: clustered = kmeans1d.cluster(flattened_val, bins) codebook = np.array(clustered.centroids) encoded = codebook[clustered.clusters] last_inertia = np.sum( np.power(flattened_val, 2) - np.power(encoded, 2)) else: # scipy is horribly slow kmeans = KMeans(n_clusters=bins) kmeans.fit(flattened_val.reshape((-1, 1))) codebook = kmeans.cluster_centers_ last_inertia = kmeans.inertia_ codebook = codebook.astype(val.dtype).flatten() compressed_val, codes = codes_and_compressed(flattened_val, codebook, val.shape) if last_inertia is not None and inertia is not None: inertia.append(last_inertia) return compressed_val, codes, codebook
l_m[1, 0] = 53.29799633 l_m[2, 0] = 22.6235733 df1 = pd.read_csv("kine_features1.csv") print(df1.head()) data_12 = df1.values norm = summarize_cls(data_12) #print(data_12[1]) #print(class_separation(data_12)) d = list() df = pd.read_csv("kine_class_1.csv") #print(df.head()) data_1 = df["dist"] data_2 = df.values ##creating clusters from worspace for sampling the theta_initial clusters, centroids = kmeans1d.cluster(data_1, 5) # assign each sample to a cluster #km.fit(x.reshape(-1,1)) for i in centroids: d.append(i) ##def Reverse(Ist): #Ist.reverse() #return Ist # Driver Code #lst = [10, 11, 12, 13, 14, 15] #print(Reverse(d)) d_1 = d[::-1] f1 = {0: d_1[0], 1: d_1[1], 2: d_1[2], 3: d_1[3], 4: d_1[4]} l_m_1 = sqrt(pow(l_m[0, 0], 2) + pow(l_m[1, 0], 2) + pow(l_m[2, 0], 2)) d2 = []
sentiment_list = [] objectivity_list = [] retweet_list = [] for tweet in tweets: favorites_list.append(tweet['favorites']) listed_list.append(tweet['listed']) followers_list.append(tweet['followers']) friends_list.append(tweet['friends']) statuses_list.append(tweet['statuses']) time_list.append(tweet['created_at']) sentiment_list.append(tweet['sentiment']) objectivity_list.append(tweet['objectivity']) retweet_list.append(tweet['retweets']) favorites_clusters, favorites_centroids = kmeans1d.cluster(favorites_list, 16) listed_clusters, listed_centroids = kmeans1d.cluster(listed_list, 7) followers_clusters, followers_centroids = kmeans1d.cluster(followers_list, 11) friends_clusters, friends_centroids = kmeans1d.cluster(friends_list, 16) statuses_clusters, statuses_centroids = kmeans1d.cluster(statuses_list, 11) time_clusters, time_centroids = kmeans1d.cluster(statuses_list, 24) sentiment_clusters, sentiment_centroids = kmeans1d.cluster(sentiment_list, 5) objectivity_clusters, objectivity_centroids = kmeans1d.cluster( objectivity_list, 5) print(time_list) print(time_clusters) print(time_centroids) x = []
def sbl(A, y, eps=1e-3, thresholding_method='tau'): ''' Refer to SLide 31 of http://math.iisc.ernet.in/~nmi/Chandra%20Murthy.pdf for algorithm Inputs: A = measurement matrix y = measurements sigval = variance of noise in measurement tau = threshold on signal ''' # Doing this preprocessing inside sbl() function itself tau = 0. #0.01 * np.min(y/np.sum(A, axis=-1)) y_max = np.max(y) assert y_max >= 0 if y_max > 0: A = A / y_max y = y / y_max pos_y = y[y > 0.] pos_A = A[y > 0.] sigval = np.std(pos_y / np.sum(pos_A, axis=-1)) else: # sigma should be 0 but this will mess with the algo. Set it to some small # value sigval = 0.1 y = np.array(y, dtype=np.float64) A = np.array(A, dtype=np.float64) [m, n] = A.shape # Pre-Processing # As all A_ij and x_j are positive, for any y_i=0 implies that for all j s.t A_ij=1, x_j=0. This reduces problem dimension. #nz_index = (np.where(y != 0))[0] #z_index = (np.where(y == 0))[0] #red_y = y[nz_index] #[r,c] = np.where(A[z_index,:] != 0) #ind_zero_x = np.unique(c) #ind = np.arange(0, n) #ind_nonzero_x = np.setxor1d(ind,ind_zero_x) #red_x = x[ind_nonzero_x] #red_A = (A[nz_index,:])[:,ind_nonzero_x] #red_n = (ind_nonzero_x.shape)[0] red_A = A red_n = n red_y = y ind_nonzero_x = np.arange(n) # Sparse Bayesian Learning # Corner cases are when 0 samples or all ys are 0 if red_n == 0 or np.all(red_y == 0): x_est = np.zeros(n) else: #E-step # mu is estimated mean of posterior distribution x|y, and so is the estimated red_x computed iteratively # Sigma is variance of posterior distribution of x|y # M-step # Gamma is the prior variance of x, inv_Gamma is saved as E-step only requires the inverse inv_Gamma = np.identity(red_n) gamma = np.ones(n) mu_old = np.ones(red_n) mu = np.zeros(red_n) variance = sigval * sigval #print('inside else') while np.sum(mu) == 0 or np.linalg.norm( mu_old - mu, 2) / np.linalg.norm(mu, 2) > eps: #print('inside loop') mu_old = mu inv_Sigma = np.matmul(np.transpose(red_A), red_A) / (variance) + inv_Gamma Sigma = np.linalg.inv(inv_Sigma) mu = np.matmul(Sigma, np.matmul(np.transpose(red_A), red_y)) / (variance) #mu[mu<0] = 0 err = red_y - np.dot(red_A, mu) variance = (np.sum(err * err) + variance * np.sum(1.0 - (1.0 / gamma) * np.diag(Sigma))) / m gamma = np.square(mu) + np.diag(Sigma) inv_Gamma = np.diag(1 / gamma) #pos_mu = mu[mu>0] #min_pos_mu = np.min(pos_mu) #mu[mu<=0] = min_pos_mu #log_mu = np.log(mu) #clusters, centroids = kmeans1d.cluster(log_mu, 2) #lower_centroid = centroids[0] #clusters = np.array(clusters) #lower_cluster = mu[clusters == 0] #lower_cluster_std = np.std(lower_cluster) #tau = lower_centroid - lower_cluster_std #log_tau = lower_centroid - lower_cluster_std #mu[log_mu < log_tau] = 0 #print('\nmu = ', mu) #print('\nlog_mu = ', np.log(mu)) #print('clusters = ', clusters) #print('centroids = ', centroids) assert thresholding_method in ['tau', 'cluster'] if thresholding_method == 'cluster': # Cluster with mu_square instead of with mu to get even more precise results! # Still catch 40% of positives on avg mu1 = np.array(mu) mu1[mu1 < 0] = 0 mu_square = mu1 * mu1 #mu_square = mu clusters, centroids = kmeans1d.cluster(mu_square, 2) mu = mu * np.array(clusters) x_est = np.zeros(n) x_est[ind_nonzero_x] = mu if thresholding_method == 'tau': x_est[x_est < tau] = 0 #print(x_est) return x_est
def test_cluster(self): x = [4.0, 4.1, 4.2, -50, 200.2, 200.4, 200.9, 80, 100, 102] k = 4 clusters, centroids = cluster(x, k) self.assertEqual(clusters, [1, 1, 1, 0, 3, 3, 3, 2, 2, 2]) self.assertEqual(centroids, [-50.0, 4.1, 94.0, 200.5])
Data_I_Arr = pd.Series(Data_I).array Data_ID_Arr = pd.Series(Data_ID).array Data_PR_Arr = pd.Series(Data_PR).array Data_PRD_Arr = pd.Series(Data_PRD).array Data_BR_Arr = pd.Series(Data_BR).array Data_ER_Arr = pd.Series(Data_ER).array Data_PV_Arr = pd.Series(Data_PV).array Data_SD_Arr = pd.Series(Data_SD).array Data_OS_Arr = pd.Series(Data_OS).array Data_B_Arr = pd.Series(Data_B).array Data_R_Arr = pd.Series(Data_R).array Data_TT_Arr = pd.Series(Data_TT).array Data_W_Arr = pd.Series(Data_W).array Data_Label_Arr = pd.Series(Data_Label).array clusters, centroids_A = kmeans1d.cluster(Data_A_Arr, 5) clusters, centroids_AD = kmeans1d.cluster(Data_AD_Arr, 5) clusters, centroids_I = kmeans1d.cluster(Data_I_Arr, 5) clusters, centroids_ID = kmeans1d.cluster(Data_ID_Arr, 5) clusters, centroids_PR = kmeans1d.cluster(Data_PR_Arr, 5) clusters, centroids_PRD = kmeans1d.cluster(Data_PRD_Arr, 5) clusters, centroids_BR = kmeans1d.cluster(Data_BR_Arr, 5) clusters, centroids_ER = kmeans1d.cluster(Data_ER_Arr, 5) clusters, centroids_PV = kmeans1d.cluster(Data_PV_Arr, 5) clusters, centroids_SD = kmeans1d.cluster(Data_SD_Arr, 5) def find_Centroid(arr): Out = np.array([0.0, 0.0, 0.0, 0.0]) Out[0] = (arr[0] + arr[1]) / 2 Out[1] = (arr[1] + arr[2]) / 2