def test_converges(self): for ix, input in enumerate(self.test_cases['input']): km = KMeans(input, self.test_cases['K'][ix]) km._init_centroids() old_centroid, centroid, bool_value = self.test_cases['converge'][ix] km.old_centroids, km.centroids = old_centroid, centroid self.assertEqual(km.converges(), bool_value)
def test_Kmeans(self): for ix, input in enumerate(self.test_cases['input']): km = KMeans(input, self.test_cases['K'][ix]) km.fit() np.testing.assert_array_equal(km.centroids, self.test_cases['kmeans'][ix])
def test_get_labels(self): for ix, input in enumerate(self.test_cases['input']): km = KMeans(input, self.test_cases['K'][ix]) km._init_centroids() km.get_labels() np.testing.assert_array_equal(km.labels, self.test_cases['labels'][ix])
def init_with_kmeans(self, npimg, mask): print("Creating GMM.....") # print("step8") self._beta = self.Beta(npimg) self.Smoothness(npimg, self._beta, self._gamma) bgd = np.where(mask == self.GT_bgd) prob_fgd = np.where(mask == self.P_fgd) BGDpixels = npimg[bgd] #(_,3) FGDpixels = npimg[prob_fgd] #(_,3) self.KmeansBgd = Kmeans(BGDpixels, dim=3, cluster=5, epoches=2) self.KmeansFgd = Kmeans(FGDpixels, dim=3, cluster=5, epoches=2) bgdlabel = self.KmeansBgd.run() # (BGDpixel.shape[0],1) # print(bgdlabel) fgdlabel = self.KmeansFgd.run() # (FGDpixel.shape[0],1) # print(fgdlabel) self.BGD_GMM = GMM() # The GMM Model for BGD self.FGD_GMM = GMM() # The GMM Model for FGD for idx, label in enumerate(bgdlabel): self.BGD_GMM.add_pixel(BGDpixels[idx], label) for idx, label in enumerate(fgdlabel): self.FGD_GMM.add_pixel(FGDpixels[idx], label) # learning GMM parameters self.BGD_GMM.learning() self.FGD_GMM.learning()
def kmeans_clustering(): """ For button “Kmeans” Use the Kmeans algorithm to docluster analysis on generated or loaded data and display the clustering results :return: """ global X Kmeans.run_given_data(X, int(k.get())) refresh_photo('./graph/origin.png', './graph/clustering.png')
def SegmentImages(trainDataPath,trainGroundTruth): for filename in glob.glob(trainDataPath+"\\"+"*.jpg"): #reading files from training data img = mpimg.imread(filename,format="jpg") rows = len(img) cols = len(img[0]) labels , clusters = Kmeans.Kmeans(img,3) print("Image After Clustering ") plt.imshow(labels) plt.show() labelsAs1D = np.reshape(labels,154401) #print(f" {labelsAs1D}") #reading files from ground truth filename_w_ext = os.path.basename(filename) imageName, file_extension = os.path.splitext(filename_w_ext) mat = scipy.io.loadmat(trainGroundTruth+"\\"+imageName+".mat") numberOfImages = len(mat['groundTruth'][0]) fig , ax = plt.subplots(1,numberOfImages+1) ax[0].imshow(img) for k in range(0,numberOfImages,1): groundImage = mat['groundTruth'][0][k][0][0][0] ax[k+1].imshow(groundImage) plt.show() for i in range(0,numberOfImages,1): groundImage = mat['groundTruth'][0][i][0][0][0] groundTruthAs1D = np.reshape(groundImage,154401) matrix = pd.crosstab(labelsAs1D,groundTruthAs1D, rownames=['labels'], colnames=['img']) #print(matrix) #converting DataFrame to Numpy Array matrix = matrix.values fScore = Kmeans.getFScore(matrix) conditionalEntropy = Kmeans.getConditionalEntropy(matrix) print(f"Scores against groundTruth image {i}:") print("fScore is ",fScore) print("conditionalEntropy ",conditionalEntropy) print("\n\n")
def get_color_predictions(images, max_k): # preds = np.empty((len(images), k), dtype='<U8') preds = [] for ix, input in enumerate(images): # S'ha observat que el nombre d'iteracions necessàries era proper a k*5. # Si el sobrepassa, és que no està essent eficient # La tolerància podria ser 0.05 però no val la pena kms = km.KMeans(input, 1, {"km_init": "kmeans++", "max_iter": max_k*5, "threshold": 0.35, "fitting": "DB", "tolerance": 0.1, "background_mask": 250}) kms.find_bestK(max_k) kms.fit() preds.append(km.get_colors(kms.centroids)) return np.array(preds)
def main(): X = initProblem(N, K, Nmax) for i in range(10): W2, Y = construct_Wks(X, N, K, M, Mmax, W) Xtab = bicluster_SDP(X, K, N, M, Mmax, Nmax, W2) xi = extract_xi_diag(Xtab) # Converting for k-means kmeansVars = [np.zeros(K) for i in range(N)] for i in range(K): # For each cluster for j in range(N): # For each coord kmeansVars[j][i] = xi[i][j] # Using k-means constrained KM = Kmeans.KmeansConstrained(kmeansVars, Nmax, N, K, 100) test = [ np.array([-5, 0]), np.array([-3, 0]), np.array([-4, 0]), np.array([2, 0]), np.array([3, 0]) ] #KM = Kmeans.KmeansConstrained(test, 3, 5, 2, 50) KM.initialization() KM.assignment(0) X = map_to_X(KM.map) print(Y) print(KM.map)
def main(): data = pd.read_csv('/Users/bytedance/Desktop/AI/data/wine.data.csv') label = data["0"].to_numpy() del data["0"] data = data / data.max(axis=0) # normalize data = data.to_numpy() # PCA K = 3 for thresh in [0.9, 0.8, 0.7, 0.6, 0.5]: new_data, _, _ = PCA.PCA(data.T, 2, True, thresh) ndim = new_data.shape[1] print( f"======== kmeans, K = {K}, ndim = {ndim}, thresh = {thresh} =========" ) if ndim == 2: plt.figure(1) plt.scatter(new_data[:, 0], new_data[:, 1], s=50) S, RI, predicted_label = Kmeans.test_kmeans(new_data, label, K) df_data = pd.DataFrame(new_data) df_label = pd.DataFrame(predicted_label) result_df = pd.concat([df_label, df_data], axis=1) result_df.to_csv(f"./result_ndim{ndim}_K{K}.csv")
def seqkm(k, Images, SampleSize): print("SeqKM start") v = [] PredictedLabels = [] f = k while f > 0: v.append(100) f = f - 1 if SampleSize < len(Images): M = rd.choices(Images, k=SampleSize) else: M = Images # print("choose " + str(k) + " centroid with kmeans++") centers, label = Kmeans.KMeansPlusplus(M, k) # f = 0 # i = 0 # for image in Images: # distances = [euclidean_distance(centroid, image) # for (centroid) in centers] # j = distances.index(min(distances)) # PredictedLabels.append(j) # i = i + 1 # v[j] = v[j] + 1 # epsilon = 1 / v[j] # f = f + 1 # if SampleSize< len(Images) : # # print("update centroid number " + str(j)) # for i in range(0, len(image)): # centers[j][i] = ((1 - epsilon) * centers[j][i] + 0.5) + (epsilon * image[i] + 0.5) print("SeqKM done") return v, PredictedLabels, centers
def deal_image(file_path: str, step: int, dots: int, to_show: bool, to_save: bool, dist_fun_str: str, random: bool): coll = io.ImageCollection(file_path) if len(coll) == 0: return ReturnCode.NO_SUCH_FILE for index in range(len(coll)): img = np.array(coll[index]) dx = int(img.shape[0] / step) dy = int(img.shape[1] / step) if dx == 0 or dy == 0 or step <= 0: return ReturnCode.INVALID_STEPS if dots == 0 or dots >= img.shape[0] * img.shape[1]: return ReturnCode.INVALID_DOTS features = [] for x in range(step): for y in range(step): Y = np.mean(img[x * dx:(x + 1) * dx, y * dy:(y + 1) * dy, 0]) U = np.mean(img[x * dx:(x + 1) * dx, y * dy:(y + 1) * dy, 1]) V = np.mean(img[x * dx:(x + 1) * dx, y * dy:(y + 1) * dy, 2]) features.append([Y, U, V]) # i = img.reshape(img.shape[0] * img.shape[1], 3) i = features dist_fun = str == Kmeans.ecludDist if dist_fun_str == 'ecludDist' else Kmeans.manhattanDist (index_in_center, center) = Kmeans.kMeans(i, dist_fun, Kmeans.randCenter(i, dots) if random else Kmeans.randCenter(i, dots), dots) res = [] for j in index_in_center: res.append(center[j]) print(res) ni = np.zeros((img.shape[0], img.shape[1], 3)) for n in range(len(res)): for x in range(dx): for y in range(dy): ni[int(n / step) * dx + x, n % step * dy + y] = [x / 255 for x in res[n]] if to_show: plt.imshow(ni) plt.axis('off') plt.show() if to_save: plt.imshow(ni) plt.axis('off') new_name = file_path.split('.') new_name[0] = new_name[0] + '-' + str(index) + '-' + str(step) + '.' new_name = "".join(new_name) plt.savefig(new_name) return ReturnCode.SUC
def main(): global cls, data, clsKData km.main() data = km.data cls = km.cls N = len(data) dataArray = np.zeros((len(data), 2)) #2-dim array for all data for i in range(N): k = cls[i] clsKData[k].append([ float(data[i][0]), float(data[i][1]) ]) #for first time calculation of mean, amplitude, cov dataArray[i][0] = float(data[i][0]) dataArray[i][1] = float(data[i][1]) gammas = np.zeros((len(data), 3)) #possibility of data i to cluster k means, covs, amplitudes = Expectation(True, gammas, dataArray) time = 0 converge = False gammas = Maximization(means, covs, amplitudes, dataArray, True) while time < MAX_ITER and converge == False: prevMeans = means means, covs, amplitudes = Expectation(False, gammas, dataArray) gammas = Maximization(means, covs, amplitudes, dataArray, False) time += 1 meanDiff = np.abs(np.subtract(means, prevMeans)) converge = True for i in range(3): #print(meanDiff[i]) meanDis = pow((meanDiff[i][0]**2 + meanDiff[i][1]**2), 0.5) #print(meanDis) if (meanDis >= 0.00001): converge = False for i in range(3): print("Mean", i + 1, ": ") print(means[i]) print("Covariance", i + 1, ": ") for j in range(2): print(covs[i][j]) print("Amplitude", i + 1, ": ") print(amplitudes[i])
def train(self, X_train, y_train, learning_rate=0.5, reg=1e-3, num_iters=100, batch_size=200, print_progress=False): """ Inputs: - X_train: A PyTorch tensor of shape (N, D) containing training data; there are N training samples each of dimension D. - y_train: A PyTorch tensor of shape (N,) containing training labels; y[i] = {-1,1} means that X[i] has label -1 or 1 depending on the class. - K: number of clusters - lamb: global regularization factor - learning_rate: (float) learning rate for optimization. - reg: (float) regularization strength. (ie. lambda) - num_iters: (integer) number of steps to take when optimizing - batch_size: (integer) number of training examples to use at each step. - print_progress: (boolean) If true, print progress during optimization. - exit_diff: (float) condition to stop the gradient descent algorithm if the change in loss is too low. Returns: A tuple of: - loss_all: A PyTorch tensor giving the values of the loss at each training iteration. """ N, D = X_train.shape # clustering cluster_label, centroid = Kmeans(X_train, self.K) self.centroid = centroid # feature extension X_train_hat = self.feature_extension(X_train, cluster_label) # train linear SVM loss_hist = self.LSVM.train(X_train_hat, y_train, reg=reg, num_iters=num_iters, learning_rate=learning_rate) # SVM parameters W_hat = torch.tensor(self.LSVM.W, dtype=X_train.dtype, device=X_train.device) # global regularizer self.W = 1 / np.sqrt(self.lamb) * W_hat[:D] # local predictor self.Wl = torch.zeros(D, self.K, dtype=X_train.dtype, device=X_train.device) for l in range(self.K): self.Wl[:, l] = W_hat[(D * (l + 1)):(D * (l + 2))] + self.W return loss_hist
def plot_distances(data, max_val, min_val=2): distances = [] for i in range(min_val, max_val + 1): model = Kmeans.Kmeans(i, data) distances.append(model.train(show_graph=False)) plt.plot([i + 2 for i in range(len(distances))], distances) plt.xlabel("Number of clusters") plt.ylabel("Total Sum") plt.title("Elbow Method") plt.show()
def run(): # data for multi-dimensionality (4 features) # data = pd.read_csv("results4-feat.csv") # dataset with 2 features for testing graph and visualizations data = pd.read_csv("results_short.csv") # while True: # plot_distances(data, max_val=5) model = Kmeans.Kmeans(k=2, data=data) model.train(show_graph=True)
def kmeans(self, trainset, testset, k, k_for_cluster, isClassification): km = Kmeans.Kmeans(k_for_cluster, trainset) #centroids = km.converge() centroids_class = km.getClusters() centroids_class = centroids_class[testset.columns] #call knn with the reduced train set- Centroids predicted = Knn.Knn().fit(centroids_class.values, testset, k, isClassification) return predicted, testset.iloc[:, -1] #return predicted and actual labels
def kmean_statistics(images, options, kmax=10, nsamples=250): global_times = np.zeros((kmax-1)) global_scores = np.zeros((kmax-1)) global_iterations = np.zeros((kmax-1)) for ix, input in enumerate(images[:nsamples]): local_times = [] local_scores = [] local_iterations = [] kms = km.KMeans(input, 1, options) for k in range(1, kmax+1): start = time.time() kms.K = k kms.fit() score = kms.perform_score() end = time.time() elapsed = end - start global_times[k-2] += elapsed global_scores[k-2] += score global_iterations[k-2] += kms.num_iter # local_scores.append(score) # local_iterations.append(kms.num_iter) # local_times.append(elapsed) # print("Results for image " + str(ix) + " with k=" + str(k)) # print("Score: " + str(score)) # print("Iterations needed: " + str(kms.num_iter)) # print("Elapsed time: " + str(elapsed)) # print("") # visualize_k_means(kms, input.shape) # score_series = pd.Series(local_scores, index=list(range(2,kmax+1)), name="Score") # score_series.plot(legend=True) # plt.show() # iterations_series = pd.Series(local_iterations, index=list(range(2,kmax+1)), name="Iterations") # iterations_series.plot(legend=True) # plt.show() # time_series = pd.Series(local_times, index=list(range(2,kmax+1)), name="Time") # time_series.plot(legend=True) # plt.show() global_scores /= images.shape[0] global_iterations /= images.shape[0] global_times /= images.shape[0] score_series = pd.Series(global_scores, index=list(range(1,kmax+1)), name="Score") score_series.plot(legend=True) plt.show() iterations_series = pd.Series(global_iterations, index=list(range(1,kmax+1)), name="Iterations") iterations_series.plot(legend=True) plt.show() time_series = pd.Series(global_times, index=list(range(1,kmax+1)), name="Time") time_series.plot(legend=True) plt.show()
def main (k, m="means", init_type="random"): # Starting clustering timer start_cluster = timeit.default_timer() # Initialize clusters if init_type == "random": initial_clusters = Initialize.random_centers(k) else: init_type = "kplusplus" initial_clusters = Initialize.kmeans_plusplus(k, train_images_flat,\ dist_fn=Distance.sumsq) # Run clustering algorithm final_responsibilities, final_clusters = Kmeans.kmeans(k,train_images_flat, initial_clusters, distfn = Distance.sumsq, method=m) # Find and print clustering time end_cluster = timeit.default_timer() clustering_time = end_cluster - start_cluster print "Time spent clustering : ", clustering_time # Save representative images to file. title = m + "_" + init_type + "_cluster" + str(k) File.save_images(k, train_images, final_responsibilities, final_clusters, title) ########################################################################### # Calculate Accuracy # ########################################################################### # Calculate final accuracy for clusters final, cluster_set = Accuracy.final_accuracy(final_responsibilities, train_labels, train_images_flat, final_clusters) # Now see how well we can classify the dataset start_cluster_test = timeit.default_timer() predictions = ClassifyClusters.classify(cluster_set, test_images_flat, test_labels, distfn = Distance.sumsq) finish_cluster_test = timeit.default_timer() # find time it took to test testing_time = finish_cluster_test - start_cluster_test print "Time spent testing : ", testing_time ########################################################################### # Outputs # ########################################################################### # k, prediction level, cluster_set, results = {"k" : k, "prediction_accuracy" : predictions[1], "cluster_means" : cluster_set, "cluster_stats" : final, "clustering_time" : clustering_time, "testing_time" : testing_time} with open('./results/' + title + '/' + title + '_results.json', 'w') as outfile: json.dump(results, outfile, cls=File.NumpyEncoder)
def deal_images(dir_path: str, save: bool, dist_fun_str: str, random: bool, dots: int): extension_list = ['/*.jpeg', '/*.jpg', "/*.png", "/*.bmp"] k = dots filepath = dir_path for extension in extension_list: filepath = dir_path + extension coll = io.ImageCollection(filepath) if len(coll) != 0: img_array = np.array(coll[0]) arr = np.empty((0, img_array.shape[0] * img_array.shape[1], 1 if len(img_array.shape) != 3 else img_array.shape[2])) for img in coll: img_array = np.array(img) img_array = img_array.reshape(img_array.shape[0] * img_array.shape[1] , 1 if len(img_array.shape) != 3 else img_array.shape[2]) arr = np.concatenate((arr, [img_array]), axis=0) dist_fun = str == Kmeans.ecludDist if dist_fun_str == 'ecludDist' else Kmeans.manhattanDist (index_in_center, center) = Kmeans.kMeans(arr, dist_fun, Kmeans.orderCenter(arr, k) if not random else Kmeans.randCenter( arr, k), k) # (index_in_center, center) = Kmeans.mul_kMeans(arr, dist_fun, 3, 10) for i in range(center.shape[0]): s_dir = dir_path + '/imageClass' + str(i) if not os.path.exists(s_dir): os.makedirs(s_dir) if save: img_form = np.array(coll[0]).shape plt.imshow(np.array(center[i]).reshape(img_form[0], img_form[1], img_form[2] if len(img_form) == 3 else 1)) plt.axis('off') # plt.show() plt.savefig(s_dir + '/imageOfClass' + str(i) + '.' + extension.split('.')[1]) for i in range(len(coll)): s_dir = dir_path + '/imageClass' + str(index_in_center[i]) + '/' + str(i) + '.' + extension.split('.')[ 1] plt.imshow(coll[i]) plt.axis('off') plt.savefig(s_dir) return ReturnCode.SUC
def processing(sentence, commentId): sentimentDb = db.Web_Comment_Analyzed print("handling sentence : ", sentence) obj = sentimentAnalysisExecute(sentence) obj['commentId'] = commentId print(obj) kmeanPredicted = Kmeans.predict(obj["score"]) obj['predict'] = kmeanPredicted sentimentDb.insert(obj) print("cảm xúc của câu dùng kmeans :", kmeanPredicted) return obj['predict']
def get_kmeans_accuracy(kmeans_labels_test, images, KMax, max_images_to_use, options): plt.clf() accerted_ratios_for_all_images = [] print("estimated time: 1 minute") if len(used_kmeans_images) != max_images_to_use: for i in range(len(used_kmeans_images), max_images_to_use): number_to_use = random.randint(0, images.shape[0]) used_kmeans_images.append(number_to_use) time1 = time.time() for number_to_use in used_kmeans_images: accerted_ratios = [] for j in range(2, KMax): km = Kmeans.KMeans(images[number_to_use], j, options) km.fit() returned_from_kmeans_color_labels = Kmeans.get_colors(km.centroids) accerted = get_color_accuracy(kmeans_labels_test[number_to_use], returned_from_kmeans_color_labels) #visualize_k_means(km, images[number_to_use].shape) accerted_ratios.append(accerted) accerted_ratios_for_all_images.append(accerted_ratios) for i in range(len(used_kmeans_images)): plt.scatter(list(range(2, KMax)), accerted_ratios_for_all_images[i], label="image " + str(used_kmeans_images[i])) plt.legend() plt.title("KMeans accerted % " + options["km_init"] + " ratio") plt.xlabel("K") plt.ylabel("accerted % ratios kmeans") plt.savefig(output_folder + "kmeans " + options["km_init"] + " Accerted.png") print(time.time() - time1)
def kmeans_statistics(images, KMax): times = [] iterations = [] wcds = [] for i in range(2, KMax): km = Kmeans.KMeans(images, i) time1 = time.time() iterations_needed = km.fit() times.append(time.time() - time1) iterations.append(iterations_needed) wcds.append(km.whitinClassDistance()) return times, iterations, wcds
def apply_kmeans(image_path): original_image = imread(image_path) #original_image = spm.imresize(original_image, (64, 64)) original_image = np.array(original_image, dtype=np.float64) / 255 w, h, d = original_shape = tuple(original_image.shape) assert d == 3 quantizer, labels = Kmeans.Kmeans_algorithm(original_image, w, h, d) quantized_image = Kmeans.recreate_image(quantizer.cluster_centers_, labels, w, h) cluster_pixel_map = index = np.reshape(labels, (h, w)) #plt.imshow(quantized_image) #plt.show() cluster_probability = Counter() cluster_probability.update(labels) cluster_probability = dict(cluster_probability) cluster_probability.update( (k, float(v) / len(labels)) for k, v in cluster_probability.items()) return original_image, quantized_image, cluster_pixel_map, cluster_probability
def test_get_centroids(self): for ix, input in enumerate(self.test_cases['input']): km = KMeans(input, self.test_cases['K'][ix]) km._init_centroids() km.get_labels() km.get_centroids() # Compare old centroids np.testing.assert_array_equal(km.old_centroids, self.test_cases['get_centroid'][ix][0]) # Compare new centroids np.testing.assert_array_equal(km.centroids, self.test_cases['get_centroid'][ix][1])
def datosParaGraficarKmeans(minSize=2,maxSize=100, step=1, runs=200, nclust = 5, it = 10): totalC=[] #arreglo con el promedio de comparaciones para cada tamaño del arreglo totalM=[] #arreglo con el promedio de movimientos img_clstr = km.K_means(n_clusters = nclust, iterations = it) for size in range(minSize, maxSize, step): sum_mov = 0 sum_comp = 0 for i in range(runs): test_array = createIntArray(size) img_clstr.fit(test_array) sum_mov += img_clstr.mov sum_comp += img_clstr.comp totalM.append(sum_mov/runs) totalC.append(sum_comp/runs) return totalC, totalM
def runkmeans(): global k, n, f, x, y, labels, asli t_k = k.get() t_n = n.get() t_f = f.get() print("k,n,f") print(t_k, t_n, t_f) if t_n == 0 or t_k == 0: print("please change 0 value") return else: print("start timer for KMeans") time_start = time.perf_counter() x, y, labels = Kmeans.guikmeans(t_k, t_n, t_f) elapsed = time.perf_counter() - time_start print("run time : " + str(elapsed)) show_result() return
def classify_images(self, img_array, nclust, it, plot_3d = False, listnames = None): self.image_cluster = km.K_means(n_clusters = nclust, iterations = it) self.image_cluster.fit(img_array) self.cluster_map['values'] = img_array self.cluster_map['labels'] = self.image_cluster.labels_ self.cluster_map['filename'] = listnames self.complete_center = [False] * len(self.image_cluster.cluster_centers_) #To plot dominant colors and centers if plot_3d: colors = ['red', 'green', 'blue', 'cyan', 'orange'] color_list = [] for label in self.image_cluster.labels_: color_list.append(colors[int(label)]) fig = plt.figure(2) ax = Axes3D(fig) img_array = np.array(img_array) centers = self.image_cluster.cluster_centers_ ax.scatter(img_array[:, 0], img_array[:, 1], img_array[:, 2], c = color_list) ax.scatter(centers[:, 0], centers[:, 1], centers[:, 2], marker='*', c='#050505', s=500)
resolution="l", area_thresh=1000.0, ) # draw coastlines, state and country boundaries, edge of map. m.drawcoastlines() m.drawstates() m.drawcountries() x, y = m(sgif.Longitude.values, sgif.Latitude.values) m.scatter(x, y) # dummy Kmeans in Euclidean metric X = sgif[["Latitude", "Longitude"]].values k = int(1.05 * sgif.Weight.sum() / 1000.0) init_centres = X[[int(len(sgif) * p) for p in np.random.sample(k)]] centroids, Xto, dist = km.kmeans(X, init_centres, metric=haversine) x, y = m(centroids[:, 1], centroids[:, 0]) m.scatter(x, y, color="r") a = 20.0 # 100. TODO this param will probably help to make cluster of the desired width... mydis = lambda x, y: haversine(y, (x[0], y[1])) + 0.5 * a * (haversine(x, (x[0], y[1])) + haversine(y, (y[0], x[1]))) mydis2 = ( lambda x, y: AVG_EARTH_RADIUS * np.pi / 180 * ( a / 2.0 * abs((x[1] - y[1] + 180) % 360 - 180) * (np.cos(x[0] * np.pi / 180) + np.cos(y[0] * np.pi / 180)) + abs(x[0] - y[0]) ) ) mydis3 = (
def CompleteWorkflow(full_PSI_InputFile,EventAnnot,rho_cutoff,strategy,seq,gsp,forceBroadClusters,AnalysisRound): """ This function is used perform a single-iteration of the OncoSplice workflow (called from main), including the unsupervised splicing analysis (splice-ICGS) and signature depletion """ ### Filter the EventAnnotation PSI file with non-depleted events from the prior round filtered_EventAnnot_dir=filterEventAnnotation.FilterFile(full_PSI_InputFile,EventAnnot,AnalysisRound) try: print "Running splice-ICGS for feature selection - Round"+str(AnalysisRound) ### Reset the below variables which can be altered in prior rounds gsp.setGeneSelection('') gsp.setGeneSet('None Selected') gsp.setPathwaySelect([]) species = gsp.Species() if forceBroadClusters == True: ### Find Broad clusters with at least 25% of all samples originalSamplesDiffering = gsp.SamplesDiffering() gsp.setSamplesDiffering(int(SampleNumber*0.25)) print 'Number varying samples to identify:',gsp.SamplesDiffering() graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species, 'exons', full_PSI_InputFile,mlp,exp_threshold=0, rpkm_threshold=0, parameters=gsp) if forceBroadClusters == True: gsp.setSamplesDiffering(originalSamplesDiffering) dPSI_results_fn=graphic_links3[-1][-1] dPSI_results_fn=dPSI_results_fn[:-4]+'.txt' print "Running block identification for k analyses - Round"+str(AnalysisRound) ### Parameters are fixed as they are distinct RNASeq_blockIdentification.correlateClusteredGenesParameters(dPSI_results_fn,rho_cutoff=0.4,hits_cutoff=4,hits_to_report=50,ReDefinedClusterBlocks=True,filter=True) dPSI_results_fn_block=dPSI_results_fn[:-4]+'-BlockIDs.txt' NMFinput, k = NMF_Analysis.FilterFile(dPSI_results_fn,dPSI_results_fn_block,full_PSI_InputFile,AnalysisRound) except Exception: print 'UNKNOWN ERROR!!!!! Setting k=0' print traceback.format_exc() k=0 print "Round =", AnalysisRound,'and k =', k if AnalysisRound == 1: if force_broad_round1: k = 2 else: NMFinput,k = NMF_Analysis.FilterFile(dPSI_results_fn,dPSI_results_fn,full_PSI_InputFile,AnalysisRound) ### Just use the Guide 3 file alone if k < 2: NMFinput,k = NMF_Analysis.FilterFile(dPSI_results_fn,dPSI_results_fn,full_PSI_InputFile,AnalysisRound) ### Just use the Guide 3 file alone #k = 2 print "Round =", AnalysisRound,'and k =', k if k>1: ### ADJUST THE k - MUST UPDATE!!!! if AnalysisRound == 1: if k < 2: k = 30 else: if k > 2: k = 30 print "Round =", AnalysisRound,'and k =', k try: flag,full_PSI_InputFile = performNMF(species, NMFinput, full_PSI_InputFile, filtered_EventAnnot_dir, k,AnalysisRound, strategy) except: print traceback.format_exc() k+=1 print 'Adjusted k =',k try: flag,full_PSI_InputFile = performNMF(species, NMFinput, full_PSI_InputFile, filtered_EventAnnot_dir, k, AnalysisRound, strategy) print traceback.format_exc() except: k = 30 print 'Adjusted k = 30' try: flag,full_PSI_InputFile = performNMF(species, NMFinput, full_PSI_InputFile, filtered_EventAnnot_dir, k,AnalysisRound, strategy) print traceback.format_exc() except: flag = True pass ### will force k-means below if k<2: if k==1: try: print "Running K-means analyses instead of NMF - Round"+str(AnalysisRound) header=[] header=Kmeans.header_file(dPSI_results_fn_block) Kmeans.KmeansAnalysis(dPSI_results_fn_block,header,full_PSI_InputFile,AnalysisRound) if AnalysisRound == 1: flag=True else: flag=False except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() AnalysisRound = True else: flag=False return flag,full_PSI_InputFile,filtered_EventAnnot_dir
#make arrays of the selected features with one person per array for Kmeans and KNN num_features = [] for i in xrange(len(snow_num)): temp = [] temp.append(standardized_prog_skills[i]) temp.append(binary_os[i][0]) #ugly hard-coded way of getting elements out of list temp.append(binary_os[i][1]) temp.append(binary_os[i][2]) temp.append(snow_num[i]) num_features.append(temp) num_features = np.array(num_features) #feature set [age, 3 binary values for operating sys, tiredness of snow] #Calling k-means Kmeans.kmeans(num_features) #Calling KNN #making a train and a test set. The label is the last value: tiredness of snow. train = num_features[:50] #50 datapoints in train set test = num_features[50:] # the remanining 17 datapoints in test set k = 3 Error = KNN.eval(train, test, k) print "*" *45 print "K-nearest neighbor" print "*" * 45 print "k = ", k print "Error on testset:", Error
# # meanList = list() # for size in range(1, 5): # print size # musR, assignmentsIndexListR, assignmentsR, vectorListR = Kmeans.kmeans(tokenList, word2vecModel, 1000, size) # silhouetteScoreList = Kmeans.silhouetteScore(musR, assignmentsIndexListR, vectorListR) # if len(silhouetteScoreList) == 0: # break # meanList.append(mean(array(silhouetteScoreList))) # print meanList # # clusterNum = [x for x in range(1, 30)][meanList.index(max(meanList))] # print 'best number of clusters is: ', clusterNum # Rerun kmeans with the best cluster structure: musR, assignmentsIndexListR, assignmentsR, vectorListR = Kmeans.kmeans(tokenList, word2vecModel, 1000, 1) # task 2.1: anormaly detection: # Use Local Outlier Factor to detect if a point is likely to be an anormaly. lof = LOF.LOF(musR, assignmentsIndexListR, vectorListR, 6) lofList = list() for ptIndex in range(len(assignmentsIndexListR)): lofPt = lof.calcLOF(ptIndex) lofList.append(lofPt) print ptIndex, lofPt lofIdList = list(); anormalySentences = list() for lofId in range(len(lofList)): if lofList[lofId] > 1.0: # now try to find inliners # print sentenceList[lofId]
################################################# # kmeans: k-means cluster # Author : zouxy # Date : 2013-12-25 # HomePage : http://blog.csdn.net/zouxy09 # Email : [email protected] ################################################# from numpy import * import time import matplotlib.pyplot as plt import Kmeans ## step 1: load data print "step 1: load data..." dataSet = [] fileIn = open('kmean_dataset.txt') for line in fileIn.readlines(): lineArr = line.strip().split('\t') dataSet.append([float(lineArr[0]), float(lineArr[1])]) ## step 2: clustering... print "step 2: clustering..." dataSet = mat(dataSet) k = 4 centroids, clusterAssment = Kmeans.kmeans(dataSet, k) ## step 3: show the result print "step 3: show the result..." Kmeans.showCluster(dataSet, k, centroids, clusterAssment)
class grabcut(object): # print("step3") def __init__(self): # print("step5") self.cluster = 5 self.iter = 2 self.BGD_GMM = None self.FGD_GMM = None self.KmeansBgd = None self.KmeansFgd = None self._gamma = 50 self._lambda = 9 * self._gamma self.GT_bgd = 0 #ground truth background self.P_fgd = 1 #ground truth foreground self.P_bgd = 2 #may be background self.GT_fgd = 3 #may be foreground #calculating Beta for smootheness def Beta(self, npimg): # print("step6") rows, cols = npimg.shape[:2] ldiff = np.linalg.norm(npimg[:, 1:] - npimg[:, :-1]) uldiff = np.linalg.norm(npimg[1:, 1:] - npimg[:-1, :-1]) udiff = np.linalg.norm(npimg[1:, :] - npimg[:-1, :]) urdiff = np.linalg.norm(npimg[1:, :-1] - npimg[:-1, 1:]) beta = np.square(ldiff) + np.square(uldiff) + np.square( udiff) + np.square(urdiff) beta = 1 / (2 * beta / (4 * cols * rows - 3 * cols - 3 * rows + 2)) # print(beta) return beta #estimating smoothness term def Smoothness(self, npimg, beta, gamma): # print("step7") rows, cols = npimg.shape[:2] self.lweight = np.zeros([rows, cols]) self.ulweight = np.zeros([rows, cols]) self.uweight = np.zeros([rows, cols]) self.urweight = np.zeros([rows, cols]) for y in range(rows): # print("stop1") for x in range(cols): color = npimg[y, x] if x >= 1: diff = color - npimg[y, x - 1] # print(np.exp(-self.beta*(diff*diff).sum())) self.lweight[y, x] = gamma * np.exp(-beta * (diff * diff).sum()) if x >= 1 and y >= 1: diff = color - npimg[y - 1, x - 1] self.ulweight[y, x] = gamma / np.sqrt(2) * np.exp( -beta * (diff * diff).sum()) if y >= 1: diff = color - npimg[y - 1, x] self.uweight[y, x] = gamma * np.exp(-beta * (diff * diff).sum()) if x + 1 < cols and y >= 1: diff = color - npimg[y - 1, x + 1] self.urweight[y, x] = gamma / np.sqrt(2) * np.exp( -beta * (diff * diff).sum()) #creating GMM for foreground and background def init_with_kmeans(self, npimg, mask): print("Creating GMM.....") # print("step8") self._beta = self.Beta(npimg) self.Smoothness(npimg, self._beta, self._gamma) bgd = np.where(mask == self.GT_bgd) prob_fgd = np.where(mask == self.P_fgd) BGDpixels = npimg[bgd] #(_,3) FGDpixels = npimg[prob_fgd] #(_,3) self.KmeansBgd = Kmeans(BGDpixels, dim=3, cluster=5, epoches=2) self.KmeansFgd = Kmeans(FGDpixels, dim=3, cluster=5, epoches=2) bgdlabel = self.KmeansBgd.run() # (BGDpixel.shape[0],1) # print(bgdlabel) fgdlabel = self.KmeansFgd.run() # (FGDpixel.shape[0],1) # print(fgdlabel) self.BGD_GMM = GMM() # The GMM Model for BGD self.FGD_GMM = GMM() # The GMM Model for FGD for idx, label in enumerate(bgdlabel): self.BGD_GMM.add_pixel(BGDpixels[idx], label) for idx, label in enumerate(fgdlabel): self.FGD_GMM.add_pixel(FGDpixels[idx], label) # learning GMM parameters self.BGD_GMM.learning() self.FGD_GMM.learning() # initial call def __call__(self, epoches, npimg, mask): print("Starting.....") # print("step9") self.init_with_kmeans(npimg, mask) for epoch in range(epoches): self.assign_step(npimg, mask) self.learn_step(npimg, mask) self.construct_gcgraph(npimg, mask) mask = self.estimate_segmentation(mask) img = copy.deepcopy(npimg) img[np.logical_or(mask == self.P_bgd, mask == self.GT_bgd)] = 0 return Image.fromarray(img.astype(np.uint8)) # assigning GMMs parameters def assign_step(self, npimg, mask): print("Assinging GMM parameter.....") # print("step10") rows, cols = npimg.shape[:2] clusterid = np.zeros((rows, cols)) for row in range(rows): for col in range(cols): pixel = npimg[row, col] if mask[row, col] == self.GT_bgd or mask[row, col] == self.P_bgd: #bgd clusterid[row, col] = self.BGD_GMM.pixel_from_cluster(pixel) else: clusterid[row, col] = self.FGD_GMM.pixel_from_cluster(pixel) self.clusterid = clusterid.astype(np.int) #Learning GMM parameter def learn_step(self, npimg, mask): print("Learning parameter......") # print("step11") for cluster in range(self.cluster): bgd_cluster = np.where( np.logical_and( self.clusterid == cluster, np.logical_or(mask == self.GT_bgd, mask == self.P_bgd))) fgd_cluster = np.where( np.logical_and( self.clusterid == cluster, np.logical_or(mask == self.GT_fgd, mask == self.P_fgd))) for pixel in npimg[bgd_cluster]: self.BGD_GMM.add_pixel(pixel, cluster) for pixel in npimg[fgd_cluster]: self.FGD_GMM.add_pixel(pixel, cluster) self.BGD_GMM.learning() self.FGD_GMM.learning() # constructing graph def construct_gcgraph(self, npimg, mask): print("Graph construction...may take a while.....") # print("step12") rows, cols = npimg.shape[:2] vertex_count = rows * cols edge_count = 2 * (4 * vertex_count - 3 * (rows + cols) + 2) self.graph = GCGraph(vertex_count, edge_count) for row in range(rows): for col in range(cols): #source background sink foreground vertex_index = self.graph.add_vertex() color = npimg[row, col] if mask[row, col] == self.P_bgd or mask[ row, col] == self.P_fgd: #pred fgd fromSource = -log(self.BGD_GMM.pred_GMM(color)) toSink = -log(self.FGD_GMM.pred_GMM(color)) elif mask[row, col] == self.GT_bgd: fromSource = 0 toSink = self._lambda else: fromSource = self._lambda toSink = 0 self.graph.add_term_weights(vertex_index, fromSource, toSink) if col - 1 >= 0: w = self.lweight[row, col] self.graph.add_edges(vertex_index, vertex_index - 1, w, w) if row - 1 >= 0 and col - 1 >= 0: w = self.ulweight[row, col] self.graph.add_edges(vertex_index, vertex_index - cols - 1, w, w) if row - 1 >= 0: w = self.uweight[row, col] self.graph.add_edges(vertex_index, vertex_index - cols, w, w) if col + 1 < cols and row - 1 >= 0: w = self.urweight[row, col] self.graph.add_edges(vertex_index, vertex_index - cols + 1, w, w) # segmentation estimation E( α , k, θ , z) - min cut def estimate_segmentation(self, mask): print("Estimation.......") # print("step13") rows, cols = mask.shape self.graph.max_flow() for row in range(rows): for col in range(cols): if mask[row, col] == self.P_fgd or mask[row, col] == self.P_bgd: if self.graph.insource_segment(row * cols + col): # Vertex Index mask[row, col] = self.P_fgd else: mask[row, col] = self.P_bgd # print("working") # self.KmeansBgd.plot() # self.KmeansFgd.plot() return mask
time_start = time.perf_counter() x, y, labels = SC.guisc(k, n, 0) elapsed = time.perf_counter() - time_start nmi = normalized_mutual_info_score(y, labels) nmisc[i] += (nmi) print(str(n) + " | " + str(elapsed) + " | " + str(nmi)) sssc[i] += (elapsed) + i * 2 print("KMeans computation") print(" n | time | NMI") for j in range(0, repeat): for i in range(0, till): n = (i + 1) * step k = 10 time_start = time.perf_counter() x, y, labels = Kmeans.guikmeans(k, n, 0) elapsed = time.perf_counter() - time_start nmi = normalized_mutual_info_score(y, labels) print(str(n) + " | " + str(elapsed) + " | " + str(nmi)) nmikmeans[i] += (nmi) km[i] += elapsed + i / 2 print("KMeans++ computation") print(" n | time | NMI") for j in range(0, repeat): for i in range(0, till): n = (i + 1) * step k = 10 time_start = time.perf_counter() x, y, labels = Kmeans.guikmeansplusplus(k, n, 0) elapsed = time.perf_counter() - time_start
print(X_train.toarray().shape) print(Y_train.shape) print(X_test.toarray().shape) print(Y_test.shape) # SVM model to classification clustering_with_linear_SVM_sklearn(X_train, X_test, Y_train, Y_test) ############################# Kmean ###################################### with open('./data_set/words_idfs.txt') as f: vocab_size = len(f.read().splitlines()) num_cluster = 20 Kmean = Kmeans(num_clusters=num_cluster, num_word_vocab=vocab_size) print(Kmean._num_clusters) print(Kmean._num_word_vocab) # Load data Kmean.load_data('./data_set/train_tf_idf.txt') max_purity = -1 max_NMI = -1 choose_seed = 0 # Run and choose the best seed for i in range(10): Kmean.run(seed_value=i + 1, criterion='centroid', threshold=0) print(Kmean.compute_purity())
def run_thinkmeans(self): self.centroids,self.Xto,self.dist = km.kmeans(self.X,self.init_centres,metric=self.metric,verbose=2,restrict_comp_to_close=True)