def _run_interface(self, runtime): fname = self.inputs.volume #load data, read lines 8~penultimate datafile = open(fname, 'rb') data = [i.strip().split() for i in datafile.readlines()] stringmatrix = data[8:-1] datafile.close() if self.inputs.hemi == 'lh': chosenvertices = lhvertices if self.inputs.hemi == 'rh': chosenvertices = rhvertices corrmatrix = np.zeros((len(chosenvertices),len(chosenvertices))) for x, vertex in enumerate(chosenvertices): for i in xrange(len(chosenvertices)): corrmatrix[x][i] = abs(float(stringmatrix[vertex][i])) if self.inputs.cluster_type == 'spectral': labels = spectral(corrmatrix, n_clusters=self.inputs.n_clusters, mode='arpack') if self.inputs.cluster_type == 'hiercluster': labels = Ward(n_clusters=self.inputs.n_clusters).fit_predict(corrmatrix) if self.inputs.cluster_type == 'kmeans': labels = km(n_clusters=self.inputs.n_clusters).fit_predict(corrmatrix) if self.inputs.cluster_type == 'dbscan': labels = DBSCAN(eps=np.average(corrmatrix)+np.std(corrmatrix)).fit_predict(corrmatrix) sxfmout = self.inputs.sxfmout img = nb.load(sxfmout) outarray = -np.ones(shape=img.shape[0]) for j, cluster in enumerate(labels): outarray[chosenvertices[j]] = cluster+1 new_img = nb.Nifti1Image(outarray, img.get_affine(), img.get_header()) _, base, _ = split_filename(fname) nb.save(new_img, os.path.abspath(base + '_clustered.nii')) return runtime
def _run_interface(self, runtime): #load data data = nb.load(self.inputs.in_File).get_data() corrmatrix = np.squeeze(data) if self.inputs.cluster_type == 'spectral': positivecorrs = np.where( corrmatrix > 0, corrmatrix, 0) #threshold at 0 (spectral uses non-negative values) newmatrix = np.asarray( positivecorrs, dtype=np.double) #spectral expects dtype=double values labels = spectral(newmatrix, n_clusters=self.inputs.n_clusters, eigen_solver='arpack', assign_labels='discretize') if self.inputs.cluster_type == 'hiercluster': labels = Ward( n_clusters=self.inputs.n_clusters).fit_predict(corrmatrix) if self.inputs.cluster_type == 'kmeans': labels = km( n_clusters=self.inputs.n_clusters).fit_predict(corrmatrix) if self.inputs.cluster_type == 'dbscan': labels = DBSCAN(eps=self.inputs.epsilon).fit_predict(corrmatrix) new_img = nb.Nifti1Image(labels + 1, None) #+1 because cluster labels start at 0 _, base, _ = split_filename(self.inputs.in_File) nb.save( new_img, os.path.abspath(base + '_' + str(self.inputs.n_clusters) + '_' + self.inputs.cluster_type + '_' + self.inputs.hemi + '.nii')) return runtime
def run(self, src): x = self.table(src) k = self.settings.k if k == "auto": raise RuntimeError("Have to implement auto parser") cl = km(n_clusters=k) y = cl.fit_predict(x) clusters = {} for x_i, y_i in zip(x, y): clusters[y_i] = clusters.get(y_i, []) + [x_i] return clusters
def retrain(clusters): pickle_file = open("train_data.pickle", "rb") arr = pickle.load(pickle_file) pickle_file.close() clt = km(n_clusters = clusters) clt.fit(arr) pickle_file = open("clt.pickle", "wb") pickle.dump(clt, pickle_file) pickle_file.close()
def __nk(self, nk=None): if self.Xm is None: self.nk = None elif nk is None: #self.nk = np.random.randint(0,self.k, self.n) self.nk = km(n_clusters=self.k).fit_predict(self.Xm) else: try: self.nk = nk self.nk.shape = (self.n, ) except ValueError: print "nk must be of the same lenght as Xm" return self.nk
def cluster(nc): global list_u global dict_u clist = [] for i in range(len(list_u)): clist.append(dict_u[list_u[i]]) result = km(n_clusters=nc, max_iter=300, n_init=40, init='k-means++').fit_predict( np.array(clist)) #根据大学的不同学科情况进行聚类,聚类个数为nc个,最大迭代次数为300 ny = [[] for i in range(nc)] #初始化数组 for i in range(top_k): ny[result[i]].append(list_u[i]) #按照类别将大学名称加入各类的list中 for i in range(nc): #输出各类的大学名称 print(ny[i])
def genlist(): print("fetching candidate replacements for " + self.pred) cdatasource = eval(self.classname).pred_candidates if not cdatasource.has(self.lnoun): print("can't find typical preds for ", self.noun) return None cands_max = cdatasource.get(self.lnoun).most_common() cands = set() syns = self.get_syns() syns.sort(key=lambda x: self.modifies_noun(x), reverse=True) strong_syns = list() mat = list() good = list() for cand in cands_max: c = cand[0] if c in vecs and abst.has(c) and abst.get(c) > abst.get( self.pred): if c != self.pred and c in syns: strong_syns.append(c) mat.append(vecs.get(c)) good.append(cand) if len(strong_syns) > 0: self.strong_syns = strong_syns mat = np.array(mat) if len(cands_max) > 100 and False: k = km(n_clusters=round(len(cands_max) / 50), random_state=0).fit(mat) hotclust = k.predict(vecs.get(self.pred).reshape(1, -1)) cands = [ cand for cand in good if k.predict(vecs.get(cand[0]).reshape(1, -1)) == hotclust ] cands.sort(key=lambda x: x[1]) #cands = squeeze(cands,15) else: cands = good cands = [cand[0] for cand in cands[:50]] if len(strong_syns) > 1: cands = set(cands[:5]).union(set(strong_syns)) cands.discard(self.pred) ret = list(cands) else: cands = set(cands[:15]).union(set(syns[:5])) cands.discard(self.pred) ret = squeeze(list(cands), 5) return ret
def input_generator(file_names, n_clust, n_sub, cluster_method, sub_length): ''' Creates a matrix from the gaze data. Gaze data from each file is clustered to n_clust points using either HR clustering or KM clustering. These n_clust points are flattened to 2 * n_clust values and each of these 2 * n_clust values are divided in n_sub subsequences of length sub_length. The columns of the matrix correspond to these gaze subsequences of length sub_length flattened into 2 * sub_length values. :param file_names: Gaze files names as the list of strings. :param n_clust: Number of clusters as an integer. :param n_sub: Number of Sub seqeunces per image as an integer :param cluster_method: Clustering method used as a String. Can be either 'HR' or 'KM' :param sub_length: Integer length of the sub sequence :return: Input matrix with all the clustered gaze points organized into sub sequences. ''' mat = [] for image in file_names: gaze = [] with open(args.gaze_path + image, 'r') as f: reader = csv.reader(f, delimiter=',') count = 0 for row in reader: gaze.append((int(row[0]), int(row[1]), count * 110)) count += 1 gaze = np.array(gaze) try: if cluster_method == 'HR': cluster_labels = hc(n_clusters=n_clust).fit_predict(gaze) elif cluster_method == 'KM': cluster_labels = km(n_clusters=n_clust).fit_predict(gaze) except: raise ValueError( 'Choose between "HR" or "KM" as the clustering method') result = { i: gaze[np.where(cluster_labels == i)] for i in range(n_clust) } centres = [] for cluster in result: cluster_points = np.array(result[cluster]) cluster_centre = np.mean(cluster_points, axis=0) centres.append(int(cluster_centre[0])) centres.append(int(cluster_centre[1])) # for i in range(0, len(centres) - (2 * sub_length), 2 * ((n_clust - sub_length)/ n_sub)): for i in range(0, len(centres), ((2 * n_clust) / n_sub)): mat.append(centres[i:i + (2 * sub_length)]) return np.transpose(np.array(mat))
def _run_interface(self, runtime): #load data data = nb.load(self.inputs.in_File).get_data() corrmatrix = np.squeeze(data) if self.inputs.cluster_type == 'spectral': positivecorrs = np.where(corrmatrix>0,corrmatrix,0) #threshold at 0 (spectral uses non-negative values) newmatrix = np.asarray(positivecorrs,dtype=np.double) #spectral expects dtype=double values labels = spectral(newmatrix, n_clusters=self.inputs.n_clusters, eigen_solver='arpack', assign_labels='discretize') if self.inputs.cluster_type == 'hiercluster': labels = Ward(n_clusters=self.inputs.n_clusters).fit_predict(corrmatrix) if self.inputs.cluster_type == 'kmeans': labels = km(n_clusters=self.inputs.n_clusters).fit_predict(corrmatrix) if self.inputs.cluster_type == 'dbscan': labels = DBSCAN(eps=self.inputs.epsilon).fit_predict(corrmatrix) new_img = nb.Nifti1Image(labels+1, None) #+1 because cluster labels start at 0 _, base, _ = split_filename(self.inputs.in_File) nb.save(new_img, os.path.abspath(base+'_'+str(self.inputs.n_clusters)+'_'+self.inputs.cluster_type+'_'+self.inputs.hemi+'.nii')) return runtime
def genlist(): print("fetching candidate replacements for "+self.pred) cdatasource = eval(self.classname).pred_candidates if not cdatasource.has(self.lnoun): print("can't find typical preds for ",self.noun) return None cands_max = cdatasource.get(self.lnoun).most_common() cands = set() syns = self.get_syns() syns.sort(key=lambda x: self.modifies_noun(x),reverse=True) strong_syns = list() mat = list() good = list() for cand in cands_max: c = cand[0] if c in vecs and abst.has(c) and abst.get(c) > abst.get(self.pred): if c != self.pred and c in syns: strong_syns.append(c) mat.append(vecs.get(c)) good.append(cand) if len(strong_syns) > 0: self.strong_syns = strong_syns mat = np.array(mat) if len(cands_max) > 100 and False: k = km(n_clusters=round(len(cands_max)/50),random_state=0).fit(mat) hotclust = k.predict(vecs.get(self.pred).reshape(1,-1)) cands = [cand for cand in good if k.predict(vecs.get(cand[0]).reshape(1,-1)) == hotclust] cands.sort(key=lambda x : x[1]) #cands = squeeze(cands,15) else: cands = good cands = [cand[0] for cand in cands[:50]] if len(strong_syns) > 1: cands = set(cands[:5]).union(set(strong_syns)) cands.discard(self.pred) ret = list(cands) else: cands = set(cands[:15]).union(set(syns[:5])) cands.discard(self.pred) ret = squeeze(list(cands),5) return ret
def fit(self, original_image, segmented_image): self.original_image = original_image self.segmented_image = segmented_image self.groups = np.unique( self.segmented_image[self.segmented_image != -1]) self.flat_segments = np.reshape(self.segmented_image, (-1)) self.flat_image = np.reshape(self.original_image, (-1, 3)) # Extracting objects from image objects = [] for group in self.groups: object = self.flat_image[self.flat_segments == group] objects.append(object) # Constructing features from objects features = np.zeros((len(objects), 3)) for i, object in enumerate(objects): max = np.max(object) min = np.min(object) mean = np.mean(object) features[i] = [max, min, mean] self.group_counts = np.array(self.n_clusters) kmeans = km(n_clusters=self.n_clusters, random_state=0).fit(features) labels = kmeans.labels_ labelled_image = np.zeros(self.flat_segments.shape[0]) for i, group in enumerate(self.groups): labelled_image[self.flat_segments == group] = labels[i] labelled_image = np.reshape(labelled_image, self.segmented_image.shape) if self.visualise: cv.imshow("Clustered image", labelled_image) cv.waitKey(0) cv.destroyWindow("Clustered image") return labelled_image
def kmeans( feature_matrix: pd.DataFrame(), k: int = 2, feature_columns: list = []): """包裝Kmeans Args: feature_matrix (pd.DataFrame): 經過前處理的特徵矩陣 k (int, optional): 分K群. Defaults to 2. feature_columns (list, optional): 選擇分群用的特徵的columns. Defaults to []. Returns: sklearn.cluster.KMeans: sklearn Kmeans分群結果物件 """ if not feature_columns: feature_columns = feature_matrix.columns # uid以外作為特徵 feature_matrix = feature_matrix[feature_columns] # 轉np array做輸入用 x = np.array(feature_matrix) # 跑Kmeans分群 km_model = km(n_clusters=k, random_state=0).fit(x) return km_model
def silhoutte(name): df = pd.read_csv("../processing/" + name + "_dist.csv.virus") xs = list(df['x']) ys = list(df['y']) xs = [x - min(xs) for x in xs] ys = [x - min(ys) for x in ys] X = np.matrix(zip(xs, ys)) stat = open('kstatistics.csv', 'w') ncluster = [] distortion_set = [] silh = [] for nc in range(2, 13): kmeans = km(n_clusters=nc).fit(X) cluster_labels = kmeans.fit_predict(X) silhouette_avg = silhouette_score(X, cluster_labels) distortion = 0 distortion = (sum( np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0]) ncluster.append(nc) distortion_set.append(distortion) silh.append(silhouette_avg) # print kmeans.cluster_centers_, sum(np.min(cdist(X, # kmeans.cluster_centers_, 'euclidean'), axis=1))/13 print(nc, distortion, silhouette_avg) print(nc, distortion, silhouette_avg, file=stat) fig = plt.figure() ax = fig.add_subplot(111) plt.plot(ncluster, distortion_set, '-o') ax.set_xlabel('No.of clusters') ax.set_ylabel('Distortion') ax.set_title("Selecting K with Elbow method", fontsize=10) plt.show(block=False) ''' fig1 = plt.figure()
a = exp_X_cent_dist / tf.reduce_sum( exp_X_cent_dist, axis=2, keep_dims=True) print a return a def fit(self, X_train): c, l = self.sess.run(self.network, {self.X: X_train}) return c, l from sklearn.cluster import KMeans as km if __name__ == "__main__": nb_samples = 10000 E = 2 nb_clusters = 2 X, y = make_blobs(n_samples=nb_samples, centers=nb_clusters, n_features=E) X_ = X[np.newaxis, :] y = y[np.newaxis, :] print y kmean = KMeans(nb_clusters) kmean.init() centroids, labels = kmean.fit(X_) print centroids print np.sum(labels) print y kmeans = km(n_clusters=2, random_state=0).fit(X) print kmeans.cluster_centers_
#normalization def fun(i): x = ((i - i.min()) / (i.max() - i.min())) return (x) df_norm = fun(df.iloc[:, 1:]) df_norm.describe() # In[157]: #elbow curve wss = [] k = list(range(10, 100, 5)) for i in k: kmeans = km(n_clusters=i) kmeans.fit(df_norm) wss.append(kmeans.inertia_) wss # In[158]: plt.plot(k, wss, 'ro-') plt.xlabel('number of clusters') plt.ylabel('total with in ss') # In[159]: model = km(n_clusters=40) model.fit(df_norm) model.labels_
print('Computing using: ' + method + ' breed method') start = datetime.now() GENERATION = copy.deepcopy(init_pop) for i in range(gens): print('Generation no: ' + str(i + 1)) GENERATION.select() survivors.append(GENERATION.population) top_scores.append((min(GENERATION.score()))) fittest.append(GENERATION.population[GENERATION.sorted_scores[0]]) GENERATION.mutate(0.001) GENERATION.breed(method=method) GENERATION.population = fittest GENERATION.sorted_scores = np.argsort(GENERATION.score()) fit_rank = GENERATION.sorted_scores alpha = fittest[fit_rank[0]] ga_means = km(n_clusters, alpha, 1).fit(X) cluster_list.append(ga_means) end = datetime.now() comp_duration.append(end - start) Fittest.append(alpha) #Survivors.append(survivors) for i in range(len(cluster_list)): Distances.append(galuster.sum_distances(cluster_list[i], X)) plt.figure(i) galuster.lolipop_plot(cluster_list[i], X) #Iterate GA operations over number of generations #ga_start = datetime.now() #
frqs = [['Actual/Predict', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 'Recall'] ] + [[i] + [0 for j in range(10)] for i in range(10)] + [['Precision']] # saving original pics # for i in range(2000): # scipy.misc.imsave("Output/2.2c/Original_"+str(n_cluster)+"/"+str(i)+".bmp",data[i].reshape(28,28)) # modeling and predicting labels pca = PCA(n_components=.9) data = pca.fit_transform(data) print data[0] sys.exit() model = km(n_clusters=n_cluster, max_iter=2000, n_init=100, init='k-means++', tol=.00001, n_jobs=4) model.fit(np.array(data)) print 1 p_label = model.fit_predict(data) print 2 # finding mapping for i in range(2000): result[p_label[i]].append(labels[i][0]) mapping = [max(set(i), key=i.count) for i in result] # saving pics after applying PCA for i in range(2000): scipy.misc.imsave(
def find_spread(X,cluster_size): X_kmeans = km(cluster_size, random_state = 0).fit(X) X_kmeans_list = {i: X[np.where(X_kmeans.labels_ == i)] for i in range(cluster_size)} X_spread = list(np.diag(np.diag(np.cov(X_kmeans_list[i].T))) for i in range(cluster_size)) return X_spread
ga_start = datetime.now() for i in range(gens): print('Generation no: ' + str(i + 1)) pop.select() #survivors.append(pop.population) top_scores.append((min(pop.score()))) fittest.append(pop.population[pop.sorted_scores[0]]) pop.mutate(0.001) pop.breed(method='hybrid') #Cluster the data using the fittest seed init_pop.population = fittest init_pop.sorted_scores = np.argsort(init_pop.score()) fit_rank = init_pop.sorted_scores ga_means = km(n_clusters, fittest[fit_rank[0]], 1).fit(X) ga_end = datetime.now() comp_duration.append(ga_end - ga_start) cluster_list.append(ga_means) sum_of_dist.append(galuster.sum_distances(ga_means, X)) #Plot GA cluster membership plt.figure(0) galuster.lolipop_plot(ga_means, X) for i in range(no_kmeans): print('Starting kmeans algorithm no: ' + str(i + 1)) start = datetime.now() #Record starting time kmeans = km(n_clusters, n_init=n_seed).fit(X) #compute kmeans
return clusters def organizeEnumeratedDictionary(dictionary): newDict = {} for i in range(0, len(dictionary)): newDict[i]=dictionary[i] return newDict def emojiCodeToEmoji(clusterDict, emojiDict): for i in clusterDict: for x in range(0, len(clusterDict[i])): clusterDict[i][x] = emojiDict[clusterDict[i][x]] return clusterDict tsvToEmojiDict = arrayToDict(getFile('emoji_lookup.tsv'))#dictionary to translate emojis over emojiDataFrame = pd.read_csv(StringIO(codecs.open('emojis.txt', 'r', encoding='utf8', errors='ignore').read()), sep='\s+')#creates pandas system for holding data dimensions = np.array(emojiDataFrame.as_matrix(columns=emojiDataFrame.columns[1:]))#gets dimensions in an array perfect for cluster # auto random state cluster = km(n_clusters = 100, max_iter=10000000)#kmeans clustering #cluster = ap(max_iter=1000000)#Affinity propogation clustering #cluster = b(n_clusters=200)#birch clustering cluster.fit(dimensions) codedCluster = addEmojiCode(dimensions, cluster, getFile('emoji_lookup.tsv')) organizedClusters = organizeEnumeratedDictionary(codedCluster) emojiClusters = emojiCodeToEmoji(organizedClusters, tsvToEmojiDict) for z in emojiClusters: print("Clusters" + str(z)) print(emojiClusters[z]) print("\n")
""" import pandas as pd from sklearn.cluster import KMeans as km import numpy import matplotlib.pyplot as plt from sklearn.metrics import silhouette_score as ss data = pd.read_csv("sdm.csv", sep=";", header=None) data.columns = ["depth", "param_1", "param_2"] data_pr = data[["param_1", "param_2"]] plt.scatter(data_pr.param_1, data_pr.param_2) kmeans = km(init='k-means++', n_clusters=3, random_state=0).fit(data_pr.as_matrix()) # data_pr['labels'] =pd.Series(kmeans.labels_) # data_pr.plot.scatter(x='b',y='c',c='labels', colormap='viridis') data_pr1 = data_pr.copy() scores = [] for k in range(2, 10): kmeans = km(init='k-means++', n_clusters=k, random_state=0).fit(data_pr.as_matrix()) data_pr1['labels'] = pd.Series(kmeans.labels_) print(len(kmeans.labels_)) data_pr1.plot.scatter(x='b', y='c', c='labels', colormap='viridis') scores.append(ss(data_pr1[['b', 'c']], labels=data_pr1['labels'])) print(data_pr1) n = [i for i in range(2, 10)] plt.figure()
def run(self, clusters=3): self.y = km(n_clusters=clusters).fit_predict(self.X)
data.head() #plotting values f1 = data['duration'].values f2 = data['imdb_score'].values X = np.array(list(zip(f1, f2))) print("This is X. The zipped array") print(X) #Finding optimal k wcss = [] for i in range(1, 11): kmeans = km(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0) kmeans.fit(X) wcss.append(kmeans.inertia_) #Plotting the results onto a line graph, allowing us to observe 'The elbow' plt.plot(range(1, 11), wcss) plt.title('The elbow method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') #within cluster sum of squares plt.show() k = 4 kmeans = km(n_clusters=k) KMmodel = kmeans.fit(X)
for line in lines: if line.strip() == "": break name = line.split(":")[0].strip() location = line.split(":")[1].strip().split(",") top_yelp.append([name, float(location[0].strip()), float(location[1].strip())]) top_yelp_lat_long = [] for row in top_yelp: top_yelp_lat_long.append([row[1], row[2]]) t = 21 lat_long = [] for row in all_data[t]: lat_long.append([row[3], row[4]]) for k in [240]: kmeans = km(k, max_iter=1000, n_init=50, init="k-means++") kmeans.fit(lat_long) pred = kmeans.predict(top_yelp_lat_long) pred_cluster_centers = [kmeans.cluster_centers_[i] for i in pred] error = [vincenty(top_yelp_lat_long[i], pred_cluster_centers[i]).miles for i in range(len(top_yelp_lat_long))] print "For t = {t_val}:".format(t_val=t) print "Min Error\t\tMax Error\tAvg Error".format() print "{min_e}\t{max_e}\t{avg_e}".format(min_e=min(error), max_e=max(error), avg_e=sum(error) / len(error))
def runROIDetector(p, k, min_dist, threshold, gammae, nue): k = k.astype(np.int64) p = p.astype(np.int64) threshold = threshold.astype(np.int64) resize = .3 filesHtrain = [] filesUtrain = [] filesUtest = [] filesHtest = [] filesUCV = [] filesHCV = [] unhealthyTestPatient = [ f for f in listdir('testROI/unhealthy') if isdir(join('testROI/unhealthy', f)) ] healthyTestPatient = [ f for f in listdir('testROI/healthy') if isdir(join('testROI/healthy', f)) ] unhealthyTrainPatient = [ f for f in listdir('trainROI/unhealthy') if isdir(join('trainROI/unhealthy', f)) ] healthyTrainPatient = [ f for f in listdir('trainROI/healthy') if isdir(join('trainROI/healthy', f)) ] unhealthyCVPatient = [ f for f in listdir('trainROI/unlabel') if isdir(join('trainROI/unlabel', f)) ] healthyCVPatient = healthyTrainPatient[0:8] healthyTrainPatient = healthyTrainPatient[8:len(healthyTrainPatient) - 1] for i in healthyTrainPatient: dirs = listdir('trainROI/healthy/' + i) for j in dirs: if "._" not in j: filesHtrain.append(('trainROI/healthy/' + i + '/' + j)) for i in unhealthyTrainPatient: dirs = listdir('trainROI/unhealthy/' + i) for j in dirs: if "._" not in j: filesUtrain.append(('trainROI/unhealthy/' + i + '/' + j)) for i in unhealthyCVPatient: dirs = listdir('trainROI/unlabel/' + i) for j in dirs: if "._" not in j: filesUCV.append(('trainROI/unlabel/' + i + '/' + j)) for i in healthyTestPatient: dirs = listdir('testROI/healthy/' + i) for j in dirs: if "._" not in j: filesHtest.append(('testROI/healthy/' + i + '/' + j)) for i in unhealthyTestPatient: dirs = listdir('testROI/unhealthy/' + i) for j in dirs: if "._" not in j: filesUtest.append(('testROI/unhealthy/' + i + '/' + j)) init_words = [] words = [] final_words = [] healthyPatientDict = {} # Part One:----------Extract initial words from training positive set---------- for image in (filesUtrain): # read image from Unhealthy class (These are the seed examples) img = imread(image) [h, w, d] = img.shape img = rescale(img, resize) [h, w, d] = img.shape # Trim the image to a bucketable size F = img[0:int(p * math.floor(h / p)), 0:int(p * math.floor(w / p)), :] # Extract HOG Features fd = hog(F[:, :, 1], orientations=8, pixels_per_cell=(p, p), cells_per_block=(1, 1)) # Reorganize features so that each index matches a sample features = np.reshape(fd, (((F.shape[1] * F.shape[0]) / (p * p)), 8)) # Cluster the features KM = km(n_clusters=k).fit(features) # Add the cluster centers to the inital dictionary init_words.extend(KM.cluster_centers_) init_words = np.asarray(init_words) occurencVec = {} ### Part Two:----------------------Filter out "bad words"------------------------ for patient in (healthyTrainPatient): healthyPatientDict[patient] = {} for idx in range(len(init_words)): temp = healthyPatientDict[patient] temp[idx] = -1 occurencVec[idx] = 0 healthyPatientDict[patient] = temp for patient in (healthyTrainPatient): imageList = listdir('trainROI/healthy/' + patient) HOGvec = [] count = 0 for image in imageList: if "._" not in image: # read image from Healthy class img = imread('trainROI/healthy/' + patient + '/' + image) [h, w, d] = img.shape img = rescale(img, resize, anti_aliasing=True) [h, w, d] = img.shape # Trim the image to a bucketable size F = img[0:int(p * math.floor(h / p)), 0:int(p * math.floor(w / p)), :] # Extract HOG Features fd = hog(F[:, :, 1], orientations=8, pixels_per_cell=(p, p), cells_per_block=(1, 1)) # Reorganize features so that each index matches a sample features = np.reshape(fd, (((F.shape[1] * F.shape[0]) / (p * p)), 8)) if count == 0: HOGvec = features else: HOGvec = np.vstack((HOGvec, features)) count = count + 1 # For each of the words in the inital dictionary, calculate the # L2-distance between the features of the current image. If the distance # is too small too many times, remove the word from the dictionary. initWordsIdx = 0 for rows in (init_words): num_of_matches = 0 iters = 0 for n in HOGvec: iters = iters + 1 r = np.linalg.norm(rows - n) if r < min_dist: num_of_matches = num_of_matches + 1 temp = occurencVec[initWordsIdx] occurencVec[initWordsIdx] = temp + 1 iters = iters + 1 final_words.append(rows) temp = healthyPatientDict[patient] temp[initWordsIdx] = num_of_matches initWordsIdx = initWordsIdx + 1 averages = [] for count in range(len(healthyPatientDict[healthyTrainPatient[0]])): numerator = 0 for patient in healthyTrainPatient: temp = healthyPatientDict[patient] numerator = numerator + temp[count] average = numerator / len(healthyPatientDict) averages.append(average) idxs = np.where(np.asarray(averages) < threshold) idxs = idxs[0] featureMatrix = np.zeros(shape=(len(healthyPatientDict), len(idxs))) i = 0 for patient in healthyPatientDict: j = 0 for idx in idxs: temp = healthyPatientDict[patient] featureMatrix[i, j] = temp[idx] j = j + 1 i = i + 1 validationMatrix = np.zeros(shape=(len(unhealthyCVPatient) + len(healthyCVPatient), len(idxs))) i = 0 for patient in (healthyCVPatient): imageList = listdir('trainROI/healthy/' + patient) HOGvec = [] count = 0 j = 0 for image in imageList: if "._" not in image: # read image from Healthy class img = imread('trainROI/healthy/' + patient + '/' + image) [h, w, d] = img.shape img = rescale(img, resize, anti_aliasing=True) [h, w, d] = img.shape # Trim the image to a bucketable size F = img[0:int(p * math.floor(h / p)), 0:int(p * math.floor(w / p)), :] # Extract HOG Features fd = hog(F[:, :, 1], orientations=8, pixels_per_cell=(p, p), cells_per_block=(1, 1)) # Reorganize features so that each index matches a sample features = np.reshape(fd, (((F.shape[1] * F.shape[0]) / (p * p)), 8)) if count == 0: HOGvec = features else: HOGvec = np.vstack((HOGvec, features)) count = count + 1 # For each of the words in the inital dictionary, calculate the # L2-distance between the features of the current image. If the distance # is too small too many times, remove the word from the dictionary. initWordsIdx = 0 for num in range(len(idxs)): num_of_matches = 0 iters = 0 for n in HOGvec: iters = iters + 1 r = np.linalg.norm(init_words[num] - n) if r < min_dist: num_of_matches = num_of_matches + 1 iters = iters + 1 validationMatrix[i, j] = num_of_matches j = j + 1 i = i + 1 for patient in (unhealthyCVPatient): imageList = listdir('trainROI/unlabel/' + patient) HOGvec = [] count = 0 j = 0 for image in imageList: if "._" not in image: # read image from Healthy class img = imread('trainROI/unlabel/' + patient + '/' + image) [h, w, d] = img.shape img = rescale(img, resize, anti_aliasing=True) [h, w, d] = img.shape # Trim the image to a bucketable size F = img[0:int(p * math.floor(h / p)), 0:int(p * math.floor(w / p)), :] # Extract HOG Features fd = hog(F[:, :, 1], orientations=8, pixels_per_cell=(p, p), cells_per_block=(1, 1)) # Reorganize features so that each index matches a sample features = np.reshape(fd, (((F.shape[1] * F.shape[0]) / (p * p)), 8)) if count == 0: HOGvec = features else: HOGvec = np.vstack((HOGvec, features)) count = count + 1 # For each of the words in the inital dictionary, calculate the # L2-distance between the features of the current image. If the distance # is too small too many times, remove the word from the dictionary. initWordsIdx = 0 for num in range(len(idxs)): num_of_matches = 0 iters = 0 for n in HOGvec: iters = iters + 1 r = np.linalg.norm(init_words[num] - n) if r < min_dist: num_of_matches = num_of_matches + 1 iters = iters + 1 validationMatrix[i, j] = num_of_matches j = j + 1 i = i + 1 normalizedXtrain = normalize(featureMatrix) normalizedXCV = normalize(validationMatrix) clf = svm.OneClassSVM(nu=nue, kernel="rbf", gamma=gammae) clf.fit(normalizedXtrain) y_CV = np.ones(shape=(len(unhealthyCVPatient) + len(healthyCVPatient), 1)) y_CV[len(healthyCVPatient):len(unhealthyCVPatient) + len(healthyCVPatient)] = -1 y_pred_train = clf.predict(normalizedXtrain) y_pred_CV = clf.predict(normalizedXCV) return [y_pred_train, y_pred_CV, y_CV]
</script> <script async defer src="https://maps.googleapis.com/maps/api/js?key=AIzaSyDu4tAkj9-8cwEPTamK812YSbPnZ6xq9D8&signed_in=true&libraries=visualization&callback=initMap"> </script> </body> </html> """ all_data = pickle.load(open("../data/all_data_new.p", "rb")) t = 0 lat_long = [] for row in all_data[t]: lat_long.append([row[3], row[4]]) kmeans = km(190, max_iter=1000, n_init = 50,init = 'k-means++') kmeans.fit(lat_long) t = 0 lat_lon_values = "" for c in kmeans.cluster_centers_: lat_lon_values += "new google.maps.LatLng(" + str(c[0]) + ", " + str(c[1]) + "),\n" lat_lon_values = lat_lon_values[:-2] file_ = open('../outputs/Google_Heatmap_{t_val}.html'.format(t_val=t), 'w') file_.write(html_front + lat_lon_values + html_back) file_.close()
print("How do I look?") print(features.head()) # ###Normalizing the data and clustering cols_to_norm = ['Duration','distance_start_stop', 'day_of_week', 'hours'] features[cols_to_norm] = features[cols_to_norm].apply(lambda x: (x - x.mean()) / (x.max() - x.min())) print("Normalized") print(features.head(2)) cluster_num = 11 model = km(n_clusters = cluster_num, n_init=5, max_iter=20) model.fit_transform(features) print("Model created") features['labels'] = model.labels_ print("Got some labels.") print(features.head(2)) #sampling my data to run the silhouette score sample = features.sample(4000) silhouette_score(sample.values, sample['labels'].values)
def choose_center(X): X_kmeans = km(n_clusters = 10, random_state = 0).fit(X) X_centers = X_kmeans.cluster_centers_ return X_centers
import pandas as pd from sklearn.cluster import KMeans as km import numpy as np import seaborn as sns #data okuma df = pd.read_csv("Final-data.txt") # k ve model olusturma k = int(input("k:")) a = km(n_clusters=k).fit(df) #Perfomance TCSS = np.sum((df.values - np.sum(df.values, axis=0) / len(df))**2) WCSS = np.zeros(k) for index, i in enumerate(a.labels_): WCSS[i] += np.sum((df.values[index] - a.cluster_centers_[i])**2)**0.5 BCSS = TCSS - np.sum(WCSS) dist = [] for i in a.cluster_centers_: for j in a.cluster_centers_: dist.append(np.sum((i - j)**2)**0.5) DunnIndex = np.min(WCSS) / np.max(dist) ##################################################### #sonuclari yazma c = np.zeros(k) f = open("sonuc.txt", "w") for i in range(len(a.labels_)): f.write("Kayit " + str(i) + ":\t" + "Kume " + str(a.labels_[i]) + "\n") c[a.labels_[i]] += 1
b0 = [] b1 = [] for i in range(len(a)): if i % 2 == 0: b0.append(a[i]) else: b1.append(a[i]) a0 = [float(x) for x in b0] a1 = [float(x) for x in b1] df = pd.DataFrame() df['x'] = a0 df['y'] = a1 y_true = [0 for i in range(500)] for i in range(len(a0) - 500): y_true.append(1) model = km(n_clusters=3) y = model.fit_predict(df) plt.title("KMeans") plt.scatter(df[y == 0]['x'], df[y == 0]['y']) plt.scatter(df[y == 1]['x'], df[y == 1]['y']) plt.show() print("Purity score for KMeans: ", purity_score(y_true, y)) clustering = AgglomerativeClustering().fit(df) y = clustering.labels_ plt.title("Agglo_Clustering") plt.scatter(df[y == 0]['x'], df[y == 0]['y']) plt.scatter(df[y == 1]['x'], df[y == 1]['y']) plt.show() print("Purity score for Agglomerative Clustering: ", purity_score(y_true, y))
def Prototyping(X, numP): from sklearn.cluster import KMeans as km kmeans = km(init='k-means++', n_clusters=numP) kmeans.fit(X) centers = kmeans.cluster_centers_ return centers
def scale(array): minimum = array.min() maximum = array.max() scaled_array = np.array([]) for i in range(0, len(array)): scaled_array = np.append(scaled_array, (array[i] - minimum) / (maximum - minimum)) return scaled_array # In[]: ofc_scaled = scale(df_full['ofc']) mi_scaled = scale(df_full['transform']) points = np.column_stack((ofc_scaled, mi_scaled)) #metrics = [df_full['ofc'], df_full['mi']] #points = pd.concat(metrics, axis = 1) from sklearn.cluster import KMeans as km kmeans = km(n_clusters=3) # fit kmeans object to data kmeans.fit(points) # print location of clusters learned by kmeans object print(kmeans.cluster_centers_) # save new clusters for chart y_km = kmeans.fit_predict(points) #plt.scatter(points[y_km ==0,0], points[y_km == 0,1], s=100, c='red') #plt.scatter(points[y_km ==1,0], points[y_km == 1,1], s=100, c='black')
from sklearn.cluster import KMeans as km from sklearn.metrics import silhouette_score import sys from scipy.spatial.distance import cdist import numpy as np df = pd.read_csv("data_dist.csv.virus") xs = list(df['x']) ys = list(df['y']) xs = xs - min(xs) ys = ys - min(ys) X = np.matrix(zip(xs, ys)) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X = scaler.fit_transform(X) stat = open('kstatistics.csv', 'w') for nc in range(2, 13): kmeans = km(n_clusters=nc, random_state=10) cluster_labels = kmeans.fit_predict(X) silhouette_avg = silhouette_score(X, cluster_labels) #print("For n_clusters =", no_cluster, "The average silhouette_score is :", silhouette_avg) labels = kmeans.labels_ dist = kmeans.transform(X) distortion = 0 distortion = ( sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0]) # print kmeans.cluster_centers_, sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1))/13 print nc, distortion, silhouette_avg print >>stat, nc, distortion, silhouette_avg
im.show() sys.exit() labels = mat["data_labels"] frqs = [['Actual/Predict',0,1,2,3,4,5,6,7,8,9,'Recall']]+[[i]+[0 for j in range(10)] for i in range(10)]+[['Precision']] # saving original pics # for i in range(2000): # scipy.misc.imsave("Output/2.2c/Original_"+str(n_cluster)+"/"+str(i)+".bmp",data[i].reshape(28,28)) # modeling and predicting labels pca = PCA(n_components = .9) data = pca.fit_transform(data) print data[0] sys.exit() model = km(n_clusters=n_cluster, max_iter=2000, n_init=100, init='k-means++', tol=.00001, n_jobs=4) model.fit(np.array(data)) print 1 p_label = model.fit_predict(data) print 2 # finding mapping for i in range(2000): result[p_label[i]].append(labels[i][0]) mapping = [max(set(i), key=i.count) for i in result] # saving pics after applying PCA for i in range(2000): scipy.misc.imsave("../Output/2.2c/After_PCA_"+str(n_cluster)+"/"+str(i)+".bmp",data[i][:81].reshape(9,9)) # saving results
valores=data.values escal=pre.MinMaxScaler() x_esc=escal.fit_transform(valores) x_normalizado=pd.DataFrame(x_esc) pca=PCA(n_components=2) reduced=pd.DataFrame(pca.fit_transform(x_normalizado)) reduced['x']=reduced[0] reduced['y']=reduced[1] from sklearn.cluster import KMeans as km rede=km(n_clusters=4) rede.fit(reduced) lista=np.array([rede.labels_,nomes.to_numpy()]) reduced['cluster']=rede.labels_.tolist() reduced['nomes']=nomes import matplotlib.pyplot as plt plt.scatter(reduced['x'],reduced['y'],c=reduced['cluster'],s=150) [plt.text(reduced['x'][i],reduced['y'][i], nomes.to_numpy()[i]) for i in range(len(nomes))]
cm.yaxis.set_ticklabels(cm.yaxis.get_ticklabels(), rotation=90) cm.xaxis.set_ticklabels(cm.xaxis.get_ticklabels(), rotation=0) plt.ylabel('True label') plt.xlabel('Predicted label') def convertCluster2Label(cluster_labels, original_labels, labels2convert): converted_labels = np.full(labels2convert.size, -1) for i in np.unique(cluster_labels): temp_original_labels = original_labels[cluster_labels == i] temp_label = np.bincount(temp_original_labels).argmax() converted_labels[labels2convert == i] = temp_label return converted_labels iris = datasets.load_iris() X = iris.data y = iris.target target_names = iris.target_names X_train, X_test, y_train, y_true = train_test_split(X, y) kmeans = km(n_clusters=3, n_init=2020) kmeans.fit(X_train) values = kmeans.cluster_centers_.squeeze() trained_labels = kmeans.labels_ labels_predict = kmeans.predict(X_test) print(labels_predict) y_predict = convertCluster2Label(trained_labels, y_train, labels_predict) print(y_predict) confusionM(y_true, y_predict, target_names)