# -*- coding: utf-8 -*- """ Created on Sun Feb 14 23:29:50 2021 @author: 김도형 """ import pandas as pd import matplotlib.pyplot as plt uci_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/\ 00292/Wholesale%20customers%20data.csv' df = pd.read_csv(uci_path, header=0) X = df.iloc[:,:] from sklearn import preprocessing X = preprocessing.StandardScaler().fit(X).transform(X) from sklearn import cluster kmeans = cluster.KMeans(init='k-means++', n_clusters=5, n_init=10) kmeans.fit(X) cluster_label=kmeans.labels_ df['Cluster'] = cluster_label print(df)
# 1. k개의 중심값을 임의로 배정한다. # 2. 각 데이터마다 중심값까지의 거리를 계산하고 가장 가까운 중심값의 클러스터에 할당. # 3. 클러스에서 속한 데이터의 평균값으로 중심값을 이동 # 4. 데이터에 대한 클러스터 할당이 변하지 않을 때까지 반복 from sklearn import cluster from sklearn import datasets import numpy as np from mpl_toolkits.mplot3d import Axes3D from sklearn import metrics iris = datasets.load_iris() X = iris.data[:, 0:2] kmeans = cluster.KMeans(n_clusters=3, random_state=0).fit(X) print("Clusters: ", kmeans.labels_) mean_squared_error(kmeans.labels_, iris.target) X = iris.data Y = iris.target print(X) print() print(Y) estimator = [('k=8', cluster.KMeans(n_clusters=8)), ('k=3', cluster.KMeans(n_clusters=3)), ('k=3(r)', cluster.KMeans(n_clusters=3, n_init=1, init='random'))]
args = vars(ap.parse_args()) # Tell GDAL to throw Python exceptions, and register all drivers gdal.UseExceptions() gdal.AllRegister() # Read in raster image img_ds = gdal.Open(args["image"], gdal.GA_ReadOnly) band = img_ds.GetRasterBand(4) img = band.ReadAsArray() X = img.reshape((-1, 1)) k_means = cluster.KMeans(n_clusters=8) k_means.fit(X) X_cluster = k_means.labels_ X_cluster = X_cluster.reshape(img.shape) plt.figure(figsize=(20, 20)) plt.imshow(X_cluster, cmap="hsv") plt.show() # Read in raster image img_ds = gdal.Open(args["image"], gdal.GA_ReadOnly) img = np.zeros( (img_ds.RasterYSize, img_ds.RasterXSize, img_ds.RasterCount), gdal_array.GDALTypeCodeToNumericTypeCode(img_ds.GetRasterBand(1).DataType))
df['track'].apply(add_more_swim_data) ## track = df.loc['7A96', 'track'].copy() segs = track[:-1] # variables to cluster on: from sklearn import cluster if 0: # standardize state = segs.loc[:, ['swim_hdg_rel', 'swim_speed', 'tnum_m']].values state = (state - state.mean(axis=0)) / state.std(axis=0) kmeans = cluster.KMeans(n_clusters=5).fit(state) labels = kmeans.labels_ if 1: # standardize state = segs.loc[:, ['swim_hdg_rel', 'swim_speed', 'tnum_m']].values state = (state - state.mean(axis=0)) / state.std(axis=0) spectral = cluster.SpectralClustering(n_clusters=6).fit(state) labels = spectral.labels_ num = 20 plt.figure(num).clf() fig, (ax_geo, ax_swim) = plt.subplots(2, 1, num=num) ax_geo.scatter(segs['x_m'], segs['y_m'], 20, labels, cmap='jet') ax_swim.scatter(segs['swim_x'], segs['swim_y'], 20, labels, cmap='jet')
for n in range(nrows - 1): y[n][l] = float(y[n][l].strip()) / float(max) # print(y) return y, first_col if __name__ == '__main__': # 读数据 print("开始读取数据") X, first_col = excel() # 聚类 print("开始聚类") n_clusters = 10 km = cluster.KMeans(n_clusters=n_clusters, init='k-means++', max_iter=1, n_init=1) km.fit(X) # 聚类完成,打印出每一行数据的类别 # for i, j in enumerate(km.labels_): # if(i%100 == 0): # print(i, j) # 总共分n 类 data = [] for i in range(n_clusters): text = "" for j, k in enumerate(km.labels_): if (i == k): text = text + str(first_col[j + 1])
csv = np.genfromtxt('output.csv', delimiter=",")[1:] a = np.apply_along_axis(check_condition, 1, csv) a = np.where(a == True)[0] nonzero_rows = csv[a, :] avg_synapse = np.mean(nonzero_rows[:, -1]) xyz_only = nonzero_rows[:, [0, 1, 2]] if filter_less_than_avg: filter_avg_synapse = np.apply_along_axis(synapse_filt, 1, nonzero_rows, avg_synapse) a = np.where(filter_avg_synapse == True)[0] nonzero_filtered = nonzero_rows[a, :] xyz_only = nonzero_filtered[:, [0, 1, 2]] kmeans_algo = cluster.KMeans(n_clusters=n_clusters) clusters = kmeans_algo.fit_predict(xyz_only) centers = kmeans_algo.cluster_centers_ print centers # randomly sample perm = np.random.permutation(xrange(1, len(xyz_only[:]))) xyz_only = xyz_only[perm[:samples]] clusters = clusters[perm[:samples]] # get range for graphing x_min = np.amin(xyz_only[:, 0]) x_max = np.amax(xyz_only[:, 0]) y_max = np.amax(xyz_only[:, 1]) y_min = np.amin(xyz_only[:, 1]) z_min = np.amin(xyz_only[:, 2])
import numpy as np import scipy as sp import matplotlib.pyplot as plt from sklearn import cluster from scipy.misc import face face = face(gray=True) n_clusters = 5 np.random.seed(0) X = face.reshape((-1, 1)) # We need an (n_sample, n_feature) array k_means = cluster.KMeans(n_clusters=n_clusters, n_init=4) k_means.fit(X) values = k_means.cluster_centers_.squeeze() labels = k_means.labels_ # create an array from labels and values face_compressed = np.choose(labels, values) face_compressed.shape = face.shape vmin = face.min() vmax = face.max() # original face plt.figure(1, figsize=(3, 2.2)) plt.imshow(face, cmap='gray', vmin=vmin, vmax=256)
for node in G: for neighbor in G.neighbors(node): edge_mat[images_list.index(node)][images_list.index(neighbor)] = 1 edge_mat[images_list.index(node)][images_list.index(node)] = 1 return edge_mat img_img_graph = None with open('pickles/cache/graph-k-10-20181123-165012.pkl', 'rb') as f: img_img_graph = pickle.load(f) with open('pickles/pre-processed/images_list.pkl', 'rb') as f: images_list = pickle.load(f) edge_matrix = graph_to_edge_matrix(img_img_graph, images_list) k_clusters = 10 results = [] algorithms = {} algorithms['kmeans'] = cluster.KMeans(n_clusters=k_clusters, n_init=1) for model in algorithms.values(): model.fit(edge_matrix) results.extend(model.labels_) clusters = {} for cluster_id in set(results): clusters[cluster_id] = [ i for i, x in enumerate(results) if x == cluster_id ] # [main_list[x] for x in indexes] for cluster_id in clusters.keys(): visualize_images("Cluster id " + str(cluster_id), [images_list[x] for x in clusters[cluster_id]])
""" demo08_kmeans.py kmeans聚类 """ import numpy as np import sklearn.cluster as sc import matplotlib.pyplot as mp x = np.loadtxt('../ml_data/multiple3.txt', delimiter=',') # 构建聚类模型 model = sc.KMeans(n_clusters=4) model.fit(x) # 返回每个样本的聚类的类别标签: 0/1/2/3 pred_y = model.labels_ # 返回所有的聚类中心样本 centers = model.cluster_centers_ print(centers) # 绘制分类边界线 n = 500 l, r = x[:, 0].min() - 1, x[:, 0].max() + 1 b, t = x[:, 1].min() - 1, x[:, 1].max() + 1 grid_x = np.meshgrid(np.linspace(l, r, n), np.linspace(b, t, n)) flat_x = np.column_stack((grid_x[0].ravel(), grid_x[1].ravel())) flat_y = model.predict(flat_x) grid_y = flat_y.reshape(grid_x[0].shape) mp.figure('K-Means Cluster', facecolor='lightgray') mp.title('K-Means Cluster', fontsize=20) mp.xlabel('x', fontsize=14) mp.ylabel('y', fontsize=14) mp.tick_params(labelsize=10) mp.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='gray')
def perform_clustering(): ################################################################################################ ## Connect to DB and select data ################################################################################################ # Connection string to connect to SQL Server named instance. conn_str = 'Driver=SQL Server;Server=VC5-SOPHIA;Database=tpcxbb_1gb;Trusted_Connection=True;' input_query = '''SELECT ss_customer_sk AS customer, ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio, ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio, ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio, COALESCE(returns_count, 0) AS frequency FROM ( SELECT ss_customer_sk, -- return order ratio COUNT(distinct(ss_ticket_number)) AS orders_count, -- return ss_item_sk ratio COUNT(ss_item_sk) AS orders_items, -- return monetary amount ratio SUM( ss_net_paid ) AS orders_money FROM store_sales s GROUP BY ss_customer_sk ) orders LEFT OUTER JOIN ( SELECT sr_customer_sk, -- return order ratio count(distinct(sr_ticket_number)) as returns_count, -- return ss_item_sk ratio COUNT(sr_item_sk) as returns_items, -- return monetary amount ratio SUM( sr_return_amt ) AS returns_money FROM store_returns GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk''' # Define the columns we wish to import. column_info = { "customer": { "type": "integer" }, "orderRatio": { "type": "float" }, "itemsRatio": { "type": "float" }, "frequency": { "type": "integer" } } data_source = revoscale.RxSqlServerData(sql_query=input_query, column_Info=column_info, connection_string=conn_str) revoscale.RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False) # import data source and convert to pandas dataframe. customer_data = pd.DataFrame(revoscale.rx_import(data_source)) #print("Data frame:", customer_data.head(n=5)) cdata = customer_data n_clusters = 4 means_cluster = sk_cluster.KMeans(n_clusters=n_clusters, random_state=111) columns = ["orderRatio", "itemsRatio", "monetaryRatio", "frequency"] est = means_cluster.fit(customer_data[columns]) clusters = est.labels_ customer_data['cluster'] = clusters # Print some data about the clusters: # For each cluster, count the members. for c in range(n_clusters): cluster_members = customer_data[customer_data['cluster'] == c][:] print('Cluster{}(n={}):'.format(c, len(cluster_members))) print('-' * 17) # Print mean values per cluster. print(customer_data.groupby(['cluster']).mean())
plot_dendrogram(dist) ############################################################################## ############################################################################## num_clusters = input( 'How many clusters would you like? ') # check dendrogram ############################################################################## ############################################################################## #### K-means clustering #### #num_clusters = len(sample['author'].unique()) # set number of clusters to number of unique authors km = cluster.KMeans(n_clusters=num_clusters) km.fit(tfidf_matrix) clusters = km.labels_.tolist() # list of clusters # create doc attribute df including cluster assignment revisions = { 'time': sample.index, 'author': sample['author'].tolist(), 'text': texts, 'cluster': clusters } frame = pd.DataFrame(revisions, index=[clusters], columns=['time', 'author', 'cluster', 'text']) print('Cluster value counts:')
plt.scatter(x[:,0], x[:,1], c=y_pred, cmap='viridis', marker='^', s=200, edgecolor='k') plt.axis([0, 12, 0, 15]) plt.show() # #### 실제 데이터에 K-평균 모델 적용 # In[7]: #p282 from sklearn.cluster import KMeans iris = datasets.load_iris() X_iris = iris.data y_iris = iris.target k_means = cluster.KMeans(n_clusters=3) k_means.fit(X_iris) print(k_means.labels_[::10]) print(y_iris[::10]) # In[8]: from sklearn import cluster, datasets import matplotlib.pyplot as plt iris = datasets.load_iris() X = iris.data[:, 2:] y = iris.target X1 = iris.data[[1, 50, 100], 2:] y1 = iris.target[[1, 50, 100]]
vectorizer.fit(text_corpora) vector = vectorizer.transform(text_corpora) return vector stopword_list = list(set(nl.stopwords.words('english'))) r_df = pd.read_csv("python4.csv",encoding = "ISO-8859-1") print(r_df) text_corpora = [s.translate(str.maketrans("","","0123456789")) for s in r_df.loc[:,"scraptweets"]] words_data = [nt.word_tokenize(s.lower()) for s in text_corpora] words_data = [[ ps.stem(word) for word in sent if word not in stopword_list ] for sent in words_data ] sent_data = [" ".join(sent) for sent in words_data] vector = generate_tfidf(sent_data) kmeans_obj = km.KMeans(n_clusters = 5, max_iter=100) clusters = kmeans_obj.fit(vector) r_df["label"]=clusters.labels_ print("cluster 1") r_df.loc[r_df["label"]==0] print(r_df.loc[r_df["label"]==1]) r_df.to_csv("Clustered_tweet2.txt",index=False) file = open('Clustered_tweet1.txt', encoding="utf8",) a= file.read() stopword_list = list(set(nl.stopwords.words('english'))) wordCount = {} for word in a.lower().split():
from sklearn import datasets from sklearn import cluster import matplotlib.pyplot as plt data, label = datasets.make_blobs(n_samples=500, n_features=2, centers=5) e = cluster.KMeans(n_clusters=5) e.fit(data) print(e.labels_) print(e.cluster_centers_) plt.scatter(data[:, 0], data[:, 1], marker="o", c=e.labels_, edgecolors="k") plt.scatter(e.cluster_centers_[:,0], e.cluster_centers_[:,1], marker="x") plt.show()
columns=df_cluster.columns) # type(df_std) # Creating clusters from sklearn import cluster from sklearn.metrics import silhouette_score # Using Silhoutter score to understand the right number of clusters. # Higher the silhoutter score, better . So usually the k that maximizes Silhoutter score should be selected # Read Below link for interpretation # http://stackoverflow.com/questions/23687247/efficient-k-means-evaluation-with-silhouette-score-in-sklearn # USe this only as a guideline as this method has it's limitations . Business Judgement takes precedence for k in range(2, 11): kmeans = cluster.KMeans(n_clusters=k) kmeans.fit(df_cluster_scaled) label = kmeans.labels_ sil_coeff = silhouette_score(df_cluster_scaled, label, metric='euclidean') print("k={}, The Silhouette Coefficient is {}".format(k, sil_coeff)) # Number of clusters k = 6 kmeans = cluster.KMeans(n_clusters=k) kmeans.fit(df_cluster_scaled) # fitting cluster # Scoring and analyzing cluster # caution ----Ideally a training sample needs to separated out for all this analysis . Not doing it because # there are only 800 customers . Analyzing on Test sample # Assigning cluster to each row in the ORIGINAL data .
X_train, X_validation, Y_train, Y_validation = train_test_split( X, Y, test_size=0.3, random_state=seed[i]) knn = KNeighborsClassifier(n_neighbors=3) knn.fit(X_train, Y_train) predictions = knn.predict(X_validation) acc1 = accuracy_score(Y_validation, predictions) total.append(acc1) print("varinace: ", np.var(total)) print("accuracy: ", np.mean(total)) #-------------------------------------------------K-means----------------------------------------------- dataset = data.iloc[:, 0:4].values kmeans = cluster.KMeans(n_clusters=3).fit_predict(dataset) plt.scatter(dataset[kmeans == 0, 0], dataset[kmeans == 0, 1], s=100, c='red', label='Iris-setosa') plt.scatter(dataset[kmeans == 1, 0], dataset[kmeans == 1, 1], s=100, c='blue', label='Iris-versicolour') plt.scatter(dataset[kmeans == 2, 0], dataset[kmeans == 2, 1], s=100, c='green',
from tqdm import tqdm dataset = data.load_iris() # %% #Define os parâmetros para o treinamento e teste test_size = 0.5 ncenters = 3 n_testes = 1000 f1_list = [] for _ in tqdm(range(n_testes)): #Dividindo o dataset em treinamento e checagem data_train, data_test, _, label_test = train_test_split( dataset.data, dataset.target, test_size=test_size) #Implementa o Algoritmo Fuzzy C-means kmean_obj = skc.KMeans(n_clusters=ncenters, max_iter=10000) kmean_obj.fit(data_train) #A partir dos centros gerados testa o dataset predicted = kmean_obj.predict(data_test) f1_list.append(f1_score(label_test, predicted, average='weighted')) f1_mean = np.mean(f1_list) f1_error = np.std(f1_list) / np.sqrt(len(f1_list)) print('\nEm {:d} testes:' '\nF1 Score = {:.04f} +/- {:.04f}' \ .format(n_testes, f1_mean, f1_error))
def kmeans_clust(som, n_clusters=8): print("Performing K-means clustering to SOM trained data...") cl_labels = clust.KMeans(n_clusters=n_clusters, random_state=tfpinit.km_seed).fit_predict(som.codebook.matrix) return cl_labels
def PCA_graph(INPUT_FILE, DATASET_LABEL): """uses the PCA plink does and generates plot and uses k-mean cluster to determine outliers to remove""" def SuperPop(x): if x in ["GBR", "CEU", "TSI", "FIN", "IBS"]: return "EUR" elif x in ["CHB", "JPT", "CHS", "CDX", "KHV"]: return "EAS" elif x in ["YRI", "LWK", "GWD", "MSL", "ESN", "ASW", "ACB"]: return "AFR" elif x in ["MXL", "PUR", "CLM", "PEL"]: return "AMR" elif x in ["GIH", "PJL", "BEB", "STU", "ITU"]: return "SAS" else: return "Samples" ## Starting to handle big data so bringing in Pandas raw = pd.read_csv(INPUT_FILE, sep=" ", header=None) ## put 1000g data into superpopulation groups and define dataset clean = (raw[list(raw.columns[:4])]) clean.columns = ['FAM_ID', 'ID', 'C1', 'C2'] clean.set_index(['FAM_ID'], inplace=True) ## setting up super population codes to map colours for graph clean["POP"] = clean.ID.apply(SuperPop) groups = clean.groupby('POP') ## Plotting fig, ax = plt.subplots() ax.margins(0.1) for name, group in groups: ax.plot(group.C1, group.C2, marker='o', linestyle='', ms=4, label=name) ax.legend(numpoints=1, loc='best') plt.xlabel('Component 1') plt.ylabel('Component 2') plt.suptitle("PCA on " + DATASET_LABEL, weight='bold') fig.savefig(DATASET_LABEL + ".PCA_results.pdf") plt.close() ##kmean clustering to find outliers find_out = clean[['C1', 'C2']].copy() k_means = cluster.KMeans(n_clusters=5, ) k_means.fit(find_out) centroids = k_means.cluster_centers_ labels = k_means.labels_ results = pd.DataFrame([clean.index, labels]).T results.columns = ["FAM_ID", "k_group"] results["ID"] = clean[["ID"]].copy() results.set_index(['FAM_ID'], inplace=True) output_label = (DATASET_LABEL + ".PCA_kmeans.txt") ## Display samples that are not Europeans in dataset merge_df = pd.merge(clean, results, right_index=True, left_index=True) merge_df['k_group'] = merge_df['k_group'].astype(int) test = merge_df.loc[merge_df['POP'] == "EUR", ['k_group']].apply(np.median) Euro_group = int(test) #print ("European cluster is :" + str(Euro_group)) your_samples = merge_df.loc[merge_df['POP'] == "Samples", ['k_group']] your_samples['check'] = np.where(your_samples['k_group'] == Euro_group, 'good', 'bad') bad_ids = your_samples[your_samples['check'] == 'bad'] after = (clean[~clean.index.isin(bad_ids.index)]) count = len(bad_ids.index.get_level_values(0)) #print (str(count) + " Samples fall outside the European cluster ") after_groups = after.groupby('POP') ### Plotting with outliers removed fig, ax = plt.subplots() ax.margins(0.1) for name, after_groups in after_groups: ax.plot(after_groups.C1, after_groups.C2, marker='o', linestyle='', ms=4, label=name) ax.legend(numpoints=1, loc='best') plt.xlabel('Component 1') plt.ylabel('Component 2') plt.suptitle("Outliers removed PCA on " + DATASET_LABEL + " - " + str(count) + " Samples were removed", weight='bold') #print ("Graph saved as " + DATASET_LABEL + ".PCA_results.pdf") #print ("Outliers removed Graph saved as " + DATASET_LABEL + ".outliers_removed_PCA_results.pdf") fig.savefig(DATASET_LABEL + ".outliers_removed_PCA_results.pdf") output_id = (DATASET_LABEL + ".outliers.txt") #print ("bad IDs exported to text file : " + output_id) bad_ids.to_csv(output_id, sep="\t", header=None) plt.close()
def clusteringmap_category(ax,sm,n_clusters,dataset,colorcategory,labels, savepath): """ Description: This function is used to output maps that prints colors on dots based on their properties """ categories = dataset[colorcategory] #if colorcategory is one col of the dataset cmap = plt.get_cmap("tab20") #cmap for background n_palette = 20 # number of different colors in this color palette color_list = [cmap((i % n_palette)/n_palette) for i in range(n_clusters)] msz = sm.codebook.mapsize proj = sm.project_data(sm.data_raw) coord = sm.bmu_ind_to_xy(proj) fig, ax = plt.subplots(1, 1, figsize=(30,30)) cl_labels = clust.KMeans(n_clusters=n_clusters,random_state=555).fit_predict(sm.codebook.matrix) # fill each rectangular unit area with cluster color # and draw line segment to the border of cluster norm = mpl.colors.Normalize(vmin=0, vmax=n_palette, clip=True) #ax.pcolormesh(cl_labels.reshape(msz[0], msz[1]).T % n_palette, #cmap=cmap, norm=norm, edgecolors='face', #xinyuewang, make the background to white #lw=0.5, alpha=0.5) # config for each grid ax.scatter(coord[:, 0]+0.5, coord[:, 1]+0.5, c='k', marker='o') ax.axis('off') categoryname = list(dataset.groupby(colorcategory).count().index) categories_int = categories.apply(categoryname.index) N = len(categoryname) cmap_labels = plt.cm.gist_ncar # extract all colors from the .jet map cmaplist = [cmap_labels(i) for i in range(cmap_labels.N)] # create the new map cmap_labels = cmap_labels.from_list('Custom cmap', cmaplist, cmap_labels.N) # define the bins and normalize bounds = np.linspace(0,N,N+1) norm_labels = mpl.colors.BoundaryNorm(bounds, cmap_labels.N) scat = ax.scatter(coord[:, 0]+0.5, coord[:, 1]+0.5, c=categories_int,s=30,cmap=cmap_labels,norm=norm_labels)# s is the size of projection dot cbar = plt.colorbar(scat, spacing='proportional',ticks=bounds) cbar.ax.get_yaxis().set_ticks([]) for j, lab in enumerate(categoryname): cbar.ax.text(1, (2 * j + 1) / (2*(len(categoryname))), lab, ha='left', va='center', fontsize=30) cbar.ax.get_yaxis().labelpad = 15 # cbar.ax.set_ylabel('# of contacts', rotation=270) ax.axis('off') for label, x, y in zip(labels, coord[:, 0], coord[:, 1]): x += 0.2 y += 0.2 # "+ 0.1" means shift of label location to upperright direction # randomize the location of the label # not to be overwrapped with each other x += 0.1 * np.random.randn() y += 0.3 * np.random.randn() # wrap of label for chemical compound #label = str_wrap(label) ax.text(x+0.4, y+0.4, label, horizontalalignment='left', verticalalignment='bottom',rotation=30, fontsize=12, weight='semibold') # cl_labels = som.cluster(n_clusters) cl_labels = clust.KMeans(n_clusters = n_clusters, random_state = 555).fit_predict(sm.codebook.matrix) for i in range(len(cl_labels)): rect_x = [i // msz[1], i // msz[1], i // msz[1] + 1, i // msz[1] + 1] rect_y = [i % msz[1], i % msz[1] + 1, i % msz[1] + 1, i % msz[1]] if i % msz[1] + 1 < msz[1]: # top border if cl_labels[i] != cl_labels[i+1]: ax.plot([rect_x[1], rect_x[2]], [rect_y[1], rect_y[2]], 'k-', lw=10)# boundary linewid,orginally2.5 if i + msz[1] < len(cl_labels): # right border if cl_labels[i] != cl_labels[i+msz[1]]: ax.plot([rect_x[2], rect_x[3]], [rect_y[2], rect_y[3]], 'k-', lw=10)#2.5 plt.savefig(savepath) return cl_labels
def eval_batch(x_train, y_train, x_test, y_test, classifier, components, no_clusters, dimensionality): cluster_finder = cluster.KMeans(n_clusters=no_clusters) if classifier == 'mbk': cluster_finder = MiniBatchKMeans(init='k-means++', n_clusters=no_clusters, batch_size=32, max_no_improvement=10, verbose=0) cluster_finder.fit(x) cddd = str(cluster_finder.score) clll = str(cluster_finder) log_str = str( components) + 'score' + cddd + 'algo=' + clll + 'comp=' + str( components) + dimensionality labels = cluster_finder.labels_ else: cluster_finder = cluster.KMeans(n_clusters=no_clusters) cluster_finder.fit(x) cddd = str(cluster_finder.score) clll = str(cluster_finder) log_str = str( components) + 'score' + cddd + 'algo=' + clll + 'comp=' + str( components) + dimensionality labels = cluster_finder.labels_ clustered_x = [] clustered_y = [] for c in range(0, no_clusters): clustered_x.append([]) clustered_y.append([]) for i, item in enumerate(x_train): item = item.reshape(1, -1) predicted = cluster_finder.predict(item) clustered_x[predicted[0]].append(item) clustered_y[predicted[0]].append(y[i]) # here testing for q, qtem in enumerate(test_x): qtem = qtem.reshape(1, -1) predicted = cluster_finder.predict(qtem)[0] print('predicted=', predicted) # closest = find_closest(qtem, clustered_x[predicted], clustered_y[predicted], 'mutual_info_score') # closest = find_closest(qtem, clustered_x[predicted], clustered_y[predicted], 'euclidean') closest = find_closest(qtem, clustered_x[predicted], clustered_y[predicted], 'cosine') print('closest=', closest) generated = decode_sequence(closest[1]) print('generated=', generated) actual = decode_sequence(test_y[q]) print('actual=', actual) import nltk BLEUscore = nltk.translate.bleu_score.sentence_bleu([generated], actual) log = log_str + 'bleu-4=' + str(BLEUscore) rouge = Rouge() scores = rouge.get_scores(generated, actual) log = log_str + str(scores) log_file.writelines(log + "\n") log_file.writelines("---------------------" + '\n')
def extract_fribbles(stimulus_directory): """returns fribbles""" # load answer key answer_key = extract_answer_key(stimulus_directory) # return list of trial names extract_list = list( answer_key['oddity']) # set diameter of images (determined manually) d = int(200/2) # identify experiment folder task_folders = [i for i in os.listdir(stimulus_directory) if 'pdf' not in i] # import rotation keys rotation = import_rotation_keys(stimulus_directory) # stimulus information for loading i_folder = [i for i in task_folders if 'oddity' in i][0] # identify all files in directory files = os.listdir(os.path.join(stimulus_directory, i_folder)) # initialize kmeans segmentation protocol -- 7 images per screen k_means = cluster.KMeans(n_clusters=7, n_init=4) oddity_images = {} for i_file in files: # create human readable filename i_number = i_file[i_file.find('_')+1:len(i_file)-4] if i_number in extract_list: # load image i_image = np.array(Image.open(os.path.join(stimulus_directory, i_folder, i_file))) ## kmeans segmentation protocol ## # binarize layer of image binary = i_image[:,:,1] != 255 # convert binaryized image into a vector for each non-blank location points = [[i,j] for i in range(binary.shape[0]) for j in range(binary.shape[1]) if binary[i,j]] # cluster non-blank locations to determine object center of masses k_means.fit(points) # determine mapping between cluster order and image order order = np.array([0 for i in range(7)]) # determine whether image is in the top row top_row = k_means.cluster_centers_[:,0] < 400 # extract all top row images order[top_row] = k_means.cluster_centers_[:,1][top_row].argsort().argsort() # extract all bottom row images order[top_row==0] = k_means.cluster_centers_[:,1][top_row==0].argsort().argsort() + 4 # sort order = order.argsort() ## segmentation complete ## # iterate over all objects in image i_slide = [] for i_segmented_object in range(len(order)): # identify center of mass for each object x, y = k_means.cluster_centers_[order[i_segmented_object],:] # select an area around the center of mass defined above i_object = i_image[int(x-d):int(x+d), int(y-d):int(y+d)] # add segmented image i_slide.append(i_object) # resize and append to ambiguity type oddity_images[i_number] = [ imresize(i_slide[i], (224, 224)) for i in range(len(i_slide)) ] return oddity_images
def _cluster(self, data, targets): clusterer = cluster.KMeans( n_clusters=len(set(targets.tolist()))).fit(data) return metrics.mutual_info_score(targets, clusterer.labels_)
def extract_and_rotate_novel_images(stimulus_directory): """return novel images in their canonical orientation""" # load answer key answer_key = extract_answer_key(stimulus_directory) # identify task folders task_folders = [i for i in os.listdir(stimulus_directory) if 'pdf' not in i] # import rotation keys rotation = import_rotation_keys(stimulus_directory) # stimulus information for loading i_folder = [i for i in task_folders if 'novel' in i][0] # all files files = os.listdir(os.path.join(stimulus_directory, i_folder)) # initialize image segmentation protocol--four objects per stimulus screen k_means = cluster.KMeans(n_clusters=4, n_init=4) # initialize data storage novel_images = {'high':{}, 'low':{}} # set diameter of images (determined manually) d = int(350/2) for i_file in files: # determine ambiguity level of this trial if 'LOW' in i_file: amb = 'low' if 'HIG' in i_file: amb = 'high' # define human readable filename i_number = i_file[i_file.find('_')+1:len(i_file)-4] # only select those images that we have rotation keys for if (i_file in rotation['novel'][amb].keys()) * (i_number in answer_key['novel_%s'%amb].keys()): # load rotations necessary for image image_rotations = rotation['novel'][amb][ i_file ] # load image i_image = np.array(Image.open(os.path.join(stimulus_directory, i_folder, i_file))) ##### begin kmeans segmentation protocol ##### # binarize layer of image io = i_image[:,:,1] != 255 # convert binaryized image into a vector for each non-blank location points = [[i,j] for i in range(io.shape[0]) for j in range(io.shape[1]) if io[i,j]] # cluster non-blank locations to determine object center of masses k_means.fit(points) # determine mapping between cluster order and image order order = determine_order_of_clusters(k_means.cluster_centers_, i_image) ##### end segmentation protocol ####### # iterate over all objects in image i_slide = [] for i_segmented_object in range(len(order)): # identify centroids of clusters x, y = k_means.cluster_centers_[order[i_segmented_object],:] # select an area around the center of mass defined above i_object = i_image[int(x-d):int(x+d), int(y-d):int(y+d)] # rotate the selected area into its connonical orientation i_object_rotated = np.rot90(i_object, k=image_rotations[i_segmented_object]) # add to list i_slide.append(i_object_rotated) # resize images i_slide = [imresize(i_slide[i],(224, 224)) for i in range(len(i_slide))] # append to ambiguity type novel_images[amb][i_number] = i_slide return novel_images
correct = (mapped_preds == targets).sum() total = len(targets) acc = correct / (total + eps) cm = metrics.confusion_matrix(targets, mapped_preds) return loss, acc, cm df = pd.read_csv("ATNTFaceImages.txt", header=None) df_matrix = df.to_numpy() inputs = df_matrix[1:].T targets = df_matrix[0].T k = 40 model = cluster.KMeans(k) model.fit(inputs) loss, acc, cm = kmeans_metric(model, inputs, targets) print(f"Loss: {loss:.5f} Acc: {acc:.5f}") print("confusion matrix:") print(cm) df = pd.read_csv("HandWrittenLetters.txt", header=None) df_matrix = df.to_numpy() inputs = df_matrix[1:].T targets = df_matrix[0].T k = 26 model = cluster.KMeans(k) model.fit(inputs) loss, acc, cm = kmeans_metric(model, inputs, targets)
def clasterize_n(data, n): kmean = cluster.KMeans(n_clusters=n, init='k-means++', random_state=241) X = preprocessing.scale(list(map(lambda el: el['time'], data))) kmean.fit(X) return kmean.labels_
df.loc[county, 'MHI'] = liquor.loc[county, 'Median_Household_Income'] return df # Sets a custom color palette for clusters colors = ['r','b','y','g','c','m'] def getColors(labels): return [colors[l] for l in labels] # In[251]: # Setup kmeans klean = lean[['VODKA', 'WHISKEY']] kaveraged = averaged[['VODKA', 'WHISKEY']] kmeans = sk.KMeans(n_clusters=3, random_state=0).fit(klean) # Plot fig,ax = plt.subplots(1) plt.scatter(klean['VODKA'], klean['WHISKEY'], color=getColors(kmeans.labels_), label=kmeans.labels_) plt.title('Vodka and Whisky Percentage of Sales - Median by County, Filtered, 3 Clusters') plt.xlabel('Vodka Percentage of Sales - Median') plt.ylabel('Whisky Percentage of Sales - Median') # Add rectangle rect = patches.Rectangle((-0.02,-0.02), 0.04, 0.04, linewidth=1, edgecolor='r', facecolor='none') ax.add_patch(rect) plt.show() fig.savefig('plots/Lean3Clusters.png') # In[252]:
def get_kmeans(clusters): return cluster.KMeans(n_clusters=clusters, init='k-means++', random_state=241, tol=0.1)
def estim_class_model(features, nb_classes, estim_model='GMM', pca_coef=None, use_scaler=True, max_iter=99): """ create pipeline (scaler, PCA, model) over several options how to cluster samples and fit it on data :param ndarray features: :param int nb_classes: number of expected classes :param float pca_coef: range (0, 1) or None :param bool use_scaler: whether use a scaler :param str estim_model: used model :param int max_iter: :return: >>> np.random.seed(0) >>> fts = np.row_stack([np.random.random((50, 3)) - 1, ... np.random.random((50, 3)) + 1]) >>> mm = estim_class_model(fts, 2) >>> mm.predict_proba(fts).shape (100, 2) >>> mm = estim_class_model(fts, 2, estim_model='GMM_kmeans', ... pca_coef=0.95, max_iter=3) >>> mm.predict_proba(fts).shape (100, 2) >>> mm = estim_class_model(fts, 2, estim_model='GMM_Otsu', max_iter=3) >>> mm.predict_proba(fts).shape (100, 2) >>> mm = estim_class_model(fts, 2, estim_model='kmeans_quantiles', ... use_scaler=False, max_iter=3) >>> mm.predict_proba(fts).shape (100, 2) >>> mm = estim_class_model(fts, 2, estim_model='BGM', max_iter=3) >>> mm.predict_proba(fts).shape (100, 2) >>> mm = estim_class_model(fts, 2, estim_model='Otsu', max_iter=3) >>> mm.predict_proba(fts).shape (100, 2) """ components = [] if use_scaler: components += [('std_scaler', preprocessing.StandardScaler())] if pca_coef is not None: components += [('reduce_dim', decomposition.PCA(pca_coef))] nb_inits = max(1, int(np.sqrt(max_iter))) # http://scikit-learn.org/stable/modules/generated/sklearn.mixture.GMM.html mm = mixture.GaussianMixture(n_components=nb_classes, covariance_type='full', n_init=nb_inits, max_iter=max_iter) # split the model and used initilaisation if '_' in estim_model: init_type = estim_model.split('_')[-1] estim_model = estim_model.split('_')[0] else: init_type = '' y = None if estim_model == 'GMM': # model = estim_class_model_gmm(features, nb_classes) if init_type == 'kmeans': mm.set_params(n_init=1) # http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html kmeans = cluster.KMeans(n_clusters=nb_classes, init='k-means++', n_jobs=-1) y = kmeans.fit_predict(features) elif init_type == 'Otsu': mm.set_params(n_init=1) y = compute_multivarian_otsu(features) elif estim_model == 'kmeans': # http://scikit-learn.org/stable/modules/generated/sklearn.mixture.GMM.html mm.set_params(max_iter=1) init_type = 'quantiles' if init_type == 'quantiles' else 'k-means++' _, y = estim_class_model_kmeans(features, nb_classes, init_type=init_type, max_iter=max_iter) logging.info('compute probability of each feature to all component') elif estim_model == 'BGM': mm = mixture.BayesianGaussianMixture(n_components=nb_classes, covariance_type='full', n_init=nb_inits, max_iter=max_iter) elif estim_model == 'Otsu' and nb_classes == 2: mm.set_params(max_iter=1, n_init=1) y = compute_multivarian_otsu(features) components += [('model', mm)] # compose the pipeline model = pipeline.Pipeline(components) if y is not None: # fit with examples model.fit(features, y) else: # fit from scrach model.fit(features) return model
def cluster_weights(weights, n_clusters): from sklearn import cluster kmeans = cluster.KMeans(n_clusters=n_clusters).fit(weights.reshape( (-1, 1))) return kmeans.labels_.reshape(weights.shape), np.around( kmeans.cluster_centers_).astype(np.int32)