class FCmeansDaily: def __init__(self, training_data): self.training_data = training_data data_for_clustering = self.preprocess_daily_data_to_fit_fcmeans_library( training_data) self.fcmeans = FCM(n_clusters=5, random_state=0) self.fcmeans.fit(data_for_clustering) def preprocess_daily_data_to_fit_fcmeans_library(self, training_data): list_to_pass_kmeans_function = [] for i in range(0, len(training_data)): temp = [] temp.append(training_data[i].upper_shadow_length) temp.append(training_data[i].lower_shadow_length) temp.append(training_data[i].body_length) temp.append(training_data[i].color) list_to_pass_kmeans_function.append(temp) data_for_clustering = np.array(list_to_pass_kmeans_function) return data_for_clustering def get_clusters(self): return self.fcmeans.centers def get_labels_list(self): X = self.preprocess_daily_data_to_fit_fcmeans_library( self.training_data) return self.fcmeans.predict(X) def get_labels_for_each_data_point(self, data_point_index): return self.get_labels_list()[data_point_index]
def fcm_alg(dataset_num, num_of_samples=None, num_of_clusters=None, get_figure=False, calc_ami_score=True, plot_anomaly=False): if plot_anomaly and (num_of_samples is None or (num_of_samples is not None and num_of_samples > 6000)): num_of_samples = 6000 data = d.get_data(dataset_num, n_samples=num_of_samples) # store the data frame which is given by a dimension reduction (using PCA) into 2 dimensions of the original # data set df = d.get_df_to_cluster(data) tag = d.get_tag(data) # if the number of clusters isn't defined, choose it to be the "real" number of clusters (according to the tag) if num_of_clusters is None: num_of_clusters = d.get_num_of_clusters(tag) # create a FCM-type object with the relevant number of clusters and fit it to the data frame fcm = FCM(n_clusters=num_of_clusters) fcm.fit(df) # store the labels result after the fitting labels = fcm.predict(df) if get_figure or plot_anomaly: if plot_anomaly: silhouettes = silhouette_samples(df, labels) labels[silhouettes < 0] = -1 # plot the clustered data and the centroid of each cluster plt.scatter(df['PC1'][labels != -1], df['PC2'][labels != -1], c=labels[labels != -1]) plt.scatter(df['PC1'][labels == -1], df['PC2'][labels == -1], c=['black'] * len(labels[labels == -1]), label='Anomaly') plt.scatter(fcm.centers["PC1"], fcm.centers["PC2"], marker="*", label='centroid', c='black') plt.legend() title = 'DS{} - Fuzzy C Means'.format(dataset_num) # fig_name = 'Images\Fuzzy C Means\\' + title plt.title(title) # save the figure # plt.savefig(fig_name) plt.show() # calculate the adjusted mutual info score of the clustering if calc_ami_score: labels_true = d.get_labels(tag) return adjusted_mutual_info_score(labels_true=labels_true, labels_pred=labels)
def fuzzy_cmeans_clustering(self, show_plt=True): fcm = FCM(n_clusters=self.k) fcm.fit(self.data) labels = fcm.predict(self.data) if show_plt: plt.title('Fuzzy C Means') plt.scatter(self.data[:, 0], self.data[:, 1], c=labels, s=7, cmap='rainbow') plt.show() return metrics.adjusted_mutual_info_score(self.tags, labels)
def fuzzy_cmeans_clustering(dataset, tags, k, show_plt=True): fcm = FCM(n_clusters=k) fcm.fit(dataset) labels = fcm.predict(dataset) if show_plt: plt.title('Fuzzy C Means') plt.scatter(dataset[:, 0], dataset[:, 1], c=labels, s=7, cmap='rainbow') plt.show() return metrics.adjusted_mutual_info_score(tags, labels)
def decompose( path_to_features: str, path_to_images: str, path_to_decomposed_images_1: str, path_to_decomposed_images_2: str, class_name: str, fc: int ): """ Decomposition of extracted features using Fuzzy c means clustering. params: <string> path_to_features <string> path_to_images <string> path_to_decomposed_images_1 <string> path_to_decomposed_images_2 <int> fc: Number of clusters """ # Load features features = np.load(path_to_features) #fcm fcm = FCM(n_clusters=fc) fcm.fit(features) idx = fcm.predict(features) # Cluster index #idx = FCM(n_clusters=fc, random_state=111).fit(features) #idx = idx.predict(features) # Images list images = [filename for filename in os.listdir(path_to_images)] # Iterate through images progress_bar = tqdm(range(len(images))) progress_bar.set_description(f"Composing {class_name} images") for i in progress_bar: filename = os.path.join(path_to_images, images[i]) # Read image I = plt.imread(filename) filename_1 = os.path.join(path_to_decomposed_images_1, images[i]) filename_2 = os.path.join(path_to_decomposed_images_2, images[i]) # If image belongs to a cluster, write the image to a certain folder, otherwise, write it to the other folder. if (idx[i] == 1): plt.imsave(filename_1, I) else: plt.imsave(filename_2, I)
def fuuzyc(train_x1, train_y): #standarization pca = PCA(n_components=2) pca.fit(train_x1) train_x1 = pca.transform(train_x1) fcm = FCM(n_clusters=7) fcm.fit(train_x1) m = fcm.predict(train_x1) ps = purity_score(train_y, m) ss = silhouette_score(train_x1, m) #print(ss) #print(ps) a = f1_score(train_y.flatten(), m, average='weighted') #print(a) #plot_clusters(train_x1, m) return ps, a, ss
def fcm_alg(dataset_num, num_of_samples=10000, num_of_clusters=None, get_figure=False, calc_ami_score=True): data = d.get_data(dataset_num, n_samples=num_of_samples) # store the data frame which is given by a dimension reduction (using PCA) into 2 dimensions of the original # data set df = d.get_df_to_cluster(data) tag = d.get_tag(data) # if the number of clusters isn't defined, choose it to be the "real" number of clusters (according to the tag) if num_of_clusters is None: num_of_clusters = d.get_num_of_clusters(tag) # create a FCM-type object with the relevant number of clusters and fit it to the data frame fcm = FCM(n_clusters=num_of_clusters) fcm.fit(df) # store the labels result after the fitting labels = fcm.predict(df) if get_figure: # plot the clustered data and the centroid of each cluster plt.scatter(df["PC1"], df["PC2"], c=labels) plt.scatter(fcm.centers["PC1"], fcm.centers["PC2"], marker="*", label='centroid', c='black') plt.legend() title = 'DS{} - Fuzzy C Means'.format(dataset_num) fig_name = 'images/dataset {}/'.format( dataset_num) + title + " ({} clusters)".format(num_of_clusters) plt.title(title) # save the figure plt.savefig(fig_name) plt.show() # calculate the adjusted mutual info score of the clustering if calc_ami_score: labels_true = d.get_labels(tag) return adjusted_mutual_info_score(labels_true=labels_true, labels_pred=labels)
def fuuzyc(train_x1, tag, gender, race): #standarization pca = PCA(n_components=2) pca.fit(train_x1) train_x1 = pca.transform(train_x1) fcm = FCM(n_clusters=9) fcm.fit(train_x1) m = fcm.predict(train_x1) ps = [] ps.append(purity_score(tag, m)) ps.append(purity_score(gender, m)) ps.append(purity_score(race, m)) ss = silhouette_score(train_x1, m) #print(ss) #print(ps) a = [] a.append(f1_score(tag.flatten(), m, average='weighted')) a.append(f1_score(gender.flatten(), m, average='weighted')) a.append(f1_score(race.flatten(), m, average='weighted')) #print(a) # plot_clusters(train_x1, m) return ps, a, ss
def fuuzyc(train_x1, vtype, weekend, rev, c): #standarization pca = PCA(n_components=2) pca.fit(train_x1) train_x1 = pca.transform(train_x1) fcm = FCM(n_clusters=c) fcm.fit(train_x1) m = fcm.predict(train_x1) ps = [] ps.append(purity_score(vtype, m)) ps.append(purity_score(weekend, m)) ps.append(purity_score(rev, m)) ss = silhouette_score(train_x1, m) print(ss) print(ps) a = [] a.append(f1_score(vtype.flatten(), m, average='weighted')) a.append(f1_score(weekend.flatten(), m, average='weighted')) a.append(f1_score(rev.flatten(), m, average='weighted')) print(a) plot_clusters(train_x1, m) return ps, a, ss
def merge_clusters(rgb_array_in, counts_from_cluster, clsuter_1D_method='Diff'): use_std = True # This is not working as expected, so we are setting it to False rgb_array = rgb_array_in.copy( ) # we need to make a copy, and it has to be of float type to prevent overflow rgb_array = np.expand_dims(rgb_array, axis=0) hsv = cv2.cvtColor(rgb_array, cv2.COLOR_RGB2HSV) # how about COLOR_BGR2HLS? hsv = np.squeeze(hsv, axis=0) if clsuter_1D_method == 'Diff': labels = differntial_1D_cluster( hsv[:, 0]) # hsv[:, 0] is the hue component elif clsuter_1D_method == 'MeanSift': result, ms = cluster_1D( hsv[:, 0:1] ) # hsv[:, 0:1] # getting the h value to be used in clustering labels = result['labels'] elif clsuter_1D_method == '2nd_fcm': # not so good clf = FCM(n_clusters=13) clf.fit(rgb_array_in) labels = clf.predict(rgb_array_in) rgb_array = clf.centers.round() rgb_array = np.expand_dims(rgb_array, axis=0) counts_from_cluster = Counter(labels) else: print('Incorrect choice of clsuter_1D_method') labels = 0 if use_std and clsuter_1D_method != '2nd_fcm': # second stage, decompose similar hue(s) labels = decompose_hue(labels, rgb_array_in.copy()) # now, average simliar colors rgb_array = average_similar_colors_pix_cnt(rgb_array, counts_from_cluster, labels) return rgb_array, labels
import time def RGBXY(image): shape = list(image.shape) shape[2] = 5 img = np.zeros(shape) img[:, :, :3] = image / 255 indexes = np.array([[i, j] for i in range(shape[0]) for j in range(shape[1])]) img[:, :, 3:] = indexes.reshape((shape[0], shape[1], 2)) img[:, :, 3] = img[:, :, 3] / shape[0] img[:, :, 4] = img[:, :, 4] / shape[1] return img.reshape([-1, 5]), indexes c = int(input("Numbers of clusters(c): ")) img = np.asarray(Image.open('1558014721_E7jyWs_iiit_d.jpg')).copy() t1 = time.time() features, indexes = RGBXY(img) fcm = FCM(n_clusters=c) fcm.fit(features) fcm_centers = fcm.centers fcm_labels = fcm.predict(features) print(fcm_centers, fcm_centers.shape) img = fcm_centers[fcm_labels, :3].reshape((img.shape[0], img.shape[1], 3)) print("Time taken: %f" % (time.time() - t1)) plt.imshow(img) plt.axis('off') plt.show()
import pandas as pd from fcmeans import FCM dataset = pd.read_csv( 'D://Visual Exercise//Python//New folder//Fuzzy-C-Means Clustering//Fuzzy-C-Means Clustering//Mall_Customers.csv' ) #Getting data set X = dataset.iloc[:, [3, 4]].values print(X) # fit the fuzzy-c-means fcm = FCM(n_clusters=3, max_iter=150, random_state=0) fcm.fit(X) y_pred = fcm.predict(X) print(y_pred) # outputs #predict and labels are same fcm_centers = fcm.centers fcm_labels = fcm.u.argmax(axis=1) print(fcm_labels) # Visualising the clusters plt.scatter(X[y_pred == 0, 0], X[y_pred == 0, 1], s=100, c='red',
inertia = km.inertia_ cents = km.cluster_centers_ cents_list.append(cents) inert_list.append(inertia) # Get best centroids to use for full clustering best_cents = cents_list[inert_list.index(min(inert_list))] # fit the fuzzy-c-means fcm = FCM(n_clusters=2, first_center=best_cents, max_iter=500, random_state=42) fcm.fit(X) probability = fcm.predict(x_add) probability_df = pd.DataFrame(data=probability[:, 2], columns=['predict']) cluster = collections.Counter(probability[:, 2]) print(cluster) if cluster[0.0] >= cluster[1.0]: probability_df.loc[probability_df['predict'] == 0.0, 'predict'] = name probability_df.loc[probability_df['predict'] == 1.0, 'predict'] = 'outlier' else: probability_df.loc[probability_df['predict'] == 1.0, 'predict'] = name probability_df.loc[probability_df['predict'] == 0.0, 'predict'] = 'outlier' result_df = pd.DataFrame(data=x_add, columns=data.columns) result_df['actual'] = list(label) result_df['predict'] = probability_df['predict'].tolist()
import geopandas as gpd import matplotlib.pyplot as plt import seaborn as sns from shapely.geometry import Point, Polygon from sklearn.cluster import KMeans from fcmeans import FCM colnames = [ 'ozone', 'particullate_matter', 'carbon_monoxide', 'sulfure_dioxide', 'nitrogen_dioxide', 'longitude', 'latitude', 'timestamp' ] df = pd.read_csv('./pollution data/combined_dataset.csv', names=colnames) # plt.scatter(x=df['longitude'], y=df['latitude']) # plt.show() numOfClusters = 4 fcm = FCM(n_clusters=numOfClusters) fcm.fit(map[['longitude', 'latitude']]) y_fcmeans = fcm.predict(df[['longitude', 'latitude']]) plt.scatter(df['longitude'], y=df['latitude'], c=y_fcmeans, s=50, cmap='viridis') df['cluster_label'] = y_fcmeans centers = fcm.centers_ plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5) plt.show() # df.to_csv ('clustered_data.csv', index=None, header = True)
import numpy as np from fcmeans import FCM from matplotlib import pyplot as plt import pandas as pd from mpl_toolkits.mplot3d import Axes3D n_samples = 3000 df = pd.read_excel('120_data.xlsx', header=None) X = df.to_numpy() fcm = FCM(n_clusters=3) fcm.fit(X) # outputs fcm_centers = fcm.centers fcm_labels = fcm.predict(X) # plot result fig = plt.figure() ax = Axes3D(fig) ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=fcm_labels, cmap='Set1') plt.show()
13.051967221349628, 4.065577055191767, 29.922603051099713, 7.58443241651873, 25286.235250015743, 17660.494008300826, 21851.365482692607, 14800.013759329424, 69.73815264740048, 5.022356752792227, 1233.9959969564013, 4.065577055191767, 7.58443241651873, 13.311492579165701, 35.45257318447153, 1053.2675415630874 ]] # fit the fuzzy-c-means fcm = FCM(n_clusters=2, first_center=centers, max_iter=100) fcm.fit(X_train) # outputs fcm_centers = fcm.centers # 첫번째는 'Bengin' cetroid, 두번쨰는 'attack' centroid fcm_labels = fcm.u.argmax(axis=1) probability = fcm.predict(X_test) result_df = pd.DataFrame(data=probability, columns=[0, 1, 'pre_class']) result_df['class'] = y_test print(color.BOLD + "\nValidate records in cluster(find invalid record)" + color.END) dif_data = check_different(result_df) print(dif_data.shape) #probability threshold(pt)를 기준으로 부합한 데이터 추출 # 0.4 <= pt <= 0.6 small_num = [round(i * (0.01), 2) for i in range(1, 50)] small_num.reverse() big_num = [round(i * (0.01), 2) for i in range(51, 101)] result_data = pd.DataFrame(index=range(0, len(big_num)),
def compute_centers_PSC(language, labels, num_protos=10): encoding = torch.load(f'logs/train_Encodings_{language}.pt') # hate_usage = torch.load(f'logs/train_pred_{language}.pt') points = np.zeros((encoding.shape[0], encoding.shape[-1])) for i in range(len(points)): well = 0 for j in range(encoding.shape[1]): # if hate_usage[i][j] == labels[i]: points[i] += encoding[i][j] well += 1.0 points[i] /= well fcm = FCM(n_clusters=num_protos) fcm.fit(points) fcm_labels = fcm.predict(points) #%% # plot result # plt.scatter(X[:,0], X[:,1], c=fcm_labels, alpha=.1) colors = ['b', 'g', 'r', 'y', 'c', 'b', 'g', 'r', 'y', 'c'] protos = [] for i in range(num_protos): idx = list(np.where(fcm_labels == i)[0].reshape(-1)) homogeneus = True for j in idx: if labels[j] != labels[idx[0]]: homogeneus = False break if homogeneus == True: midle = points[idx].sum(axis=0) / len(idx) protos.append(idx[0]) closeness = None for j in range(len(idx)): d = cosine_similarity( midle.reshape(1, points.shape[1]), points[idx[j]].reshape(1, points.shape[1])) if closeness == None or closeness < d: closeness = d protos[-1] = idx[j] else: Major_class = 0 if labels[idx].sum() > len(idx) / 2: Major_class = 1 for j in range(len(idx)): if labels[idx[j]] == Major_class: new_p = None closeness = None for k in range(len(idx)): if labels[idx[k]] != Major_class: d = cosine_similarity( points[idx[j]].reshape(1, points.shape[1]), points[idx[k]].reshape(1, points.shape[1])) if closeness == None or closeness < d: closeness = d new_p = idx[k] protos.append(new_p) new_p = None closeness = None for k in range(len(idx)): if labels[idx[k]] == Major_class: d = cosine_similarity( points[protos[-1]].reshape(1, points.shape[1]), points[idx[k]].reshape(1, points.shape[1])) if closeness == None or closeness < d: closeness = d new_p = idx[k] protos.append(new_p) protos = list(set(protos)) P_set = [] N_set = [] for i in protos: if labels[i] == 1: P_set.append(i) else: N_set.append(i) print( f'{bcolors.BOLD}Computed prototypes {language}:\t{len(protos)}\nNegative: {len(N_set)} Positive: {len(P_set)}{bcolors.ENDC}' ) P_idx = list(np.argwhere(labels == 1).reshape(-1)) N_idx = list(np.argwhere(labels == 0).reshape(-1)) Z = TSNE(n_components=2).fit_transform(points) P = Z[P_idx] N = Z[N_idx] C = Z[P_set] F = Z[N_set] colors = ['b', 'g', 'r', 'y', 'w'] plt.scatter(P[:, 0], P[:, 1], c='c', label='Pos', alpha=.5) plt.scatter(N[:, 0], N[:, 1], c='r', label='Neg', alpha=.3) plt.scatter(C[:, 0], C[:, 1], c='0', label='Proto_Pos', alpha=.7) plt.scatter(F[:, 0], F[:, 1], c='#723a91', label='Proto_Neg', alpha=.7) plt.legend(loc=1) plt.savefig(f'logs/protos_{language}.png') plt.close() return P_set, N_set