def init_q_with_kmeans(self,data): ''' Fonction qui initialise notre algorithme EM Paramètres: data:(np.array(nb_samples,nb_composante)) Les échantillons sur lesquels sera calculé EM ''' self.q_e_step = np.zeros([data.shape[0],self.k]) km = KMeans(self.k) km.fit(data) prediction = km.predict(data) for i in range(data.shape[0]): self.q_e_step[i,prediction[i]]=1
def _init_parameters(self,data) : if self.init == 'random' : self.mu = data[np.random.choice(data.shape[0], self.k, replace=False)] self.pi = [1/self.k for j in range(self.k)] self.q = 1/self.k * np.ones((data.shape[0],self.k)) elif self.init == 'kmeans' : clf = KMeans(k=self.k, random_seed=self.random_seed, init='kmeans++') clf.fit(data) self.mu = clf.centers self.pi = [np.sum(clf.labels==j)/data.shape[0] for j in range(self.k)] self.q = np.zeros((data.shape[0],self.k)) for index, label in np.ndenumerate(clf.labels): self.q[index,int(label)] = 1 self.sigma = np.zeros((self.k,data.shape[1],data.shape[1])) if self.format_covariance == 'isotropic' : for j in range(self.k) : sigma_squared = sum([self.q[i,j]*np.dot(x_i-self.mu[j, :], x_i-self.mu[j, :]) for (i,x_i) in enumerate(data)])/(2*np.sum(self.q[:, j])) self.sigma[j] = sigma_squared * np.identity(data.shape[1]) elif self.format_covariance == 'general' : for j in range(self.k) : mu_j = self.mu[j, :].reshape((-1, 1)) self.sigma[j] = sum([self.q[i,j]*(x_i.reshape((-1,1))-mu_j).dot(x_i.reshape((-1,1)).T-mu_j.T) for (i,x_i) in enumerate(data)])/np.sum(self.q[:, j])
import pandas as pd from Kmeans import KMeans from Plot import Plot dataset = pd.read_csv('data/Mall_Customers.csv') X = dataset.iloc[:, [3, 4]].values clf = KMeans(k=5) y_pred = clf.predict(X) p = Plot() p.plot_in_2d(X, y_pred, title="K-Means Clustering")
matrix = [] word_list = [] topic_list = [] place_list = [] with open(filepath, 'rb') as csv_file: reader = csv.reader(csv_file, delimiter=',', quotechar='"') for row in reader: dataMatrix.append(row) for item in dataMatrix[1]: if "_" not in item: word_list.append(item) elif "t_" in item: topic_list.append(item[2:]) elif "p_" in item: place_list.append(item[2:]) word_list = word_list[1:] # Remove 'Article #' words_topics_size = len(topic_list) + len(word_list) for row in dataMatrix[2:]: matrix.append( [row[0]] + map(int, row[1:]) ) return {"topic_list":topic_list, "word_list": word_list, "place_list":place_list, "matrix": matrix} data = parseDM() clusters = int(sys.argv[1]) dist_type = int(sys.argv[2]) kmeans = KMeans(clusters, dist_type, data) kmeans.get_clusters()
elif "t_" in item: topic_list.append(item[2:]) elif "p_" in item: place_list.append(item[2:]) word_list = word_list[1:] # Remove 'Article #' words_topics_size = len(topic_list) + len(word_list) for row in dataMatrix[2:]: matrix.append( [row[0]] + map(int, row[1:]) ) return {"topic_list":topic_list, "word_list": word_list, "place_list":place_list, "matrix": matrix} def write_clusters(clusters): file_name = "Kmeans_clusters_" + str(len(clusters)) + ".txt" f = open(file_name, "w") for cluster in clusters: f.write(' '.join(map(str,cluster))) f.write('\n') f.close() ''' Main ''' data = parseDM() clusters = int(sys.argv[1]) dist_type = int(sys.argv[2]) kmeans = KMeans(clusters, dist_type, data) clusters_ind = kmeans.get_clusters() write_clusters(clusters_ind)
from Kmeans import KMeans import timeit # from sklearn.cluster import KMeans start = timeit.default_timer() #Scikit - Learn # km = KMeans() # X, y = make_blobs(centers=3, n_samples=500, n_features=2, shuffle=True, random_state=17) # print(X.shape) # y_pred = km.fit_predict(X) # # runs in 0.08sec X, y = make_blobs(centers=3, n_samples=500, n_features=2, shuffle=True, random_state=17) print(X.shape) clusters = len(np.unique(y)) print(clusters) k = KMeans(K=clusters, max_iters=150, plot_steps=False) y_pred = k.fit_predict(X) k.plot() #runs in 1.8 seconds so not as effcient as kmean from sklearn but better than i thought stop = timeit.default_timer() print('Time: ', stop - start)
reader = csv.reader(csv_file, delimiter=',', quotechar='"') for row in reader: dataMatrix.append(row) for item in dataMatrix[1]: if "_" not in item: word_list.append(item) elif "t_" in item: topic_list.append(item[2:]) elif "p_" in item: place_list.append(item[2:]) word_list = word_list[1:] # Remove 'Article #' words_topics_size = len(topic_list) + len(word_list) for row in dataMatrix[2:]: matrix.append([row[0]] + map(int, row[1:])) return { "topic_list": topic_list, "word_list": word_list, "place_list": place_list, "matrix": matrix } data = parseDM() clusters = int(sys.argv[1]) dist_type = int(sys.argv[2]) kmeans = KMeans(clusters, dist_type, data) kmeans.get_clusters()