def produce_clusterings(self): km = kmeans(self.representativities, self.graphlets_per_graph, self.nb_clusters) km.to_string() labels = km.compute() folder = self.Results + '/Clusterings/%s_classes' % self.nb_clusters if not '%s_classes' % self.nb_clusters in listdir(self.Results + '/Clusterings/'): makedirs(folder, exist_ok=True) km.write_results(folder) radar = kiviat(folder) radar.plot_kiviat()
def kmix(xs, k, tolerance=0.01, max_iter=100, verbose=True, init='random'): n, p = xs.shape #n total data points, p is dimension if init == 'kmeans': if verbose == True: print('Initializing points with K-Means++.') mus = list(kMeans.kmeans(xs, k, plus=True, verbose=verbose).keys()) else: if verbose == True: print('Initializing points randomly.') mus = random_points(xs, k) sigmas = [np.array([[1, 0], [0, 1]])] * k pis = [1 / k] * k ll_old = 0 for i in range(max_iter): if verbose == True: print('Iteration {} | loglikelihood {}'.format(i, ll_old)) ''' pij is a kxn array. We iterate over the clusters and then over each data point. pij /= pij.sum(0) divides the columns by the sum of the columns ''' pij = np.zeros((k, n)) for j in range(len(mus)): for i in range(n): pij[j, i] = pis[j] * multivariate_normal.pdf( xs[i], mus[j], sigmas[j]) pij /= pij.sum(0) ''' we iterate over the clusters and then over the data again, and pis is initially the sum of all the data points per cluster. Then we divide each cluster by the length of the total data ''' pis = np.zeros(k) for j in range(len(mus)): for i in range(n): pis[j] += pij[j, i] pis /= n ''' mus is initially the points weighted by the pij. Then we divide mus by the sum of the weights in the cluster ''' mus = np.zeros((k, p)) for j in range(k): for i in range(n): mus[j] += pij[j, i] * xs[i] mus[j] /= pij[j, :].sum() ''' sigmas is initially k p dimensional square matrices Then it becomes the sum of the weighted covariances Then we divide by the sum of the weights in the cluster ''' sigmas = np.zeros((k, p, p)) for j in range(k): for i in range(n): ys = np.reshape(xs[i] - mus[j], (2, 1)) sigmas[j] += pij[j, i] * np.dot(ys, ys.T) sigmas[j] /= pij[j, :].sum() ''' iterate over all data points then over all clusters add the weighted things per cluster, then log ''' ll_new = 0.0 for i in range(n): s = 0 for j in range(k): s += pis[j] * multivariate_normal.pdf(xs[i], mus[j], sigmas[j]) ll_new += np.log(s) if np.abs(ll_new - ll_old) <= tolerance: break ll_old = ll_new return_dict = {} for i in range(n): max_tracker = [] for j in range(k): max_tracker.append( pis[j] * multivariate_normal(mus[j], sigmas[j]).pdf(xs[i])) index = max_tracker.index(max(max_tracker)) if not (tuple(mus[index])) in return_dict.keys(): return_dict[tuple(mus[index])] = [xs[i]] else: return_dict[tuple(mus[index])].append(xs[i]) return return_dict
__author__ = 'I322233' from numpy import * import time import matplotlib.pyplot as plt import kMeans as km ## step 1: load data print "step 1: load data..." dataSet = [] ## read file fileIn = open('kmeans_test_set.txt') for line in fileIn.readlines(): lineArr = line.strip().split(' ') dataSet.append([float(lineArr[0]), float(lineArr[1])]) ## step 2: clustering... print "step 2: clustering..." ## change array to matrix dataSet = mat(dataSet) k = 4 centroids, clusterAssment = km.kmeans(dataSet, k) ## step 3: show the result print "step 3: show the result..." km.showCluster(dataSet, k, centroids, clusterAssment)
def kmix(xs, k, tolerance = 0.01, max_iter=100, verbose = True, init = 'random'): n, p = xs.shape #n total data points, p is dimension if init == 'kmeans': if verbose == True: print('Initializing points with K-Means++.') mus = list(kMeans.kmeans(xs, k, plus = True, verbose = verbose).keys()) else: if verbose == True: print('Initializing points randomly.') mus = random_points(xs, k) sigmas = [np.array([[1, 0],[0, 1]])] * k pis = [1 / k] * k ll_old = 0 for i in range(max_iter): if verbose == True: print('Iteration {} | loglikelihood {}'.format(i, ll_old)) ''' pij is a kxn array. We iterate over the clusters and then over each data point. pij /= pij.sum(0) divides the columns by the sum of the columns ''' pij = np.zeros((k, n)) for j in range(len(mus)): for i in range(n): pij[j, i] = pis[j] * multivariate_normal.pdf(xs[i], mus[j], sigmas[j]) pij /= pij.sum(0) ''' we iterate over the clusters and then over the data again, and pis is initially the sum of all the data points per cluster. Then we divide each cluster by the length of the total data ''' pis = np.zeros(k) for j in range(len(mus)): for i in range(n): pis[j] += pij[j, i] pis /= n ''' mus is initially the points weighted by the pij. Then we divide mus by the sum of the weights in the cluster ''' mus = np.zeros((k, p)) for j in range(k): for i in range(n): mus[j] += pij[j, i] * xs[i] mus[j] /= pij[j, :].sum() ''' sigmas is initially k p dimensional square matrices Then it becomes the sum of the weighted covariances Then we divide by the sum of the weights in the cluster ''' sigmas = np.zeros((k, p, p)) for j in range(k): for i in range(n): ys = np.reshape(xs[i]- mus[j], (2,1)) sigmas[j] += pij[j, i] * np.dot(ys, ys.T) sigmas[j] /= pij[j,:].sum() ''' iterate over all data points then over all clusters add the weighted things per cluster, then log ''' ll_new = 0.0 for i in range(n): s = 0 for j in range(k): s += pis[j] * multivariate_normal.pdf(xs[i], mus[j], sigmas[j]) ll_new += np.log(s) if np.abs(ll_new - ll_old) <= tolerance: break ll_old = ll_new return_dict = {} for i in range(n): max_tracker = [] for j in range(k): max_tracker.append(pis[j] * multivariate_normal(mus[j], sigmas[j]).pdf(xs[i])) index = max_tracker.index(max(max_tracker)) if not (tuple(mus[index])) in return_dict.keys(): return_dict[tuple(mus[index])] = [xs[i]] else: return_dict[tuple(mus[index])].append(xs[i]) return return_dict
plt.clf() def graph_it(data, display = True, file_name = None, verbose = True): ''' The data should be in the form of a dictionary with centers as keys and values arrays file_name, if passed, will write the graph to the output file display, if True, will display the graph upon completion ''' p = len(data[list(data.keys())[0]][0]) if p == 2: return graph2d(data, display, file_name, verbose) elif p == 3: graph3d(data, display, file_name, verbose) #Examples graph_it(kMeans.kmeans(toydata, 3, plus = False)) # graph_it(kMixture.kmix(toydata, 3, init = 'kmeans'), file_name = 'lol.png') # graph_it(kMeans.kmeans(np.random.rand(1000,2), 8, plus = True)) # toydata = pd.read_csv("Data/3Ddata.txt", sep=r"\s+", header = None) # print(toydata) # toydata = np.array([toydata.ix[:,0],toydata.ix[:,1], toydata.ix[:,2]]).transpose() # graph3d(kMeans.kmeans(toydata, 3, plus = True)) def clusterer(n, clusters): per = n // clusters final = np.array([0,0]) for i in range(clusters): center = np.random.rand(1,2) dists = np.random.rand(per, 1)
def main(): modelName = "Word2VectforNLPTraining" model = Word2Vec.load(modelName) # model.init_sims(replace=True) word_vectors = model.syn0 # print(word_vectors[0]) num_clusters = int(word_vectors.shape[0] / 5) # print("number of clusters: {}".format(num_clusters)) # input("Press enter to continue:") print("Clustering...") startTime = time.time() cluster_index = kMeans.kmeans(num_clusters, word_vectors) endTime = time.time() print("Time taken for clustering: {} seconds".format(endTime - startTime)) # create a word/index dictionary, mapping each vocabulary word to a cluster number # zip(): make an iterator that aggregates elements from each of the iterables index_word_map = dict(zip(model.index2word, cluster_index)) def create_bag_of_centroids(reviewData): """ assign each word in the review to a centroid this returns a numpy array with the dimension as num_clusters each will be served as one feature for classification :param reviewData: :return: """ featureVector = np.zeros(num_clusters, dtype=np.float) for word in reviewData: if word in index_word_map: index = index_word_map[word] featureVector[index] += 1 return featureVector train = pd.read_csv("/path/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3) test = pd.read_csv("/path/testData.tsv", header=0, delimiter="\t", quoting=3) trainingDataFV = np.zeros((train["review"].size, num_clusters), dtype=np.float) testDataFV = np.zeros((test["review"].size, num_clusters), dtype=np.float) print("Processing training data...") counter = 0 cleaned_training_data = processData.clean_data(train) for review in cleaned_training_data: trainingDataFV[counter] = create_bag_of_centroids(review) counter += 1 print("Processing test data...") counter = 0 cleaned_test_data = processData.clean_data(test) for review in cleaned_test_data: testDataFV[counter] = create_bag_of_centroids(review) counter += 1 n_estimators = 100 result = randomForestClassifier.rfClassifer(n_estimators, trainingDataFV, train["sentiment"],testDataFV) output = pd.DataFrame(data={"id": test["id"], "sentiment": result}) output.to_csv("Doc2Vec_Clustering.csv", index=False, quoting=3)
centers = [0] * len(key) for key in data.keys(): for i in range(len(centers)): centers[i] = key[i] ax.scatter(centers[0], centers[1], centers[2], marker='o', c='black') if file_name != None: fig.savefig(file_name) #, bbox_inches='tight') if display == True: plt.show() plt.clf() def graph_it(data, display=True, file_name=None, verbose=True): ''' The data should be in the form of a dictionary with centers as keys and values arrays file_name, if passed, will write the graph to the output file display, if True, will display the graph upon completion ''' p = len(data[list(data.keys())[0]][0]) if p == 2: return graph2d(data, display, file_name, verbose) elif p == 3: graph3d(data, display, file_name, verbose) #Examples graph_it(kMeans.kmeans(toydata, 3, plus=True)) # graph_it(kMixture.kmix(toydata, 3, init = 'kmeans'), file_name = 'lol.png') # graph_it(kMeans.kmeans(np.random.rand(1000,3), 4, plus = True))
def test_kmeans(self): # I find it's also hard to test. Because the initial centroids is randomly. dataset = load_dataset("testSet.txt") centroids, cluster_assignment = kmeans(dataset, 4) print centroids