def new_centroids(data,centroids,clusters,clusters_id,num_of_iterations,num_of_clusters,iteration_count): #new_centroid = [[float(0) for _ in range(centroids.shape[1])] for _ in range(num_of_clusters)] new_centroid = [] for i in range(len(clusters)): val = np.array(clusters[i]) new_centroid.append((np.sum(val,0))/len(clusters[i])) new_centroid = np.array(new_centroid) sums = np.sum(np.array(centroids)-np.array(new_centroid)) d = dict() if(sums==0 or iteration_count==iterations): print("Converged") for x in range(len(clusters_id)): for y in range(len(clusters_id[x])): d[clusters_id[x][y]] = x+1 vals = [d[x] for x in sorted(d.keys())] ground_truth = list(map(int,data[:,1])) print("Jaccard") ja = helpers.jaccard(ground_truth,vals) print(ja) print("Rand index") rd = helpers.rand(ground_truth,vals) print(rd) unique_predicted = list(set(vals)) new_x = helpers.pca(data[:,2:]) helpers.scatter(new_x[:,0],new_x[:,1],vals,unique_predicted,"K-means Algorithm","iyer.txt") else: kmeans(data,new_centroid,iterations,no_cluster,iteration_count)
def new_centroids(reducedSpace, centroids, clusters, clusters_id, num_of_iterations, num_clusters, iteration_count): new_centroid = [] for i in range(len(clusters)): val = np.array(clusters[i]) new_centroid.append((np.sum(val, 0)) / len(clusters[i])) new_centroid = np.array(new_centroid) sums = np.sum(np.array(centroids) - np.array(new_centroid)) # sums = 0 d = dict() if (sums == 0 or iteration_count == iterations): print("Converged") for x in range(len(clusters_id)): for y in range(len(clusters_id[x])): d[clusters_id[x][y]] = x + 1 vals = [d[x] for x in sorted(d.keys())] vals = np.array(vals) print(vals) print(set(vals)) # ground_truth = list(map(int,GeneExpressions[:,1])) print("Jaccard") ja = helpers.jaccard(Groundtruth, vals) print(ja) print("Rand index") rd = helpers.rand(Groundtruth, vals) print(rd) unique_predicted = list(set(vals)) new_x = helpers.pca(GeneExpressions) helpers.scatter(new_x[:, 0], new_x[:, 1], vals, unique_predicted) else: kmeans(reducedSpace, new_centroid, iterations, num_clusters, iteration_count)
min_pts = float(input("Enter min_pts value: ")) # 3. Perform DBSCAN model = __dbscan.DBSCAN(X, epsilon, min_pts) predicted = model.fit() unique, counts = np.unique(predicted, return_counts=True) print("Counts by cluster:") for key, value in zip(unique, counts): print("{}: {}".format(key, value)) # 4. Find Rand index and Jaccard rand_score = helpers.rand(y, predicted) jaccard_score = helpers.jaccard(y, predicted) unique_predicted = list(set(predicted)) print(predicted) print(rand_score) print(jaccard_score) # print(adjusted_rand_score(y, predicted)) # print(jaccard_similarity_score(y, predicted)) # 5. Visualize using PCA new_X = X if X.shape[1] > 2: new_X = helpers.pca(X) helpers.scatter(new_X[:, 0], new_X[:, 1], predicted, unique_predicted, "DBSCAN", file_name)
# Testing data #distMatrix = [[0.00,0.71,5.66,3.61,4.24,3.20],[0.71,0.00,4.95,2.92,3.54,2.50],[5.66,4.95,0.00,2.24,1.41,2.50],[3.61,2.92,2.24,0.00,1.00,0.50],[4.24,3.54,1.41,1.00,0.00,1.12],[3.20,2.50,2.50,0.50,1.12,0.00]] #rowsnumbers = [[0],[1],[2],[3],[4],[5]] distMatrix = np.array(distMatrix) #print(distMatrix) while (len(distMatrix) >= 2): if (len(rowsnumbers) == k): break distMatrix, rowsnumbers = updateDistMatrix(distMatrix, rowsnumbers) print(rowsnumbers) clusterassignments = {} cluster = 1 for i in rowsnumbers: for j in i: clusterassignments[j + 1] = cluster cluster += 1 sorted(clusterassignments) sorted(Groundtruth) predicted = list(clusterassignments.values()) unique_predicted = list(set(predicted)) Ground = list(Groundtruth.values()) print(h.jaccard(Ground, predicted)) print(h.rand(Ground, predicted)) new_X = h.pca(GeneExpressions) h.scatter(new_X[:, 0], new_X[:, 1], predicted, unique_predicted)
plt.gca().set_aspect('equal', adjustable='box') plt.show(block=False) input('Program paused. Press enter to continue.') ## =============== Part 2: Principal Component Analysis =============== # You should now implement PCA, a dimension reduction technique. You # should complete the code in pca.m # print('Running PCA on example dataset.\n') # Before running PCA, it is important to first normalize X X_norm, mu, _ = featureNormalize(X) # Run PCA U, S = pca(X_norm) # Compute mu, the mean of the each feature # Draw the eigenvectors centered at mean of data. These lines show the # directions of maximum variations in the dataset. plt.hold(True) drawLine(mu, mu + 1.5 * S[0, 0] * U[:, 0].T, c='k', linewidth=2) drawLine(mu, mu + 1.5 * S[1, 1] * U[:, 1].T, c='k', linewidth=2) plt.hold(False) print('Top eigenvector: \n') print(' U(:,1) = {:f} {:f} \n'.format(U[0, 0], U[1, 0])) print('(you should expect to see -0.707107 -0.707107)') input('Program paused. Press enter to continue.')