def new_centroids(data,centroids,clusters,clusters_id,num_of_iterations,num_of_clusters,iteration_count): #new_centroid = [[float(0) for _ in range(centroids.shape[1])] for _ in range(num_of_clusters)] new_centroid = [] for i in range(len(clusters)): val = np.array(clusters[i]) new_centroid.append((np.sum(val,0))/len(clusters[i])) new_centroid = np.array(new_centroid) sums = np.sum(np.array(centroids)-np.array(new_centroid)) d = dict() if(sums==0 or iteration_count==iterations): print("Converged") for x in range(len(clusters_id)): for y in range(len(clusters_id[x])): d[clusters_id[x][y]] = x+1 vals = [d[x] for x in sorted(d.keys())] ground_truth = list(map(int,data[:,1])) print("Jaccard") ja = helpers.jaccard(ground_truth,vals) print(ja) print("Rand index") rd = helpers.rand(ground_truth,vals) print(rd) unique_predicted = list(set(vals)) new_x = helpers.pca(data[:,2:]) helpers.scatter(new_x[:,0],new_x[:,1],vals,unique_predicted,"K-means Algorithm","iyer.txt") else: kmeans(data,new_centroid,iterations,no_cluster,iteration_count)
def bruteforce(X): distances = {} for i, x in enumerate(X): for j, y in enumerate(X): distances.setdefault(i, []).append((j, jaccard(x, y))) for k in distances: distances[k].sort(key=lambda x: -x[1]) return distances
def greene(step_communities, similarity=0.5, death=3): dynamic = [] for community in step_communities[0]: dynamic.append(DynamicCommunity(community)) del step_communities[0] for i, step in enumerate(step_communities): to_add = [] # match communities to fronts for d_idx, d in enumerate(dynamic): if d.is_dead() is None: for c in step: if helpers.jaccard(dynamic[d_idx].get_front()[0], c) > similarity: if dynamic[d_idx].get_front()[1] < i+1: # update the front dynamic[d_idx].add_community(c, i+1) else: # create a split community split = DynamicCommunity(c, i+1, dynamic[d_idx].get_timeline()) split.define_split(d_idx, i+1) to_add.append(split) dynamic[d_idx].observed() # kill inactive communities if dynamic[d_idx].get_front()[1] < i+1 and dynamic[d_idx].unobserved() > death: dynamic[d_idx].kill(i+1) # create dynamic communities for unmatched communities for c in step: matched = False for d in dynamic: if d.is_dead() is None and helpers.jaccard(d.get_front()[0], c) == 1: matched = True break if not matched: to_add.append(DynamicCommunity(c, i+1)) dynamic.extend(to_add) return dynamic
def new_centroids(reducedSpace, centroids, clusters, clusters_id, num_of_iterations, num_clusters, iteration_count): new_centroid = [] for i in range(len(clusters)): val = np.array(clusters[i]) new_centroid.append((np.sum(val, 0)) / len(clusters[i])) new_centroid = np.array(new_centroid) sums = np.sum(np.array(centroids) - np.array(new_centroid)) # sums = 0 d = dict() if (sums == 0 or iteration_count == iterations): print("Converged") for x in range(len(clusters_id)): for y in range(len(clusters_id[x])): d[clusters_id[x][y]] = x + 1 vals = [d[x] for x in sorted(d.keys())] vals = np.array(vals) print(vals) print(set(vals)) # ground_truth = list(map(int,GeneExpressions[:,1])) print("Jaccard") ja = helpers.jaccard(Groundtruth, vals) print(ja) print("Rand index") rd = helpers.rand(Groundtruth, vals) print(rd) unique_predicted = list(set(vals)) new_x = helpers.pca(GeneExpressions) helpers.scatter(new_x[:, 0], new_x[:, 1], vals, unique_predicted) else: kmeans(reducedSpace, new_centroid, iterations, num_clusters, iteration_count)
def experiment(hashes, estimation, X, other_data): X_hashes = [h.hash(X) for h in hashes] for Y in other_data: Y_hashes = [h.hash(Y) for h in hashes] yield (jaccard(X, Y), estimation(X_hashes, Y_hashes))
epsilon = float(input("Enter epsilon value: ")) min_pts = float(input("Enter min_pts value: ")) # 3. Perform DBSCAN model = __dbscan.DBSCAN(X, epsilon, min_pts) predicted = model.fit() unique, counts = np.unique(predicted, return_counts=True) print("Counts by cluster:") for key, value in zip(unique, counts): print("{}: {}".format(key, value)) # 4. Find Rand index and Jaccard rand_score = helpers.rand(y, predicted) jaccard_score = helpers.jaccard(y, predicted) unique_predicted = list(set(predicted)) print(predicted) print(rand_score) print(jaccard_score) # print(adjusted_rand_score(y, predicted)) # print(jaccard_similarity_score(y, predicted)) # 5. Visualize using PCA new_X = X if X.shape[1] > 2: new_X = helpers.pca(X) helpers.scatter(new_X[:, 0], new_X[:, 1], predicted, unique_predicted, "DBSCAN", file_name)
# Testing data #distMatrix = [[0.00,0.71,5.66,3.61,4.24,3.20],[0.71,0.00,4.95,2.92,3.54,2.50],[5.66,4.95,0.00,2.24,1.41,2.50],[3.61,2.92,2.24,0.00,1.00,0.50],[4.24,3.54,1.41,1.00,0.00,1.12],[3.20,2.50,2.50,0.50,1.12,0.00]] #rowsnumbers = [[0],[1],[2],[3],[4],[5]] distMatrix = np.array(distMatrix) #print(distMatrix) while (len(distMatrix) >= 2): if (len(rowsnumbers) == k): break distMatrix, rowsnumbers = updateDistMatrix(distMatrix, rowsnumbers) print(rowsnumbers) clusterassignments = {} cluster = 1 for i in rowsnumbers: for j in i: clusterassignments[j + 1] = cluster cluster += 1 sorted(clusterassignments) sorted(Groundtruth) predicted = list(clusterassignments.values()) unique_predicted = list(set(predicted)) Ground = list(Groundtruth.values()) print(h.jaccard(Ground, predicted)) print(h.rand(Ground, predicted)) new_X = h.pca(GeneExpressions) h.scatter(new_X[:, 0], new_X[:, 1], predicted, unique_predicted)