def train(data): X,_ = HAC.preprocess(data) test_data = X time_constraint = data.time_constraint ### optimize by number of clusters if time_constraint == 1: best_score = -1.1 best_results = None for number in [2,4,8]: results = agg(n_clusters=number).fit_predict(test_data) if silhouette_score(test_data,results) > best_score: best_score = silhouette_score(test_data,results) best_results = results if time_constraint == 2: best_score = -1.1 best_results = None for number in [2,3,4,5,6,7,8,9,10]: results = agg(n_clusters=number).fit_predict(test_data) if silhouette_score(test_data,results) > best_score: best_score = silhouette_score(test_data,results) best_results = results if time_constraint == 3: best_score = -1.1 best_results = None for number in [2,4,8]: for func in ['euclidean','manhattan']: results = agg(n_clusters=number,affinity=func).fit_predict(test_data) if silhouette_score(test_data,results) > best_score: best_score = silhouette_score(test_data,results) best_results = results if time_constraint == 4: best_score = -1.1 best_results = None for number in [2,3,4,5,6,7,8]: for func in ['euclidean','manhattan']: results = agg(n_clusters=number,affinity=func).fit_predict(test_data) if silhouette_score(test_data,results) > best_score: best_score = silhouette_score(test_data,results) best_results = results if time_constraint == 5: best_score = -1.1 best_results = None for number in [2,3,4,5,6,7,8]: for func in ['euclidean','manhattan','l1','l2','cosine']: results = agg(n_clusters=number,affinity=func).fit_predict(test_data) if silhouette_score(test_data,results) > best_score: best_score = silhouette_score(test_data,results) best_results = results return test_data,None,results,None,None
def single_link_cluster(): #data = 'dataset1.csv' data = 'dataset2.csv' # Read in the data df = pd.read_csv(data) #x_val = df.iloc[:, [0, 1]].values x_val = df.iloc[:, [0, 1, 2]].values hac = agg(n_clusters=None, distance_threshold=1, linkage='single') hac.fit(x_val) # Formulate matrix of linkages for dendrogram plotting linkages = create_dendrogram(hac) dendrogram(linkages, truncate_mode='lastp') plt.title("Dendrogram for Single Linkage HAC on " + data) #plt.savefig("singleHAC2D_dendrogram.png") plt.savefig("singleHAC3D_dendrogram.png") plt.clf() #single_cluster2D(x_val) single_cluster3D(x_val)
def radviz_sort_features(matrix, reduce=4): Agg = agg(n_clusters=None, distance_threshold=0) #m.preprocess(500) #data = m.a.X.todense() Agg.fit(matrix.T) sorted_ft = [a for a in Agg.children_.flatten() if a < matrix.shape[1]] if reduce: sorted_ft = [g[0] for g in grouper(sorted_ft, reduce)] return matrix[:, sorted_ft]
def single_cluster2D(x_val): hac = agg(n_clusters=28, affinity='euclidean', linkage='single') hac.fit_predict(x_val) plt.title("Cluster Map for Single Linkage HAC on dataset1.csv") plt.scatter(x_val[:, 0], x_val[:, 1], c=hac.labels_) plt.savefig("singleHAC2D_cluster.png") plt.clf()
def average_cluster3D(x_val): hac = agg(n_clusters=26, affinity='euclidean', linkage='average') hac.fit_predict(x_val) fig = plt.figure() a = Axes3D(fig) a.set_title("Cluster Map for Single Linkage HAC on dataset2.csv") a.scatter(x_val[:, 0], x_val[:, 1], x_val[:, 2], c=hac.labels_) plt.savefig("averageHAC3D_cluster.png")
def make_tree_parallel_agg(data: np.ndarray, names: np.ndarray) -> Tree: """ Nj tree is made from random sampling with replacement from the distance matrix. :param data: :param names: :return: """ np.random.seed(randint(0, 1000000)) selected_ids = np.random.choice(np.arange(names.shape[0]), size=names.shape[0], replace=True) hc = agg() hc.fit(data[selected_ids]) return Tree.from_sklearn(hc, names=names[selected_ids])
def cluster(M, num_cluster, acc_list, dist_func, smooth_alpha): """ @param M: the matrix of data points to be clustered. Each column is a market @param num_cluster: user-defined @param acc_list: list of accuracies with ith entry being the model accuracy for ith market's model; this affects each clusters' weightage. Cluster with higher average cluster will have a higher weightage @param dist_func: func(x,y) returns the distance between x and y @param smooth_alpha: smoothen each markets to make the clustering easier. When smooth_alpha=5, by taking weekly average price instead of using daily price direcly """ M = normalize(smooth(M, smooth_alpha)) D = compute_dist_matrix(M, dist_func) clusters = agg(n_clusters=num_cluster, affinity='precomputed', linkage='average').fit_predict(D) weightage = np.ones(num_cluster) for i in range(num_cluster): weightage[i] = np.mean(acc_list[clusters == i]) weightage = weightage / np.sum(weightage) return clusters, weightage
def prepare_bootstrap_trees_agg( data_array: np.ndarray, names: [None, List[str]] = None, iteration: int = 10, n_threads: int = 1, linkage: str = "average") -> Tuple[Tree, List[Tree]]: if names is None: names = [str(x) for x in range(data_array.shape[0])] hc = agg(linkage=linkage) hc.fit(data_array) tree: Tree = Tree.from_sklearn(hc, names) names = np.array(names) ray.init(num_cpus=n_threads) names_ray = ray.put(names) data_ray = ray.put(data_array) other_trees: List[Tree] = ray.get([ make_tree_parallel_agg.remote(data_ray, names_ray) for _ in range(iteration) ]) ray.shutdown() return tree.root, other_trees
contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred) return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) df = pd.read_csv("D:\sem3\ds3\data_science_3\lab11\inLab\Iris.csv") x = list(df["Species"]) df1 = df.iloc[:, 1:5] pca = PCA(n_components=4).fit(df1) reduced_data = PCA(n_components=2).fit_transform(df1) X, Y = zip(*reduced_data) plt.scatter(X, Y) agg_clustering = agg(n_clusters=3).fit(reduced_data) plt.scatter(X, Y, c=agg_clustering.labels_) plt.title("Agglomerative_clustering_model") plt.show() print("Agglomerative_clustering_model purity score is ", purity_score(x, agg_clustering.labels_)) print( "DBCAN_clustering_model ##################################################33" ) EPS = [0.05, 0.5, 0.95] MIN_SAMPLES = [1, 5, 10, 20] for eps_ in EPS: for min_ in MIN_SAMPLES: dbscan_clustering = DBSCAN(eps=eps_, min_samples=min_).fit(reduced_data)