def test_construct_from_data(self): """ Check viability and correctness of an LST constructed directly from a tabular dataset. """ tree = dcl.construct_tree(self.dataset, self.k, prune_threshold=self.gamma) self._check_tree_viability(tree) self._check_tree_correctness(tree)
def test_load(self): """ Check viability and correctness of an LST saved then loaded from file. """ tree = dcl.construct_tree(self.dataset, self.k, prune_threshold=self.gamma) with tempfile.NamedTemporaryFile() as f: tree.save(f.name) tree2 = dcl.load_tree(f.name) self._check_tree_viability(tree2) self._check_tree_correctness(tree)
def setUp(self): ## Data parameters np.random.seed(451) self.n = 1000 mix = (0.3, 0.5, 0.2) mean = (-1, 0, 1) stdev = (0.3, 0.2, 0.1) ## Tree parameters k = 50 self.gamma = 5 ## Simulate data membership = np.random.multinomial(self.n, pvals=mix) dataset = np.array([], dtype=np.float) for (p, mu, sigma) in zip(membership, mean, stdev): draw = np.random.normal(loc=mu, scale=sigma, size=p) dataset = np.append(dataset, draw) dataset = np.sort(dataset).reshape((self.n, 1)) self.tree = dcl.construct_tree(dataset, k, self.gamma)
X = np.zeros((n, p), dtype=np.float) g = np.zeros((n, ), dtype=np.int) b = np.cumsum((0, ) + tuple(membership)) for i, (size, mu, sigma) in enumerate(zip(membership, centers, sdev)): ix = range(b[i], b[i + 1]) X[ix, :] = np.random.multivariate_normal(mu, sigma, size) g[ix] = i X = np.sort(X, axis=0) ## Estimate the level set tree. k = int(0.02 * n) gamma = int(0.05 * n) tree = dcl.construct_tree(X, k, prune_threshold=gamma, verbose=True) print tree ## Retrieve cluster assignments from the tree. labels = tree.get_clusters(method='leaf') ## Labels returned from the `get_clusters` method match the index of the # highest density node to which an observation belongs. Because these labels # are usually non-consecutive, we can reindex to make many post-processing # steps more natural. new_labels = dcl.utils.reindex_cluster_labels(labels) print "cluster counts:", np.bincount(new_labels[:, 1]) ## Plot the level set tree as a dendrogram. The plot function returns a tuple # containing 4 objects. The first item is a matplotlib figure, which can be # shown and saved.
stay_points = list(stay_points) stay_points = [s.strip('\n').split(',') for s in stay_points] size = len(stay_points) for u in range(size): users_list.append(i) stay_points = np.asarray(stay_points, dtype='float32') total_stay_points.append(stay_points) total_stay_points = [item for sublist in total_stay_points for item in sublist] total_stay_points = np.asarray(total_stay_points) # write total_stay_points.txt total_stay_points_file = open("./Clustering/total_stay_points.txt", "w") for itr in total_stay_points: total_stay_points_file.write(str(itr) + "\n") # clustering tree = dcl.construct_tree(total_stay_points, k=50) labels = tree.get_clusters() # write the Label.txt labels_file = open("./Clustering/Label.txt", "w") for itr in labels: labels_file.write(str(itr) + "\n") # write TBHG to the TBHG.txt tree_file = open("./Clustering/TBHG.txt", "w") tree_str = str(tree) tree_file.write(tree_str) # determine the specific user in the each low-level cluster cluster_dict = {} for entry in labels:
g = np.zeros((n,), dtype=np.int) b = np.cumsum((0,) + tuple(membership)) for i, (size, mu, sigma) in enumerate(zip(membership, centers, sdev)): ix = range(b[i], b[i + 1]) X[ix, :] = np.random.multivariate_normal(mu, sigma, size) g[ix] = i X = np.sort(X, axis=0) ## Estimate the level set tree. k = int(0.02 * n) gamma = int(0.05 * n) tree = dcl.construct_tree(X, k, prune_threshold=gamma, verbose=True) print tree ## Retrieve cluster assignments from the tree. labels = tree.get_clusters(method="leaf") ## Labels returned from the `get_clusters` method match the index of the # highest density node to which an observation belongs. Because these labels # are usually non-consecutive, we can reindex to make many post-processing # steps more natural. new_labels = dcl.utils.reindex_cluster_labels(labels) print "cluster counts:", np.bincount(new_labels[:, 1])
circles = make_circles(500, factor=0.5, noise=0.06, random_state=23) blob = make_blobs(100, centers=1, center_box=(-1.7, 1.7), cluster_std=0.1, random_state=19) X = np.vstack((circles[0], blob[0])) print("Dataset shape:", X.shape) with plt.style.context('ggplot'): fig, ax = plt.subplots(figsize=(6, 4.5)) ax.scatter(X[:, 0], X[:, 1], c='black', s=50, alpha=0.5) fig.show() import debacl as dcl tree = dcl.construct_tree(X, k=20) print(tree) plot = tree.plot() plot[0].show() pruned_tree = tree.prune(60) pruned_tree.plot()[0].show() cluster_labels = pruned_tree.get_clusters() print("Cluster labels shape:", cluster_labels.shape)