def do_stuff(dataset = None, metric = True, drtype = "mds", components = 2): data_for_mds = np.array(dataset) if drtype: if drtype == "mds": mds = manifold.MDS(n_components=components, n_init=10, max_iter=3000, dissimilarity="euclidean", n_jobs=1, metric=metric) mds_result = mds.fit(data_for_mds) elif drtype == "pca": pca = PCA(n_components=2) mds_result = pca.fit(euclidean_distances(data_for_mds)).transform(data_for_mds) elif drtype == "tsne": model = manifold.TSNE(n_components=2, random_state=0, learning_rate=1000, early_exaggeration=10.0) mds_result = model.fit_transform(data_for_mds) clusterings = {} for i in range(10, 1, -1): clustering = ac(n_clusters=i, memory=mkdtemp()) clusterings[i] = clustering.fit(data_for_mds).labels_.tolist() clustering = ac(n_clusters=1, memory=mkdtemp()) clustering.fit(data_for_mds) output = { "drInfo": None, "embedding": None, "clustering": { "tree": clustering.children_.tolist(), "labels": clusterings } } if drtype: median_distance = False stress1 = False raw_stress = False if drtype == "mds": raw_stress = mds_result.stress_ disparities = euclidean_distances(data_for_mds) disparityHalfMatrix = np.triu(disparities) sumSquaredDisparities = np.sum(np.square(disparityHalfMatrix)) stress1 = math.sqrt(mds_result.stress_ / sumSquaredDisparities) median_distance = np.median(euclidean_distances(mds_result.embedding_)) embedding = mds_result.embedding_.tolist() print mds_result.stress_ else: embedding = mds_result.tolist() output["drInfo"] = { "type": drtype, "metric": metric, "components": components, "stress1": stress1, "rawStress":raw_stress, "medianDistance": median_distance } output["embedding"] = embedding return output
def do_stuff(dataset = None, metric = True, drtype = "mds", components = 2): data_for_mds = np.array(dataset) if drtype: if drtype == "mds": mds = manifold.MDS(n_components=components, n_init=10, max_iter=3000, dissimilarity="euclidean", n_jobs=1, metric=metric) mds_result = mds.fit(data_for_mds) elif drtype == "pca": pca = PCA(n_components=2) mds_result = pca.fit(euclidean_distances(data_for_mds)).transform(data_for_mds) elif drtype == "tsne": model = manifold.TSNE(n_components=2, random_state=0, learning_rate=1000, early_exaggeration=10.0) mds_result = model.fit_transform(data_for_mds) clusterings = {} for i in range(10, 1, -1): clustering = ac(n_clusters=i, memory=mkdtemp()) clusterings[i] = clustering.fit(data_for_mds).labels_.tolist() clustering = ac(n_clusters=1, memory=mkdtemp()) clustering.fit(data_for_mds) output = { "drInfo": None, "embedding": None, "clustering": { "tree": clustering.children_.tolist(), "labels": clusterings } } if drtype: median_distance = False stress1 = False raw_stress = False if drtype == "mds": raw_stress = mds_result.stress_ disparities = euclidean_distances(data_for_mds) disparityHalfMatrix = np.triu(disparities) sumSquaredDisparities = np.sum(np.square(disparityHalfMatrix)) stress1 = math.sqrt(mds_result.stress_ / sumSquaredDisparities) median_distance = np.median(euclidean_distances(mds_result.embedding_)) embedding = mds_result.embedding_.tolist() print(mds_result.stress_) else: embedding = mds_result.tolist() output["drInfo"] = { "type": drtype, "metric": metric, "components": components, "stress1": stress1, "rawStress":raw_stress, "medianDistance": median_distance } output["embedding"] = embedding return output
def average_linkage(dataset): path = "data/" + dataset + ".csv" df = pd.read_csv(path) if dataset == 'dataset1': x = df.iloc[:, [0, 1]].values d = '2D' else: x = df.iloc[:, [0, 1, 2]].values d = '3D' hac = ac(n_clusters = None, distance_threshold = 1, linkage = 'average') hac.fit(x) linkages = create_dendogram(hac) dendrogram(linkages, truncate_mode = 'lastp') plt.title("Dendogram for Average Linkage HAC with " + dataset) filename = "averageHAC" + d + "_dendogram" plt.savefig(filename) plt.clf() if d == '2D': generate_2D_plot(x, 3, 'average') else: generate_3D_plot(x, 26, 'average')
def generate_2D_plot(x, clusters, linkage): hac = ac(n_clusters = clusters, affinity = 'euclidean', linkage = linkage) hac.fit_predict(x) plt.title("Cluster Map for " + linkage.capitalize() + " Linkage HAC with dataset1") plt.scatter(x[:, 0], x[:, 1], c = hac.labels_) filename = linkage + "HAC2D_cluster" plt.savefig(filename) plt.clf()
def generate_3D_plot(x, clusters, linkage): hac = ac(n_clusters = clusters, affinity = 'euclidean', linkage = linkage) hac.fit_predict(x) fig = plt.figure() ax = Axes3D(fig) ax.set_title("Cluster Map for " + linkage.capitalize() + " Linkage HAC with dataset2") ax.scatter(x[:, 0], x[:, 1], x[:, 2], c = hac.labels_) filename = linkage + "HAC3D_cluster" plt.savefig(filename) plt.clf()
#fit the data points to the k means algorithm kmeans.fit(points) print(kmeans.cluster_centers_) y_kmeans = kmeans.fit_predict(points) f1 = plt.figure() plt.title('K-means clustering') plt.scatter(points[y_kmeans == 0, 0], points[y_kmeans == 0, 1], c='red') plt.scatter(points[y_kmeans == 1, 0], points[y_kmeans == 1, 1], c='blue') plt.scatter(points[y_kmeans == 2, 0], points[y_kmeans == 2, 1], c='black') plt.scatter(points[y_kmeans == 3, 0], points[y_kmeans == 3, 1], c='cyan') plt.show() #create dendogram #dendogram = sch.dendrogram(sch.linkage(points,method='ward')) hc = ac(n_clusters=2, affinity='euclidean', linkage='ward') y_hc = hc.fit_predict(points) f2 = plt.figure() plt.scatter(points[y_hc == 0, 0], points[y_hc == 0, 1], c='red') plt.scatter(points[y_hc == 1, 0], points[y_hc == 1, 1], c='blue') plt.scatter(points[y_hc == 2, 0], points[y_hc == 2, 1], c='black') plt.scatter(points[y_hc == 3, 0], points[y_hc == 3, 1], c='cyan') plt.title('Heirarchical Clustering') plt.show() #Birch clustering bir = Birch(n_clusters=2, threshold=0.8, branching_factor=200) bir.fit(points) y_bir = bir.fit_predict(points)
def run(self, clusters=3): self.y = ac(n_clusters=clusters).fit_predict(self.X)
print "Raw dataset:\n", dataset.head() x = dataset.iloc[:, [3, 4]].values #Taking columns 4 & 5 print "Independent variables:\n", x ## Using dendrogram to find the optimal number of clusters dendrogram = sch.dendrogram(sch.linkage( x, method='ward')) #ward method minimizes variance within each cluster plt.title('Dendrogram') plt.xlabel('Customers') plt.ylabel('Euclidean distances') plt.show() # Largest distance where we can make vertically without crossing any horizontal line: optimal clusters = 5 ## Fitting Hierarchical clustering to the mall dataset hc = ac( n_clusters=5, affinity='euclidean', linkage='ward' ) #affinity = distance to make the linkage, ward method minimizes variance within each cluster. Use the same linkage as the one used to build the dendrogram. y_hc = hc.fit_predict( x) #Fitting AgglomerativeClustering to data x to create vector y. print "Clusters:\n", y_hc #y_hc only shows the clusters. Join this with matrix x to analyse the behaviour of each clusters ##Visualising the clusters (Only for 2d clustering i.e. 2 columns of interest) plt.scatter(x[y_hc == 0, 0], x[y_hc == 0, 1], s=100, c='red', label='Cluster 1') plt.scatter(x[y_hc == 1, 0], x[y_hc == 1, 1], s=100, c='blue',