def _compare_clusters(**datasets): for name, dataset in datasets.items(): pca = RandomizedPCA(2) pca.fit(dataset) X = pca.transform(dataset) instances = _kmeans() for instance in instances: instance.fit(dataset) # reduce to 2d for visualisation draw_cluster_2d(instance, X, filename="%s-kmeans-%s.png" % (name, instance.k)) ms_instances = _meanshift(dataset) for instance in ms_instances: instance.fit(dataset) compare_pies( [_get_distribution(i) for i in instances] + [_get_distribution(i) for i in ms_instances], ["KMeans(%s)" % i.k for i in instances] + ["MeanShift(%s)" % round(i.bandwidth) for i in ms_instances], filename="%s-pie.png" % name)
def _draw(dataset, filename, title): pca = RandomizedPCA(2) pca.fit(dataset) X = pca.transform(dataset) draw_2d(X, filename, title)
def lininit(self): """X = UsigmaWT, XTX = Wsigma^2WT, T = XW = Usigma Further, we can get lower ranks by using just few of the eigen vevtors T(2) = U(2)sigma(2) = XW(2) ---> 2 is the number of selected eigenvectors This is how we initialize the map, just by using the first two first eigen vals and eigenvectors Further, we create a linear combination of them in the new map by giving values from -1 to 1 in each Direction of SOM map It shoud be noted that here, X is the covariance matrix of original data """ msize = getattr(self, 'mapsize') rows = msize[0] cols = msize[1] nnodes = getattr(self, 'nnodes') if np.min(msize) > 1: coord = np.zeros((nnodes, 2)) for i in range(0, nnodes): coord[i, 0] = int(i / cols) # x coord[i, 1] = int(i % cols) # y mx = np.max(coord, axis=0) mn = np.min(coord, axis=0) coord = (coord - mn) / (mx - mn) coord = (coord - .5) * 2 data = getattr(self, 'data') me = np.mean(data, 0) data = (data - me) codebook = np.tile(me, (nnodes, 1)) pca = RandomizedPCA(n_components=2) # Randomized PCA is scalable # pca = PCA(n_components=2) pca.fit(data) eigvec = pca.components_ eigval = pca.explained_variance_ norms = np.sqrt(np.einsum('ij, ij->i', eigvec, eigvec)) eigvec = ((eigvec.T / norms) * eigval).T; eigvec.shape for j in range(nnodes): for i in range(eigvec.shape[0]): codebook[j, :] = codebook[j, :] + coord[j, i] * eigvec[i, :] return np.around(codebook, decimals=6) elif np.min(msize) == 1: coord = np.zeros((nnodes, 1)) for i in range(0, nnodes): # coord[i, 0] = int(i / cols) # x coord[i, 0] = int(i % cols) # y mx = np.max(coord, axis=0) mn = np.min(coord, axis=0) # print coord coord = (coord - mn) / (mx - mn) coord = (coord - 0.5) * 2 # print coord data = getattr(self, 'data') me = np.mean(data, 0) data = (data - me) codebook = np.tile(me, (nnodes, 1)) pca = RandomizedPCA(n_components=1) # Randomized PCA is scalable # pca = PCA(n_components=2) pca.fit(data) eigvec = pca.components_ eigval = pca.explained_variance_ norms = np.sqrt(np.einsum('ij, ij->i', eigvec, eigvec)) eigvec = ((eigvec.T / norms) * eigval).T; eigvec.shape for j in range(nnodes): for i in range(eigvec.shape[0]): codebook[j, :] = codebook[j, :] + coord[j, i] & eigvec[i, :] return np.around(codebook, decimals=6)