def test_kmeans_spec(): features, targets = generate_cluster_data(n_samples=100, n_features=2, n_centers=2, cluster_stds=.1) model = KMeans(2) model.fit(features) assert (hasattr(model, 'means'))
def fit(self, features): """ Fit GMM to the given data using `self.n_clusters` number of Gaussians. Features can have greater than 2 dimensions. Args: features (np.ndarray): array containing inputs of size (n_samples, n_features). Returns: None (saves model - means, covariances, and mixing weights - internally) """ # 1. Use your KMeans implementation to initialize the means of the GMM. kmeans = KMeans(self.n_clusters) kmeans.fit(features) self.means = kmeans.means # 2. Initialize the covariance matrix and the mixing weights self.covariances = self._init_covariance(features.shape[-1]) # 3. Initialize the mixing weights self.mixing_weights = np.random.rand(self.n_clusters) self.mixing_weights /= np.sum(self.mixing_weights) # 4. Compute log_likelihood under initial random covariance and KMeans means. prev_log_likelihood = -float('inf') log_likelihood = self._overall_log_likelihood(features) # 5. While the log_likelihood is increasing significantly, or max_iterations has # not been reached, continue EM until convergence. n_iter = 0 while log_likelihood - prev_log_likelihood > 1e-4 and n_iter < self.max_iterations: prev_log_likelihood = log_likelihood assignments = self._e_step(features) self.means, self.covariances, self.mixing_weights = (self._m_step( features, assignments)) log_likelihood = self._overall_log_likelihood(features) n_iter += 1 # Since self.covariances is assumed to have a certain shape, I need to reshape it in the end if self.covariance_type == 'spherical': # self.covariances is assumed to be a 1-D array of variances final_covariances = np.zeros(self.n_clusters) for index, covariance in enumerate(self.covariances): final_covariances[index] = np.mean(np.diagonal(covariance)) self.covariances = final_covariances elif self.covariance_type == 'diagonal': # self.covariances is assumed to be a 2-D array final_covariances = np.zeros((self.n_clusters, features.shape[-1])) for index, covariance in enumerate(self.covariances): final_covariances[index] = np.diagonal(covariance) self.covariances = final_covariances
def test_kmeans_spec(): features, targets = generate_cluster_data(n_samples=100, n_features=2, n_centers=2, cluster_stds=.1) model = KMeans(2) rng_state = np.random.get_state() model.fit(features) np.random.set_state(rng_state) assert (hasattr(model, 'means'))
def test_kmeans_on_generated(): n_samples = [1000, 10000] n_centers = [2] stds = [.1] n_features = [1, 2, 4] for n in n_samples: for f in n_features: for c in n_centers: for s in stds: features, targets = generate_cluster_data(n_samples=n, n_features=f, n_centers=c, cluster_stds=s) # make model and fit model = KMeans(c) # Depending on how many random() calls the student code # makes, it can mess with the random state used to generate # data for subsequent tests and lead to an "impossible" # input distribution that can't achieve the desired # performance. To avoid this, we save and restore the # random state so the student code can't interfere with it. rng_state = np.random.get_state() model.fit(features) np.random.set_state(rng_state) means = model.means orderings = permutations(means) distance_to_true_means = [] actual_means = np.array([ features[targets == i, :].mean(axis=0) for i in range(targets.max() + 1) ]) for ordering in orderings: _means = np.array(list(ordering)) distance_to_true_means.append( np.abs(_means - actual_means).sum()) assert (min(distance_to_true_means) < 1e-1) # predict and calculate adjusted mutual info labels = model.predict(features) acc = adjusted_mutual_info(targets, labels) assert (acc >= .9)
def fit(self, features): """ Fit GMM to the given data using `self.n_clusters` number of Gaussians. Features can have greater than 2 dimensions. Args: features (np.ndarray): array containing inputs of size (n_samples, n_features). Returns: None (saves model - means, covariances, and mixing weights - internally) """ # 1. Use your KMeans implementation to initialize the means of the GMM. kmeans = KMeans(self.n_clusters) kmeans.fit(features) self.means = kmeans.means # 2. Initialize the covariance matrix and the mixing weights self.covariances = self._init_covariance(features.shape[-1]) # 3. Initialize the mixing weights self.mixing_weights = np.random.rand(self.n_clusters) self.mixing_weights /= np.sum(self.mixing_weights) # print("\nmixing_weights.shape = ", self.mixing_weights.shape) # print("\nmeans.shape = ", self.means.shape) # print("\ncovariances.shape = ", self.covariances.shape) # 4. Compute log_likelihood under initial random covariance and KMeans means. prev_log_likelihood = -float('inf') log_likelihood = self._overall_log_likelihood(features) # 5. While the log_likelihood is increasing significantly, or max_iterations has # not been reached, continue EM until convergence. n_iter = 0 while abs(log_likelihood - prev_log_likelihood) > 1e-4 and n_iter < self.max_iterations: prev_log_likelihood = log_likelihood assignments = self._e_step(features) self.means, self.covariances, self.mixing_weights = (self._m_step( features, assignments)) log_likelihood = self._overall_log_likelihood(features) n_iter += 1
def test_kmeans_on_generated(): n_samples = [1000, 10000] n_centers = [2] stds = [.1] n_features = [1, 2, 4] for n in n_samples: for f in n_features: for c in n_centers: for s in stds: features, targets = generate_cluster_data(n_samples=n, n_features=f, n_centers=c, cluster_stds=s) # make model and fit model = KMeans(c) model.fit(features) means = model.means orderings = permutations(means) distance_to_true_means = [] actual_means = np.array([ features[targets == i, :].mean(axis=0) for i in range(targets.max() + 1) ]) for ordering in orderings: _means = np.array(list(ordering)) distance_to_true_means.append( np.abs(_means - actual_means).sum()) assert (min(distance_to_true_means) < 1e-1) # predict and calculate adjusted mutual info labels = model.predict(features) acc = adjusted_mutual_info(targets, labels) assert (acc >= .9)
def fit(self, features): kmeans = KMeans(self.n_clusters) kmeans.fit(features) self.means = kmeans.means self.covariances = self._init_covariance(features.shape[-1]) self.mixing_weights = np.random.rand(self.n_clusters) self.mixing_weights /= np.sum(self.mixing_weights) prev_log_likelihood = -float('inf') log_likelihood = self._overall_log_likelihood(features) n_iter = 0 while log_likelihood - prev_log_likelihood > 1e-4 and n_iter < self.max_iterations: prev_log_likelihood = log_likelihood assignments = self._e_step(features) self.means, self.covariances, self.mixing_weights = ( self._m_step(features, assignments) ) log_likelihood = self._overall_log_likelihood(features) n_iter += 1
county_data2 = np.array(ip_list) else: county_data2 = np.vstack((county_data2, np.array(ip_list))) print(max) for col in range(county_data2.shape[1]): avg = np.sum(county_data2[:,col])/np.nonzero(county_data2[:,col])[0].shape for county in range(county_data2.shape[0]): if county_data2[county,col] == 0: county_data2[county,col] = avg county_data = np.concatenate((county_data1, county_data2[0:county_data1.shape[0],:]), axis=1) print(county_data1.shape) print(county_data2.shape) print(county_data.shape) kmeans_learner = KMeans(2) kmeans_learner.fit(county_data) labels = kmeans_learner.predict(data) cluster4 = features[labels == 4,:] cluster3 = features[labels == 3,:] cluster2 = features[labels == 2,:] cluster1 = features[labels == 1,:] cluster0 = features[labels == 0,:] cluster4_targets = targets[labels == 4] cluster3_targets = targets[labels == 3] cluster2_targets = targets[labels == 2] cluster1_targets = targets[labels == 1] cluster0_targets = targets[labels == 0] cm = plt.get_cmap('jet') ###