Ejemplo n.º 1
0
def test_kmeans_spec():
    features, targets = generate_cluster_data(n_samples=100,
                                              n_features=2,
                                              n_centers=2,
                                              cluster_stds=.1)
    model = KMeans(2)
    model.fit(features)
    assert (hasattr(model, 'means'))
    def fit(self, features):
        """
        Fit GMM to the given data using `self.n_clusters` number of Gaussians.
        Features can have greater than 2 dimensions.

        Args:
            features (np.ndarray): array containing inputs of size
                (n_samples, n_features).
        Returns:
            None (saves model - means, covariances, and mixing weights - internally)
        """
        # 1. Use your KMeans implementation to initialize the means of the GMM.
        kmeans = KMeans(self.n_clusters)
        kmeans.fit(features)
        self.means = kmeans.means

        # 2. Initialize the covariance matrix and the mixing weights
        self.covariances = self._init_covariance(features.shape[-1])

        # 3. Initialize the mixing weights
        self.mixing_weights = np.random.rand(self.n_clusters)
        self.mixing_weights /= np.sum(self.mixing_weights)

        # 4. Compute log_likelihood under initial random covariance and KMeans means.
        prev_log_likelihood = -float('inf')
        log_likelihood = self._overall_log_likelihood(features)

        # 5. While the log_likelihood is increasing significantly, or max_iterations has
        # not been reached, continue EM until convergence.
        n_iter = 0
        while log_likelihood - prev_log_likelihood > 1e-4 and n_iter < self.max_iterations:
            prev_log_likelihood = log_likelihood

            assignments = self._e_step(features)
            self.means, self.covariances, self.mixing_weights = (self._m_step(
                features, assignments))

            log_likelihood = self._overall_log_likelihood(features)
            n_iter += 1

        # Since self.covariances is assumed to have a certain shape, I need to reshape it in the end
        if self.covariance_type == 'spherical':
            # self.covariances is assumed to be a 1-D array of variances
            final_covariances = np.zeros(self.n_clusters)

            for index, covariance in enumerate(self.covariances):
                final_covariances[index] = np.mean(np.diagonal(covariance))

            self.covariances = final_covariances

        elif self.covariance_type == 'diagonal':
            # self.covariances is assumed to be a 2-D array
            final_covariances = np.zeros((self.n_clusters, features.shape[-1]))

            for index, covariance in enumerate(self.covariances):
                final_covariances[index] = np.diagonal(covariance)

            self.covariances = final_covariances
Ejemplo n.º 3
0
def test_kmeans_spec():
    features, targets = generate_cluster_data(n_samples=100,
                                              n_features=2,
                                              n_centers=2,
                                              cluster_stds=.1)
    model = KMeans(2)
    rng_state = np.random.get_state()
    model.fit(features)
    np.random.set_state(rng_state)
    assert (hasattr(model, 'means'))
Ejemplo n.º 4
0
def test_kmeans_on_generated():
    n_samples = [1000, 10000]
    n_centers = [2]
    stds = [.1]
    n_features = [1, 2, 4]

    for n in n_samples:
        for f in n_features:
            for c in n_centers:
                for s in stds:
                    features, targets = generate_cluster_data(n_samples=n,
                                                              n_features=f,
                                                              n_centers=c,
                                                              cluster_stds=s)
                    # make model and fit
                    model = KMeans(c)

                    # Depending on how many random() calls the student code
                    # makes, it can mess with the random state used to generate
                    # data for subsequent tests and lead to an "impossible"
                    # input distribution that can't achieve the desired
                    # performance.  To avoid this, we save and restore the
                    # random state so the student code can't interfere with it.
                    rng_state = np.random.get_state()
                    model.fit(features)
                    np.random.set_state(rng_state)

                    means = model.means
                    orderings = permutations(means)
                    distance_to_true_means = []

                    actual_means = np.array([
                        features[targets == i, :].mean(axis=0)
                        for i in range(targets.max() + 1)
                    ])

                    for ordering in orderings:
                        _means = np.array(list(ordering))

                        distance_to_true_means.append(
                            np.abs(_means - actual_means).sum())

                    assert (min(distance_to_true_means) < 1e-1)

                    # predict and calculate adjusted mutual info
                    labels = model.predict(features)
                    acc = adjusted_mutual_info(targets, labels)
                    assert (acc >= .9)
Ejemplo n.º 5
0
    def fit(self, features):
        """
        Fit GMM to the given data using `self.n_clusters` number of Gaussians.
        Features can have greater than 2 dimensions.

        Args:
            features (np.ndarray): array containing inputs of size
                (n_samples, n_features).
        Returns:
            None (saves model - means, covariances, and mixing weights - internally)
        """
        # 1. Use your KMeans implementation to initialize the means of the GMM.
        kmeans = KMeans(self.n_clusters)
        kmeans.fit(features)
        self.means = kmeans.means

        # 2. Initialize the covariance matrix and the mixing weights
        self.covariances = self._init_covariance(features.shape[-1])

        # 3. Initialize the mixing weights
        self.mixing_weights = np.random.rand(self.n_clusters)
        self.mixing_weights /= np.sum(self.mixing_weights)

        # print("\nmixing_weights.shape = ", self.mixing_weights.shape)
        # print("\nmeans.shape = ", self.means.shape)
        # print("\ncovariances.shape = ", self.covariances.shape)

        # 4. Compute log_likelihood under initial random covariance and KMeans means.
        prev_log_likelihood = -float('inf')
        log_likelihood = self._overall_log_likelihood(features)

        # 5. While the log_likelihood is increasing significantly, or max_iterations has
        # not been reached, continue EM until convergence.
        n_iter = 0
        while abs(log_likelihood -
                  prev_log_likelihood) > 1e-4 and n_iter < self.max_iterations:
            prev_log_likelihood = log_likelihood

            assignments = self._e_step(features)
            self.means, self.covariances, self.mixing_weights = (self._m_step(
                features, assignments))

            log_likelihood = self._overall_log_likelihood(features)
            n_iter += 1
Ejemplo n.º 6
0
def test_kmeans_on_generated():
    n_samples = [1000, 10000]
    n_centers = [2]
    stds = [.1]
    n_features = [1, 2, 4]

    for n in n_samples:
        for f in n_features:
            for c in n_centers:
                for s in stds:
                    features, targets = generate_cluster_data(n_samples=n,
                                                              n_features=f,
                                                              n_centers=c,
                                                              cluster_stds=s)
                    # make model and fit
                    model = KMeans(c)
                    model.fit(features)

                    means = model.means
                    orderings = permutations(means)
                    distance_to_true_means = []

                    actual_means = np.array([
                        features[targets == i, :].mean(axis=0)
                        for i in range(targets.max() + 1)
                    ])

                    for ordering in orderings:
                        _means = np.array(list(ordering))

                        distance_to_true_means.append(
                            np.abs(_means - actual_means).sum())

                    assert (min(distance_to_true_means) < 1e-1)

                    # predict and calculate adjusted mutual info
                    labels = model.predict(features)
                    acc = adjusted_mutual_info(targets, labels)
                    assert (acc >= .9)
Ejemplo n.º 7
0
    def fit(self, features):
        kmeans = KMeans(self.n_clusters)
        kmeans.fit(features)
        self.means = kmeans.means

        self.covariances = self._init_covariance(features.shape[-1])

        self.mixing_weights = np.random.rand(self.n_clusters)
        self.mixing_weights /= np.sum(self.mixing_weights)

        prev_log_likelihood = -float('inf')
        log_likelihood = self._overall_log_likelihood(features)

        n_iter = 0
        while log_likelihood - prev_log_likelihood > 1e-4 and n_iter < self.max_iterations:
            prev_log_likelihood = log_likelihood

            assignments = self._e_step(features)
            self.means, self.covariances, self.mixing_weights = (
                self._m_step(features, assignments)
            )

            log_likelihood = self._overall_log_likelihood(features)
            n_iter += 1
Ejemplo n.º 8
0
            county_data2 = np.array(ip_list)
        else:
            county_data2 = np.vstack((county_data2, np.array(ip_list)))
print(max)
for col in range(county_data2.shape[1]):
    avg = np.sum(county_data2[:,col])/np.nonzero(county_data2[:,col])[0].shape
    for county in range(county_data2.shape[0]):
        if county_data2[county,col] == 0:
            county_data2[county,col] = avg
county_data = np.concatenate((county_data1, county_data2[0:county_data1.shape[0],:]), axis=1)
print(county_data1.shape)
print(county_data2.shape)
print(county_data.shape)


kmeans_learner = KMeans(2)
kmeans_learner.fit(county_data)
labels = kmeans_learner.predict(data)
cluster4 = features[labels == 4,:]
cluster3 = features[labels == 3,:]
cluster2 = features[labels == 2,:]
cluster1 = features[labels == 1,:]
cluster0 = features[labels == 0,:]
cluster4_targets = targets[labels == 4]
cluster3_targets = targets[labels == 3]
cluster2_targets = targets[labels == 2]
cluster1_targets = targets[labels == 1]
cluster0_targets = targets[labels == 0]

cm = plt.get_cmap('jet')
###