Beispiel #1
0
def _test_gmm_parameters(covariance_type):
    n_samples = [1000]
    n_centers = [2]
    stds = [.1, .5]
    n_features = [2, 4]

    for n in n_samples:
        for f in n_features:
            for c in n_centers:
                for s in stds:
                    features, targets = generate_cluster_data(n_samples=n,
                                                              n_features=f,
                                                              n_centers=c,
                                                              cluster_stds=s)
                    # make model and fit
                    model = GMM(c, covariance_type=covariance_type)
                    model.fit(features)
                    covariances = model.covariances
                    for cov in covariances:
                        assert (np.abs(np.sqrt(cov) - s).mean() < 1e-1)

                    means = model.means
                    orderings = permutations(means)
                    distance_to_true_means = []

                    actual_means = np.array([
                        features[targets == i, :].mean(axis=0)
                        for i in range(targets.max() + 1)
                    ])

                    for ordering in orderings:
                        _means = np.array(list(ordering))

                        distance_to_true_means.append(
                            np.abs(_means - actual_means).sum())
                    assert (min(distance_to_true_means) < 1e-1)

                    mixing_weights = model.mixing_weights
                    orderings = permutations(mixing_weights)
                    distance_to_true_mixing_weights = []

                    actual_mixing_weights = np.array([
                        features[targets == i, :].shape[0]
                        for i in range(targets.max() + 1)
                    ])
                    actual_mixing_weights = actual_mixing_weights / actual_mixing_weights.sum(
                    )

                    for ordering in orderings:
                        _mixing_weights = np.array(list(ordering))

                        distance_to_true_mixing_weights.append(
                            np.abs(_mixing_weights -
                                   actual_mixing_weights).sum())
                    assert (min(distance_to_true_mixing_weights) < 1e-1)

                    # predict and calculate adjusted mutual info
                    labels = model.predict(features)
                    acc = adjusted_mutual_info(targets, labels)
                    assert (acc >= .9)
Beispiel #2
0
def test_kmeans_spec():
    features, targets = generate_cluster_data(n_samples=100,
                                              n_features=2,
                                              n_centers=2,
                                              cluster_stds=.1)
    model = KMeans(2)
    model.fit(features)
    assert (hasattr(model, 'means'))
Beispiel #3
0
def test_kmeans_spec():
    features, targets = generate_cluster_data(n_samples=100,
                                              n_features=2,
                                              n_centers=2,
                                              cluster_stds=.1)
    model = KMeans(2)
    rng_state = np.random.get_state()
    model.fit(features)
    np.random.set_state(rng_state)
    assert (hasattr(model, 'means'))
Beispiel #4
0
def test_gmm_spec():
    features, targets = generate_cluster_data(n_samples=100,
                                              n_features=2,
                                              n_centers=2,
                                              cluster_stds=.1)
    gmm = GMM(2, 'spherical')
    gmm.fit(features)

    assert (hasattr(gmm, 'means'))
    assert (hasattr(gmm, 'covariances'))
    assert (hasattr(gmm, 'mixing_weights'))
Beispiel #5
0
def test_kmeans_on_generated():
    n_samples = [1000, 10000]
    n_centers = [2]
    stds = [.1]
    n_features = [1, 2, 4]

    for n in n_samples:
        for f in n_features:
            for c in n_centers:
                for s in stds:
                    features, targets = generate_cluster_data(n_samples=n,
                                                              n_features=f,
                                                              n_centers=c,
                                                              cluster_stds=s)
                    # make model and fit
                    model = KMeans(c)

                    # Depending on how many random() calls the student code
                    # makes, it can mess with the random state used to generate
                    # data for subsequent tests and lead to an "impossible"
                    # input distribution that can't achieve the desired
                    # performance.  To avoid this, we save and restore the
                    # random state so the student code can't interfere with it.
                    rng_state = np.random.get_state()
                    model.fit(features)
                    np.random.set_state(rng_state)

                    means = model.means
                    orderings = permutations(means)
                    distance_to_true_means = []

                    actual_means = np.array([
                        features[targets == i, :].mean(axis=0)
                        for i in range(targets.max() + 1)
                    ])

                    for ordering in orderings:
                        _means = np.array(list(ordering))

                        distance_to_true_means.append(
                            np.abs(_means - actual_means).sum())

                    assert (min(distance_to_true_means) < 1e-1)

                    # predict and calculate adjusted mutual info
                    labels = model.predict(features)
                    acc = adjusted_mutual_info(targets, labels)
                    assert (acc >= .9)
Beispiel #6
0
def test_generate_cluster_data():
    n_samples = [2000, 20000]
    n_centers = [1, 2]
    stds = [.1, .5, 1.0, 2.0]
    n_features = [1, 2, 4]

    for n in n_samples:
        for f in n_features:
            for c in n_centers:
                for s in stds:
                    X, y = generate_cluster_data(n_samples=n,
                                                 n_features=f,
                                                 n_centers=c,
                                                 cluster_stds=s)

                    assert (X.shape == (n, f))
                    assert (y.max() == c - 1)

                    for i in range(y.max()):
                        subset = X[y == i]
                        assert (np.abs(np.std(subset, axis=0) - s).mean() < s)
Beispiel #7
0
def test_kmeans_on_generated():
    n_samples = [1000, 10000]
    n_centers = [2]
    stds = [.1]
    n_features = [1, 2, 4]

    for n in n_samples:
        for f in n_features:
            for c in n_centers:
                for s in stds:
                    features, targets = generate_cluster_data(n_samples=n,
                                                              n_features=f,
                                                              n_centers=c,
                                                              cluster_stds=s)
                    # make model and fit
                    model = KMeans(c)
                    model.fit(features)

                    means = model.means
                    orderings = permutations(means)
                    distance_to_true_means = []

                    actual_means = np.array([
                        features[targets == i, :].mean(axis=0)
                        for i in range(targets.max() + 1)
                    ])

                    for ordering in orderings:
                        _means = np.array(list(ordering))

                        distance_to_true_means.append(
                            np.abs(_means - actual_means).sum())

                    assert (min(distance_to_true_means) < 1e-1)

                    # predict and calculate adjusted mutual info
                    labels = model.predict(features)
                    acc = adjusted_mutual_info(targets, labels)
                    assert (acc >= .9)
Beispiel #8
0
def _test_gmm_parameters(covariance_type):
    n_samples = [1000]
    n_centers = [2]
    stds = [.1, .5]
    n_features = [2, 4]

    for n in n_samples:
        for f in n_features:
            for c in n_centers:
                for s in stds:
                    features, targets = generate_cluster_data(n_samples=n,
                                                              n_features=f,
                                                              n_centers=c,
                                                              cluster_stds=s)
                    # make model and fit
                    model = GMM(c, covariance_type=covariance_type)
                    model.fit(features)
                    covariances = model.covariances
                    for cov in covariances:
                        print("mean cov: ", np.abs(np.sqrt(cov) - s).mean())
                        if (np.abs(np.sqrt(cov) - s).mean() < 1e-1):
                            return