def test_kmeans_breastcancer(self):
        # python -m unittest tests_clustering.Tests_Clustering.test_kmeans_breastcancer
        from discomll.clustering import kmeans
        from sklearn.cluster import KMeans

        max_iter = 10
        clusters = 2
        random_seed = 2

        x_train, _, x_test, _ = datasets.breastcancer_disc()
        train_data, test_data = datasets.breastcancer_disc_discomll()

        kmeans2 = KMeans(n_clusters=clusters, max_iter=max_iter, n_init=1, random_state=random_seed).fit(x_train)
        centroids1 = kmeans2.cluster_centers_
        predictions1 = kmeans2.predict(x_test)

        centroids_url = kmeans.fit(train_data,
                                   n_clusters=clusters,
                                   max_iterations=max_iter,
                                   random_state=random_seed)

        predictions_url = kmeans.predict(test_data, centroids_url)
        predictions2 = [v[0] for k, v in result_iterator(predictions_url)]
        centroids2 = [v["x"] for k, v in result_iterator(centroids_url["kmeans_fitmodel"])]

        centroids2[0], centroids2[1] = centroids2[1], centroids2[0]

        self.assertTrue(np.allclose(centroids1, centroids2))
    def test_kmeans_iris(self):
        # python -m unittest tests_clustering.Tests_Clustering.test_kmeans_iris
        from discomll.clustering import kmeans
        from sklearn.cluster import KMeans

        max_iter = 10
        clusters = 3
        random_seed = 0

        x_train, y_train, x_test, y_test = datasets.iris()
        train_data, test_data = datasets.iris_discomll()

        sk_kmeans = KMeans(n_clusters=clusters, max_iter=max_iter, n_init=1, random_state=random_seed).fit(x_train)
        centroids1 = sk_kmeans.cluster_centers_
        # predictions1 = sk_kmeans.predict(x_test)

        centroids_url = kmeans.fit(train_data,
                                   n_clusters=clusters,
                                   max_iterations=max_iter,
                                   random_state=random_seed)

        predictions_url = kmeans.predict(test_data, centroids_url)
        # predictions2 = [v[1] for k,v in result_iterator(predictions_url)]

        centroids2 = [v["x"] for k, v in result_iterator(centroids_url["kmeans_fitmodel"])]
        centroids2[0], centroids2[2] = centroids2[2], centroids2[0]
        self.assertTrue(np.allclose(centroids1, centroids2))
    def test_kmeans_breastcancer(self):
        # python -m unittest tests_clustering.Tests_Clustering.test_kmeans_breastcancer
        from discomll.clustering import kmeans
        from sklearn.cluster import KMeans

        max_iter = 10
        clusters = 2
        random_seed = 2

        x_train, _, x_test, _ = datasets.breastcancer_disc()
        train_data, test_data = datasets.breastcancer_disc_discomll()

        kmeans2 = KMeans(n_clusters=clusters,
                         max_iter=max_iter,
                         n_init=1,
                         random_state=random_seed).fit(x_train)
        centroids1 = kmeans2.cluster_centers_
        predictions1 = kmeans2.predict(x_test)

        centroids_url = kmeans.fit(train_data,
                                   n_clusters=clusters,
                                   max_iterations=max_iter,
                                   random_state=random_seed)

        predictions_url = kmeans.predict(test_data, centroids_url)
        predictions2 = [v[0] for k, v in result_iterator(predictions_url)]
        centroids2 = [
            v["x"]
            for k, v in result_iterator(centroids_url["kmeans_fitmodel"])
        ]

        centroids2[0], centroids2[1] = centroids2[1], centroids2[0]

        self.assertTrue(np.allclose(centroids1, centroids2))
    def test_kmeans_iris(self):
        # python -m unittest tests_clustering.Tests_Clustering.test_kmeans_iris
        from discomll.clustering import kmeans
        from sklearn.cluster import KMeans

        max_iter = 10
        clusters = 3
        random_seed = 0

        x_train, y_train, x_test, y_test = datasets.iris()
        train_data, test_data = datasets.iris_discomll()

        sk_kmeans = KMeans(n_clusters=clusters,
                           max_iter=max_iter,
                           n_init=1,
                           random_state=random_seed).fit(x_train)
        centroids1 = sk_kmeans.cluster_centers_
        # predictions1 = sk_kmeans.predict(x_test)

        centroids_url = kmeans.fit(train_data,
                                   n_clusters=clusters,
                                   max_iterations=max_iter,
                                   random_state=random_seed)

        predictions_url = kmeans.predict(test_data, centroids_url)
        # predictions2 = [v[1] for k,v in result_iterator(predictions_url)]

        centroids2 = [
            v["x"]
            for k, v in result_iterator(centroids_url["kmeans_fitmodel"])
        ]
        centroids2[0], centroids2[2] = centroids2[2], centroids2[0]
        self.assertTrue(np.allclose(centroids1, centroids2))
Exemple #5
0
def kmeans_fit(input_dict):
    from discomll.clustering import kmeans

    fitmodel_url = kmeans.fit(input_dict["dataset"],
                              n_clusters=input_dict["clusters"],
                              max_iterations=input_dict["itr"],
                              save_results=True)

    return {"fitmodel_url": fitmodel_url}
Exemple #6
0
def kmeans_fit(input_dict):
    from discomll.clustering import kmeans

    fitmodel_url = kmeans.fit(input_dict["dataset"],
                                n_clusters = input_dict["clusters"],
                                 max_iterations = input_dict["itr"],
                                 save_results = True)

    return {"fitmodel_url" : fitmodel_url}
Exemple #7
0
def kmeans_fit(input_dict):
    from discomll.clustering import kmeans

    random_state = None if input_dict["seed"] == "None" else int(input_dict["seed"])

    fitmodel_url = kmeans.fit(input_dict["dataset"],
                              n_clusters=input_dict["clusters"],
                              max_iterations=input_dict["itr"],
                              random_state=random_state,
                              save_results=True)

    return {"fitmodel_url": fitmodel_url}
Exemple #8
0
def kmeans_fit(input_dict):
    from discomll.clustering import kmeans

    random_state = None if input_dict["seed"] == "None" else int(
        input_dict["seed"])

    fitmodel_url = kmeans.fit(input_dict["dataset"],
                              n_clusters=input_dict["clusters"],
                              max_iterations=input_dict["itr"],
                              random_state=random_state,
                              save_results=True)

    return {"fitmodel_url": fitmodel_url}
Exemple #9
0
from discomll.utils import model_view


# define training dataset
train = dataset.Data(data_tag=["test:breast_cancer_cont"],
                     data_type="chunk",  # define data source - chunk data on ddfs
                     X_indices=xrange(0, 9),  # define attribute indices
                     y_index=9,  # define class index
                     delimiter=",")

# define test dataset
test = dataset.Data(data_tag=["test:breast_cancer_cont_test"],
                    data_type="chunk",  # define data source - chunk data on ddfs
                    X_indices=xrange(0, 9),  # define attribute indices
                    y_index=9,  # define class index
                    delimiter=",")

# fit model on training dataset
fit_model = kmeans.fit(train, n_clusters=2, max_iterations=5, random_state=0)

# output model
model = model_view.output_model(fit_model)
print model

# predict test dataset
predictions = kmeans.predict(test, fit_model)

# output results
for k, v in result_iterator(predictions):
    print k, v
from discomll import dataset
from discomll.clustering import kmeans

train = dataset.Data(data_tag=[
    "http://ropot.ijs.si/data/linear/train/xaaaaa.gz",
    "http://ropot.ijs.si/data/linear/train/xaaabj.gz"
],
                     data_type="gzip",
                     generate_urls=True,
                     X_indices=range(1, 22),
                     id_index=0,
                     delimiter=",")

test = dataset.Data(data_tag=[
    "http://ropot.ijs.si/data/linear/test/xaaaaa.gz",
    "http://ropot.ijs.si/data/linear/test/xaaabj.gz"
],
                    data_type="gzip",
                    generate_urls=True,
                    X_indices=range(1, 22),
                    id_index=0,
                    delimiter=",")

fit_model = kmeans.fit(train, n_clusters=5, max_iterations=10, random_state=0)
predictions = kmeans.predict(test, fit_model)
print predictions