Beispiel #1
0
def main(method='plusPlusDense'):
    infile = "./data/distributed/kmeans_dense.csv"
    nClusters = 10
    maxIter = 25

    # configure a kmeans-init
    init_algo = d4p.kmeans_init(nClusters, method=method, distributed=True)
    # Load the data
    data = loadtxt(infile, delimiter=',')
    # now slice the data, it would have been better to read only what we need, of course...
    rpp = int(data.shape[0]/d4p.num_procs())
    data = data[rpp*d4p.my_procid():rpp*d4p.my_procid()+rpp,:]

    # compute initial centroids
    init_result = init_algo.compute(data)
    # The results provides the initial centroids
    assert init_result.centroids.shape[0] == nClusters

    # configure kmeans main object
    algo = d4p.kmeans(nClusters, maxIter, distributed=True)
    # compute the clusters/centroids
    result = algo.compute(data, init_result.centroids)
    
    # Note: we could have done this in just one line:
    # d4p.kmeans(nClusters, maxIter, assignFlag=True, distributed=True).compute(data, d4p.kmeans_init(nClusters, method="plusPlusDense", distributed=True).compute(data).centroids)

    # Kmeans result objects provide centroids, goalFunction, nIterations and objectiveFunction
    assert result.centroids.shape[0] == nClusters
    assert result.nIterations <= maxIter
    # we need an extra call to kmeans to get the assignments (not directly supported through parameter assignFlag yet in SPMD mode)
    algo = d4p.kmeans(nClusters, 0, assignFlag=True) # maxIt=0; not distributed, we compute on local data only!
    assignments = algo.compute(data, result.centroids).assignments

    return (assignments, result)
Beispiel #2
0
        def test_kmeans_spmd(self):
            nClusters = 10
            maxIter = 25

            data = np.loadtxt("./data/distributed/kmeans_dense.csv",
                              delimiter=',')

            rpp = int(data.shape[0] / d4p.num_procs())
            spmd_data = data[rpp * d4p.my_procid():rpp * d4p.my_procid() +
                             rpp, :]

            for init_method in [
                    'plusPlusDense', 'parallelPlusDense', 'deterministicDense'
            ]:
                batch_init_res = d4p.kmeans_init(
                    nClusters=nClusters, method=init_method).compute(data)
                spmd_init_res = d4p.kmeans_init(
                    nClusters=nClusters, method=init_method,
                    distributed=True).compute(spmd_data)

                if init_method in ['parallelPlusDense']:
                    print("Warning: It is well known "
                          "that results of parallelPlusDense init "
                          "does not match with batch algorithm")
                else:
                    reason = "Initial centroids with " + init_method
                    reason += " does not match with batch algorithm"
                    self.assertTrue(
                        np.allclose(batch_init_res.centroids,
                                    spmd_init_res.centroids), reason)

                batch_res = d4p.kmeans(nClusters=nClusters,
                                       maxIterations=maxIter).compute(
                                           data, batch_init_res.centroids)
                spmd_res = d4p.kmeans(nClusters=nClusters,
                                      maxIterations=maxIter,
                                      distributed=True).compute(
                                          spmd_data, spmd_init_res.centroids)

                if init_method in ['parallelPlusDense']:
                    print("Warning: It is well known "
                          "that results of parallelPlusDense init "
                          "does not match with batch algorithm")
                else:
                    reason = "Final centroids with " + init_method
                    reason += " does not match with batch algorithm"
                    self.assertTrue(
                        np.allclose(batch_res.centroids, spmd_res.centroids),
                        reason)
Beispiel #3
0
def run_inference(num_observations:int = 1000):
    """Run xgboost for specified number of observations"""
    # Load data
    test_df = common.get_test_data_df(X=common.X_dfc,size = num_observations)
    num_rows = len(test_df)
    ######################
    print("_______________________________________")
    print("Total Number of Rows", num_rows)
    run_times = []
    inference_times = []
    for _ in range(NUM_LOOPS):
        
        start_time = timer()
        init_alg = d4p.kmeans_init(nClusters = 5, fptype = "float",
                                   method = "randomDense")
        centroids = init_alg.compute(test_df).centroids
        alg = d4p.kmeans(nClusters = 5, maxIterations = 100,
                         fptype = "float", accuracyThreshold = 0,
                         assignFlag = False)
        result = alg.compute((test_df), centroids)
        end_time = timer()

        total_time = end_time - start_time
        run_times.append(total_time*10e3)

        inference_time = total_time*(10e6)/num_rows
        inference_times.append(inference_time)

    return_elem = common.calculate_stats(inference_times)
    print(num_observations, ", ", return_elem)
    return return_elem
Beispiel #4
0
def kmeans(N, D, nClusters, maxit):
    a = np.random.ranf((N, D))  # doesn't make much sense, but ok for now
    kmi = daal4py.kmeans_init(nClusters, method='plusPlusDense')
    km = daal4py.kmeans(nClusters, maxit)
    kmr = km.compute(a, kmi.compute(a).centroids)
    return (kmr.centroids, kmr.assignments, kmr.objectiveFunction,
            kmr.goalFunction, kmr.nIterations)
def test_predict(X, X_init):
    algorithm = kmeans(fptype=getFPType(X),
                       nClusters=params.n_clusters,
                       maxIterations=0,
                       assignFlag=True,
                       accuracyThreshold=0.0)
    return algorithm.compute(X, X_init)
Beispiel #6
0
def main(readcsv=read_csv, method='defaultDense'):
    infile = "./data/batch/kmeans_dense.csv"
    nClusters = 20
    maxIter = 5

    initrain_algo = d4p.kmeans_init(nClusters, method="randomDense")
    # Load the data
    data = readcsv(infile, range(20))
    # compute initial centroids
    initrain_result = initrain_algo.compute(data)
    # The results provides the initial centroids
    assert initrain_result.centroids.shape[0] == nClusters

    # configure kmeans main object: we also request the cluster assignments
    algo = d4p.kmeans(nClusters, maxIter, assignFlag=True)
    # compute the clusters/centroids
    result = algo.compute(data, initrain_result.centroids)

    # Note: we could have done this in just one line:
    # d4p.kmeans(nClusters, maxIter, assignFlag=True).compute(
    #     data, d4p.kmeans_init(nClusters, method="plusPlusDense").compute(data).centroids
    # )

    # Kmeans result objects provide assignments (if requested), centroids,
    # goalFunction, nIterations and objectiveFunction
    assert result.centroids.shape[0] == nClusters
    assert result.assignments.shape == (data.shape[0], 1)
    assert result.nIterations <= maxIter

    return result
Beispiel #7
0
def _daal4py_kmeans_compatibility(nClusters, maxIterations, fptype = "double",
    method = "lloydDense", accuracyThreshold = 0.0, resultsToEvaluate = "computeCentroids"):
    kmeans_algo = None
    if daal_check_version((2020, 2), (2021, 107)):
        kmeans_algo = daal4py.kmeans(nClusters = nClusters,
            maxIterations= maxIterations,
            fptype = fptype,
            resultsToEvaluate = resultsToEvaluate,
            method = method)
    else:
        assigFlag = 'computeAssignments' in resultsToEvaluate
        kmeans_algo = daal4py.kmeans(nClusters = nClusters,
            maxIterations= maxIterations,
            fptype = fptype,
            assignFlag = assigFlag,
            method = method)
    return kmeans_algo
def _daal4py_k_means_dense(X, nClusters, numIterations, tol, cluster_centers_0, n_init, random_state):

    if numIterations < 0:
        raise ValueError("Wrong iterations number")

    if hasattr(X, '__array__'):
        X_fptype = getFPType(X)
    else:
        raise NotImplementedError("""Unsupported input type {} encountered in DAAL-based optimization of KMeans.
        You can disable DAAL-based optimizations of scikit-learn with sklearn.daal4sklearn.dispatcher.disable()""".format(type(X)))

    abs_tol = _tolerance(X, tol) # tol is relative tolerance

    best_labels, best_inertia, best_cluster_centers = None, None, None
    best_n_iter = -1

    if numIterations == 0:
        n_init = 1

    kmeans_algo = daal4py.kmeans(
        nClusters = nClusters,
        maxIterations = numIterations,
        assignFlag = True,
        accuracyThreshold = abs_tol,
        fptype = X_fptype,
        # gamma = 1.0, # only relevant for categorical features of which we should have none
        method = 'defaultDense') #,
        # distanceType = 'euclidean')

    for k in range(n_init):
        deterministic, starting_centroids_ = _daal4py_compute_starting_centroids(
            X, X_fptype, nClusters, cluster_centers_0, random_state)

        res = kmeans_algo.compute(X, starting_centroids_)

        # Per documentation, with numIterations == 0, centroids and goalFunction are not updated
        if numIterations == 0:
            best_labels = res.assignments[:,0]
            best_n_iter = int(res.nIterations[0,0])
            break
        else:
            inertia = res.goalFunction[0,0]
            if best_inertia is None or inertia < best_inertia:
                best_labels = res.assignments.ravel()
                best_cluster_centers = res.centroids
                if n_init > 1:
                    best_labels = best_labels.copy()
                    best_cluster_centers = best_cluster_centers.copy()
                best_inertia = inertia
                best_n_iter = int(res.nIterations[0,0])
        if deterministic and n_init != 1:
            warnings.warn(
                'Explicit initial center position passed: '
                'performing only one init in k-means instead of n_init=%d'
                % n_init, RuntimeWarning, stacklevel=2)
            break

    return best_cluster_centers, best_labels, best_inertia, best_n_iter
Beispiel #9
0
def compute(data, nClusters, maxIter, method):
    # configure kmeans init object
    initrain_algo = d4p.kmeans_init(nClusters, method=method, fptype='float')
    # compute initial centroids
    initrain_result = initrain_algo.compute(data)

    # configure kmeans main object: we also request the cluster assignments
    algo = d4p.kmeans(nClusters, maxIter, assignFlag=True, fptype='float')
    # compute the clusters/centroids
    return algo.compute(data, initrain_result.centroids)
def _daal4py_kmeans_compatibility(nClusters,
                                  maxIterations,
                                  fptype="double",
                                  method="lloydDense",
                                  accuracyThreshold=0.0,
                                  resultsToEvaluate="computeCentroids"):
    kmeans_algo = daal4py.kmeans(
        nClusters=nClusters,
        maxIterations=maxIterations,
        fptype=fptype,
        resultsToEvaluate=resultsToEvaluate,
        accuracyThreshold=accuracyThreshold,
        method=method,
    )
    return kmeans_algo
Beispiel #11
0
    def kMeans(self, Data_Path, n):
        '''
        daal4py KMeans Clustering SPMD Mode
        '''

        nClusters = 4

        maxIter = 25  # fixed maximum number of itertions

        # Initialize SPMD mode
        d4p.daalinit(nthreads=n)

        # training setup
        file_path = Data_Path + str(d4p.my_procid() + 1) + ".csv"
        data = pd.read_csv(file_path)
        init_algo = d4p.kmeans_init(nClusters=nClusters,
                                    distributed=True,
                                    method="plusPlusDense")

        self.logger.info('Training the KMeans in pydaal SPMD Mode')

        # compute initial centroids
        centroids = init_algo.compute(data).centroids
        init_result = init_algo.compute(data)

        # configure kmeans main object
        algo = d4p.kmeans(nClusters, maxIter, distributed=True)
        kmeans_start_time = time.time()
        # compute the clusters/centroids
        result = algo.compute(data, init_result.centroids)
        self.latency["Parallel_KMeans_SPMD_Time"] = time.time() - \
            kmeans_start_time

        # result is available on all processes - but we print only on root
        if d4p.my_procid() == 0:
            print("KMeans completed", result)

        self.logger.info('Completed KMeans in pydaal SPMD Mode')

        d4p.daalfini()

        return
Beispiel #12
0
    def kMeans(self, data, target):

        '''
        Method for serial running of Kmeans
        '''
        
        nClusters = 4
        maxIter = 25 #fixed maximum number of itertions
        data = data.drop(target, axis=1)


        init_algo = d4p.kmeans_init(nClusters=nClusters, method="plusPlusDense")
        self.logger.info('Training the KMeans in pydaal Batch/Serial Mode')

        train_result = init_algo.compute(data)

        # The results provides the initial centroids
        assert train_result.centroids.shape[0] == nClusters

        # configure kmeans main object: we also request the cluster assignments
        algo = d4p.kmeans(nClusters, maxIter)
        # compute the clusters/centroids

        kmeans_start_time = time.time()

        result = algo.compute(data, train_result.centroids)

        self.latency["Serial_KMeans_Batch_Time"] = time.time() - kmeans_start_time


        # Kmeans result objects provide assignments (if requested), centroids, goalFunction, nIterations and objectiveFunction
        assert result.centroids.shape[0] == nClusters
        assert result.assignments.shape == (data.shape[0], 1)
        assert result.nIterations <= maxIter

        self.logger.info('Completed KMeans in pydaal Batch/Serial Mode')

        return
    def kmeans_lightcones(self, past_params, future_params,
                            past_decay=0, future_decay=0,
                            past_init_params=None, future_init_params=None):
        '''
        Performs clustering on the global arrays of both past and future lightcones.


        Parameters
        ----------
        past_params: dict,
            Dictionary of keword arguments for past lightcone clustering algorithm.

            If past_cluster == 'kmeans':
                past_params must include values for 'nClusters' and 'maxIterations'

        future_params: dict,
            Dictionary of keword arguments for future lightcone clustering algorithm.

            If future_cluster == 'kmeans':
                future_params must include values for 'nClusters' and 'maxIterations'

        past_decay: int, optional (default=0)
            Exponential decay rate for lightcone distance used for past lightcone clustering.

        future_decay: int, optional (default=0)
            Exponential decay rate for lightcone distance used for future lightcone clustering.
        '''
        if self.plcs is None:
            raise RuntimeError("Must call .extract() on a training field(s) before calling .cluster_lightcones().")
            
        if len(self._adjusted_shape) == 2:
            past_decays = lightcone_decay(self.past_depth, self.c, past_decay, False)
            future_decays = lightcone_decay(self.future_depth, self.c, future_decay, True)
        elif len(self._adjusted_shape) == 3:
            past_decays = lightcone_decay_2D(self.past_depth, self.c, past_decay, False)
            future_decays = lightcone_decay_2D(self.future_depth, self.c, future_decay, True)
        
        self.plcs *= np.sqrt(past_decays)
        self.flcs *= np.sqrt(future_decays)

        # Primarily used for global joint dist in distributed mode
        self._N_pasts = past_params['nClusters']
        self._N_futures = future_params['nClusters']

        if past_init_params is None:
            #method = 'randomDense'
            #method = 'parallelPlusDense'
            method = 'plusPlusDense'
            #method = 'defaultDense'
            past_init_params = {'nClusters':self._N_pasts,
                                   'method': method,
                                   'distributed': self._distributed}
        initial = d4p.kmeans_init(**past_init_params)
        centroids = initial.compute(self.plcs).centroids
        past_cluster = d4p.kmeans(distributed=self._distributed, **past_params).compute(self.plcs, centroids)
        past_local = d4p.kmeans(nClusters=self._N_pasts, distributed=self._distributed, assignFlag=True, maxIterations=0).compute(self.plcs, past_cluster.centroids)
        self.pasts = past_local.assignments.flatten()

        del past_cluster
        del self.plcs

        if future_init_params is None:
            #method = 'randomDense'
            #method = 'parallelPlusDense'
            method = 'plusPlusDense'
            #method = 'defaultDense'
            future_init_params = {'nClusters':self._N_futures,
                                   'method': method,
                                   'distributed': self._distributed}
        initial = d4p.kmeans_init(**future_init_params)
        centroids = initial.compute(self.flcs).centroids
        future_cluster = d4p.kmeans(distributed=self._distributed, **future_params).compute(self.flcs, centroids)
        self._future_centroids = future_cluster.centroids # save for field reconstruction
        future_local = d4p.kmeans(nClusters=self._N_futures, distributed=self._distributed, assignFlag=True, maxIterations=0).compute(self.flcs, self._future_centroids)
        self.futures = future_local.assignments.flatten()

        del future_cluster
        del self.flcs
Beispiel #14
0
    init_algo = d4p.kmeans_init(nClusters,
                                method="plusPlusDense",
                                distributed=True)
    # Load the data
    data = loadtxt(infile, delimiter=',')
    # now slice the data, it would have been better to read only what we need, of course...
    rpp = int(data.shape[0] / d4p.num_procs())
    data = data[rpp * d4p.my_procid():rpp * d4p.my_procid() + rpp, :]

    # compute initial centroids
    init_result = init_algo.compute(data)
    # The results provides the initial centroids
    assert init_result.centroids.shape[0] == nClusters

    # configure kmeans main object
    algo = d4p.kmeans(nClusters, maxIter, distributed=True)
    # compute the clusters/centroids
    result = algo.compute(data, init_result.centroids)

    # Note: we could have done this in just one line:
    # d4p.kmeans(nClusters, maxIter, assignFlag=True, distributed=True).compute(data, d4p.kmeans_init(nClusters, method="plusPlusDense", distributed=True).compute(data).centroids)

    # Kmeans result objects provide centroids, goalFunction, nIterations and objectiveFunction
    assert result.centroids.shape[0] == nClusters
    assert result.nIterations <= maxIter
    # we need an extra call to kmeans to get the assignments (not directly supported through parameter assignFlag yet in SPMD mode)
    algo = d4p.kmeans(
        nClusters, 0, assignFlag=True
    )  # maxIt=0; not distributed, we compute on local data only!
    assignments = algo.compute(data, result.centroids).assignments
Beispiel #15
0
def test_fit(X, X_init):
    algorithm = kmeans(
        nClusters=params.n_clusters,
        maxIterations=params.maxiter
    )  # FIXME tolerance?
    return algorithm.compute(X, X_init)
Beispiel #16
0
    def kmeans_lightcones(self,
                          past_params,
                          future_params,
                          past_decay=0,
                          future_decay=0,
                          past_init_params=None,
                          future_init_params=None):
        '''
        Performs clustering on the master arrays of both past and future lightcones.

        Expects clustering algorithm to give integer cluster labels start at 0,
        with the "noise cluster" having label -1.

        Diagnostics of this clustering (what are the unique clusters and how many
        lightcones were assigned to each cluster) accessed through namedtuple
        Reconstructor.lc_cluster_diagnostic.

        *** Actually make revert back to original Reconstructor format; don't require
        sklearn objects for clustering -- but do save centroids***

        *** How is the call to distributed DAAL4PY clustering objects going to work with this? ***

        Parameters
        ----------
        past_params: dict,
            Dictionary of keword arguments for past lightcone clustering algorithm.

            If past_cluster == 'kmeans':
                past_params must include values for 'nClusters' and 'maxIterations'

        future_params: dict,
            Dictionary of keword arguments for future lightcone clustering algorithm.

            If future_cluster == 'kmeans':
                future_params must include values for 'nClusters' and 'maxIterations'

        past_decay: int, optional (default=0)
            Exponential decay rate for lightcone distance used for past lightcone clustering.

        future_decay: int, optional (default=0)
            Exponential decay rate for lightcone distance used for future lightcone clustering.
        '''
        # OPT: comment out for performance runs
        if self.plcs is None:
            raise RuntimeError(
                "Must call .extract() on a training field(s) before calling .cluster_lightcones()."
            )

        past_decays = lightcone_decay_2D(self.past_depth, self.c, past_decay,
                                         False)
        self.plcs *= np.sqrt(past_decays)

        future_decays = lightcone_decay_2D(self.future_depth, self.c,
                                           future_decay, True)
        self.flcs *= np.sqrt(future_decays)

        # Need these for dbscan version (after clustering)
        self._N_pasts = past_params['nClusters']
        self._N_futures = future_params['nClusters']

        if past_init_params is None:
            #method = 'randomDense'
            #method = 'parallelPlusDense'
            #method = 'plusPlusDense'
            method = 'defaultDense'
            past_init_params = {
                'nClusters': self._N_pasts,
                #'method':'plusPlusDense',
                'method': method,
                'distributed': True
            }
        initial = d4p.kmeans_init(**past_init_params)
        #         print('past initialization method: ', method, flush=True)
        centroids = initial.compute(self.plcs).centroids
        #         print('done: past centroid calc', flush=True)
        past_cluster = d4p.kmeans(distributed=True,
                                  **past_params).compute(self.plcs, centroids)
        #         print('done: first pass past kmeans', flush=True)
        past_local = d4p.kmeans(nClusters=self._N_pasts,
                                distributed=False,
                                assignFlag=True,
                                maxIterations=0).compute(
                                    self.plcs, past_cluster.centroids)
        #         print('done: past cluster assignments', flush=True)
        self.pasts = past_local.assignments.flatten()
        #         print('done: flatten the past assignments', flush=True)

        del past_cluster
        del self.plcs

        if future_init_params is None:
            #method = 'randomDense'
            #method = 'parallelPlusDense'
            #method = 'plusPlusDense'
            method = 'defaultDense'
            future_init_params = {
                'nClusters': self._N_futures,
                #'method':'plusPlusDense',
                'method': method,
                'distributed': True
            }
        initial = d4p.kmeans_init(**future_init_params)
        #         print('future initialization method: ', method, flush=True)
        centroids = initial.compute(self.flcs).centroids
        #         print('done: future centroid calc', flush=True)
        future_cluster = d4p.kmeans(distributed=True, **future_params).compute(
            self.flcs, centroids)
        #         print('done: first pass future kmeans', flush=True)
        future_local = d4p.kmeans(nClusters=self._N_futures,
                                  distributed=False,
                                  assignFlag=True,
                                  maxIterations=0).compute(
                                      self.flcs, future_cluster.centroids)
        #         print('done: past cluster assignments', flush=True)
        self.futures = future_local.assignments.flatten()
        #         print('done: flatten the future assignments', flush=True)

        del future_cluster
        del self.flcs
Beispiel #17
0
def test_predict(X, X_init):
    algorithm = kmeans(
        nClusters=params.n_clusters,
        maxIterations=0
    )  # FIXME tolerance
    return algorithm.compute(X, X_init)
Beispiel #18
0
    init_algo = d4p.kmeans_init(nClusters,
                                method="plusPlusDense",
                                distributed=True)
    # Load the data
    data = loadtxt(infile, delimiter=',')
    # now slice the data, it would have been better to read only what we need, of course...
    rpp = int(data.shape[0] / d4p.num_procs())
    data = data[rpp * d4p.my_procid():rpp * d4p.my_procid() + rpp, :]

    # compute initial centroids
    init_result = init_algo.compute(data)
    # The results provides the initial centroids
    assert init_result.centroids.shape[0] == nClusters

    # configure kmeans main object
    algo = d4p.kmeans(nClusters, maxIter, distributed=True)
    # compute the clusters/centroids
    result = algo.compute(data, init_result.centroids)

    # Note: we could have done this in just one line:
    # d4p.kmeans(nClusters, maxIter, assignFlag=True, distributed=True).compute(data, d4p.kmeans_init(nClusters, method="plusPlusDense", distributed=True).compute(data).centroids)

    # Kmeans result objects provide assignments (if requested), centroids, goalFunction, nIterations and objectiveFunction
    assert result.centroids.shape[0] == nClusters
    print(result.nIterations, result.centroids[0], maxIter)
    # we'd need an extra call to kmeans.compute(10, 0) to get the assignments; getting assignments is not yet supported in dist mode
    assert result.assignments == None
    assert result.nIterations <= maxIter

    print('All looks good!')
    d4p.daalfini()
Beispiel #19
0
n_slices = int(image_array.shape[0] / d4p.num_procs())

print("Number of MPI tasks: ", d4p.num_procs())

image_array = image_array[n_slices *
                          d4p.my_procid():n_slices * d4p.my_procid() +
                          n_slices, :]

print("Fitting model on the data")
t0 = time()

# compute initial centroids
init_result = init_algo.compute(image_array)
assert init_result.centroids.shape[0] == n_colors
# configure kmeans main object
algo = d4p.kmeans(n_colors, max_iter, distributed=True)
# compute the clusters/centroids
result = algo.compute(image_array, init_result.centroids)
# Kmeans result objects provide centroids, goalFunction, nIterations and objectiveFunction
assert result.centroids.shape[0] == n_colors
assert result.nIterations <= max_iter

print("Computation finished in in %0.3fs." % (time() - t0))
# Get labels for all points
print("Predicting color indices on the full image (k-means)")

t0 = time()
algo = d4p.kmeans(n_colors, 0, assignFlag=True)
prediction = algo.compute(image_array, result.centroids)
labels = prediction.assignments
Beispiel #20
0
def predict(X):
    algorithm = kmeans(X_init.shape[0], 0)  # FIXME tolerance
    algorithm.compute(X, X_init)
Beispiel #21
0
# Now let's **load up the centroids** and look at them.

# In[6]:

# loading the initial centroids from a file
loaded_centroids = pickle.load(open(centroids_filename, "rb"))
print("Here is our centroids loaded from file:\n\n", loaded_centroids)

# # Assign The Data to Clusters and Save The Results

# Let's **assign the data** to clusters.

# In[7]:

# compute the clusters/centroids
kmeans_result = d4p.kmeans(nClusters=3, maxIterations=5,
                           assignFlag=True).compute(X, init_result.centroids)

# To **get Kmeans result objects** (assignments, centroids, goalFunction [deprecated], nIterations, and objectiveFunction):

# In[8]:

# retrieving and printing cluster assignments
assignments = kmeans_result.assignments
print("Here is our cluster assignments for first 5 datapoints: \n\n",
      assignments[:5])

# Now let's **export the cluster assignments** to a **CSV file**. We will also **stop the distribution engine.**

# In[9]:

# now export the results to a CSV file
Beispiel #22
0
def train(X):
    algorithm = kmeans(10, 100)  # FIXME tolerance?
    algorithm.compute(X, X_init)