コード例 #1
0
ファイル: kmeans_batch.py プロジェクト: FelipeNSantos/daal4py
def main(readcsv=read_csv, method='defaultDense'):
    infile = "./data/batch/kmeans_dense.csv"
    nClusters = 20
    maxIter = 5

    initrain_algo = d4p.kmeans_init(nClusters, method="randomDense")
    # Load the data
    data = readcsv(infile, range(20))
    # compute initial centroids
    initrain_result = initrain_algo.compute(data)
    # The results provides the initial centroids
    assert initrain_result.centroids.shape[0] == nClusters

    # configure kmeans main object: we also request the cluster assignments
    algo = d4p.kmeans(nClusters, maxIter, assignFlag=True)
    # compute the clusters/centroids
    result = algo.compute(data, initrain_result.centroids)

    # Note: we could have done this in just one line:
    # d4p.kmeans(nClusters, maxIter, assignFlag=True).compute(data, d4p.kmeans_init(nClusters, method="plusPlusDense").compute(data).centroids)

    # Kmeans result objects provide assignments (if requested), centroids, goalFunction, nIterations and objectiveFunction
    assert result.centroids.shape[0] == nClusters
    assert result.assignments.shape == (data.shape[0], 1)
    assert result.nIterations <= maxIter

    return result
コード例 #2
0
ファイル: d4p_kmeans.py プロジェクト: rowhit/sdc
def kmeans(N, D, nClusters, maxit):
    a = np.random.ranf((N, D))  # doesn't make much sense, but ok for now
    kmi = daal4py.kmeans_init(nClusters, method='plusPlusDense')
    km = daal4py.kmeans(nClusters, maxit)
    kmr = km.compute(a, kmi.compute(a).centroids)
    return (kmr.centroids, kmr.assignments, kmr.objectiveFunction,
            kmr.goalFunction, kmr.nIterations)
コード例 #3
0
def run_inference(num_observations:int = 1000):
    """Run xgboost for specified number of observations"""
    # Load data
    test_df = common.get_test_data_df(X=common.X_dfc,size = num_observations)
    num_rows = len(test_df)
    ######################
    print("_______________________________________")
    print("Total Number of Rows", num_rows)
    run_times = []
    inference_times = []
    for _ in range(NUM_LOOPS):
        
        start_time = timer()
        init_alg = d4p.kmeans_init(nClusters = 5, fptype = "float",
                                   method = "randomDense")
        centroids = init_alg.compute(test_df).centroids
        alg = d4p.kmeans(nClusters = 5, maxIterations = 100,
                         fptype = "float", accuracyThreshold = 0,
                         assignFlag = False)
        result = alg.compute((test_df), centroids)
        end_time = timer()

        total_time = end_time - start_time
        run_times.append(total_time*10e3)

        inference_time = total_time*(10e6)/num_rows
        inference_times.append(inference_time)

    return_elem = common.calculate_stats(inference_times)
    print(num_observations, ", ", return_elem)
    return return_elem
コード例 #4
0
ファイル: _k_means_0_23.py プロジェクト: asrednit/daal4py
def _daal4py_compute_starting_centroids(X, X_fptype, nClusters, cluster_centers_0, verbose, random_state):

    def is_string(s, target_str):
        return isinstance(s, str) and s == target_str
    is_sparse = sp.isspmatrix(X)
    
    deterministic = False
    if is_string(cluster_centers_0, 'k-means++'):
        _seed = random_state.randint(np.iinfo('i').max)
        plus_plus_method = "plusPlusCSR" if is_sparse else "plusPlusDense"
        daal_engine = daal4py.engines_mt19937(fptype=X_fptype, method="defaultDense", seed=_seed)
        _n_local_trials = 2 + int(np.log(nClusters))
        kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype,
                                          nTrials=_n_local_trials, method=plus_plus_method, engine=daal_engine)
        kmeans_init_res = kmeans_init.compute(X)
        centroids_ = kmeans_init_res.centroids
    elif is_string(cluster_centers_0, 'random'):
        _seed = random_state.randint(np.iinfo('i').max)
        random_method = "randomCSR" if is_sparse else "randomDense"
        daal_engine = daal4py.engines_mt19937(seed=_seed, fptype=X_fptype, method="defaultDense")
        kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype, method=random_method, engine=daal_engine)
        kmeans_init_res = kmeans_init.compute(X)
        centroids_ = kmeans_init_res.centroids
    elif hasattr(cluster_centers_0, '__array__'):
        deterministic = True
        cc_arr = np.ascontiguousarray(cluster_centers_0, dtype=X.dtype)
        _validate_center_shape(X, nClusters, cc_arr)
        centroids_ = cc_arr
    elif callable(cluster_centers_0):
        cc_arr = cluster_centers_0(X, nClusters, random_state)
        cc_arr = np.ascontiguousarray(cc_arr, dtype=X.dtype)
        _validate_center_shape(X, nClusters, cc_arr)
        centroids_ = cc_arr
    elif is_string(cluster_centers_0, 'deterministic'):
        deterministic = True
        default_method = "lloydCSR" if is_sparse else "defaultDense"
        kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype, method=default_method)
        kmeans_init_res = kmeans_init.compute(X)
        centroids_ = kmeans_init_res.centroids
    else:
        raise ValueError(
                f"init should be either 'k-means++', 'random', a ndarray or a "
                f"callable, got '{cluster_centers_0}' instead.")
    if verbose:
        print("Initialization complete")
    return deterministic, centroids_
コード例 #5
0
ファイル: kmeans_batch.py プロジェクト: vlad-nazarov/daal4py
def compute(data, nClusters, maxIter, method):
    # configure kmeans init object
    initrain_algo = d4p.kmeans_init(nClusters, method=method, fptype='float')
    # compute initial centroids
    initrain_result = initrain_algo.compute(data)

    # configure kmeans main object: we also request the cluster assignments
    algo = d4p.kmeans(nClusters, maxIter, assignFlag=True, fptype='float')
    # compute the clusters/centroids
    return algo.compute(data, initrain_result.centroids)
コード例 #6
0
ファイル: spmd_test_examples.py プロジェクト: rlnx/daal4py
        def test_kmeans_spmd(self):
            nClusters = 10
            maxIter = 25

            data = np.loadtxt("./data/distributed/kmeans_dense.csv",
                              delimiter=',')

            rpp = int(data.shape[0] / d4p.num_procs())
            spmd_data = data[rpp * d4p.my_procid():rpp * d4p.my_procid() +
                             rpp, :]

            for init_method in [
                    'plusPlusDense', 'parallelPlusDense', 'deterministicDense'
            ]:
                batch_init_res = d4p.kmeans_init(
                    nClusters=nClusters, method=init_method).compute(data)
                spmd_init_res = d4p.kmeans_init(
                    nClusters=nClusters, method=init_method,
                    distributed=True).compute(spmd_data)

                if init_method in ['parallelPlusDense']:
                    print(
                        "Warning: It is well known that results of parallelPlusDense init does not match with batch algorithm"
                    )
                else:
                    self.assertTrue(
                        np.allclose(batch_init_res.centroids,
                                    spmd_init_res.centroids),
                        "Initial centroids with " + init_method +
                        " does not match with batch algorithm")

                batch_res = d4p.kmeans(nClusters=nClusters,
                                       maxIterations=maxIter).compute(
                                           data, batch_init_res.centroids)
                spmd_res = d4p.kmeans(nClusters=nClusters,
                                      maxIterations=maxIter,
                                      distributed=True).compute(
                                          spmd_data, spmd_init_res.centroids)

                self.assertTrue(
                    np.allclose(batch_res.centroids,
                                batch_res.centroids), "Final centroids with " +
                    init_method + " does not match with batch algorithm")
コード例 #7
0
def _daal4py_compute_starting_centroids(X, X_fptype, nClusters, cluster_centers_0, random_state):

    def is_string(s, target_str):
        return isinstance(s, string_types) and s == target_str

    deterministic = False
    if is_string(cluster_centers_0, 'k-means++'):
        _seed = random_state.randint(np.iinfo('i').max)
        daal_engine = daal4py.engines_mt19937(fptype=X_fptype, method='defaultDense', seed=_seed)
        _n_local_trials = 2 + int(np.log(nClusters))
        kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype,
                                          nTrials=_n_local_trials, method='plusPlusDense', engine=daal_engine)
        kmeans_init_res = kmeans_init.compute(X)
        centroids_ = kmeans_init_res.centroids
    elif is_string(cluster_centers_0, 'random'):
        _seed = random_state.randint(np.iinfo('i').max)
        daal_engine = daal4py.engines_mt19937(seed=_seed, fptype=X_fptype, method='defaultDense')
        kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype, method='randomDense', engine=daal_engine)
        kmeans_init_res = kmeans_init.compute(X)
        centroids_ = kmeans_init_res.centroids
    elif hasattr(cluster_centers_0, '__array__'):
        deterministic = True
        cc_arr = np.ascontiguousarray(cluster_centers_0, dtype=X.dtype)
        _validate_center_shape(X, nClusters, cc_arr)
        centroids_ = cc_arr
    elif callable(cluster_centers_0):
        cc_arr = cluster_centers_0(X, nClusters, random_state)
        cc_arr = np.ascontiguousarray(cc_arr, dtype=X.dtype)
        _validate_center_shape(X, nClusters, cc_arr)
        centroids_ = cc_arr
    elif is_string(cluster_centers_0, 'deterministic'):
        deterministic = True
        kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype, method='defaultDense')
        kmeans_init_res = kmeans_init.compute(X)
        centroids_ = kmeans_init_res.centroids
    else:
        raise ValueError("Cluster centers should either be 'k-means++', 'random', 'deterministic' or an array")
    return deterministic, centroids_
コード例 #8
0
def main(method='plusPlusDense'):
    infile = "./data/distributed/kmeans_dense.csv"
    nClusters = 10
    maxIter = 25

    # configure a kmeans-init
    init_algo = d4p.kmeans_init(nClusters, method=method, distributed=True)
    # Load the data
    data = loadtxt(infile, delimiter=',')
    # now slice the data,
    # it would have been better to read only what we need, of course...
    rpp = int(data.shape[0] / d4p.num_procs())
    data = data[rpp * d4p.my_procid():rpp * d4p.my_procid() + rpp, :]

    # compute initial centroids
    init_result = init_algo.compute(data)
    # The results provides the initial centroids
    assert init_result.centroids.shape[0] == nClusters

    # configure kmeans main object
    algo = d4p.kmeans(nClusters, maxIter, distributed=True)
    # compute the clusters/centroids
    result = algo.compute(data, init_result.centroids)

    # Note: we could have done this in just one line:
    # d4p.kmeans(nClusters, maxIter, assignFlag=True, distributed=True).compute(
    #     data,
    #     d4p.kmeans_init(
    #         nClusters,
    #         method="plusPlusDense",
    #         distributed=True
    #     ).compute(data).centroids
    # )

    # Kmeans result objects provide centroids, goalFunction,
    # nIterations and objectiveFunction
    assert result.centroids.shape[0] == nClusters
    assert result.nIterations <= maxIter
    # we need an extra call to kmeans to get the assignments
    # (not directly supported through parameter assignFlag yet in SPMD mode)
    algo = d4p.kmeans(nClusters, 0, assignFlag=True)
    # maxIt=0; not distributed, we compute on local data only!
    assignments = algo.compute(data, result.centroids).assignments

    return (assignments, result)
コード例 #9
0
    def kMeans(self, Data_Path, n):
        '''
        daal4py KMeans Clustering SPMD Mode
        '''

        nClusters = 4

        maxIter = 25  # fixed maximum number of itertions

        # Initialize SPMD mode
        d4p.daalinit(nthreads=n)

        # training setup
        file_path = Data_Path + str(d4p.my_procid() + 1) + ".csv"
        data = pd.read_csv(file_path)
        init_algo = d4p.kmeans_init(nClusters=nClusters,
                                    distributed=True,
                                    method="plusPlusDense")

        self.logger.info('Training the KMeans in pydaal SPMD Mode')

        # compute initial centroids
        centroids = init_algo.compute(data).centroids
        init_result = init_algo.compute(data)

        # configure kmeans main object
        algo = d4p.kmeans(nClusters, maxIter, distributed=True)
        kmeans_start_time = time.time()
        # compute the clusters/centroids
        result = algo.compute(data, init_result.centroids)
        self.latency["Parallel_KMeans_SPMD_Time"] = time.time() - \
            kmeans_start_time

        # result is available on all processes - but we print only on root
        if d4p.my_procid() == 0:
            print("KMeans completed", result)

        self.logger.info('Completed KMeans in pydaal SPMD Mode')

        d4p.daalfini()

        return
コード例 #10
0
    def kMeans(self, data, target):

        '''
        Method for serial running of Kmeans
        '''
        
        nClusters = 4
        maxIter = 25 #fixed maximum number of itertions
        data = data.drop(target, axis=1)


        init_algo = d4p.kmeans_init(nClusters=nClusters, method="plusPlusDense")
        self.logger.info('Training the KMeans in pydaal Batch/Serial Mode')

        train_result = init_algo.compute(data)

        # The results provides the initial centroids
        assert train_result.centroids.shape[0] == nClusters

        # configure kmeans main object: we also request the cluster assignments
        algo = d4p.kmeans(nClusters, maxIter)
        # compute the clusters/centroids

        kmeans_start_time = time.time()

        result = algo.compute(data, train_result.centroids)

        self.latency["Serial_KMeans_Batch_Time"] = time.time() - kmeans_start_time


        # Kmeans result objects provide assignments (if requested), centroids, goalFunction, nIterations and objectiveFunction
        assert result.centroids.shape[0] == nClusters
        assert result.assignments.shape == (data.shape[0], 1)
        assert result.nIterations <= maxIter

        self.logger.info('Completed KMeans in pydaal Batch/Serial Mode')

        return
コード例 #11
0
# organizing variables used in the model for prediction
# each process gets its own data
infile = "./data/distributed_data/daal4py_Distributed_Kmeans_" + str(
    d4p.my_procid() + 1) + ".csv"

# read data
X = pd.read_csv(infile)

# ## Computing and Saving Initial Centroids

# Time to **initialize our centroids!**

# In[4]:

# computing inital centroids
init_result = d4p.kmeans_init(nClusters=3, method="plusPlusDense").compute(X)

# To **get initial centroid information and save it** to a file:

# In[5]:

# retrieving and printing inital centroids
centroids = init_result.centroids
print("Here our centroids:\n\n\n", centroids, "\n")

centroids_filename = './models/kmeans_clustering_initcentroids_' + str(
    d4p.my_procid() + 1) + '.csv'

# saving centroids to a file
pickle.dump(centroids, open(centroids_filename, "wb"))
コード例 #12
0
ファイル: pdisco.py プロジェクト: adamrupe/DisCo
    def kmeans_lightcones(self,
                          past_params,
                          future_params,
                          past_decay=0,
                          future_decay=0,
                          past_init_params=None,
                          future_init_params=None):
        '''
        Performs clustering on the master arrays of both past and future lightcones.

        Expects clustering algorithm to give integer cluster labels start at 0,
        with the "noise cluster" having label -1.

        Diagnostics of this clustering (what are the unique clusters and how many
        lightcones were assigned to each cluster) accessed through namedtuple
        Reconstructor.lc_cluster_diagnostic.

        *** Actually make revert back to original Reconstructor format; don't require
        sklearn objects for clustering -- but do save centroids***

        *** How is the call to distributed DAAL4PY clustering objects going to work with this? ***

        Parameters
        ----------
        past_params: dict,
            Dictionary of keword arguments for past lightcone clustering algorithm.

            If past_cluster == 'kmeans':
                past_params must include values for 'nClusters' and 'maxIterations'

        future_params: dict,
            Dictionary of keword arguments for future lightcone clustering algorithm.

            If future_cluster == 'kmeans':
                future_params must include values for 'nClusters' and 'maxIterations'

        past_decay: int, optional (default=0)
            Exponential decay rate for lightcone distance used for past lightcone clustering.

        future_decay: int, optional (default=0)
            Exponential decay rate for lightcone distance used for future lightcone clustering.
        '''
        # OPT: comment out for performance runs
        if self.plcs is None:
            raise RuntimeError(
                "Must call .extract() on a training field(s) before calling .cluster_lightcones()."
            )

        past_decays = lightcone_decay_2D(self.past_depth, self.c, past_decay,
                                         False)
        self.plcs *= np.sqrt(past_decays)

        future_decays = lightcone_decay_2D(self.future_depth, self.c,
                                           future_decay, True)
        self.flcs *= np.sqrt(future_decays)

        # Need these for dbscan version (after clustering)
        self._N_pasts = past_params['nClusters']
        self._N_futures = future_params['nClusters']

        if past_init_params is None:
            #method = 'randomDense'
            #method = 'parallelPlusDense'
            #method = 'plusPlusDense'
            method = 'defaultDense'
            past_init_params = {
                'nClusters': self._N_pasts,
                #'method':'plusPlusDense',
                'method': method,
                'distributed': True
            }
        initial = d4p.kmeans_init(**past_init_params)
        #         print('past initialization method: ', method, flush=True)
        centroids = initial.compute(self.plcs).centroids
        #         print('done: past centroid calc', flush=True)
        past_cluster = d4p.kmeans(distributed=True,
                                  **past_params).compute(self.plcs, centroids)
        #         print('done: first pass past kmeans', flush=True)
        past_local = d4p.kmeans(nClusters=self._N_pasts,
                                distributed=False,
                                assignFlag=True,
                                maxIterations=0).compute(
                                    self.plcs, past_cluster.centroids)
        #         print('done: past cluster assignments', flush=True)
        self.pasts = past_local.assignments.flatten()
        #         print('done: flatten the past assignments', flush=True)

        del past_cluster
        del self.plcs

        if future_init_params is None:
            #method = 'randomDense'
            #method = 'parallelPlusDense'
            #method = 'plusPlusDense'
            method = 'defaultDense'
            future_init_params = {
                'nClusters': self._N_futures,
                #'method':'plusPlusDense',
                'method': method,
                'distributed': True
            }
        initial = d4p.kmeans_init(**future_init_params)
        #         print('future initialization method: ', method, flush=True)
        centroids = initial.compute(self.flcs).centroids
        #         print('done: future centroid calc', flush=True)
        future_cluster = d4p.kmeans(distributed=True, **future_params).compute(
            self.flcs, centroids)
        #         print('done: first pass future kmeans', flush=True)
        future_local = d4p.kmeans(nClusters=self._N_futures,
                                  distributed=False,
                                  assignFlag=True,
                                  maxIterations=0).compute(
                                      self.flcs, future_cluster.centroids)
        #         print('done: past cluster assignments', flush=True)
        self.futures = future_local.assignments.flatten()
        #         print('done: flatten the future assignments', flush=True)

        del future_cluster
        del self.flcs
コード例 #13
0
# each process gets its own data
infile = "./data/distributed_data/daal4py_Distributed_Kmeans_" + str(
    d4p.my_procid() + 1) + ".csv"

# read data
X = pd.read_csv(infile)

# ## Computing and Saving Initial Centroids

# Time to **initialize our centroids!**

# In[4]:

# computing inital centroids
init_result = d4p.kmeans_init(nClusters=3,
                              method="plusPlusDense",
                              distributed=True).compute(X)

# To **get initial centroid information and save it** to a file:

# In[5]:

# retrieving and printing inital centroids
centroids = init_result.centroids
print("Here our centroids:\n\n\n", centroids, "\n")

centroids_filename = './models/kmeans_clustering_initcentroids_' + str(
    d4p.my_procid() + 1) + '.csv'

# saving centroids to a file
pickle.dump(centroids, open(centroids_filename, "wb"))
コード例 #14
0
import daal4py as d4p
from numpy import loadtxt, allclose

if __name__ == "__main__":

    # Initialize SPMD mode
    d4p.daalinit()

    infile = "./data/distributed/kmeans_dense.csv"
    nClusters = 10
    maxIter = 25

    # configure a kmeans-init
    init_algo = d4p.kmeans_init(nClusters,
                                method="plusPlusDense",
                                distributed=True)
    # Load the data
    data = loadtxt(infile, delimiter=',')
    # now slice the data, it would have been better to read only what we need, of course...
    rpp = int(data.shape[0] / d4p.num_procs())
    data = data[rpp * d4p.my_procid():rpp * d4p.my_procid() + rpp, :]

    # compute initial centroids
    init_result = init_algo.compute(data)
    # The results provides the initial centroids
    assert init_result.centroids.shape[0] == nClusters

    # configure kmeans main object
    algo = d4p.kmeans(nClusters, maxIter, distributed=True)
    # compute the clusters/centroids
コード例 #15
0
    def kmeans_lightcones(self, past_params, future_params,
                            past_decay=0, future_decay=0,
                            past_init_params=None, future_init_params=None):
        '''
        Performs clustering on the global arrays of both past and future lightcones.


        Parameters
        ----------
        past_params: dict,
            Dictionary of keword arguments for past lightcone clustering algorithm.

            If past_cluster == 'kmeans':
                past_params must include values for 'nClusters' and 'maxIterations'

        future_params: dict,
            Dictionary of keword arguments for future lightcone clustering algorithm.

            If future_cluster == 'kmeans':
                future_params must include values for 'nClusters' and 'maxIterations'

        past_decay: int, optional (default=0)
            Exponential decay rate for lightcone distance used for past lightcone clustering.

        future_decay: int, optional (default=0)
            Exponential decay rate for lightcone distance used for future lightcone clustering.
        '''
        if self.plcs is None:
            raise RuntimeError("Must call .extract() on a training field(s) before calling .cluster_lightcones().")
            
        if len(self._adjusted_shape) == 2:
            past_decays = lightcone_decay(self.past_depth, self.c, past_decay, False)
            future_decays = lightcone_decay(self.future_depth, self.c, future_decay, True)
        elif len(self._adjusted_shape) == 3:
            past_decays = lightcone_decay_2D(self.past_depth, self.c, past_decay, False)
            future_decays = lightcone_decay_2D(self.future_depth, self.c, future_decay, True)
        
        self.plcs *= np.sqrt(past_decays)
        self.flcs *= np.sqrt(future_decays)

        # Primarily used for global joint dist in distributed mode
        self._N_pasts = past_params['nClusters']
        self._N_futures = future_params['nClusters']

        if past_init_params is None:
            #method = 'randomDense'
            #method = 'parallelPlusDense'
            method = 'plusPlusDense'
            #method = 'defaultDense'
            past_init_params = {'nClusters':self._N_pasts,
                                   'method': method,
                                   'distributed': self._distributed}
        initial = d4p.kmeans_init(**past_init_params)
        centroids = initial.compute(self.plcs).centroids
        past_cluster = d4p.kmeans(distributed=self._distributed, **past_params).compute(self.plcs, centroids)
        past_local = d4p.kmeans(nClusters=self._N_pasts, distributed=self._distributed, assignFlag=True, maxIterations=0).compute(self.plcs, past_cluster.centroids)
        self.pasts = past_local.assignments.flatten()

        del past_cluster
        del self.plcs

        if future_init_params is None:
            #method = 'randomDense'
            #method = 'parallelPlusDense'
            method = 'plusPlusDense'
            #method = 'defaultDense'
            future_init_params = {'nClusters':self._N_futures,
                                   'method': method,
                                   'distributed': self._distributed}
        initial = d4p.kmeans_init(**future_init_params)
        centroids = initial.compute(self.flcs).centroids
        future_cluster = d4p.kmeans(distributed=self._distributed, **future_params).compute(self.flcs, centroids)
        self._future_centroids = future_cluster.centroids # save for field reconstruction
        future_local = d4p.kmeans(nClusters=self._N_futures, distributed=self._distributed, assignFlag=True, maxIterations=0).compute(self.flcs, self._future_centroids)
        self.futures = future_local.assignments.flatten()

        del future_cluster
        del self.flcs