def main(method='plusPlusDense'): infile = "./data/distributed/kmeans_dense.csv" nClusters = 10 maxIter = 25 # configure a kmeans-init init_algo = d4p.kmeans_init(nClusters, method=method, distributed=True) # Load the data data = loadtxt(infile, delimiter=',') # now slice the data, it would have been better to read only what we need, of course... rpp = int(data.shape[0]/d4p.num_procs()) data = data[rpp*d4p.my_procid():rpp*d4p.my_procid()+rpp,:] # compute initial centroids init_result = init_algo.compute(data) # The results provides the initial centroids assert init_result.centroids.shape[0] == nClusters # configure kmeans main object algo = d4p.kmeans(nClusters, maxIter, distributed=True) # compute the clusters/centroids result = algo.compute(data, init_result.centroids) # Note: we could have done this in just one line: # d4p.kmeans(nClusters, maxIter, assignFlag=True, distributed=True).compute(data, d4p.kmeans_init(nClusters, method="plusPlusDense", distributed=True).compute(data).centroids) # Kmeans result objects provide centroids, goalFunction, nIterations and objectiveFunction assert result.centroids.shape[0] == nClusters assert result.nIterations <= maxIter # we need an extra call to kmeans to get the assignments (not directly supported through parameter assignFlag yet in SPMD mode) algo = d4p.kmeans(nClusters, 0, assignFlag=True) # maxIt=0; not distributed, we compute on local data only! assignments = algo.compute(data, result.centroids).assignments return (assignments, result)
def test_kmeans_spmd(self): nClusters = 10 maxIter = 25 data = np.loadtxt("./data/distributed/kmeans_dense.csv", delimiter=',') rpp = int(data.shape[0] / d4p.num_procs()) spmd_data = data[rpp * d4p.my_procid():rpp * d4p.my_procid() + rpp, :] for init_method in [ 'plusPlusDense', 'parallelPlusDense', 'deterministicDense' ]: batch_init_res = d4p.kmeans_init( nClusters=nClusters, method=init_method).compute(data) spmd_init_res = d4p.kmeans_init( nClusters=nClusters, method=init_method, distributed=True).compute(spmd_data) if init_method in ['parallelPlusDense']: print("Warning: It is well known " "that results of parallelPlusDense init " "does not match with batch algorithm") else: reason = "Initial centroids with " + init_method reason += " does not match with batch algorithm" self.assertTrue( np.allclose(batch_init_res.centroids, spmd_init_res.centroids), reason) batch_res = d4p.kmeans(nClusters=nClusters, maxIterations=maxIter).compute( data, batch_init_res.centroids) spmd_res = d4p.kmeans(nClusters=nClusters, maxIterations=maxIter, distributed=True).compute( spmd_data, spmd_init_res.centroids) if init_method in ['parallelPlusDense']: print("Warning: It is well known " "that results of parallelPlusDense init " "does not match with batch algorithm") else: reason = "Final centroids with " + init_method reason += " does not match with batch algorithm" self.assertTrue( np.allclose(batch_res.centroids, spmd_res.centroids), reason)
def run_inference(num_observations:int = 1000): """Run xgboost for specified number of observations""" # Load data test_df = common.get_test_data_df(X=common.X_dfc,size = num_observations) num_rows = len(test_df) ###################### print("_______________________________________") print("Total Number of Rows", num_rows) run_times = [] inference_times = [] for _ in range(NUM_LOOPS): start_time = timer() init_alg = d4p.kmeans_init(nClusters = 5, fptype = "float", method = "randomDense") centroids = init_alg.compute(test_df).centroids alg = d4p.kmeans(nClusters = 5, maxIterations = 100, fptype = "float", accuracyThreshold = 0, assignFlag = False) result = alg.compute((test_df), centroids) end_time = timer() total_time = end_time - start_time run_times.append(total_time*10e3) inference_time = total_time*(10e6)/num_rows inference_times.append(inference_time) return_elem = common.calculate_stats(inference_times) print(num_observations, ", ", return_elem) return return_elem
def kmeans(N, D, nClusters, maxit): a = np.random.ranf((N, D)) # doesn't make much sense, but ok for now kmi = daal4py.kmeans_init(nClusters, method='plusPlusDense') km = daal4py.kmeans(nClusters, maxit) kmr = km.compute(a, kmi.compute(a).centroids) return (kmr.centroids, kmr.assignments, kmr.objectiveFunction, kmr.goalFunction, kmr.nIterations)
def test_predict(X, X_init): algorithm = kmeans(fptype=getFPType(X), nClusters=params.n_clusters, maxIterations=0, assignFlag=True, accuracyThreshold=0.0) return algorithm.compute(X, X_init)
def main(readcsv=read_csv, method='defaultDense'): infile = "./data/batch/kmeans_dense.csv" nClusters = 20 maxIter = 5 initrain_algo = d4p.kmeans_init(nClusters, method="randomDense") # Load the data data = readcsv(infile, range(20)) # compute initial centroids initrain_result = initrain_algo.compute(data) # The results provides the initial centroids assert initrain_result.centroids.shape[0] == nClusters # configure kmeans main object: we also request the cluster assignments algo = d4p.kmeans(nClusters, maxIter, assignFlag=True) # compute the clusters/centroids result = algo.compute(data, initrain_result.centroids) # Note: we could have done this in just one line: # d4p.kmeans(nClusters, maxIter, assignFlag=True).compute( # data, d4p.kmeans_init(nClusters, method="plusPlusDense").compute(data).centroids # ) # Kmeans result objects provide assignments (if requested), centroids, # goalFunction, nIterations and objectiveFunction assert result.centroids.shape[0] == nClusters assert result.assignments.shape == (data.shape[0], 1) assert result.nIterations <= maxIter return result
def _daal4py_kmeans_compatibility(nClusters, maxIterations, fptype = "double", method = "lloydDense", accuracyThreshold = 0.0, resultsToEvaluate = "computeCentroids"): kmeans_algo = None if daal_check_version((2020, 2), (2021, 107)): kmeans_algo = daal4py.kmeans(nClusters = nClusters, maxIterations= maxIterations, fptype = fptype, resultsToEvaluate = resultsToEvaluate, method = method) else: assigFlag = 'computeAssignments' in resultsToEvaluate kmeans_algo = daal4py.kmeans(nClusters = nClusters, maxIterations= maxIterations, fptype = fptype, assignFlag = assigFlag, method = method) return kmeans_algo
def _daal4py_k_means_dense(X, nClusters, numIterations, tol, cluster_centers_0, n_init, random_state): if numIterations < 0: raise ValueError("Wrong iterations number") if hasattr(X, '__array__'): X_fptype = getFPType(X) else: raise NotImplementedError("""Unsupported input type {} encountered in DAAL-based optimization of KMeans. You can disable DAAL-based optimizations of scikit-learn with sklearn.daal4sklearn.dispatcher.disable()""".format(type(X))) abs_tol = _tolerance(X, tol) # tol is relative tolerance best_labels, best_inertia, best_cluster_centers = None, None, None best_n_iter = -1 if numIterations == 0: n_init = 1 kmeans_algo = daal4py.kmeans( nClusters = nClusters, maxIterations = numIterations, assignFlag = True, accuracyThreshold = abs_tol, fptype = X_fptype, # gamma = 1.0, # only relevant for categorical features of which we should have none method = 'defaultDense') #, # distanceType = 'euclidean') for k in range(n_init): deterministic, starting_centroids_ = _daal4py_compute_starting_centroids( X, X_fptype, nClusters, cluster_centers_0, random_state) res = kmeans_algo.compute(X, starting_centroids_) # Per documentation, with numIterations == 0, centroids and goalFunction are not updated if numIterations == 0: best_labels = res.assignments[:,0] best_n_iter = int(res.nIterations[0,0]) break else: inertia = res.goalFunction[0,0] if best_inertia is None or inertia < best_inertia: best_labels = res.assignments.ravel() best_cluster_centers = res.centroids if n_init > 1: best_labels = best_labels.copy() best_cluster_centers = best_cluster_centers.copy() best_inertia = inertia best_n_iter = int(res.nIterations[0,0]) if deterministic and n_init != 1: warnings.warn( 'Explicit initial center position passed: ' 'performing only one init in k-means instead of n_init=%d' % n_init, RuntimeWarning, stacklevel=2) break return best_cluster_centers, best_labels, best_inertia, best_n_iter
def compute(data, nClusters, maxIter, method): # configure kmeans init object initrain_algo = d4p.kmeans_init(nClusters, method=method, fptype='float') # compute initial centroids initrain_result = initrain_algo.compute(data) # configure kmeans main object: we also request the cluster assignments algo = d4p.kmeans(nClusters, maxIter, assignFlag=True, fptype='float') # compute the clusters/centroids return algo.compute(data, initrain_result.centroids)
def _daal4py_kmeans_compatibility(nClusters, maxIterations, fptype="double", method="lloydDense", accuracyThreshold=0.0, resultsToEvaluate="computeCentroids"): kmeans_algo = daal4py.kmeans( nClusters=nClusters, maxIterations=maxIterations, fptype=fptype, resultsToEvaluate=resultsToEvaluate, accuracyThreshold=accuracyThreshold, method=method, ) return kmeans_algo
def kMeans(self, Data_Path, n): ''' daal4py KMeans Clustering SPMD Mode ''' nClusters = 4 maxIter = 25 # fixed maximum number of itertions # Initialize SPMD mode d4p.daalinit(nthreads=n) # training setup file_path = Data_Path + str(d4p.my_procid() + 1) + ".csv" data = pd.read_csv(file_path) init_algo = d4p.kmeans_init(nClusters=nClusters, distributed=True, method="plusPlusDense") self.logger.info('Training the KMeans in pydaal SPMD Mode') # compute initial centroids centroids = init_algo.compute(data).centroids init_result = init_algo.compute(data) # configure kmeans main object algo = d4p.kmeans(nClusters, maxIter, distributed=True) kmeans_start_time = time.time() # compute the clusters/centroids result = algo.compute(data, init_result.centroids) self.latency["Parallel_KMeans_SPMD_Time"] = time.time() - \ kmeans_start_time # result is available on all processes - but we print only on root if d4p.my_procid() == 0: print("KMeans completed", result) self.logger.info('Completed KMeans in pydaal SPMD Mode') d4p.daalfini() return
def kMeans(self, data, target): ''' Method for serial running of Kmeans ''' nClusters = 4 maxIter = 25 #fixed maximum number of itertions data = data.drop(target, axis=1) init_algo = d4p.kmeans_init(nClusters=nClusters, method="plusPlusDense") self.logger.info('Training the KMeans in pydaal Batch/Serial Mode') train_result = init_algo.compute(data) # The results provides the initial centroids assert train_result.centroids.shape[0] == nClusters # configure kmeans main object: we also request the cluster assignments algo = d4p.kmeans(nClusters, maxIter) # compute the clusters/centroids kmeans_start_time = time.time() result = algo.compute(data, train_result.centroids) self.latency["Serial_KMeans_Batch_Time"] = time.time() - kmeans_start_time # Kmeans result objects provide assignments (if requested), centroids, goalFunction, nIterations and objectiveFunction assert result.centroids.shape[0] == nClusters assert result.assignments.shape == (data.shape[0], 1) assert result.nIterations <= maxIter self.logger.info('Completed KMeans in pydaal Batch/Serial Mode') return
def kmeans_lightcones(self, past_params, future_params, past_decay=0, future_decay=0, past_init_params=None, future_init_params=None): ''' Performs clustering on the global arrays of both past and future lightcones. Parameters ---------- past_params: dict, Dictionary of keword arguments for past lightcone clustering algorithm. If past_cluster == 'kmeans': past_params must include values for 'nClusters' and 'maxIterations' future_params: dict, Dictionary of keword arguments for future lightcone clustering algorithm. If future_cluster == 'kmeans': future_params must include values for 'nClusters' and 'maxIterations' past_decay: int, optional (default=0) Exponential decay rate for lightcone distance used for past lightcone clustering. future_decay: int, optional (default=0) Exponential decay rate for lightcone distance used for future lightcone clustering. ''' if self.plcs is None: raise RuntimeError("Must call .extract() on a training field(s) before calling .cluster_lightcones().") if len(self._adjusted_shape) == 2: past_decays = lightcone_decay(self.past_depth, self.c, past_decay, False) future_decays = lightcone_decay(self.future_depth, self.c, future_decay, True) elif len(self._adjusted_shape) == 3: past_decays = lightcone_decay_2D(self.past_depth, self.c, past_decay, False) future_decays = lightcone_decay_2D(self.future_depth, self.c, future_decay, True) self.plcs *= np.sqrt(past_decays) self.flcs *= np.sqrt(future_decays) # Primarily used for global joint dist in distributed mode self._N_pasts = past_params['nClusters'] self._N_futures = future_params['nClusters'] if past_init_params is None: #method = 'randomDense' #method = 'parallelPlusDense' method = 'plusPlusDense' #method = 'defaultDense' past_init_params = {'nClusters':self._N_pasts, 'method': method, 'distributed': self._distributed} initial = d4p.kmeans_init(**past_init_params) centroids = initial.compute(self.plcs).centroids past_cluster = d4p.kmeans(distributed=self._distributed, **past_params).compute(self.plcs, centroids) past_local = d4p.kmeans(nClusters=self._N_pasts, distributed=self._distributed, assignFlag=True, maxIterations=0).compute(self.plcs, past_cluster.centroids) self.pasts = past_local.assignments.flatten() del past_cluster del self.plcs if future_init_params is None: #method = 'randomDense' #method = 'parallelPlusDense' method = 'plusPlusDense' #method = 'defaultDense' future_init_params = {'nClusters':self._N_futures, 'method': method, 'distributed': self._distributed} initial = d4p.kmeans_init(**future_init_params) centroids = initial.compute(self.flcs).centroids future_cluster = d4p.kmeans(distributed=self._distributed, **future_params).compute(self.flcs, centroids) self._future_centroids = future_cluster.centroids # save for field reconstruction future_local = d4p.kmeans(nClusters=self._N_futures, distributed=self._distributed, assignFlag=True, maxIterations=0).compute(self.flcs, self._future_centroids) self.futures = future_local.assignments.flatten() del future_cluster del self.flcs
init_algo = d4p.kmeans_init(nClusters, method="plusPlusDense", distributed=True) # Load the data data = loadtxt(infile, delimiter=',') # now slice the data, it would have been better to read only what we need, of course... rpp = int(data.shape[0] / d4p.num_procs()) data = data[rpp * d4p.my_procid():rpp * d4p.my_procid() + rpp, :] # compute initial centroids init_result = init_algo.compute(data) # The results provides the initial centroids assert init_result.centroids.shape[0] == nClusters # configure kmeans main object algo = d4p.kmeans(nClusters, maxIter, distributed=True) # compute the clusters/centroids result = algo.compute(data, init_result.centroids) # Note: we could have done this in just one line: # d4p.kmeans(nClusters, maxIter, assignFlag=True, distributed=True).compute(data, d4p.kmeans_init(nClusters, method="plusPlusDense", distributed=True).compute(data).centroids) # Kmeans result objects provide centroids, goalFunction, nIterations and objectiveFunction assert result.centroids.shape[0] == nClusters assert result.nIterations <= maxIter # we need an extra call to kmeans to get the assignments (not directly supported through parameter assignFlag yet in SPMD mode) algo = d4p.kmeans( nClusters, 0, assignFlag=True ) # maxIt=0; not distributed, we compute on local data only! assignments = algo.compute(data, result.centroids).assignments
def test_fit(X, X_init): algorithm = kmeans( nClusters=params.n_clusters, maxIterations=params.maxiter ) # FIXME tolerance? return algorithm.compute(X, X_init)
def kmeans_lightcones(self, past_params, future_params, past_decay=0, future_decay=0, past_init_params=None, future_init_params=None): ''' Performs clustering on the master arrays of both past and future lightcones. Expects clustering algorithm to give integer cluster labels start at 0, with the "noise cluster" having label -1. Diagnostics of this clustering (what are the unique clusters and how many lightcones were assigned to each cluster) accessed through namedtuple Reconstructor.lc_cluster_diagnostic. *** Actually make revert back to original Reconstructor format; don't require sklearn objects for clustering -- but do save centroids*** *** How is the call to distributed DAAL4PY clustering objects going to work with this? *** Parameters ---------- past_params: dict, Dictionary of keword arguments for past lightcone clustering algorithm. If past_cluster == 'kmeans': past_params must include values for 'nClusters' and 'maxIterations' future_params: dict, Dictionary of keword arguments for future lightcone clustering algorithm. If future_cluster == 'kmeans': future_params must include values for 'nClusters' and 'maxIterations' past_decay: int, optional (default=0) Exponential decay rate for lightcone distance used for past lightcone clustering. future_decay: int, optional (default=0) Exponential decay rate for lightcone distance used for future lightcone clustering. ''' # OPT: comment out for performance runs if self.plcs is None: raise RuntimeError( "Must call .extract() on a training field(s) before calling .cluster_lightcones()." ) past_decays = lightcone_decay_2D(self.past_depth, self.c, past_decay, False) self.plcs *= np.sqrt(past_decays) future_decays = lightcone_decay_2D(self.future_depth, self.c, future_decay, True) self.flcs *= np.sqrt(future_decays) # Need these for dbscan version (after clustering) self._N_pasts = past_params['nClusters'] self._N_futures = future_params['nClusters'] if past_init_params is None: #method = 'randomDense' #method = 'parallelPlusDense' #method = 'plusPlusDense' method = 'defaultDense' past_init_params = { 'nClusters': self._N_pasts, #'method':'plusPlusDense', 'method': method, 'distributed': True } initial = d4p.kmeans_init(**past_init_params) # print('past initialization method: ', method, flush=True) centroids = initial.compute(self.plcs).centroids # print('done: past centroid calc', flush=True) past_cluster = d4p.kmeans(distributed=True, **past_params).compute(self.plcs, centroids) # print('done: first pass past kmeans', flush=True) past_local = d4p.kmeans(nClusters=self._N_pasts, distributed=False, assignFlag=True, maxIterations=0).compute( self.plcs, past_cluster.centroids) # print('done: past cluster assignments', flush=True) self.pasts = past_local.assignments.flatten() # print('done: flatten the past assignments', flush=True) del past_cluster del self.plcs if future_init_params is None: #method = 'randomDense' #method = 'parallelPlusDense' #method = 'plusPlusDense' method = 'defaultDense' future_init_params = { 'nClusters': self._N_futures, #'method':'plusPlusDense', 'method': method, 'distributed': True } initial = d4p.kmeans_init(**future_init_params) # print('future initialization method: ', method, flush=True) centroids = initial.compute(self.flcs).centroids # print('done: future centroid calc', flush=True) future_cluster = d4p.kmeans(distributed=True, **future_params).compute( self.flcs, centroids) # print('done: first pass future kmeans', flush=True) future_local = d4p.kmeans(nClusters=self._N_futures, distributed=False, assignFlag=True, maxIterations=0).compute( self.flcs, future_cluster.centroids) # print('done: past cluster assignments', flush=True) self.futures = future_local.assignments.flatten() # print('done: flatten the future assignments', flush=True) del future_cluster del self.flcs
def test_predict(X, X_init): algorithm = kmeans( nClusters=params.n_clusters, maxIterations=0 ) # FIXME tolerance return algorithm.compute(X, X_init)
init_algo = d4p.kmeans_init(nClusters, method="plusPlusDense", distributed=True) # Load the data data = loadtxt(infile, delimiter=',') # now slice the data, it would have been better to read only what we need, of course... rpp = int(data.shape[0] / d4p.num_procs()) data = data[rpp * d4p.my_procid():rpp * d4p.my_procid() + rpp, :] # compute initial centroids init_result = init_algo.compute(data) # The results provides the initial centroids assert init_result.centroids.shape[0] == nClusters # configure kmeans main object algo = d4p.kmeans(nClusters, maxIter, distributed=True) # compute the clusters/centroids result = algo.compute(data, init_result.centroids) # Note: we could have done this in just one line: # d4p.kmeans(nClusters, maxIter, assignFlag=True, distributed=True).compute(data, d4p.kmeans_init(nClusters, method="plusPlusDense", distributed=True).compute(data).centroids) # Kmeans result objects provide assignments (if requested), centroids, goalFunction, nIterations and objectiveFunction assert result.centroids.shape[0] == nClusters print(result.nIterations, result.centroids[0], maxIter) # we'd need an extra call to kmeans.compute(10, 0) to get the assignments; getting assignments is not yet supported in dist mode assert result.assignments == None assert result.nIterations <= maxIter print('All looks good!') d4p.daalfini()
n_slices = int(image_array.shape[0] / d4p.num_procs()) print("Number of MPI tasks: ", d4p.num_procs()) image_array = image_array[n_slices * d4p.my_procid():n_slices * d4p.my_procid() + n_slices, :] print("Fitting model on the data") t0 = time() # compute initial centroids init_result = init_algo.compute(image_array) assert init_result.centroids.shape[0] == n_colors # configure kmeans main object algo = d4p.kmeans(n_colors, max_iter, distributed=True) # compute the clusters/centroids result = algo.compute(image_array, init_result.centroids) # Kmeans result objects provide centroids, goalFunction, nIterations and objectiveFunction assert result.centroids.shape[0] == n_colors assert result.nIterations <= max_iter print("Computation finished in in %0.3fs." % (time() - t0)) # Get labels for all points print("Predicting color indices on the full image (k-means)") t0 = time() algo = d4p.kmeans(n_colors, 0, assignFlag=True) prediction = algo.compute(image_array, result.centroids) labels = prediction.assignments
def predict(X): algorithm = kmeans(X_init.shape[0], 0) # FIXME tolerance algorithm.compute(X, X_init)
# Now let's **load up the centroids** and look at them. # In[6]: # loading the initial centroids from a file loaded_centroids = pickle.load(open(centroids_filename, "rb")) print("Here is our centroids loaded from file:\n\n", loaded_centroids) # # Assign The Data to Clusters and Save The Results # Let's **assign the data** to clusters. # In[7]: # compute the clusters/centroids kmeans_result = d4p.kmeans(nClusters=3, maxIterations=5, assignFlag=True).compute(X, init_result.centroids) # To **get Kmeans result objects** (assignments, centroids, goalFunction [deprecated], nIterations, and objectiveFunction): # In[8]: # retrieving and printing cluster assignments assignments = kmeans_result.assignments print("Here is our cluster assignments for first 5 datapoints: \n\n", assignments[:5]) # Now let's **export the cluster assignments** to a **CSV file**. We will also **stop the distribution engine.** # In[9]: # now export the results to a CSV file
def train(X): algorithm = kmeans(10, 100) # FIXME tolerance? algorithm.compute(X, X_init)