def _check_array(self, X): t0 = tic() if isinstance(X, pd.DataFrame): X = X.values elif isinstance(X, dd.DataFrame): raise TypeError("Cannot fit on dask.dataframe due to unknown " "partition lengths.") if X.dtype == 'int32': X = X.astype('float32') elif X.dtype == 'int64': X = X.astype('float64') X = check_array(X, accept_dask_dataframe=False, accept_unknown_chunks=False, accept_sparse=False) if isinstance(X, np.ndarray): X = da.from_array(X, chunks=(max(1, len(X) // cpu_count()), X.shape[-1])) bad = (da.isnull(X).any(), da.isinf(X).any()) if any(*compute(bad)): msg = ("Input contains NaN, infinity or a value too large for " "dtype('float64').") raise ValueError(msg) t1 = tic() logger.info("Finished check_array in %0.2f s", t1 - t0) return X
def main(args=None): args = parse_args(args) steps = range(args.start, args.stop, args.step) if args.scheduler_address: client = Client(args.scheduler_address) info = client.scheduler_info() logger.info("Distributed mode: %s", client.scheduler) logger.info("Dashboard: %s:%s", info["address"], info["services"]["bokeh"]) else: logger.warning("Local mode") logger.info("Fitting for %s", list(steps)) logger.info("Reading data") X = read().pipe(transform).pipe(as_array) X, = persist(X) timings = [] for n_clusters in range(args.start, args.stop, args.step): logger.info("Starting %02d", n_clusters) t0 = tic() with _timer(n_clusters, _logger=logger): km = do(X, n_clusters, factor=args.factor) t1 = tic() logger.info("Cluster Centers [%s]:\n%s", n_clusters, km.cluster_centers_) inertia = km.inertia_.compute() logger.info("Inertia [%s]: %s", km.cluster_centers_, inertia) timings.append((n_clusters, args.factor, t1 - t0, inertia)) pd.DataFrame(timings, columns=["n_clusters", "factor", "time", "inertia"]).to_csv("timings.csv")
def main(args=None): args = parse_args() ctx = directory = tempfile.TemporaryDirectory() with ctx: original = os.path.join(str(directory), args.original) split = os.path.join(str(directory), args.split) final = os.path.join(str(directory), args.final) shape = (args.n_slices, ) + args.shape chunks = (1, ) + args.shape a = da.random.random(shape, chunks=chunks) a.to_zarr(original, overwrite=True) with Client(): print("rechunking") t0 = tic() with performance_report(): rechunk.rechunk(original, split, final, args.split_chunks) t1 = tic() took = t1 - t0 gbs = a.nbytes / 1e9 / took print( f"Rechunked {dask.utils.format_bytes(a.nbytes)} in {t1 - t0:.2f}s ({gbs:0.2f} GB/s)" )
def main(args=None): args = parse_args(args) steps = range(args.start, args.stop, args.step) if args.scheduler_address: client = Client(args.scheduler_address) info = client.scheduler_info() logger.info("Distributed mode: %s", client.scheduler) logger.info("Dashboard: %s:%s", info['address'], info['services']['bokeh']) else: logger.warning("Local mode") logger.info("Fitting for %s", list(steps)) logger.info("Reading data") X = read().pipe(transform).pipe(as_array) X, = persist(X) timings = [] for n_clusters in range(args.start, args.stop, args.step): logger.info("Starting %02d", n_clusters) t0 = tic() km = do(X, n_clusters, factor=args.factor) t1 = tic() logger.info("Finished %02d, [%.2f]", n_clusters, t1 - t0) logger.info("Cluster Centers [%s]:\n%s", n_clusters, km.cluster_centers_) inertia = km.inertia_.compute() logger.info("Inertia [%s]: %s", km.cluster_centers_, inertia) timings.append((n_clusters, args.factor, t1 - t0, inertia)) pd.DataFrame(timings, columns=['n_clusters', 'factor', 'time', 'inertia']).to_csv('timings.csv')
def wrapper(*args, **kwargs): # TODO: grab config. # TODO: structlog or something similar t0 = tic() result = func(*args, **kwargs) t1 = tic() timings[func.__name__].append(t1 - t0) return result
def _partial_fit(model, x, y, kwargs=None): kwargs = kwargs or dict() start = tic() logger.info("Starting partial-fit %s", dask.base.tokenize(model, x, y)) model.partial_fit(x, y, **kwargs) stop = tic() logger.info("Finished partial-fit %s [%0.2f]", dask.base.tokenize(model, x, y), stop - start) return model
def init_scalable(X, n_clusters, random_state=None, max_iter=None, oversampling_factor=2): """K-Means initialization using k-means|| This is algorithm 2 in Scalable K-Means++ (2012). """ if isinstance(random_state, Integral) or random_state is None: random_state = da.random.RandomState(random_state) logger.info("Initializing with k-means||") init_start = tic() # Step 1: Initialize Centers idx = 0 centers = da.compute(X[idx, np.newaxis])[0] c_idx = {idx} # Step 2: Initialize cost cost = evaluate_cost(X, centers) # TODO: natural log10? log2? n_iter = int(np.round(np.log(cost))) if max_iter is not None: n_iter = min(max_iter, n_iter) # Steps 3 - 6: update candidate Centers for i in range(n_iter): t0 = tic() new_idxs = _sample_points(X, centers, oversampling_factor, random_state) new_idxs = set(*compute(new_idxs)) c_idx |= new_idxs t1 = tic() logger.info("init iteration %2d/%2d %.2f s, %2d centers", i + 1, n_iter, t1 - t0, len(c_idx)) # Sort before slicing, for better performance / memory # usage with the scheduler. # See https://github.com/dask/dask-ml/issues/39 centers = X[sorted(c_idx)].compute() # XXX: scikit-learn doesn't have weighted k-means. # The paper weights each center by the number of points closest to it. # https://stackoverflow.com/a/37198799/1889400 claims you can scale the # features before clustering, but that doesn't seem right. # I think that replicating the *points*, proportional to the number of # original points closest to the candidate centers, would be a better way # to do that. # Step 7, 8 without weights km = sk_k_means.KMeans(n_clusters) km.fit(centers) logger.info("Finished initialization. %.2f s, %2d centers", tic() - init_start, n_clusters) return km.cluster_centers_
def init_random(X, n_clusters, random_state): """K-means initialization using randomly chosen points""" logger.info("Initializing randomly") t0 = tic() idx = sorted(random_state.randint(0, len(X), size=n_clusters)) centers = X[idx].compute() logger.info("Finished initialization. %.2f s, %2d centers", tic() - t0, n_clusters) return centers
def _kmeans_single_lloyd(X, n_clusters, max_iter=300, init='k-means||', verbose=False, x_squared_norms=None, random_state=None, tol=1e-4, precompute_distances=True, oversampling_factor=2, init_max_iter=None): centers = k_init(X, n_clusters, init=init, oversampling_factor=oversampling_factor, random_state=random_state, max_iter=init_max_iter) dt = X.dtype X = X.astype(np.float32) P = X.shape[1] for i in range(max_iter): t0 = tic() centers = centers.astype('f4') labels, distances = pairwise_distances_argmin_min( X, centers, metric='euclidean', metric_kwargs={"squared": True} ) labels = labels.astype(np.int32) distances = distances.astype(np.float32) r = da.atop(_centers_dense, 'ij', X, 'ij', labels, 'i', n_clusters, None, distances, 'i', adjust_chunks={"i": n_clusters, "j": P}, dtype='f8') new_centers = da.from_delayed( sum(r.to_delayed().flatten()), (n_clusters, P), X.dtype ) counts = da.bincount(labels, minlength=n_clusters) new_centers = new_centers / counts[:, None] new_centers, = compute(new_centers) # Convergence check shift = squared_norm(centers - new_centers) t1 = tic() logger.info("Lloyd loop %2d. Shift: %0.4f [%.2f s]", i, shift, t1 - t0) if shift < tol: break centers = new_centers if shift > 1e-7: labels, distances = pairwise_distances_argmin_min(X, centers) inertia = distances.astype(dt).sum() centers = centers.astype(dt) labels = labels.astype(np.int64) return labels, inertia, centers, i + 1
def init_pp(X, n_clusters, random_state): """K-means initialization using k-means++ This uses scikit-learn's implementation. """ x_squared_norms = row_norms(X, squared=True).compute() logger.info("Initializing with k-means++") t0 = tic() centers = sk_k_means._k_init(X, n_clusters, random_state=random_state, x_squared_norms=x_squared_norms) logger.info("Finished initialization. %.2f s, %2d centers", tic() - t0, n_clusters) return centers
def fit(data, use_scikit_learn=False): logger.info("Starting to cluster") # Cluster n_clusters = 8 oversampling_factor = 2 if use_scikit_learn: km = sk.KMeans(n_clusters=n_clusters, random_state=0) else: km = KMeans(n_clusters=n_clusters, oversampling_factor=oversampling_factor, random_state=0) t0 = tic() logger.info("Starting n_clusters=%2d, oversampling_factor=%2d", n_clusters, oversampling_factor) km.fit(data) t1 = tic() logger.info("Finished in %.2f", t1 - t0)
def _timer(name, _logger=None, level="info"): """ Output execution time of a function to the given logger level Parameters ---------- name : str How to name the timer (will be in the logs) logger : logging.logger The optional logger where to write level : str On which level to log the performance measurement """ start = tic() _logger = _logger or logger _logger.info("Starting %s", name) yield stop = tic() delta = datetime.timedelta(seconds=stop - start) _logger_level = getattr(_logger, level) _logger_level("Finished %s in %s", name, delta) # nicer formatting for time.
def fit(self, X, y=None, **kwargs): """Fit the underlying estimator. Parameters ---------- X, y : array-like **kwargs Additional fit-kwargs for the underlying estimator. Returns ------- self : object """ start = tic() logger.info("Starting fit") result = self.estimator.fit(X, y, **kwargs) stop = tic() logger.info("Finished fit, %0.2f", stop - start) # Copy over learned attributes copy_learned_attributes(result, self) copy_learned_attributes(result, self.estimator) return self
This example shows how dask-ml's ``SpectralClustering`` scales with the number of samples, compared to scikit-learn's implementation. The dask version uses an approximation to the affinity matrix, which avoids an avoids an expensive computation at the cost of some approximation error. """ from sklearn.datasets import make_circles from sklearn.utils import shuffle import pandas as pd from timeit import default_timer as tic import sklearn.cluster as scluster import dask_ml.cluster as dcluster import seaborn as sns Ns = [2500, 5000, 7500] X, y = make_circles(n_samples=10_000, noise=0.05, random_state=0, factor=0.5) X, y = shuffle(X, y) timings = [] for n in Ns: X, y = make_circles(n_samples=n, random_state=n, noise=0.5, factor=0.5) t1 = tic() dcluster.SpectralClustering(n_clusters=2, n_components=100).fit(X) timings.append(('nystrom', n, tic() - t1)) t1 = tic() scluster.SpectralClustering(n_clusters=2).fit(X) timings.append(('exact', n, tic() - t1)) df = pd.DataFrame(timings, columns=['method', 'n_samples', 'time']) sns.factorplot(x='n_samples', y='time', hue='method', data=df, aspect=1.5)
def evolve(self, generations=1, model_epochs=10, elites=1, verbose=2): log = "Generation,Fitness,TrainingTime\n" for generation in range(generations): epoch_start = tic() for i in range(self.pop_size): if (self.population[i].fitness == -1): if (verbose >= 1): print("Training model {}...".format(i + 1)) print(self.population[i]) training_start = tic() self._get_fitness(i, epochs=model_epochs, verbose=verbose) log += '{},{:.4f},{:.4f}\n'.format( generation, self.population[i].fitness, tic() - training_start) else: if (verbose >= 1): print("Model {} already trained".format(i + 1)) log += '{},{:.4f},{:.4f}\n'.format( generation, self.population[i].fitness, tic() - training_start) self.population = sorted(self.population, key=lambda x: x.fitness, reverse=True) if (verbose >= 1): print("Best fitness for Generation {}: {:.4f}".format( generation + 1, self.population[0].fitness)) probs = np.array([gene.fitness for gene in self.population]) total = probs.sum() probs = probs / total new_pop = self.population[:elites] # Keep the most fit individuals for i in range(elites, self.pop_size): a, b = np.random.choice(self.pop_size, size=2, replace=True, p=probs) child = self.population[a].cross(self.population[b]) child.mutate() new_pop.append(child) self.population = new_pop if (verbose >= 1): print("Generation {} duration: {:.4f}".format( i + 1, tic() - epoch_start)) if (verbose >= 1): print("Training final generation...") for i in range(self.pop_size): if (self.population[i].fitness == -1): if (verbose >= 1): print("Training model {}...".format(i + 1)) print(self.population[i]) self._get_fitness(i, epochs=model_epochs, verbose=verbose) log += '{},{:.4f},{:.4f}\n'.format(generations, self.population[i].fitness, tic() - training_start) else: if (verbose >= 1): print("Model {} already trained".format(i + 1)) log += '{},{:.4f},{:.4f}\n'.format( generations, self.population[i].fitness, tic() - training_start) self.population = sorted(self.population, key=lambda x: x.fitness, reverse=True) output_log = open("output_log.csv", 'w') output_log.write(log) output_log.close() if (verbose >= 1): print("Final best fitness: {:.4f}".format( self.population[0].fitness))
def _kmeans_single_lloyd( X, n_clusters, max_iter=300, init="k-means||", verbose=False, x_squared_norms=None, random_state=None, tol=1e-4, precompute_distances=True, oversampling_factor=2, init_max_iter=None, ): centers = k_init( X, n_clusters, init=init, oversampling_factor=oversampling_factor, random_state=random_state, max_iter=init_max_iter, ) dt = X.dtype P = X.shape[1] for i in range(max_iter): t0 = tic() labels, distances = pairwise_distances_argmin_min( X, centers, metric="euclidean", metric_kwargs={"squared": True}) labels = labels.astype(np.int32) # distances is always float64, but we need it to match X.dtype # for centers_dense, but remain float64 for inertia r = da.atop( _centers_dense, "ij", X, "ij", labels, "i", n_clusters, None, distances.astype(X.dtype), "i", adjust_chunks={ "i": n_clusters, "j": P }, dtype=X.dtype, ) new_centers = da.from_delayed(sum(r.to_delayed().flatten()), (n_clusters, P), X.dtype) counts = da.bincount(labels, minlength=n_clusters) # Require at least one per bucket, to avoid division by 0. counts = da.maximum(counts, 1) new_centers = new_centers / counts[:, None] new_centers, = compute(new_centers) # Convergence check shift = squared_norm(centers - new_centers) t1 = tic() logger.info("Lloyd loop %2d. Shift: %0.4f [%.2f s]", i, shift, t1 - t0) if shift < tol: break centers = new_centers if shift > 1e-7: labels, distances = pairwise_distances_argmin_min(X, centers) labels = labels.astype(np.int32) inertia = distances.sum() centers = centers.astype(dt) return labels, inertia, centers, i + 1
b_ix = pd.Index([1, 2, 3, 4, 5], name='b') concat = xr.concat([uav.Band1, uav.Band2, uav.Band3, uav.Band4, uav.Band5], b_ix) # Mask nodata areas #concat = concat.where(concat.sum(dim='b') > 0) predicted = xarray_classify.classify_dataset(concat, clf_RF) # Just look at a subset area in this case (slice) #predicted = xarray_classify.classify_dataset(concat.isel(x=slice(3000,3500),y=slice(3000,3500)), clf_RF) # Calculate albedo #uav = uav.isel(x=slice(3000,3500),y=slice(3000,3500)) #albedo = 0.726*(uav['Band2']-0.18) - 0.322*(uav['Band2']-0.18)**2 - 0.015*(uav['Band4']-0.2) + 0.581*(uav['Band4']-0.2) t1 = tic() albedo = 0.726 * uav['Band2'] - 0.322 * uav['Band2']**2 - 0.015 * uav[ 'Band4'] + 0.581 * uav['Band4'] print('xarray albedo (seconds): ', tic() - t1) # Save outputs if not setup: # Define projection srs = osr.SpatialReference() srs.ImportFromProj4('+init=epsg:32622') crs = xr.DataArray(0, encoding={'dtype': np.dtype('int8')}) crs.attrs['projected_crs_name'] = srs.GetAttrValue('projcs') crs.attrs['grid_mapping_name'] = 'universal_transverse_mercator' crs.attrs['scale_factor_at_central_origin'] = srs.GetProjParm( 'scale_factor')
version uses an approximation to the affinity matrix, which avoids an expensive computation at the cost of some approximation error. """ from sklearn.datasets import make_circles from sklearn.utils import shuffle import pandas as pd from timeit import default_timer as tic import sklearn.cluster import dask_ml.cluster import seaborn as sns Ns = [2500, 5000, 7500, 10000] X, y = make_circles(n_samples=10_000, noise=0.05, random_state=0, factor=0.5) X, y = shuffle(X, y) timings = [] for n in Ns: X, y = make_circles(n_samples=n, random_state=n, noise=0.5, factor=0.5) t1 = tic() sklearn.cluster.SpectralClustering(n_clusters=2).fit(X) timings.append(('Scikit-Learn (exact)', n, tic() - t1)) t1 = tic() dask_ml.cluster.SpectralClustering(n_clusters=2, n_components=100).fit(X) timings.append(('dask-ml (approximate)', n, tic() - t1)) df = pd.DataFrame(timings, columns=['method', 'Number of Samples', 'Fit Time']) sns.factorplot(x='Number of Samples', y='Fit Time', hue='method', data=df, aspect=1.5)
Comparison of scaling. """ from dask_ml.datasets import make_classification import pandas as pd from timeit import default_timer as tic import sklearn.linear_model import dask_ml.linear_model import seaborn as sns Ns = [2500, 5000, 7500, 10000] timings = [] for n in Ns: X, y = make_classification(n_samples=n, random_state=n, chunks=n // 20) t1 = tic() sklearn.linear_model.LogisticRegression().fit(X, y) timings.append(('Scikit-Learn', n, tic() - t1)) t1 = tic() dask_ml.linear_model.LogisticRegression().fit(X, y) timings.append(('dask-ml', n, tic() - t1)) df = pd.DataFrame(timings, columns=['method', 'Number of Samples', 'Fit Time']) sns.factorplot(x='Number of Samples', y='Fit Time', hue='method', data=df, aspect=1.5)
def init_scalable(X, n_clusters, random_state=None, max_iter=None, oversampling_factor=2): """K-Means initialization using k-means|| This is algorithm 2 in Scalable K-Means++ (2012). """ logger.info("Initializing with k-means||") init_start = tic() # Step 1: Initialize Centers idx = 0 centers = da.compute(X[idx, np.newaxis])[0] c_idx = {idx} # Step 2: Initialize cost cost, = compute(evaluate_cost(X, centers)) if cost == 0: n_iter = 0 else: n_iter = int(np.round(np.log(cost))) if max_iter is not None: n_iter = min(max_iter, n_iter) # Steps 3 - 6: update candidate Centers for i in range(n_iter): t0 = tic() new_idxs = _sample_points(X, centers, oversampling_factor, random_state) new_idxs = set(*compute(new_idxs)) c_idx |= new_idxs t1 = tic() logger.info("init iteration %2d/%2d %.2f s, %2d centers", i + 1, n_iter, t1 - t0, len(c_idx)) # Sort before slicing, for better performance / memory # usage with the scheduler. # See https://github.com/dask/dask-ml/issues/39 centers = X[sorted(c_idx)].compute() # XXX: scikit-learn doesn't have weighted k-means. # The paper weights each center by the number of points closest to it. # https://stackoverflow.com/a/37198799/1889400 claims you can scale the # features before clustering, but that doesn't seem right. # I think that replicating the *points*, proportional to the number of # original points closest to the candidate centers, would be a better way # to do that. if len(centers) < n_clusters: logger.warning("Found fewer than %d clusters in init.", n_clusters) # supplement with random need = n_clusters - len(centers) locs = sorted( random_state.choice(np.arange(0, len(X)), size=need, replace=False, chunks=len(X))) extra = X[locs].compute() return np.vstack([centers, extra]) else: # Step 7, 8 without weights # dask RandomState objects aren't valid for scikit-learn rng2 = random_state.randint(0, 2**32 - 1, chunks=()).compute().item() km = sk_k_means.KMeans(n_clusters, random_state=rng2) km.fit(centers) logger.info("Finished initialization. %.2f s, %2d centers", tic() - init_start, n_clusters) return km.cluster_centers_