def main(client): m = 100000 n = 100 X, y = make_regression(n_samples=m, n_features=n, chunks=200, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) dtrain = DaskDMatrix(client, X_train, y_train) dtest = DaskDMatrix(client, X_test, y_test) output = xgb.dask.train( client, { "verbosity": 1, "tree_method": "hist", "objective": "reg:squarederror", "eval_metric": "rmse", "max_depth": 6, "learning_rate": 1.0, }, dtrain, num_boost_round=1000, evals=[(dtrain, "train"), (dtest, "test")], callbacks=[ CustomEarlyStopping(validation_set="test", target_metric="rmse", maximize=False, seed=0) ], )
def test_lm(fit_intercept): X, y = make_regression(n_samples=100, n_features=5, chunks=50) lr = LinearRegression(fit_intercept=fit_intercept) lr.fit(X, y) lr.predict(X) if fit_intercept: assert lr.intercept_ is not None
def single_chunk_regression(): """X, y pair for regression. The `X` and `y` have a single block, so chunksize is 100. Useful for testing `partial_fit` methods. """ X, y = make_regression(chunks=100, random_state=0) return X, y
def make_fake_regression(ncols=10, nrows=100): ddf = dask_datasets.make_regression( n_samples=nrows, n_features=ncols, n_informative=10, n_targets=1, bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, shuffle=True, coef=False, random_state=None, chunks=None, ) return ddf
def _prep_data(self, reg=False): self.n_samples = int(1e5) self.chunk_size = int(1e4) self.n_chunks = np.ceil(self.n_samples / self.chunk_size).astype(int) if reg: self.x, self.y = make_regression(n_samples=self.n_samples, chunks=self.chunk_size, random_state=0, n_features=40) else: self.x, self.y = make_blobs(n_samples=self.n_samples, chunks=self.chunk_size, random_state=0, n_features=40, centers=2, cluster_std=100) return self
#!/usr/bin/env python # coding: utf-8 # In[ ]: # https://www.kaggle.com/puneetgrover/speed-up-your-algorithms-dask # dask_kaggle_Regression # In[1]: from dask_ml.datasets import make_regression import dask.dataframe as dd X, y = make_regression(n_samples=1e6, chunks=50000) # In[2]: df = dd.from_dask_array(X) df.head() # In[3]: from dask_ml.model_selection import train_test_split, GridSearchCV xtr, ytr, xval, yval = train_test_split(X, y) # In[ ]:
jobid = os.getenv('SLURM_JOBID') client = Client(scheduler_file='scheduler_%s.json' % jobid) print(f'Job_id:{jobid}') # In[3]: client # In[4]: from dask_ml.datasets import make_regression X, y = make_regression(n_samples=4000000, n_features=32, chunks=1000, n_informative=10, random_state=101) # In[5]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) # In[6]: params = { 'objective': 'reg:squarederror', 'n_estimators': 100000, 'max_depth': 4, 'eta': 0.01, 'subsample': 0.5,
def xy_regression(): """X, y pair for classification""" X, y = make_regression(chunks=10, random_state=0) return X, y