def test_lm(fit_intercept): X, y = make_regression(n_samples=100, n_features=5, chunks=50) lr = LinearRegression(fit_intercept=fit_intercept) lr.fit(X, y) lr.predict(X) if fit_intercept: assert lr.intercept_ is not None
def test_unknown_chunks_ok(fit_intercept): # https://github.com/dask/dask-ml/issues/145 X = dd.from_pandas(pd.DataFrame(np.random.uniform(size=(10, 5))), 2).values y = dd.from_pandas(pd.Series(np.random.uniform(size=(10, ))), 2).values reg = LinearRegression(fit_intercept=fit_intercept) reg.fit(X, y)
hour_no_train_label = hour_no_train.loc[:,"cnt"] hour_no_train_label.head() # Now we build the most basic model with Linear Regression # In[203]: LR = LinearRegression(fit_intercept=True) # In[204]: da.LR_model_baseline = LR.fit(hour_no_train_X.values, hour_no_train_label.values) # In[205]: da.hour_no_test_X = hour_no_test.loc[:, "season":"windspeed"] # In[206]: LR_baseline_predicted = LR_model_baseline.predict(hour_no_test_X.values) # In[207]:
["year", "hour", "is_holiday", "weekday", "is_workingday", "weathersit"] de = DummyEncoder( ["year", "hour", "is_holiday", "weekday", "is_workingday", "weathersit"] ) X_traintrial = de.fit_transform(X_traintrial) de = DummyEncoder( ["year", "hour", "is_holiday", "weekday", "is_workingday", "weathersit"] ) X_testtrial = de.fit_transform(X_testtrial) ) lr = LinearRegression() lr.fit(X_traintrial.values, y_traintrial.values) X_testtrial = X_testtrial.drop("season", axis=1) y_predtrial = lr.predict(X_traintrial.values) y_predtest = lr.predict(X_testtrial.values) import dask.array as da y_predtest.compute() # random forest rf = RandomForestRegressor(n_estimators=1000, max_depth=10)
def test_lr_score(): X = da.from_array(np.arange(1000).reshape(1000, 1)) lr = LinearRegression() lr.fit(X, X) assert lr.score(X, X) == pytest.approx(1, 0.001)
categorical_variables = df[[ 'Gender', 'Age', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status' ]] target = df['Purchase'] #creating dummies for the categorical variables data = dd.get_dummies(categorical_variables.categorize()).compute() #converting dataframe to array datanew = data.values #fit the model from dask_ml.linear_model import LinearRegression lr = LinearRegression() lr.fit(datanew, target) #preparing the test data test_categorical = test[[ 'Gender', 'Age', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status' ]] test_dummy = dd.get_dummies(test_categorical.categorize()).compute() testnew = test_dummy.values #predict on test and upload pred = lr.predict(testnew) #Clustering/K-Means from dask_ml.cluster import KMeans model = KMeans()
def image_tikhonov(self, vis_arr, sphere, alpha, scale=True, usedask=False): n_s = sphere.pixels.shape[0] n_v = self.u_arr.shape[0] lambduh = alpha / np.sqrt(n_s) if not usedask: gamma = self.make_gamma(sphere) logger.info("augmented: {}".format(gamma.shape)) vis_aux = vis_to_real(vis_arr) logger.info("vis mean: {} shape: {}".format( np.mean(vis_aux), vis_aux.shape)) tol = min(alpha / 1e4, 1e-10) logger.info("Solving tol={} ...".format(tol)) # reg = linear_model.ElasticNet(alpha=alpha/np.sqrt(n_s), # tol=1e-6, # l1_ratio = 0.01, # max_iter=100000, # positive=True) if False: ( sky, lstop, itn, r1norm, r2norm, anorm, acond, arnorm, xnorm, var, ) = scipy.sparse.linalg.lsqr(gamma, vis_aux, damp=alpha, show=True) logger.info( "Alpha: {}: Iterations: {}: rnorm: {}: xnorm: {}".format( alpha, itn, r2norm, xnorm)) else: reg = linear_model.Ridge(alpha=alpha, tol=tol, solver="lsqr", max_iter=100000) reg.fit(gamma, vis_aux) logger.info(" Solve Complete, iter={}".format(reg.n_iter_)) sky = da.from_array(reg.coef_) residual = vis_aux - gamma @ sky sky, residual_norm, solution_norm = da.compute( sky, np.linalg.norm(residual)**2, np.linalg.norm(sky)**2) score = reg.score(gamma, vis_aux) logger.info("Alpha: {}: Loss: {}: rnorm: {}: snorm: {}".format( alpha, score, residual_norm, solution_norm)) else: from dask_ml.linear_model import LinearRegression import dask_glm from dask.distributed import Client, LocalCluster from dask.diagnostics import ProgressBar import dask logger.info("Starting Dask Client") if True: cluster = LocalCluster(dashboard_address=":8231", processes=False) client = Client(cluster) else: client = Client("tcp://localhost:8786") logger.info("Client = {}".format(client)) harmonic_list = [] p2j = 2 * np.pi * 1.0j dl = sphere.l dm = sphere.m dn = sphere.n n_arr_minus_1 = dn - 1 du = self.u_arr dv = self.v_arr dw = self.w_arr for u, v, w in zip(du, dv, dw): harmonic = da.from_array( np.exp(p2j * (u * dl + v * dm + w * n_arr_minus_1)) / np.sqrt(sphere.npix), chunks=(n_s, ), ) harminc = client.persist(harmonic) harmonic_list.append(harmonic) gamma = da.stack(harmonic_list) logger.info("Gamma Shape: {}".format(gamma.shape)) # gamma = gamma.reshape((n_v, n_s)) gamma = gamma.conj() gamma = client.persist(gamma) logger.info("Gamma Shape: {}".format(gamma.shape)) logger.info("Building Augmented Operator...") proj_operator_real = da.real(gamma) proj_operator_imag = da.imag(gamma) proj_operator = da.block([[proj_operator_real], [proj_operator_imag]]) proj_operator = client.persist(proj_operator) logger.info("Proj Operator shape {}".format(proj_operator.shape)) vis_aux = da.from_array( np.array( np.concatenate((np.real(vis_arr), np.imag(vis_arr))), dtype=np.float32, )) # logger.info("Solving...") en = dask_glm.regularizers.ElasticNet(weight=0.01) en = dask_glm.regularizers.L2() # dT = da.from_array(proj_operator, chunks=(-1, 'auto')) ##dT = da.from_array(proj_operator, chunks=(-1, 'auto')) # dv = da.from_array(vis_aux) dask.config.set({"array.chunk-size": "1024MiB"}) A = da.rechunk(proj_operator, chunks=("auto", n_s)) A = client.persist(A) y = vis_aux # da.rechunk(vis_aux, chunks=('auto', n_s)) y = client.persist(y) # sky = dask_glm.algorithms.proximal_grad(A, y, regularizer=en, lambduh=alpha, max_iter=10000) logger.info("Rechunking completed.. A= {}.".format(A.shape)) reg = LinearRegression( penalty=en, C=1.0 / lambduh, fit_intercept=False, solver="lbfgs", max_iter=1000, tol=1e-8, ) sky = reg.fit(A, y) sky = reg.coef_ score = reg.score(proj_operator, vis_aux) logger.info("Loss function: {}".format(score.compute())) logger.info("Solving Complete: sky = {}".format(sky.shape)) sphere.set_visible_pixels(sky, scale=False) return sky.reshape(-1, 1)
from dask.distributed import Client import time import sys from dask_ml.linear_model import LinearRegression import dask.dataframe as dd client = Client(n_workers=4) t0 = time.time() data = dd.read_csv(sys.argv[1], header=None) model = LinearRegression(fit_intercept=False) reg = model.fit( data[[ 0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21 ]].values, data[3].values) print(reg.coef_) print('Tiempo transcurrido:', time.time() - t0) client.close()
def image_tikhonov(self, vis_arr, sphere, alpha, scale=True, usedask=False): n_s = sphere.pixels.shape[0] n_v = self.u_arr.shape[0] lambduh = alpha/np.sqrt(n_s) if not usedask: gamma = self.make_gamma(sphere) logger.info("Building Augmented Operator...") proj_operator_real = np.real(gamma).astype(np.float32) proj_operator_imag = np.imag(gamma).astype(np.float32) gamma = None proj_operator = np.block([[proj_operator_real], [proj_operator_imag]]) proj_operator_real = None proj_operator_imag = None logger.info('augmented: {}'.format(proj_operator.shape)) vis_aux = np.array(np.concatenate((np.real(vis_arr), np.imag(vis_arr))), dtype=np.float32) logger.info('vis mean: {} shape: {}'.format(np.mean(vis_aux), vis_aux.shape)) logger.info("Solving...") reg = linear_model.ElasticNet(alpha=lambduh, l1_ratio=0.05, max_iter=10000, positive=True) reg.fit(proj_operator, vis_aux) sky = reg.coef_ score = reg.score(proj_operator, vis_aux) logger.info('Loss function: {}'.format(score)) else: from dask_ml.linear_model import LinearRegression import dask_glm import dask.array as da from dask.distributed import Client, LocalCluster from dask.diagnostics import ProgressBar import dask logger.info('Starting Dask Client') if True: cluster = LocalCluster(dashboard_address=':8231', processes=False) client = Client(cluster) else: client = Client('tcp://localhost:8786') logger.info("Client = {}".format(client)) harmonic_list = [] p2j = 2*np.pi*1.0j dl = sphere.l dm = sphere.m dn = sphere.n n_arr_minus_1 = dn - 1 du = self.u_arr dv = self.v_arr dw = self.w_arr for u, v, w in zip(du, dv, dw): harmonic = da.from_array(np.exp(p2j*(u*dl + v*dm + w*n_arr_minus_1)) / np.sqrt(sphere.npix), chunks=(n_s,)) harminc = client.persist(harmonic) harmonic_list.append(harmonic) gamma = da.stack(harmonic_list) logger.info('Gamma Shape: {}'.format(gamma.shape)) #gamma = gamma.reshape((n_v, n_s)) gamma = gamma.conj() gamma = client.persist(gamma) logger.info('Gamma Shape: {}'.format(gamma.shape)) logger.info("Building Augmented Operator...") proj_operator_real = da.real(gamma) proj_operator_imag = da.imag(gamma) proj_operator = da.block([[proj_operator_real], [proj_operator_imag]]) proj_operator = client.persist(proj_operator) logger.info("Proj Operator shape {}".format(proj_operator.shape)) vis_aux = da.from_array(np.array(np.concatenate((np.real(vis_arr), np.imag(vis_arr))), dtype=np.float32)) #logger.info("Solving...") en = dask_glm.regularizers.ElasticNet(weight=0.01) en = dask_glm.regularizers.L2() #dT = da.from_array(proj_operator, chunks=(-1, 'auto')) ##dT = da.from_array(proj_operator, chunks=(-1, 'auto')) #dv = da.from_array(vis_aux) dask.config.set({'array.chunk-size': '1024MiB'}) A = da.rechunk(proj_operator, chunks=('auto', n_s)) A = client.persist(A) y = vis_aux # da.rechunk(vis_aux, chunks=('auto', n_s)) y = client.persist(y) #sky = dask_glm.algorithms.proximal_grad(A, y, regularizer=en, lambduh=alpha, max_iter=10000) logger.info("Rechunking completed.. A= {}.".format(A.shape)) reg = LinearRegression(penalty=en, C=1.0/lambduh, fit_intercept=False, solver='lbfgs', max_iter=1000, tol=1e-8 ) sky = reg.fit(A, y) sky = reg.coef_ score = reg.score(proj_operator, vis_aux) logger.info('Loss function: {}'.format(score.compute())) logger.info("Solving Complete: sky = {}".format(sky.shape)) sphere.set_visible_pixels(sky, scale=True) return sky.reshape(-1,1)
client = Client() client # #### Linear Regression # In[44]: from scikitplot.metrics import plot_calibration_curve from scikitplot.plotters import plot_learning_curve from scikitplot.estimators import plot_feature_importances # In[45]: lr = LinearRegression() with joblib.parallel_backend('dask'): lr_model = lr.fit(X_train.values, y_train.values) y_pred_lr = lr.predict(X_test.values) # In[46]: mse(y_test.values, y_pred_lr) # In[47]: r2_score(y_test.values.compute(), y_pred_lr.compute()) # ### Non Linear Models # #### Random Forest Regressor # In[48]: