Esempio n. 1
0
def test_unknown_chunks_ok(fit_intercept):
    # https://github.com/dask/dask-ml/issues/145
    X = dd.from_pandas(pd.DataFrame(np.random.uniform(size=(10, 5))), 2).values
    y = dd.from_pandas(pd.Series(np.random.uniform(size=(10, ))), 2).values

    reg = LinearRegression(fit_intercept=fit_intercept)
    reg.fit(X, y)
Esempio n. 2
0
def test_lm(fit_intercept):
    X, y = make_regression(n_samples=100, n_features=5, chunks=50)
    lr = LinearRegression(fit_intercept=fit_intercept)
    lr.fit(X, y)
    lr.predict(X)
    if fit_intercept:
        assert lr.intercept_ is not None
hour_no_train_X.head(1)


# In[202]:


hour_no_train_label = hour_no_train.loc[:,"cnt"]
hour_no_train_label.head()


# Now we build the most basic model with Linear Regression

# In[203]:


LR = LinearRegression(fit_intercept=True)


# In[204]:


da.LR_model_baseline = LR.fit(hour_no_train_X.values, hour_no_train_label.values)


# In[205]:


da.hour_no_test_X = hour_no_test.loc[:, "season":"windspeed"]


# In[206]:
Esempio n. 4
0
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
import datetime as dt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from dask_ml.linear_model import LinearRegression
from dask_ml.metrics import r2_score

# linear regression

lr_model = LinearRegression()

traintrial = hour_df[hour_df["datetime"].compute() < "2012-10-01"]
testtrial = hour_df[hour_df["datetime"].compute() >= "2012-10-01"]

# drop the target variable and "registered" variable since we are using registered along with the rest of features

X_traintrial = traintrial.drop(
    ["cnt", "registered", "month", "casual", "season"], axis=1
)
y_traintrial = traintrial["cnt"]
X_testtrial = testtrial.drop(["cnt", "registered", "month", "casual"], axis=1)
y_testtrial = testtrial["cnt"]
X_traintrial = X_traintrial.drop(["datetime"], axis=1)
X_testtrial = X_testtrial.drop(["datetime"], axis=1)
Esempio n. 5
0
def test_lr_score():
    X = da.from_array(np.arange(1000).reshape(1000, 1))
    lr = LinearRegression()
    lr.fit(X, X)
    assert lr.score(X, X) == pytest.approx(1, 0.001)
#defining the data and target
categorical_variables = df[[
    'Gender', 'Age', 'Occupation', 'City_Category',
    'Stay_In_Current_City_Years', 'Marital_Status'
]]
target = df['Purchase']

#creating dummies for the categorical variables
data = dd.get_dummies(categorical_variables.categorize()).compute()

#converting dataframe to array
datanew = data.values

#fit the model
from dask_ml.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(datanew, target)

#preparing the test data
test_categorical = test[[
    'Gender', 'Age', 'Occupation', 'City_Category',
    'Stay_In_Current_City_Years', 'Marital_Status'
]]
test_dummy = dd.get_dummies(test_categorical.categorize()).compute()
testnew = test_dummy.values

#predict on test and upload
pred = lr.predict(testnew)

#Clustering/K-Means
from dask_ml.cluster import KMeans
Esempio n. 7
0
    def image_tikhonov(self,
                       vis_arr,
                       sphere,
                       alpha,
                       scale=True,
                       usedask=False):
        n_s = sphere.pixels.shape[0]
        n_v = self.u_arr.shape[0]

        lambduh = alpha / np.sqrt(n_s)
        if not usedask:
            gamma = self.make_gamma(sphere)
            logger.info("augmented: {}".format(gamma.shape))

            vis_aux = vis_to_real(vis_arr)
            logger.info("vis mean: {} shape: {}".format(
                np.mean(vis_aux), vis_aux.shape))

            tol = min(alpha / 1e4, 1e-10)
            logger.info("Solving tol={} ...".format(tol))

            # reg = linear_model.ElasticNet(alpha=alpha/np.sqrt(n_s),
            # tol=1e-6,
            # l1_ratio = 0.01,
            # max_iter=100000,
            # positive=True)
            if False:
                (
                    sky,
                    lstop,
                    itn,
                    r1norm,
                    r2norm,
                    anorm,
                    acond,
                    arnorm,
                    xnorm,
                    var,
                ) = scipy.sparse.linalg.lsqr(gamma,
                                             vis_aux,
                                             damp=alpha,
                                             show=True)
                logger.info(
                    "Alpha: {}: Iterations: {}: rnorm: {}: xnorm: {}".format(
                        alpha, itn, r2norm, xnorm))
            else:
                reg = linear_model.Ridge(alpha=alpha,
                                         tol=tol,
                                         solver="lsqr",
                                         max_iter=100000)

                reg.fit(gamma, vis_aux)
                logger.info("    Solve Complete, iter={}".format(reg.n_iter_))

                sky = da.from_array(reg.coef_)

                residual = vis_aux - gamma @ sky

                sky, residual_norm, solution_norm = da.compute(
                    sky,
                    np.linalg.norm(residual)**2,
                    np.linalg.norm(sky)**2)

                score = reg.score(gamma, vis_aux)
                logger.info("Alpha: {}: Loss: {}: rnorm: {}: snorm: {}".format(
                    alpha, score, residual_norm, solution_norm))

        else:
            from dask_ml.linear_model import LinearRegression
            import dask_glm
            from dask.distributed import Client, LocalCluster
            from dask.diagnostics import ProgressBar
            import dask

            logger.info("Starting Dask Client")

            if True:
                cluster = LocalCluster(dashboard_address=":8231",
                                       processes=False)
                client = Client(cluster)
            else:
                client = Client("tcp://localhost:8786")

            logger.info("Client = {}".format(client))

            harmonic_list = []
            p2j = 2 * np.pi * 1.0j

            dl = sphere.l
            dm = sphere.m
            dn = sphere.n

            n_arr_minus_1 = dn - 1

            du = self.u_arr
            dv = self.v_arr
            dw = self.w_arr

            for u, v, w in zip(du, dv, dw):
                harmonic = da.from_array(
                    np.exp(p2j * (u * dl + v * dm + w * n_arr_minus_1)) /
                    np.sqrt(sphere.npix),
                    chunks=(n_s, ),
                )
                harminc = client.persist(harmonic)
                harmonic_list.append(harmonic)

            gamma = da.stack(harmonic_list)
            logger.info("Gamma Shape: {}".format(gamma.shape))
            # gamma = gamma.reshape((n_v, n_s))
            gamma = gamma.conj()
            gamma = client.persist(gamma)

            logger.info("Gamma Shape: {}".format(gamma.shape))

            logger.info("Building Augmented Operator...")
            proj_operator_real = da.real(gamma)
            proj_operator_imag = da.imag(gamma)
            proj_operator = da.block([[proj_operator_real],
                                      [proj_operator_imag]])

            proj_operator = client.persist(proj_operator)

            logger.info("Proj Operator shape {}".format(proj_operator.shape))
            vis_aux = da.from_array(
                np.array(
                    np.concatenate((np.real(vis_arr), np.imag(vis_arr))),
                    dtype=np.float32,
                ))

            # logger.info("Solving...")

            en = dask_glm.regularizers.ElasticNet(weight=0.01)
            en = dask_glm.regularizers.L2()
            # dT = da.from_array(proj_operator, chunks=(-1, 'auto'))
            ##dT = da.from_array(proj_operator, chunks=(-1, 'auto'))
            # dv = da.from_array(vis_aux)

            dask.config.set({"array.chunk-size": "1024MiB"})
            A = da.rechunk(proj_operator, chunks=("auto", n_s))
            A = client.persist(A)
            y = vis_aux  # da.rechunk(vis_aux, chunks=('auto', n_s))
            y = client.persist(y)
            # sky = dask_glm.algorithms.proximal_grad(A, y, regularizer=en, lambduh=alpha, max_iter=10000)

            logger.info("Rechunking completed.. A= {}.".format(A.shape))
            reg = LinearRegression(
                penalty=en,
                C=1.0 / lambduh,
                fit_intercept=False,
                solver="lbfgs",
                max_iter=1000,
                tol=1e-8,
            )
            sky = reg.fit(A, y)
            sky = reg.coef_
            score = reg.score(proj_operator, vis_aux)
            logger.info("Loss function: {}".format(score.compute()))

        logger.info("Solving Complete: sky = {}".format(sky.shape))

        sphere.set_visible_pixels(sky, scale=False)
        return sky.reshape(-1, 1)
Esempio n. 8
0
if __name__ == '__main__':

    parser = argparse.ArgumentParser(
        description='Example of predicting departure delay from NYC flights')
    parser.add_argument('--data_dir',
                        default='data',
                        dest='data_dir',
                        type=str)
    parser.add_argument(
        '--nyc_url',
        default=
        'https://storage.googleapis.com/dask-tutorial-data/nycflights.tar.gz',
        dest='nyc_url',
        type=str)

    args = parser.parse_args()

    client = Client(n_workers=2)

    load_flights(args.data_dir, args.nyc_url)

    X, y = get_full_data(data_dir)

    X_train, X_test, X_valid, y_train, y_test, y_valid = prepare_dataset(X, y)

    lr = LinearRegression()
    test_score, valid_score = train(lr, X_train, X_test, X_valid, y_train,
                                    y_test, y_valid)

    print(test_score, valid_score)
Esempio n. 9
0
from dask.distributed import Client
import time
import sys
from dask_ml.linear_model import LinearRegression
import dask.dataframe as dd

client = Client(n_workers=4)
t0 = time.time()
data = dd.read_csv(sys.argv[1], header=None)
model = LinearRegression(fit_intercept=False)
reg = model.fit(
    data[[
        0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21
    ]].values, data[3].values)
print(reg.coef_)
print('Tiempo transcurrido:', time.time() - t0)
client.close()
Esempio n. 10
0
    def image_tikhonov(self, vis_arr, sphere, alpha, scale=True, usedask=False):
        n_s = sphere.pixels.shape[0]
        n_v = self.u_arr.shape[0]
        
        lambduh = alpha/np.sqrt(n_s)
        if not usedask:
            gamma = self.make_gamma(sphere)
            logger.info("Building Augmented Operator...")
            proj_operator_real = np.real(gamma).astype(np.float32)
            proj_operator_imag = np.imag(gamma).astype(np.float32)
            gamma = None
            proj_operator = np.block([[proj_operator_real], [proj_operator_imag]])
            proj_operator_real = None
            proj_operator_imag = None 
            logger.info('augmented: {}'.format(proj_operator.shape))
            
            vis_aux = np.array(np.concatenate((np.real(vis_arr), np.imag(vis_arr))), dtype=np.float32)
            logger.info('vis mean: {} shape: {}'.format(np.mean(vis_aux), vis_aux.shape))

            logger.info("Solving...")
            reg = linear_model.ElasticNet(alpha=lambduh, l1_ratio=0.05, max_iter=10000, positive=True)
            reg.fit(proj_operator, vis_aux)
            sky = reg.coef_
            
            score = reg.score(proj_operator, vis_aux)
            logger.info('Loss function: {}'.format(score))
            
        else:
            from dask_ml.linear_model import LinearRegression
            import dask_glm
            import dask.array as da
            from dask.distributed import Client, LocalCluster
            from dask.diagnostics import ProgressBar
            import dask
            
            logger.info('Starting Dask Client')
            
            if True:
                cluster = LocalCluster(dashboard_address=':8231', processes=False)
                client = Client(cluster)
            else:
                client = Client('tcp://localhost:8786')
                
            logger.info("Client = {}".format(client))
            
            harmonic_list = []
            p2j = 2*np.pi*1.0j
            
            dl = sphere.l
            dm = sphere.m
            dn = sphere.n
        
            n_arr_minus_1 = dn - 1

            du = self.u_arr
            dv = self.v_arr
            dw = self.w_arr
        
            for u, v, w in zip(du, dv, dw):
                harmonic = da.from_array(np.exp(p2j*(u*dl + v*dm + w*n_arr_minus_1)) / np.sqrt(sphere.npix), chunks=(n_s,))
                harminc = client.persist(harmonic)
                harmonic_list.append(harmonic)

            gamma = da.stack(harmonic_list)
            logger.info('Gamma Shape: {}'.format(gamma.shape))
            #gamma = gamma.reshape((n_v, n_s))
            gamma = gamma.conj()
            gamma = client.persist(gamma)
            
            logger.info('Gamma Shape: {}'.format(gamma.shape))
            
            logger.info("Building Augmented Operator...")
            proj_operator_real = da.real(gamma)
            proj_operator_imag = da.imag(gamma)
            proj_operator = da.block([[proj_operator_real], [proj_operator_imag]])
            
            proj_operator = client.persist(proj_operator)
            
            logger.info("Proj Operator shape {}".format(proj_operator.shape))
            vis_aux = da.from_array(np.array(np.concatenate((np.real(vis_arr), np.imag(vis_arr))), dtype=np.float32))
            
            #logger.info("Solving...")

            
            en = dask_glm.regularizers.ElasticNet(weight=0.01)
            en =  dask_glm.regularizers.L2()
            #dT = da.from_array(proj_operator, chunks=(-1, 'auto'))
            ##dT = da.from_array(proj_operator, chunks=(-1, 'auto'))
            #dv = da.from_array(vis_aux)
            

            dask.config.set({'array.chunk-size': '1024MiB'})
            A = da.rechunk(proj_operator, chunks=('auto', n_s))
            A = client.persist(A)
            y = vis_aux # da.rechunk(vis_aux, chunks=('auto', n_s))
            y = client.persist(y)
            #sky = dask_glm.algorithms.proximal_grad(A, y, regularizer=en, lambduh=alpha, max_iter=10000)

            logger.info("Rechunking completed.. A= {}.".format(A.shape))
            reg =  LinearRegression(penalty=en, C=1.0/lambduh,  
                                    fit_intercept=False, 
                                    solver='lbfgs', 
                                    max_iter=1000, tol=1e-8 )
            sky = reg.fit(A, y)
            sky = reg.coef_
            score = reg.score(proj_operator, vis_aux)
            logger.info('Loss function: {}'.format(score.compute()))

        logger.info("Solving Complete: sky = {}".format(sky.shape))

        sphere.set_visible_pixels(sky, scale=True)
        return sky.reshape(-1,1)
# In[43]:

client = Client()
client

# #### Linear Regression

# In[44]:

from scikitplot.metrics import plot_calibration_curve
from scikitplot.plotters import plot_learning_curve
from scikitplot.estimators import plot_feature_importances

# In[45]:

lr = LinearRegression()
with joblib.parallel_backend('dask'):
    lr_model = lr.fit(X_train.values, y_train.values)
    y_pred_lr = lr.predict(X_test.values)

# In[46]:

mse(y_test.values, y_pred_lr)

# In[47]:

r2_score(y_test.values.compute(), y_pred_lr.compute())

# ### Non Linear Models

# #### Random Forest Regressor