Ejemplo n.º 1
0
def init_values(X, y, number=5, intercept=True):
    """ Return an initial parameter guess for a LASSO model

    Inputs
    y: n by 1 NumPy array, outcome variable
    X: n by k NumPy array, RHS variables

    Outputs
    residuals: n ny 1 NumPy array, residuals for initial parameter guess
    coefficients: k by 1 NumPy array, initial coefficient values
    """
    # Make sure y is a proper column vector
    y = cvec(y)

    # Get the absolute value of correlations between y and X
    corr = np.abs(cor(y, X))

    # Get the number of columns of X
    kx = X.shape[1]

    # Make an index selecting the five columns of X which are most correlated
    # with y (since .argsort() always sorts in increasing order, selecting from
    # the back gets the most highly correlated columns)
    index = corr.argsort()[-np.amin([number, kx]):]

    # Set up an array of coefficient guesses
    coefficients = np.zeros(shape=(kx, 1))

    # Regress y on the five most correlated columns of X, including an intercept
    # if desired
    reg = lm(fit_intercept=intercept).fit(X[:, index], y)

    # Replace the guesses for the estimated coefficients (note that .coef_ does
    # not return the estimated intercept, if one was included in the model)
    coefficients[index, :] = reg.coef_.T

    # Replace any NANs as zeros
    coefficients[np.isnan(coefficients)] = 0

    # Get the regression residuals
    residuals = y - reg.predict(X[:, index])

    # Return the residuals and coefficients
    return {'residuals': residuals, 'coefficients': coefficients}
Ejemplo n.º 2
0
design = DataFrame([[1, 1, 1, 1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1, 1, 1]], index=['KO', 'WT']).T

# lm.fit
df = data_set.copy()
df_mean = df.mean(1)
n_rows, n_cols = df.shape

# lm.series
for i in df.index:
    y, y_mask = df.loc[i], df.loc[i].notnull().values

    if y[y_mask].count() != 0:
        x = design[y_mask]
        y = y[y_mask]

        lm_res = lm().fit(x, y)

        std_unscaled = np.sqrt(np.diag(np.linalg.qr()))


# fit$genes <- y$probes
# fit$Amean <- y$Amean
# fit$method <- method
# fit$design <- design
# new("MArrayLM", fit)

# make.contrasts

# contrasts.fit

# ebayes
Ejemplo n.º 3
0
import pandas as pd
from sklearn.linear_model import LinearRegression as lm
import statsmodels.formula.api as smf
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
df = pd.read_csv("bike.csv")
df.head()

features = "+".join(df.columns[1:-3])
y, X = dmatrices("casual ~ " + features, df, return_type = "dataframe")

df_vif = pd.DataFrame()
df_vif["VIF"] = [vif(X.values, i) for i in range(X.shape[1])]
df_vif["features"] = X.columns
df_vif

model1 = smf.ols("casual ~ " + features, data = df)
print(model1.fit().summary())

X_df = df.iloc[:, 1:-3]
model2 = lm().fit(X_df, y)
model2.predict(X_df.iloc[:3, :])
Ejemplo n.º 4
0
def rlassoEffect(x,
                 y,
                 d,
                 method='double selection',
                 I3=None,
                 post=True,
                 colnames_d=None,
                 colnames_x=None,
                 intercept=True,
                 model=True,
                 homoskedastic=False,
                 X_dependent_lambda=False,
                 lambda_start=None,
                 c=1.1,
                 gamma=None,
                 numSim=5000,
                 numIter=15,
                 tol=10**(-5),
                 threshold=-np.inf,
                 par=True,
                 corecap=np.inf,
                 fix_seed=True):
    d = cvec(d)

    y = cvec(y)

    n, kx = x.shape

    if colnames_d is None:
        colnames_d = ['d1']

    if (colnames_x is None) and (x is not None):
        colnames_x = ['x' + str(i) for i in np.arange(kx)]

    if method == 'double selection':
        I1 = rlasso(x,
                    d,
                    post=post,
                    colnames=colnames_x,
                    intercept=intercept,
                    model=model,
                    homoskedastic=homoskedastic,
                    X_dependent_lambda=X_dependent_lambda,
                    lambda_start=lambda_start,
                    c=c,
                    gamma=gamma,
                    numSim=numSim,
                    numIter=numIter,
                    tol=tol,
                    threshold=threshold,
                    par=par,
                    corecap=corecap,
                    fix_seed=fix_seed).est['index']
        I2 = rlasso(x,
                    y,
                    post=post,
                    colnames=colnames_x,
                    intercept=intercept,
                    model=model,
                    homoskedastic=homoskedastic,
                    X_dependent_lambda=X_dependent_lambda,
                    lambda_start=lambda_start,
                    c=c,
                    gamma=gamma,
                    numSim=numSim,
                    numIter=numIter,
                    tol=tol,
                    threshold=threshold,
                    par=par,
                    corecap=corecap,
                    fix_seed=fix_seed).est['index']

        # Original code checks if type(I3) is bool, but I believe they only do
        # that to see whether it has been defined by the user
        if I3 is not None:
            I3 = cvec(I3)

            I = cvec(I1.astype(bool) | I2.astype(bool) | I3.astype(bool))
        else:
            I = cvec(I1.astype(bool) | I2.astype(bool))
            # missing here: names(I) <- union(names(I1),names(I2))

        if I.sum() == 0:
            I = None

        x = np.concatenate([d, x[:, I[:, 0]]], axis=1)

        reg1 = lm(fit_intercept=True).fit(x, y)

        alpha = reg1.coef_[0, 0]

        names_alpha = colnames_d

        resid = y - cvec(reg1.predict(x))

        if I is None:
            xi = (resid) * np.sqrt(n / (n - 1))
        else:
            xi = (resid) * np.sqrt(n / (n - I.sum() - 1))

        if I is None:
            # Fit an intercept-only model
            reg2 = lm(fit_intercept=False).fit(np.ones_like(d), d)

            v = d - cvec(reg2.predict(np.ones_like(d)))
        else:
            reg2 = lm(fit_intercept=True).fit(x[:, 1:], d)

            v = d - cvec(reg2.predict(x[:, 1:]))

        var = ((1 / n) * (1 / np.mean(v**2, axis=0)) * np.mean(
            (v**2) * (xi**2), axis=0) * (1 / np.mean(v**2, axis=0)))

        se = np.sqrt(var)

        tval = alpha / np.sqrt(var)

        pval = 2 * norm.cdf(-np.abs(tval))

        if I is None:
            no_selected = 1
        else:
            no_selected = 0

        res = {'epsilon': xi, 'v': v}

        if np.issubdtype(type(colnames_d), np.str_):
            colnames_d = [colnames_d]

        results = {
            'alpha': alpha,
            #'se': pd.DataFrame(se, index=colnames_d),
            'se': se,
            't': tval,
            'pval': pval,
            'no_selected': no_selected,
            'coefficients': alpha,
            'coefficient': alpha,
            'coefficients_reg': reg1.coef_,
            'selection_index': I,
            'residuals': res,
            #call = match.call(),
            'samplesize': n
        }
    elif method == 'partialling out':
        reg1 = rlasso(x,
                      y,
                      post=post,
                      colnames=colnames_x,
                      intercept=intercept,
                      model=model,
                      homoskedastic=homoskedastic,
                      X_dependent_lambda=X_dependent_lambda,
                      lambda_start=lambda_start,
                      c=c,
                      gamma=gamma,
                      numSim=numSim,
                      numIter=numIter,
                      tol=tol,
                      threshold=threshold,
                      par=par,
                      corecap=corecap,
                      fix_seed=fix_seed)

        yr = reg1.est['residuals']

        reg2 = rlasso(x,
                      d,
                      post=post,
                      colnames=colnames_x,
                      intercept=intercept,
                      model=model,
                      homoskedastic=homoskedastic,
                      X_dependent_lambda=X_dependent_lambda,
                      lambda_start=lambda_start,
                      c=c,
                      gamma=gamma,
                      numSim=numSim,
                      numIter=numIter,
                      tol=tol,
                      threshold=threshold,
                      par=par,
                      corecap=corecap,
                      fix_seed=fix_seed)

        dr = reg2.est['residuals']

        reg3 = lm(fit_intercept=True).fit(dr, yr)

        alpha = reg3.coef_[0, 0]

        resid = yr - cvec(reg3.predict(dr))

        # This is a difference to the original code. The original code uses
        # var <- vcov(reg3)[2, 2], which is the homoskedastic covariance
        # estimator for OLS. I wrote get_cov() to calculate that, because the
        # linear regression implementation in sklearn does not include standard
        # error calculations. (I could have switched to statsmodels instead, but
        # sklearn seems more likely to be maintained in the future.) I then
        # added the option to get_cov() to calculate heteroskedastic standard
        # errors. I believe that if the penalty term is adjusted for
        # heteroskedasticity, heteroskedastic standard errors should also be
        # used here, to be internally consistent.
        var = np.array([get_cov(dr, resid, homoskedastic=homoskedastic)[1, 1]])

        se = np.sqrt(var)

        tval = alpha / np.sqrt(var)

        pval = 2 * norm.cdf(-np.abs(tval))

        res = {'epsilon': resid, 'v': dr}

        I1 = reg1.est['index']

        I2 = reg2.est['index']

        I = cvec(I1.astype(bool) | I2.astype(bool))

        #names(I) <- union(names(I1),names(I2))

        results = {
            'alpha': alpha,
            'se': se,
            't': tval,
            'pval': pval,
            'coefficients': alpha,
            'coefficient': alpha,
            'coefficients_reg': reg1.est['coefficients'],
            'selection_index': I,
            'residuals': res,
            #call = match.call(),
            'samplesize': n
        }

    return results
Ejemplo n.º 5
0
    regionnorthwest = np.array(regionnorthwest).reshape(
        len(regionnorthwest), 1)
    regionsoutheast = np.array(regionsoutheast).reshape(
        len(regionnorthwest), 1)
    regionsouthwest = np.array(regionsouthwest).reshape(
        len(regionnorthwest), 1)
    return np.concatenate(
        (datanum[:, :-1], regionnorthwest, regionsoutheast, regionsouthwest),
        1)


X = AgregarCampo(X)
print(X)
print(X.shape)
print(Y.shape)
print(X)
reg_mod = lm()

reg_mod.fit(X, Y)
y_predict = reg_mod.predict(X)

reg_mod.coef_

rmse = mean_squared_error(Y, y_predict)
r2 = r2_score(Y, y_predict)

print('Slope:', reg_mod.coef_)
print('Intercept:', reg_mod.intercept_)
print('Root mean squared error: ', rmse)
print('R2 score: ', r2)
Ejemplo n.º 6
0
X_train, X_test, y_train, y_test = train_test_split(
    bottle_df[["Salnty", "STheta"]],
    bottle_df["T_degC"],
    test_size=.2,
    random_state=0)

X_train = X_train.assign(intercept=1)
X_test = X_test.assign(intercept=1)
""" Manual calculation"""
theta_best = np.linalg.inv(X_train.T.dot(X_train)).dot(X_train.T).dot(y_train)
print("Coefficients: ", theta_best)
y_predict = X_test.dot(theta_best)
y_predict.head()
""" sklearn method """
lm_mod = lm().fit(X_train, y_train)
print('Coefficients: \n', lm_mod.coef_)
y_predict_train_sk = pd.DataFrame(lm_mod.predict(X_train),
                                  columns=["y_predict"])
y_predict_test_sk = pd.DataFrame(lm_mod.predict(X_test), columns=["y_predict"])
"""" Evaluate """
print("Model mean squared error: %.2f" %
      metrics.mean_squared_error(y_train, y_predict_train_sk.y_predict))
print("Model explained variance: %.2f" %
      metrics.explained_variance_score(y_train, y_predict_train_sk.y_predict))
print("Model r-squared: %.2f" %
      metrics.r2_score(y_train, y_predict_train_sk.y_predict))

print("Holdout mean squared error: %.2f" %
      metrics.mean_squared_error(y_test, y_predict_test_sk.y_predict))
print("Holdout explained variance: %.2f" %
def forecast(city_name):
    os.chdir(r"C:\Users\Administrator\Desktop")
    frequency = 3
    start_date = '1-JAN-2019'
    end_date = '1-JAN-2020'
    api_key = 'e60a5f5f96574a33947210842201502'
    #city_name = input('Enter city name: ')
    location_list = [city_name]
    hist_weather_data = retrieve_hist_data(api_key,
                                           location_list,
                                           start_date,
                                           end_date,
                                           frequency,
                                           location_label=False,
                                           export_csv=True,
                                           store_df=True)
    path = "C:\\Users\\Administrator\\Desktop\\"

    data = pd.read_csv(path + city_name + ".csv")

    # drop or delete the unnecessary columns in the data.
    data = data.drop([
        "date_time", 'maxtempC', 'DewPointC', 'mintempC', 'sunHour',
        'moon_illumination', 'moonrise', 'moonset', 'sunrise', 'sunset',
        'HeatIndexC', 'WindChillC', 'WindGustKmph', 'totalSnow_cm'
    ],
                     axis=1,
                     inplace=False)

    data.to_csv(city_name + '.csv')

    params = {
        'access_key': '7f31a3c1baed8dddc5b06a0448f4b534',
        'query': city_name
    }

    api_result = requests.get('http://api.weatherstack.com/current', params)
    arr = []
    api_response = api_result.json()
    print('\n')
    print(u'Given City Name: %s' % (api_response['location']['name']))
    #a=api_response['location']['name']
    #these variables a to k can be returned to get the current details
    print(u'Current temperature is %d℃' %
          (api_response['current']['temperature']))
    a = api_response['current']['temperature']
    print(u'Current Humidity is %d' % (api_response['current']['humidity']))
    b = api_response['current']['humidity']
    print(u'Current Pressure is %d Pascal' %
          (api_response['current']['pressure']))
    c = api_response['current']['pressure']
    print(u'Current Cloud Cover is %d' %
          (api_response['current']['cloudcover']))
    d = api_response['current']['cloudcover']
    print(u'Current Precipitation is %d' % (api_response['current']['precip']))
    e = api_response['current']['precip']
    print(u'Current Visibility is %d' %
          (api_response['current']['visibility']))
    f = api_response['current']['visibility']
    print(u'Current Wind Speed is %d' %
          (api_response['current']['wind_speed']))
    g = api_response['current']['wind_speed']
    print(u'Current Feels Like is %d' % (api_response['current']['feelslike']))
    h = api_response['current']['feelslike']
    print(u'Current Wind Direction is %s' %
          (api_response['current']['wind_dir']))
    i = api_response['current']['wind_arr']
    print(u'Current UV Index is %d' % (api_response['current']['uv_index']))
    j = api_response['current']['uv_index']
    print(u'Current Wind Degree is %d' %
          (api_response['current']['wind_degree']))
    k = api_response['current']['wind_degree']

    # save the data in a csv file
    path = "C:\\Users\\Administrator\\Desktop\\"
    data = pd.read_csv(path + city_name + ".csv")
    #for pressure
    X = data.drop(['pressure'], axis=1)
    Y = data['pressure']
    Y = Y.values.reshape(-1, 1)

    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.01)
    model = lm().fit(x_train, y_train)
    pressure = model.predict(x_test)
    print(pressure, 'This is the pressure in pascal for the input')

    #for temperature
    X = data.drop(['tempC'], axis=1)

    Y = data['tempC']
    Y = Y.values.reshape(-1, 1)

    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.01)
    model = lm().fit(x_train, y_train)
    temp = model.predict(x_test)
    print(temp, 'This is the temperature in degrees C for the input')

    #for humidity
    X = data.drop(['humidity'], axis=1)

    Y = data['humidity']
    Y = Y.values.reshape(-1, 1)

    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.01)
    model = lm().fit(x_train, y_train)
    hum = model.predict(x_test)
    print(hum, 'This is the humidity for the input')
    pressure = str(pressure)
    temp = str(temp)
    hum = str(hum)
    return temp, pressure, hum
Ejemplo n.º 8
0
iris_df['target'] = iris.target
iris_df['target_names'] = iris.target_names[iris.target]
print(iris_df.head(3), '\n')

# train dataset(학습데이터), test dataset(검정데이터)로 데이터 분리 (7:3)
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(iris_df, test_size=0.3)

print('train:', train_set.shape)  # 105
print('test:', test_set.shape)  # 45
print()

#----------------------------------------
# 선형회귀분석방법1 - 선형회귀(최소제곱) OLS 알고리즘을 사용

model_ols = lm().fit(X=train_set.iloc[:, [2]],
                     y=train_set.iloc[:, [3]])  # 대문자로 작성
#print(model_ols.coef_)
#print(model_ols.intercept_)
#print('ols predict : \n',model_ols.predict(test_set.iloc[:,[2]])) # 생성괸 모델 검증
#print('실제값:\n', train_set.iloc[:,[3]])

# 학습과 검정 예측 비교값
print('방법1(ols)-학습과 검정 예측 비교값 : ',
      model_ols.score(X=train_set.iloc[:, [2]], y=train_set.iloc[:, [3]]))
print('방법1(ols)-학습과 검정 예측 비교값 : ',
      model_ols.score(X=test_set.iloc[:, [2]], y=test_set.iloc[:, [3]]))

plt.scatter(train_set.iloc[:, [2]], train_set.iloc[:, [3]], color='green')
plt.plot(test_set.iloc[:, [2]], model_ols.predict(test_set.iloc[:, [2]]))
plt.show()
Ejemplo n.º 9
0
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression as lm

netflix_data=pd.read_csv("netflix_titles.csv")
netflix_data.director.fillna("No Director", inplace=True)
netflix_data.cast.fillna("No Cast", inplace=True)
netflix_data.country.fillna("Country Unavailable", inplace=True)
netflix_data.dropna(subset=["date_added", "rating"], inplace=True)

smaller_data = netflix_data.head(1000).copy()
y = smaller_data.listed_in
X = smaller_data.cast

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1000) # random_state=1000 ???
matrix = CountVectorizer(tokenizer=lambda x: x.split(','))
x_train_fit = matrix.fit_transform(X_train)
x_test_fit = matrix.transform(X_test)

y_train_fit = matrix.fit_transform(y_train)
y_test_fit = matrix.transform(y_test)

print(x_train_fit.shape)
print(x_test_fit.shape)
print(y_train_fit.shape)
print(y_test_fit.shape)
model=lm().fit(x_train_fit,y_train_fit)
#print(model.score(x_test_fit,y_test_fit))
#predictions=model.predict(x_test_fit)

#plt.scatter(y_test,predictions)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression as lm
from sklearn import svm, metrics
from sklearn.externals import joblib

dataset = pd.read_csv('kddcup99.csv', low_memory=False)
##print ("Whole dataset count : \n",dataset.shape)
##print ("\n\nColumns in whole dataset : \n",dataset.columns)

##print (dataset.count)
##dataset.plot
##plt.show()

y = dataset.label
x = np.array(dataset.drop(['flag'], axis=1))

##x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
##print( "\n\nTraining dataset x : \n",x_train.shape)
##print( "\n\nTraining dataset y : \n",y_train.shape)
##print( "\n\nTesting dataset x : \n",x_test.shape)
##print( "\n\nTesting dataset y : \n",y_test.shape)

gb = dataset.groupby(['protocol_type', 'service', 'flag', 'label'])
##print("\n\nDisplaying types of Protocols , Services , Flags and Labels Which is used : \n",gb.first())

##Training Linear Regression Model
model = lm().fit(x, y)

#Creating Model which can be imported
joblib.dump(model, 'model.pkl')
Ejemplo n.º 11
0
theta = np.random.randn(3, 1)  # random initialization

for size in minibatch_size:
    for epoch in range(n_iterations):
        shuffled_indices = np.random.permutation(m)
        X_b_shuffled = X_train[shuffled_indices]
        y_shuffled = y_train[shuffled_indices]
        for i in range(0, m, size):
            xi = X_b_shuffled[i:i + size]
            yi = y_shuffled[i:i + size]
            gradients = 2 / size * np.asarray(xi).T.dot(xi.dot(theta) - yi)
            theta = theta - eta * gradients
            theta_path_mgd.append(theta)
    best_thetas.append(theta)

(lm().fit(X_train, y_train)).coef_

y_predict_50 = X_test.dot(best_thetas[0])
y_predict_2000 = X_test.dot(best_thetas[1])
y_predict_10000 = X_test.dot(best_thetas[2])

for i in range(len(best_thetas)):
    print(f'minibatch size: {minibatch_size[i]}')
    print(f'Coefficients: {best_thetas[i]}')
    print("\n")
    print("Holdout mean squared error: %.2f" %
          metrics.mean_squared_error(y_test, X_test.dot(best_thetas[i])))
    print("Holdout explained variance: %.2f" %
          metrics.explained_variance_score(y_test, X_test.dot(best_thetas[i])))
    print("Holdout r-squared: %.2f" %
          metrics.r2_score(y_test, X_test.dot(best_thetas[i])))
Ejemplo n.º 12
0
            vmin=-1,
            vmax=1)

from sklearn.model_selection import train_test_split
y = Airbnb_data['price']
x = Airbnb_data.drop('price', axis=1)
X = x.apply(pd.to_numeric, errors='coerce')
Y = y.apply(pd.to_numeric, errors='coerce')
xTrain, xTest, yTrain, yTest = train_test_split(X,
                                                Y,
                                                test_size=0.3,
                                                random_state=42)

from sklearn.linear_model import LinearRegression as lm
from math import sqrt
regressor = lm().fit(xTrain, yTrain)
predictions = regressor.predict(xTest)

from sklearn.metrics import mean_squared_error, r2_score
print("Mean squared error: %.2f" % mean_squared_error(yTest, predictions))
print("R-square: %.2f" % r2_score(yTest, predictions))

from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

ridge = Ridge()
parameters = {
    'alpha': [
        1e-15, 1e-10, 1e-8, 1e-3, 1e-2, 1, 5, 10, 20, 30, 35, 40, 45, 50, 55,
        100
    ]
Ejemplo n.º 13
0
import pandas as pd
import numpy as np

#Import the train dataset (here, we have no train/test split, it is all train)
DiabetesTakingMed = pd.read_csv('DiabetesTakingMedF.csv', index_col=0)

DiabetesTakingMed = DiabetesTakingMed.drop('IsTrain', axis=1)
DiabetesNoMiddle = DiabetesTakingMed[DiabetesTakingMed['readmitted']!=1]
trainX01 = DiabetesNoMiddle.drop('readmitted', axis=1)
trainY01 = DiabetesNoMiddle['readmitted'].replace([2], [1])


#Remove train data where patients came back after 30 days, who look very similar to those returning <30 days:
from sklearn.linear_model import LinearRegression as lm
lm = lm()
lm.fit(trainX01, trainY01)

DiabetesMiddle = DiabetesTakingMed[DiabetesTakingMed['readmitted']==1]
MiddleX = DiabetesMiddle.drop('readmitted', axis=1)
MiddleY = DiabetesMiddle['readmitted']

predictarray = lm.predict(MiddleX)

MiddleDF75 = DiabetesMiddle.loc[predictarray<0.75]

FinalTrain = pd.concat([DiabetesNoMiddle, MiddleDF75], axis=0)

#Get the logistic regression fit object, after removing specific columns:

TrainLR = FinalTrain.drop(['diabfeat_neurologic', 'race_AfricanAmerican', 'A1Cresult_>7', 'primarydiag_injury', 'number_diagnoses', 
    'med_glimepiride', 'med_insulin', 'diag_infection', 'medical_specialty_Orthopedics', 'med_nateglinide', 'discharge_disposition_leftAMA', 
Ejemplo n.º 14
0
    def __init__(self,
                 x,
                 y,
                 colnames=None,
                 post=True,
                 intercept=True,
                 model=True,
                 homoskedastic=False,
                 X_dependent_lambda=False,
                 lambda_start=None,
                 c=1.1,
                 gamma=None,
                 numSim=5000,
                 numIter=15,
                 tol=10**(-5),
                 threshold=-np.inf,
                 par=True,
                 corecap=np.inf,
                 fix_seed=True):
        # Initialize internal variables
        if isinstance(x, pd.DataFrame) and colnames is None:
            colnames = x.columns

        self.x = np.array(x).astype(np.float32)
        self.y = cvec(y).astype(np.float32)

        self.n, self.p = self.x.shape

        if colnames is None:
            self.colnames = ['V' + str(i + 1) for i in np.arange(self.p)]
        else:
            self.colnames = colnames

        # Unused line in the original code
        # ind_names = np.arange(self.p) + 1

        self.post = post
        self.intercept = intercept
        self.model = model
        self.homoskedastic = homoskedastic
        self.X_dependent_lambda = X_dependent_lambda
        self.lambda_start = lambda_start
        self.c = c

        if gamma is None:
            self.gamma = .1 / np.log(self.n)
        else:
            self.gamma = gamma

        self.numSim = numSim
        self.numIter = numIter
        self.tol = tol
        self.threshold = threshold

        self.par = par
        self.corecap = corecap
        self.fix_seed = fix_seed

        if (self.post == False) and (self.c is None):
            self.c = .5

        if ((self.post == False) and (self.homoskedastic == False)
                and (self.X_dependent_lambda == False)
                and (self.lambda_start == None) and (self.c == 1.1)
                and (self.gamma == .1 / np.log(self.n))):
            self.c = .5

        # For now, instantiate estimate as None
        self.est = None

        # Calculate robust LASSO coefficients
        if self.intercept == True:
            meanx = cvec(self.x.mean(axis=0))

            self.x = self.x - np.ones(shape=(self.n, 1)) @ meanx.T

            mu = self.y.mean()

            self.y = self.y - mu
        else:
            meanx = np.zeros(shape=(self.p, 1))

            mu = 0

        normx = np.sqrt(np.var(self.x, axis=1, ddof=1))

        Psi = cvec(np.mean(self.x**2, axis=0))

        ind = np.zeros(shape=(self.p, 1)).astype(bool)

        XX = self.x.T @ self.x

        Xy = self.x.T @ self.y

        startingval = init_values(self.x, self.y)['residuals']

        pen = lambdaCalculation(homoskedastic=self.homoskedastic,
                                X_dependent_lambda=self.X_dependent_lambda,
                                lambda_start=self.lambda_start,
                                c=self.c,
                                gamma=self.gamma,
                                numSim=self.numSim,
                                y=startingval,
                                x=self.x,
                                par=self.par,
                                corecap=self.corecap,
                                fix_seed=self.fix_seed)

        lmbda = pen['lambda']
        Ups0 = Ups1 = pen['Ups0']
        lmbda0 = pen['lambda0']

        mm = 1
        s0 = np.sqrt(np.var(y, axis=0, ddof=1))

        while mm <= self.numIter:
            if (mm == 1) and self.post:
                coefTemp = (LassoShooting_fit(self.x,
                                              self.y,
                                              lmbda / 2,
                                              XX=XX,
                                              Xy=Xy)['coefficients'])
            else:
                coefTemp = (LassoShooting_fit(self.x,
                                              self.y,
                                              lmbda,
                                              XX=XX,
                                              Xy=Xy)['coefficients'])

            coefTemp[np.isnan(coefTemp)] = 0

            ind1 = (np.abs(coefTemp) > 0)

            x1 = self.x[:, ind1[:, 0]]

            if x1.shape[1] == 0:
                if self.intercept:
                    intercept_value = np.mean(self.y + mu)

                    coef = np.zeros(shape=(self.p + 1, 1))

                    coef = (pd.DataFrame(coef,
                                         index=['(Intercept)'] +
                                         list(self.colnames)))
                else:
                    intercept_value = np.mean(self.y)

                    coef = np.zeros(shape=(self.p, 1))

                    coef = pd.DataFrame(coef, index=self.colnames)

                self.est = {
                    'coefficients':
                    coef,
                    'beta':
                    np.zeros(shape=(self.p, 1)),
                    'intercept':
                    intercept_value,
                    'index':
                    pd.DataFrame(np.zeros(shape=(self.p, 1)).astype(bool),
                                 index=self.colnames),
                    'lambda':
                    lmbda,
                    'lambda0':
                    lmbda0,
                    'loadings':
                    Ups0,
                    'residuals':
                    self.y - np.mean(self.y),
                    'sigma':
                    np.var(self.y, axis=0, ddof=1),
                    'iter':
                    mm,
                    #'call': Not a Python option
                    'options': {
                        'post': self.post,
                        'intercept': self.intercept,
                        'ind.scale': ind,
                        'mu': mu,
                        'meanx': meanx
                    }
                }

                if self.model:
                    self.est['model'] = self.x
                else:
                    self.est['model'] = None

                self.est['tss'] = self.est['rss'] = (((
                    self.y - np.mean(self.y))**2).sum())

                self.est['dev']: self.y - np.mean(self.y)
                # In R, return() breaks while loops
                return

            # Refinement variance estimation
            if self.post:
                reg = lm(fit_intercept=False).fit(x1, self.y)

                coefT = reg.coef_.T

                coefT[np.isnan(coefT)] = 0

                e1 = self.y - x1 @ coefT

                coefTemp[ind1[:, 0]] = coefT
            else:
                e1 = self.y - x1 @ coefTemp[ind1[:, 0]]

            s1 = np.sqrt(np.var(e1, ddof=1))

            # Homoskedastic and X-independent
            if ((self.homoskedastic == True)
                    and (self.X_dependent_lambda == False)):
                Ups1 = s1 * Psi

                lmbda = pen['lambda0'] * Ups1

            # Homoskedastic and X-dependent
            elif ((self.homoskedastic == True)
                  and (self.X_dependent_lambda == True)):
                Ups1 = s1 * Psi

                lmbda = pen['lambda0'] * Ups1

            # Heteroskedastic and X-independent
            elif ((self.homoskedastic == False)
                  and (self.X_dependent_lambda == False)):
                Ups1 = ((1 / np.sqrt(self.n)) * np.sqrt(
                    (e1**2).T @ self.x**2).T)

                lmbda = pen['lambda0'] * Ups1

            # Heteroskedastic and X-dependent
            elif ((self.homoskedastic == False)
                  and (self.X_dependent_lambda == True)):
                lc = lambdaCalculation(
                    homoskedastic=self.homoskedastic,
                    X_dependent_lambda=self.X_dependent_lambda,
                    lambda_start=self.lambda_start,
                    c=self.c,
                    gamma=self.gamma,
                    numSim=self.numSim,
                    y=e1,
                    x=self.x,
                    par=self.par,
                    corecap=self.corecap,
                    fix_seed=self.fix_seed)

                Ups1 = lc['Ups0']

                lmbda = lc['lambda']

            # If homoskedastic is set to None
            elif self.homoskedastic is None:
                Ups1 = ((1 / np.sqrt(self.n)) * np.sqrt(
                    (e1**2).T @ self.x**2).T)

                lmbda = pen['lambda0'] * Ups1

            mm = mm + 1

            if np.abs(s0 - s1) < self.tol:
                break

            s0 = s1

        if x1.shape[1] == 0:
            #coefTemp = None
            ind1 = np.zeros(shape=(self.p, 1))

        coefTemp = cvec(coefTemp)

        coefTemp[np.abs(coefTemp) < self.threshold] = 0

        coefTemp = pd.DataFrame(coefTemp, index=self.colnames)

        ind1 = cvec(ind1)

        ind1 = pd.DataFrame(ind1, index=self.colnames)

        if self.intercept:
            if mu is None:
                mu = 0
            if meanx is None:
                meanx = np.zeros(shape=(coefTemp.shape[0], 1))
            if ind.sum() == 0:
                intercept_value = mu - (meanx * coefTemp).sum()
            else:
                intercept_value = mu - (meanx * coefTemp).sum()
        else:
            intercept_value = np.nan

        if self.intercept:
            beta = (np.concatenate([cvec(intercept_value), coefTemp.values],
                                   axis=0))

            beta = pd.DataFrame(beta,
                                index=['(Intercept)'] + list(self.colnames))
        else:
            beta = coefTemp

        s1 = np.sqrt(np.var(e1, ddof=1))

        self.est = {
            'coefficients': beta,
            'beta': pd.DataFrame(coefTemp, index=self.colnames),
            'intercept': intercept_value,
            'index': ind1,
            'lambda': pd.DataFrame(lmbda, index=self.colnames),
            'lambda0': lmbda0,
            'loadings': Ups1,
            'residuals': cvec(e1),
            'sigma': s1,
            'iter': mm,
            #'call': Not a Python option
            'options': {
                'post': self.post,
                'intercept': self.intercept,
                'ind.scale': ind,
                'mu': mu,
                'meanx': meanx
            },
            'model': model
        }

        if model:
            self.x = self.x + np.ones(shape=(self.n, 1)) @ meanx.T

            self.est['model'] = self.x
        else:
            self.est['model'] = None

        self.est['tss'] = ((self.y - np.mean(self.y))**2).sum()
        self.est['rss'] = (self.est['residuals']**2).sum()
        self.est['dev'] = self.y - np.mean(self.y)
Ejemplo n.º 15
0
suburb_dummies = pd.get_dummies(dataset_dr[["Type", "Method"]])

full_Data = dataset_dr.drop([
    "Address", "Price", "Date", "SellerG", "Suburb", "Type", "Method",
    "CouncilArea", "Regionname"
],
                            axis=1).join(suburb_dummies)

X = full_Data
y = dataset_dr["Price"]

# Split into test data and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train the algorithm
regressor = lm()
regressor.fit(X_train, y_train)
print("Intercept: {}".format(regressor.intercept_))
coeff_df = pd.DataFrame(regressor.coef_, X.columns, columns=['Coefficient'])
ranked_suburbs = coeff_df.sort_values("Coefficient", ascending=False)
print(ranked_suburbs)

# Calculate linear predictions
y_pred = regressor.predict(X_test)

# Metrics
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print("MAE:", metrics.mean_absolute_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

# Plot
Ejemplo n.º 16
0
print(iris_df[:5])

# 훈련세트, 테스트세트 나누기 (과적합 방지 목적)
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(
    iris_df, test_size=0.3)  # 데이터를 섞은 후 train:test를 7:3으로 나눔
print(train_set.shape)  #(105, 6)
print(test_set.shape)  #(45, 6)

print('\nLinearRegression)')
# 회귀분석 방법 1 - 선형 회귀(최소제곱)
from sklearn.linear_model import LinearRegression as lm
import matplotlib.pyplot as plt

model = lm().fit(X=train_set.iloc[:, [2]],
                 y=train_set.iloc[:, [3]])  # train data로 모델 학습
print(model.score(X=train_set.iloc[:, [2]], y=train_set.iloc[:, [3]]))
print(model.score(X=test_set.iloc[:, [2]], y=test_set.iloc[:, [3]]))
print(model.coef_)  #[[ 0.40847816]]
print(model.intercept_)  #[-0.33677518]
print('predict : ', model.predict(test_set.iloc[:, [2]]))  # test data로 모델 평가

#plot
plt.scatter(train_set.iloc[:, [2]], train_set.iloc[:, [3]], color='black')
plt.plot(test_set.iloc[:, [2]], model.predict(test_set.iloc[:, [2]]))
plt.show()

print('\nRidge')
# 회귀분석 방법 2 - Ridge: alpha값을 조정하여 과대/과소적합을 피한다.
from sklearn.linear_model import Ridge
Ejemplo n.º 17
0
train_set, test_set = train_test_split(iris_df, test_size=0.3)

print(train_set.shape)

print(test_set.shape)

print('\nLinearRegression)')

# 회귀분석 방법 1 - 선형 회귀(최소제곱)

from sklearn.linear_model import LinearRegression as lm

import matplotlib.pyplot as plt

model = lm().fit(X=train_set.iloc[:, [2]], y=train_set.iloc[:, [3]])

print(model.score(X=train_set.iloc[:, [2]], y=train_set.iloc[:, [3]]))

print(model.score(X=test_set.iloc[:, [2]], y=test_set.iloc[:, [3]]))

print(model.coef_)  #[[ 0.40847816]]

print(model.intercept_)  #[-0.33677518]

print('predict : ', model.predict(test_set.ix[:, [2]]))

#plot

plt.scatter(train_set.iloc[:, [2]], train_set.iloc[:, [3]], color='black')
Ejemplo n.º 18
0
print()
# end of 1st commit

# 2nd commit start (missing value replaced with respected column mean)
data = data.fillna(data.mean())
# end of 2nd commit 


y=data.bphi
x=data.drop('bphi',axis=1)

m=x.shape[0]
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

from sklearn.linear_model import LinearRegression as lm
model=lm().fit(x_train,y_train)

test = x_test.head(1)
predictions = model.predict(x_test.head(1))
import matplotlib.pyplot as plt
plt.scatter(y_test.head(1),predictions)
plt.xlabel("True Values")
plt.ylabel("Predictions")


predictions


predictions[0:1000]

Ejemplo n.º 19
0
#Cleaning data
df = pd.read_csv('housePractice.csv')
df['date'] = pd.to_datetime(df.date)
df.head()

#Splitting Data into training and test
y = df['price']
x = df[[
    'bedrooms', 'bathrooms', 'floors', 'sqft_living', 'sqft_lot', 'waterfront',
    'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built',
    'sqft_living15', 'sqft_lot15'
]]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

#Fitting into a linear regression model
model = lm()
model.fit(x_train, y_train)
predictions = model.predict(x_test)
df1 = pd.DataFrame({
    'Actual': y_test,
    'predicted': predictions,
})

#Calculating test error
r_sq = model.score(x, y)
print('coefficient of determination:', r_sq)

#Buliding a model for different splits
x1_train, x1_test, y1_train, y1_test = train_test_split(x, y, test_size=0.3)
model = lm()
model.fit(x1_train, y1_train)