def init_values(X, y, number=5, intercept=True): """ Return an initial parameter guess for a LASSO model Inputs y: n by 1 NumPy array, outcome variable X: n by k NumPy array, RHS variables Outputs residuals: n ny 1 NumPy array, residuals for initial parameter guess coefficients: k by 1 NumPy array, initial coefficient values """ # Make sure y is a proper column vector y = cvec(y) # Get the absolute value of correlations between y and X corr = np.abs(cor(y, X)) # Get the number of columns of X kx = X.shape[1] # Make an index selecting the five columns of X which are most correlated # with y (since .argsort() always sorts in increasing order, selecting from # the back gets the most highly correlated columns) index = corr.argsort()[-np.amin([number, kx]):] # Set up an array of coefficient guesses coefficients = np.zeros(shape=(kx, 1)) # Regress y on the five most correlated columns of X, including an intercept # if desired reg = lm(fit_intercept=intercept).fit(X[:, index], y) # Replace the guesses for the estimated coefficients (note that .coef_ does # not return the estimated intercept, if one was included in the model) coefficients[index, :] = reg.coef_.T # Replace any NANs as zeros coefficients[np.isnan(coefficients)] = 0 # Get the regression residuals residuals = y - reg.predict(X[:, index]) # Return the residuals and coefficients return {'residuals': residuals, 'coefficients': coefficients}
design = DataFrame([[1, 1, 1, 1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1, 1, 1]], index=['KO', 'WT']).T # lm.fit df = data_set.copy() df_mean = df.mean(1) n_rows, n_cols = df.shape # lm.series for i in df.index: y, y_mask = df.loc[i], df.loc[i].notnull().values if y[y_mask].count() != 0: x = design[y_mask] y = y[y_mask] lm_res = lm().fit(x, y) std_unscaled = np.sqrt(np.diag(np.linalg.qr())) # fit$genes <- y$probes # fit$Amean <- y$Amean # fit$method <- method # fit$design <- design # new("MArrayLM", fit) # make.contrasts # contrasts.fit # ebayes
import pandas as pd from sklearn.linear_model import LinearRegression as lm import statsmodels.formula.api as smf from patsy import dmatrices from statsmodels.stats.outliers_influence import variance_inflation_factor as vif df = pd.read_csv("bike.csv") df.head() features = "+".join(df.columns[1:-3]) y, X = dmatrices("casual ~ " + features, df, return_type = "dataframe") df_vif = pd.DataFrame() df_vif["VIF"] = [vif(X.values, i) for i in range(X.shape[1])] df_vif["features"] = X.columns df_vif model1 = smf.ols("casual ~ " + features, data = df) print(model1.fit().summary()) X_df = df.iloc[:, 1:-3] model2 = lm().fit(X_df, y) model2.predict(X_df.iloc[:3, :])
def rlassoEffect(x, y, d, method='double selection', I3=None, post=True, colnames_d=None, colnames_x=None, intercept=True, model=True, homoskedastic=False, X_dependent_lambda=False, lambda_start=None, c=1.1, gamma=None, numSim=5000, numIter=15, tol=10**(-5), threshold=-np.inf, par=True, corecap=np.inf, fix_seed=True): d = cvec(d) y = cvec(y) n, kx = x.shape if colnames_d is None: colnames_d = ['d1'] if (colnames_x is None) and (x is not None): colnames_x = ['x' + str(i) for i in np.arange(kx)] if method == 'double selection': I1 = rlasso(x, d, post=post, colnames=colnames_x, intercept=intercept, model=model, homoskedastic=homoskedastic, X_dependent_lambda=X_dependent_lambda, lambda_start=lambda_start, c=c, gamma=gamma, numSim=numSim, numIter=numIter, tol=tol, threshold=threshold, par=par, corecap=corecap, fix_seed=fix_seed).est['index'] I2 = rlasso(x, y, post=post, colnames=colnames_x, intercept=intercept, model=model, homoskedastic=homoskedastic, X_dependent_lambda=X_dependent_lambda, lambda_start=lambda_start, c=c, gamma=gamma, numSim=numSim, numIter=numIter, tol=tol, threshold=threshold, par=par, corecap=corecap, fix_seed=fix_seed).est['index'] # Original code checks if type(I3) is bool, but I believe they only do # that to see whether it has been defined by the user if I3 is not None: I3 = cvec(I3) I = cvec(I1.astype(bool) | I2.astype(bool) | I3.astype(bool)) else: I = cvec(I1.astype(bool) | I2.astype(bool)) # missing here: names(I) <- union(names(I1),names(I2)) if I.sum() == 0: I = None x = np.concatenate([d, x[:, I[:, 0]]], axis=1) reg1 = lm(fit_intercept=True).fit(x, y) alpha = reg1.coef_[0, 0] names_alpha = colnames_d resid = y - cvec(reg1.predict(x)) if I is None: xi = (resid) * np.sqrt(n / (n - 1)) else: xi = (resid) * np.sqrt(n / (n - I.sum() - 1)) if I is None: # Fit an intercept-only model reg2 = lm(fit_intercept=False).fit(np.ones_like(d), d) v = d - cvec(reg2.predict(np.ones_like(d))) else: reg2 = lm(fit_intercept=True).fit(x[:, 1:], d) v = d - cvec(reg2.predict(x[:, 1:])) var = ((1 / n) * (1 / np.mean(v**2, axis=0)) * np.mean( (v**2) * (xi**2), axis=0) * (1 / np.mean(v**2, axis=0))) se = np.sqrt(var) tval = alpha / np.sqrt(var) pval = 2 * norm.cdf(-np.abs(tval)) if I is None: no_selected = 1 else: no_selected = 0 res = {'epsilon': xi, 'v': v} if np.issubdtype(type(colnames_d), np.str_): colnames_d = [colnames_d] results = { 'alpha': alpha, #'se': pd.DataFrame(se, index=colnames_d), 'se': se, 't': tval, 'pval': pval, 'no_selected': no_selected, 'coefficients': alpha, 'coefficient': alpha, 'coefficients_reg': reg1.coef_, 'selection_index': I, 'residuals': res, #call = match.call(), 'samplesize': n } elif method == 'partialling out': reg1 = rlasso(x, y, post=post, colnames=colnames_x, intercept=intercept, model=model, homoskedastic=homoskedastic, X_dependent_lambda=X_dependent_lambda, lambda_start=lambda_start, c=c, gamma=gamma, numSim=numSim, numIter=numIter, tol=tol, threshold=threshold, par=par, corecap=corecap, fix_seed=fix_seed) yr = reg1.est['residuals'] reg2 = rlasso(x, d, post=post, colnames=colnames_x, intercept=intercept, model=model, homoskedastic=homoskedastic, X_dependent_lambda=X_dependent_lambda, lambda_start=lambda_start, c=c, gamma=gamma, numSim=numSim, numIter=numIter, tol=tol, threshold=threshold, par=par, corecap=corecap, fix_seed=fix_seed) dr = reg2.est['residuals'] reg3 = lm(fit_intercept=True).fit(dr, yr) alpha = reg3.coef_[0, 0] resid = yr - cvec(reg3.predict(dr)) # This is a difference to the original code. The original code uses # var <- vcov(reg3)[2, 2], which is the homoskedastic covariance # estimator for OLS. I wrote get_cov() to calculate that, because the # linear regression implementation in sklearn does not include standard # error calculations. (I could have switched to statsmodels instead, but # sklearn seems more likely to be maintained in the future.) I then # added the option to get_cov() to calculate heteroskedastic standard # errors. I believe that if the penalty term is adjusted for # heteroskedasticity, heteroskedastic standard errors should also be # used here, to be internally consistent. var = np.array([get_cov(dr, resid, homoskedastic=homoskedastic)[1, 1]]) se = np.sqrt(var) tval = alpha / np.sqrt(var) pval = 2 * norm.cdf(-np.abs(tval)) res = {'epsilon': resid, 'v': dr} I1 = reg1.est['index'] I2 = reg2.est['index'] I = cvec(I1.astype(bool) | I2.astype(bool)) #names(I) <- union(names(I1),names(I2)) results = { 'alpha': alpha, 'se': se, 't': tval, 'pval': pval, 'coefficients': alpha, 'coefficient': alpha, 'coefficients_reg': reg1.est['coefficients'], 'selection_index': I, 'residuals': res, #call = match.call(), 'samplesize': n } return results
regionnorthwest = np.array(regionnorthwest).reshape( len(regionnorthwest), 1) regionsoutheast = np.array(regionsoutheast).reshape( len(regionnorthwest), 1) regionsouthwest = np.array(regionsouthwest).reshape( len(regionnorthwest), 1) return np.concatenate( (datanum[:, :-1], regionnorthwest, regionsoutheast, regionsouthwest), 1) X = AgregarCampo(X) print(X) print(X.shape) print(Y.shape) print(X) reg_mod = lm() reg_mod.fit(X, Y) y_predict = reg_mod.predict(X) reg_mod.coef_ rmse = mean_squared_error(Y, y_predict) r2 = r2_score(Y, y_predict) print('Slope:', reg_mod.coef_) print('Intercept:', reg_mod.intercept_) print('Root mean squared error: ', rmse) print('R2 score: ', r2)
X_train, X_test, y_train, y_test = train_test_split( bottle_df[["Salnty", "STheta"]], bottle_df["T_degC"], test_size=.2, random_state=0) X_train = X_train.assign(intercept=1) X_test = X_test.assign(intercept=1) """ Manual calculation""" theta_best = np.linalg.inv(X_train.T.dot(X_train)).dot(X_train.T).dot(y_train) print("Coefficients: ", theta_best) y_predict = X_test.dot(theta_best) y_predict.head() """ sklearn method """ lm_mod = lm().fit(X_train, y_train) print('Coefficients: \n', lm_mod.coef_) y_predict_train_sk = pd.DataFrame(lm_mod.predict(X_train), columns=["y_predict"]) y_predict_test_sk = pd.DataFrame(lm_mod.predict(X_test), columns=["y_predict"]) """" Evaluate """ print("Model mean squared error: %.2f" % metrics.mean_squared_error(y_train, y_predict_train_sk.y_predict)) print("Model explained variance: %.2f" % metrics.explained_variance_score(y_train, y_predict_train_sk.y_predict)) print("Model r-squared: %.2f" % metrics.r2_score(y_train, y_predict_train_sk.y_predict)) print("Holdout mean squared error: %.2f" % metrics.mean_squared_error(y_test, y_predict_test_sk.y_predict)) print("Holdout explained variance: %.2f" %
def forecast(city_name): os.chdir(r"C:\Users\Administrator\Desktop") frequency = 3 start_date = '1-JAN-2019' end_date = '1-JAN-2020' api_key = 'e60a5f5f96574a33947210842201502' #city_name = input('Enter city name: ') location_list = [city_name] hist_weather_data = retrieve_hist_data(api_key, location_list, start_date, end_date, frequency, location_label=False, export_csv=True, store_df=True) path = "C:\\Users\\Administrator\\Desktop\\" data = pd.read_csv(path + city_name + ".csv") # drop or delete the unnecessary columns in the data. data = data.drop([ "date_time", 'maxtempC', 'DewPointC', 'mintempC', 'sunHour', 'moon_illumination', 'moonrise', 'moonset', 'sunrise', 'sunset', 'HeatIndexC', 'WindChillC', 'WindGustKmph', 'totalSnow_cm' ], axis=1, inplace=False) data.to_csv(city_name + '.csv') params = { 'access_key': '7f31a3c1baed8dddc5b06a0448f4b534', 'query': city_name } api_result = requests.get('http://api.weatherstack.com/current', params) arr = [] api_response = api_result.json() print('\n') print(u'Given City Name: %s' % (api_response['location']['name'])) #a=api_response['location']['name'] #these variables a to k can be returned to get the current details print(u'Current temperature is %d℃' % (api_response['current']['temperature'])) a = api_response['current']['temperature'] print(u'Current Humidity is %d' % (api_response['current']['humidity'])) b = api_response['current']['humidity'] print(u'Current Pressure is %d Pascal' % (api_response['current']['pressure'])) c = api_response['current']['pressure'] print(u'Current Cloud Cover is %d' % (api_response['current']['cloudcover'])) d = api_response['current']['cloudcover'] print(u'Current Precipitation is %d' % (api_response['current']['precip'])) e = api_response['current']['precip'] print(u'Current Visibility is %d' % (api_response['current']['visibility'])) f = api_response['current']['visibility'] print(u'Current Wind Speed is %d' % (api_response['current']['wind_speed'])) g = api_response['current']['wind_speed'] print(u'Current Feels Like is %d' % (api_response['current']['feelslike'])) h = api_response['current']['feelslike'] print(u'Current Wind Direction is %s' % (api_response['current']['wind_dir'])) i = api_response['current']['wind_arr'] print(u'Current UV Index is %d' % (api_response['current']['uv_index'])) j = api_response['current']['uv_index'] print(u'Current Wind Degree is %d' % (api_response['current']['wind_degree'])) k = api_response['current']['wind_degree'] # save the data in a csv file path = "C:\\Users\\Administrator\\Desktop\\" data = pd.read_csv(path + city_name + ".csv") #for pressure X = data.drop(['pressure'], axis=1) Y = data['pressure'] Y = Y.values.reshape(-1, 1) x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.01) model = lm().fit(x_train, y_train) pressure = model.predict(x_test) print(pressure, 'This is the pressure in pascal for the input') #for temperature X = data.drop(['tempC'], axis=1) Y = data['tempC'] Y = Y.values.reshape(-1, 1) x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.01) model = lm().fit(x_train, y_train) temp = model.predict(x_test) print(temp, 'This is the temperature in degrees C for the input') #for humidity X = data.drop(['humidity'], axis=1) Y = data['humidity'] Y = Y.values.reshape(-1, 1) x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.01) model = lm().fit(x_train, y_train) hum = model.predict(x_test) print(hum, 'This is the humidity for the input') pressure = str(pressure) temp = str(temp) hum = str(hum) return temp, pressure, hum
iris_df['target'] = iris.target iris_df['target_names'] = iris.target_names[iris.target] print(iris_df.head(3), '\n') # train dataset(학습데이터), test dataset(검정데이터)로 데이터 분리 (7:3) from sklearn.model_selection import train_test_split train_set, test_set = train_test_split(iris_df, test_size=0.3) print('train:', train_set.shape) # 105 print('test:', test_set.shape) # 45 print() #---------------------------------------- # 선형회귀분석방법1 - 선형회귀(최소제곱) OLS 알고리즘을 사용 model_ols = lm().fit(X=train_set.iloc[:, [2]], y=train_set.iloc[:, [3]]) # 대문자로 작성 #print(model_ols.coef_) #print(model_ols.intercept_) #print('ols predict : \n',model_ols.predict(test_set.iloc[:,[2]])) # 생성괸 모델 검증 #print('실제값:\n', train_set.iloc[:,[3]]) # 학습과 검정 예측 비교값 print('방법1(ols)-학습과 검정 예측 비교값 : ', model_ols.score(X=train_set.iloc[:, [2]], y=train_set.iloc[:, [3]])) print('방법1(ols)-학습과 검정 예측 비교값 : ', model_ols.score(X=test_set.iloc[:, [2]], y=test_set.iloc[:, [3]])) plt.scatter(train_set.iloc[:, [2]], train_set.iloc[:, [3]], color='green') plt.plot(test_set.iloc[:, [2]], model_ols.predict(test_set.iloc[:, [2]])) plt.show()
from sklearn.neural_network import MLPClassifier from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LinearRegression as lm netflix_data=pd.read_csv("netflix_titles.csv") netflix_data.director.fillna("No Director", inplace=True) netflix_data.cast.fillna("No Cast", inplace=True) netflix_data.country.fillna("Country Unavailable", inplace=True) netflix_data.dropna(subset=["date_added", "rating"], inplace=True) smaller_data = netflix_data.head(1000).copy() y = smaller_data.listed_in X = smaller_data.cast X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1000) # random_state=1000 ??? matrix = CountVectorizer(tokenizer=lambda x: x.split(',')) x_train_fit = matrix.fit_transform(X_train) x_test_fit = matrix.transform(X_test) y_train_fit = matrix.fit_transform(y_train) y_test_fit = matrix.transform(y_test) print(x_train_fit.shape) print(x_test_fit.shape) print(y_train_fit.shape) print(y_test_fit.shape) model=lm().fit(x_train_fit,y_train_fit) #print(model.score(x_test_fit,y_test_fit)) #predictions=model.predict(x_test_fit) #plt.scatter(y_test,predictions)
from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression as lm from sklearn import svm, metrics from sklearn.externals import joblib dataset = pd.read_csv('kddcup99.csv', low_memory=False) ##print ("Whole dataset count : \n",dataset.shape) ##print ("\n\nColumns in whole dataset : \n",dataset.columns) ##print (dataset.count) ##dataset.plot ##plt.show() y = dataset.label x = np.array(dataset.drop(['flag'], axis=1)) ##x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2) ##print( "\n\nTraining dataset x : \n",x_train.shape) ##print( "\n\nTraining dataset y : \n",y_train.shape) ##print( "\n\nTesting dataset x : \n",x_test.shape) ##print( "\n\nTesting dataset y : \n",y_test.shape) gb = dataset.groupby(['protocol_type', 'service', 'flag', 'label']) ##print("\n\nDisplaying types of Protocols , Services , Flags and Labels Which is used : \n",gb.first()) ##Training Linear Regression Model model = lm().fit(x, y) #Creating Model which can be imported joblib.dump(model, 'model.pkl')
theta = np.random.randn(3, 1) # random initialization for size in minibatch_size: for epoch in range(n_iterations): shuffled_indices = np.random.permutation(m) X_b_shuffled = X_train[shuffled_indices] y_shuffled = y_train[shuffled_indices] for i in range(0, m, size): xi = X_b_shuffled[i:i + size] yi = y_shuffled[i:i + size] gradients = 2 / size * np.asarray(xi).T.dot(xi.dot(theta) - yi) theta = theta - eta * gradients theta_path_mgd.append(theta) best_thetas.append(theta) (lm().fit(X_train, y_train)).coef_ y_predict_50 = X_test.dot(best_thetas[0]) y_predict_2000 = X_test.dot(best_thetas[1]) y_predict_10000 = X_test.dot(best_thetas[2]) for i in range(len(best_thetas)): print(f'minibatch size: {minibatch_size[i]}') print(f'Coefficients: {best_thetas[i]}') print("\n") print("Holdout mean squared error: %.2f" % metrics.mean_squared_error(y_test, X_test.dot(best_thetas[i]))) print("Holdout explained variance: %.2f" % metrics.explained_variance_score(y_test, X_test.dot(best_thetas[i]))) print("Holdout r-squared: %.2f" % metrics.r2_score(y_test, X_test.dot(best_thetas[i])))
vmin=-1, vmax=1) from sklearn.model_selection import train_test_split y = Airbnb_data['price'] x = Airbnb_data.drop('price', axis=1) X = x.apply(pd.to_numeric, errors='coerce') Y = y.apply(pd.to_numeric, errors='coerce') xTrain, xTest, yTrain, yTest = train_test_split(X, Y, test_size=0.3, random_state=42) from sklearn.linear_model import LinearRegression as lm from math import sqrt regressor = lm().fit(xTrain, yTrain) predictions = regressor.predict(xTest) from sklearn.metrics import mean_squared_error, r2_score print("Mean squared error: %.2f" % mean_squared_error(yTest, predictions)) print("R-square: %.2f" % r2_score(yTest, predictions)) from sklearn.linear_model import Ridge from sklearn.model_selection import GridSearchCV ridge = Ridge() parameters = { 'alpha': [ 1e-15, 1e-10, 1e-8, 1e-3, 1e-2, 1, 5, 10, 20, 30, 35, 40, 45, 50, 55, 100 ]
import pandas as pd import numpy as np #Import the train dataset (here, we have no train/test split, it is all train) DiabetesTakingMed = pd.read_csv('DiabetesTakingMedF.csv', index_col=0) DiabetesTakingMed = DiabetesTakingMed.drop('IsTrain', axis=1) DiabetesNoMiddle = DiabetesTakingMed[DiabetesTakingMed['readmitted']!=1] trainX01 = DiabetesNoMiddle.drop('readmitted', axis=1) trainY01 = DiabetesNoMiddle['readmitted'].replace([2], [1]) #Remove train data where patients came back after 30 days, who look very similar to those returning <30 days: from sklearn.linear_model import LinearRegression as lm lm = lm() lm.fit(trainX01, trainY01) DiabetesMiddle = DiabetesTakingMed[DiabetesTakingMed['readmitted']==1] MiddleX = DiabetesMiddle.drop('readmitted', axis=1) MiddleY = DiabetesMiddle['readmitted'] predictarray = lm.predict(MiddleX) MiddleDF75 = DiabetesMiddle.loc[predictarray<0.75] FinalTrain = pd.concat([DiabetesNoMiddle, MiddleDF75], axis=0) #Get the logistic regression fit object, after removing specific columns: TrainLR = FinalTrain.drop(['diabfeat_neurologic', 'race_AfricanAmerican', 'A1Cresult_>7', 'primarydiag_injury', 'number_diagnoses', 'med_glimepiride', 'med_insulin', 'diag_infection', 'medical_specialty_Orthopedics', 'med_nateglinide', 'discharge_disposition_leftAMA',
def __init__(self, x, y, colnames=None, post=True, intercept=True, model=True, homoskedastic=False, X_dependent_lambda=False, lambda_start=None, c=1.1, gamma=None, numSim=5000, numIter=15, tol=10**(-5), threshold=-np.inf, par=True, corecap=np.inf, fix_seed=True): # Initialize internal variables if isinstance(x, pd.DataFrame) and colnames is None: colnames = x.columns self.x = np.array(x).astype(np.float32) self.y = cvec(y).astype(np.float32) self.n, self.p = self.x.shape if colnames is None: self.colnames = ['V' + str(i + 1) for i in np.arange(self.p)] else: self.colnames = colnames # Unused line in the original code # ind_names = np.arange(self.p) + 1 self.post = post self.intercept = intercept self.model = model self.homoskedastic = homoskedastic self.X_dependent_lambda = X_dependent_lambda self.lambda_start = lambda_start self.c = c if gamma is None: self.gamma = .1 / np.log(self.n) else: self.gamma = gamma self.numSim = numSim self.numIter = numIter self.tol = tol self.threshold = threshold self.par = par self.corecap = corecap self.fix_seed = fix_seed if (self.post == False) and (self.c is None): self.c = .5 if ((self.post == False) and (self.homoskedastic == False) and (self.X_dependent_lambda == False) and (self.lambda_start == None) and (self.c == 1.1) and (self.gamma == .1 / np.log(self.n))): self.c = .5 # For now, instantiate estimate as None self.est = None # Calculate robust LASSO coefficients if self.intercept == True: meanx = cvec(self.x.mean(axis=0)) self.x = self.x - np.ones(shape=(self.n, 1)) @ meanx.T mu = self.y.mean() self.y = self.y - mu else: meanx = np.zeros(shape=(self.p, 1)) mu = 0 normx = np.sqrt(np.var(self.x, axis=1, ddof=1)) Psi = cvec(np.mean(self.x**2, axis=0)) ind = np.zeros(shape=(self.p, 1)).astype(bool) XX = self.x.T @ self.x Xy = self.x.T @ self.y startingval = init_values(self.x, self.y)['residuals'] pen = lambdaCalculation(homoskedastic=self.homoskedastic, X_dependent_lambda=self.X_dependent_lambda, lambda_start=self.lambda_start, c=self.c, gamma=self.gamma, numSim=self.numSim, y=startingval, x=self.x, par=self.par, corecap=self.corecap, fix_seed=self.fix_seed) lmbda = pen['lambda'] Ups0 = Ups1 = pen['Ups0'] lmbda0 = pen['lambda0'] mm = 1 s0 = np.sqrt(np.var(y, axis=0, ddof=1)) while mm <= self.numIter: if (mm == 1) and self.post: coefTemp = (LassoShooting_fit(self.x, self.y, lmbda / 2, XX=XX, Xy=Xy)['coefficients']) else: coefTemp = (LassoShooting_fit(self.x, self.y, lmbda, XX=XX, Xy=Xy)['coefficients']) coefTemp[np.isnan(coefTemp)] = 0 ind1 = (np.abs(coefTemp) > 0) x1 = self.x[:, ind1[:, 0]] if x1.shape[1] == 0: if self.intercept: intercept_value = np.mean(self.y + mu) coef = np.zeros(shape=(self.p + 1, 1)) coef = (pd.DataFrame(coef, index=['(Intercept)'] + list(self.colnames))) else: intercept_value = np.mean(self.y) coef = np.zeros(shape=(self.p, 1)) coef = pd.DataFrame(coef, index=self.colnames) self.est = { 'coefficients': coef, 'beta': np.zeros(shape=(self.p, 1)), 'intercept': intercept_value, 'index': pd.DataFrame(np.zeros(shape=(self.p, 1)).astype(bool), index=self.colnames), 'lambda': lmbda, 'lambda0': lmbda0, 'loadings': Ups0, 'residuals': self.y - np.mean(self.y), 'sigma': np.var(self.y, axis=0, ddof=1), 'iter': mm, #'call': Not a Python option 'options': { 'post': self.post, 'intercept': self.intercept, 'ind.scale': ind, 'mu': mu, 'meanx': meanx } } if self.model: self.est['model'] = self.x else: self.est['model'] = None self.est['tss'] = self.est['rss'] = ((( self.y - np.mean(self.y))**2).sum()) self.est['dev']: self.y - np.mean(self.y) # In R, return() breaks while loops return # Refinement variance estimation if self.post: reg = lm(fit_intercept=False).fit(x1, self.y) coefT = reg.coef_.T coefT[np.isnan(coefT)] = 0 e1 = self.y - x1 @ coefT coefTemp[ind1[:, 0]] = coefT else: e1 = self.y - x1 @ coefTemp[ind1[:, 0]] s1 = np.sqrt(np.var(e1, ddof=1)) # Homoskedastic and X-independent if ((self.homoskedastic == True) and (self.X_dependent_lambda == False)): Ups1 = s1 * Psi lmbda = pen['lambda0'] * Ups1 # Homoskedastic and X-dependent elif ((self.homoskedastic == True) and (self.X_dependent_lambda == True)): Ups1 = s1 * Psi lmbda = pen['lambda0'] * Ups1 # Heteroskedastic and X-independent elif ((self.homoskedastic == False) and (self.X_dependent_lambda == False)): Ups1 = ((1 / np.sqrt(self.n)) * np.sqrt( (e1**2).T @ self.x**2).T) lmbda = pen['lambda0'] * Ups1 # Heteroskedastic and X-dependent elif ((self.homoskedastic == False) and (self.X_dependent_lambda == True)): lc = lambdaCalculation( homoskedastic=self.homoskedastic, X_dependent_lambda=self.X_dependent_lambda, lambda_start=self.lambda_start, c=self.c, gamma=self.gamma, numSim=self.numSim, y=e1, x=self.x, par=self.par, corecap=self.corecap, fix_seed=self.fix_seed) Ups1 = lc['Ups0'] lmbda = lc['lambda'] # If homoskedastic is set to None elif self.homoskedastic is None: Ups1 = ((1 / np.sqrt(self.n)) * np.sqrt( (e1**2).T @ self.x**2).T) lmbda = pen['lambda0'] * Ups1 mm = mm + 1 if np.abs(s0 - s1) < self.tol: break s0 = s1 if x1.shape[1] == 0: #coefTemp = None ind1 = np.zeros(shape=(self.p, 1)) coefTemp = cvec(coefTemp) coefTemp[np.abs(coefTemp) < self.threshold] = 0 coefTemp = pd.DataFrame(coefTemp, index=self.colnames) ind1 = cvec(ind1) ind1 = pd.DataFrame(ind1, index=self.colnames) if self.intercept: if mu is None: mu = 0 if meanx is None: meanx = np.zeros(shape=(coefTemp.shape[0], 1)) if ind.sum() == 0: intercept_value = mu - (meanx * coefTemp).sum() else: intercept_value = mu - (meanx * coefTemp).sum() else: intercept_value = np.nan if self.intercept: beta = (np.concatenate([cvec(intercept_value), coefTemp.values], axis=0)) beta = pd.DataFrame(beta, index=['(Intercept)'] + list(self.colnames)) else: beta = coefTemp s1 = np.sqrt(np.var(e1, ddof=1)) self.est = { 'coefficients': beta, 'beta': pd.DataFrame(coefTemp, index=self.colnames), 'intercept': intercept_value, 'index': ind1, 'lambda': pd.DataFrame(lmbda, index=self.colnames), 'lambda0': lmbda0, 'loadings': Ups1, 'residuals': cvec(e1), 'sigma': s1, 'iter': mm, #'call': Not a Python option 'options': { 'post': self.post, 'intercept': self.intercept, 'ind.scale': ind, 'mu': mu, 'meanx': meanx }, 'model': model } if model: self.x = self.x + np.ones(shape=(self.n, 1)) @ meanx.T self.est['model'] = self.x else: self.est['model'] = None self.est['tss'] = ((self.y - np.mean(self.y))**2).sum() self.est['rss'] = (self.est['residuals']**2).sum() self.est['dev'] = self.y - np.mean(self.y)
suburb_dummies = pd.get_dummies(dataset_dr[["Type", "Method"]]) full_Data = dataset_dr.drop([ "Address", "Price", "Date", "SellerG", "Suburb", "Type", "Method", "CouncilArea", "Regionname" ], axis=1).join(suburb_dummies) X = full_Data y = dataset_dr["Price"] # Split into test data and training data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Train the algorithm regressor = lm() regressor.fit(X_train, y_train) print("Intercept: {}".format(regressor.intercept_)) coeff_df = pd.DataFrame(regressor.coef_, X.columns, columns=['Coefficient']) ranked_suburbs = coeff_df.sort_values("Coefficient", ascending=False) print(ranked_suburbs) # Calculate linear predictions y_pred = regressor.predict(X_test) # Metrics print('MSE:', metrics.mean_squared_error(y_test, y_pred)) print("MAE:", metrics.mean_absolute_error(y_test, y_pred)) print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) # Plot
print(iris_df[:5]) # 훈련세트, 테스트세트 나누기 (과적합 방지 목적) from sklearn.model_selection import train_test_split train_set, test_set = train_test_split( iris_df, test_size=0.3) # 데이터를 섞은 후 train:test를 7:3으로 나눔 print(train_set.shape) #(105, 6) print(test_set.shape) #(45, 6) print('\nLinearRegression)') # 회귀분석 방법 1 - 선형 회귀(최소제곱) from sklearn.linear_model import LinearRegression as lm import matplotlib.pyplot as plt model = lm().fit(X=train_set.iloc[:, [2]], y=train_set.iloc[:, [3]]) # train data로 모델 학습 print(model.score(X=train_set.iloc[:, [2]], y=train_set.iloc[:, [3]])) print(model.score(X=test_set.iloc[:, [2]], y=test_set.iloc[:, [3]])) print(model.coef_) #[[ 0.40847816]] print(model.intercept_) #[-0.33677518] print('predict : ', model.predict(test_set.iloc[:, [2]])) # test data로 모델 평가 #plot plt.scatter(train_set.iloc[:, [2]], train_set.iloc[:, [3]], color='black') plt.plot(test_set.iloc[:, [2]], model.predict(test_set.iloc[:, [2]])) plt.show() print('\nRidge') # 회귀분석 방법 2 - Ridge: alpha값을 조정하여 과대/과소적합을 피한다. from sklearn.linear_model import Ridge
train_set, test_set = train_test_split(iris_df, test_size=0.3) print(train_set.shape) print(test_set.shape) print('\nLinearRegression)') # 회귀분석 방법 1 - 선형 회귀(최소제곱) from sklearn.linear_model import LinearRegression as lm import matplotlib.pyplot as plt model = lm().fit(X=train_set.iloc[:, [2]], y=train_set.iloc[:, [3]]) print(model.score(X=train_set.iloc[:, [2]], y=train_set.iloc[:, [3]])) print(model.score(X=test_set.iloc[:, [2]], y=test_set.iloc[:, [3]])) print(model.coef_) #[[ 0.40847816]] print(model.intercept_) #[-0.33677518] print('predict : ', model.predict(test_set.ix[:, [2]])) #plot plt.scatter(train_set.iloc[:, [2]], train_set.iloc[:, [3]], color='black')
print() # end of 1st commit # 2nd commit start (missing value replaced with respected column mean) data = data.fillna(data.mean()) # end of 2nd commit y=data.bphi x=data.drop('bphi',axis=1) m=x.shape[0] x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2) from sklearn.linear_model import LinearRegression as lm model=lm().fit(x_train,y_train) test = x_test.head(1) predictions = model.predict(x_test.head(1)) import matplotlib.pyplot as plt plt.scatter(y_test.head(1),predictions) plt.xlabel("True Values") plt.ylabel("Predictions") predictions predictions[0:1000]
#Cleaning data df = pd.read_csv('housePractice.csv') df['date'] = pd.to_datetime(df.date) df.head() #Splitting Data into training and test y = df['price'] x = df[[ 'bedrooms', 'bathrooms', 'floors', 'sqft_living', 'sqft_lot', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'sqft_living15', 'sqft_lot15' ]] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) #Fitting into a linear regression model model = lm() model.fit(x_train, y_train) predictions = model.predict(x_test) df1 = pd.DataFrame({ 'Actual': y_test, 'predicted': predictions, }) #Calculating test error r_sq = model.score(x, y) print('coefficient of determination:', r_sq) #Buliding a model for different splits x1_train, x1_test, y1_train, y1_test = train_test_split(x, y, test_size=0.3) model = lm() model.fit(x1_train, y1_train)