def linregress(X_train, X_test, y_train, y_test): coef = [] for col in X_train.columns.tolist(): X = StandardScaler().fit_transform(X_train[col]) lr = LinearRegression() lr.fit(X.reshape(-1, 1), y_train) coef.append([col, lr.coef_]) coef = sorted(coef, key=lambda x: x[1])[::-1] nos = [x[1] for x in coef] labs = [x[0] for x in coef] for lab in labs: if lab == 'doubles': labs[labs.index(lab)] = '2B' elif lab == 'triples': labs[labs.index(lab)] = '3B' elif lab == 'Intercept': idx = labs.index('Intercept') labs.pop(idx) nos.pop(idx) labs = [lab.upper() for lab in labs] x = range(len(nos)) plt.plot(x,nos, lw=2, c='b') plt.xticks(x, labs) plt.title('Linear Regression Coefficients (Win Percentage)') plt.savefig('images/coefficients.png') plt.show() print labs
def _reduce_X(self,X,i): X_new = np.zeros(X.shape) lr = LinearRegression() for j in range(X_new.shape[1]): lr.fit(y= X[:,j].reshape(-1, 1), X= X[:,i].reshape(-1, 1)) X_new[:,j] = X[:,j] - lr.coef_*X[:,i] return np.delete(X_new, i, axis=1)
def train_regressor(options, embed_map, wordvecs, worddict): """ Return regressor to map word2vec to RNN word space """ # Gather all words from word2vec that appear in wordvecs d = defaultdict(lambda : 0) for w in embed_map.vocab.keys(): d[w] = 1 shared = OrderedDict() count = 0 for w in worddict.keys()[:options['n_words']-2]: if d[w] > 0: shared[w] = count count += 1 # Get the vectors for all words in 'shared' w2v = numpy.zeros((len(shared), 300), dtype='float32') sg = numpy.zeros((len(shared), options['dim_word']), dtype='float32') for w in shared.keys(): w2v[shared[w]] = embed_map[w] sg[shared[w]] = wordvecs[w] clf = LinearRegression() clf.fit(w2v, sg) return clf
def linearRegressionExample(X, Y): # fit-intercept defines if we should fit an intrecpt term or not est = LinearRegression(fit_intercept=False) #fit the data est.fit(X,Y) # get coefficients est.coef_
def normalize_money_with_date(): with open('train_test.pickle') as f: train_set,test_set = pickle.load(f) money = float(np.max([movie['total_money'] for movie in train_set])) year_money = np.array([[movie['date'].year,float(movie['total_money'])/money] for movie in train_set],float) year_mean = np.zeros([5,2]) for y in range(5): money = year_money[year_money[:,0] == 2011+y,1] plt.scatter(y*np.ones(np.shape(money)),money) mean = np.mean(money) year_mean[y,:] = np.array([1+y,mean],float) regressor = LinearRegression() regressor.fit(year_mean[:,0:1],year_mean[:,1]) a,b = regressor.coef_, regressor.intercept_ with open('coef.pickle') as f: coef = pickle.load(f) coef['normalize_year'] = {'a':a,'b':b,'base':2010} with open('coef.pickle','w') as f: pickle.dump(coef,f) print a,b,regressor.score(year_mean[:,0:1],year_mean[:,1]) plt.plot(year_mean[:,1]) plt.savefig('year_money.png')
def calc_task_two_one(): warnings.warn("deprecated", DeprecationWarning) model = LinearRegression() X = np.array(df[x_list].values) y = df['Price'].values model.fit(X, y) return model, X, y
def RunLinearRegressionScikit(q): totalTimer = Timer() # Load input dataset. # If the dataset contains two files then the second file is the responses # file. Log.Info("Loading dataset", self.verbose) if len(self.dataset) == 2: X = np.genfromtxt(self.dataset[0], delimiter=',') y = np.genfromtxt(self.dataset[1], delimiter=',') else: X = np.genfromtxt(self.dataset, delimiter=',') y = X[:, (X.shape[1] - 1)] X = X[:,:-1] try: with totalTimer: # Perform linear regression. model = SLinearRegression() model.fit(X, y, n_jobs=-1) b = model.coef_ except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
#train_x = np.reshape(train_x,(-1,2)) #print(train_x) #train_x2 = data["Y"][:-2].values #train_x2 = np.reshape(train_x2,(-1,1)) train_y = data["Expected_output"][:-2].values.reshape(-1,1) #train_y = np.reshape(train_y,(-1,1)) #test_x = pd.DataFrame(data,columns = data[["X","Y"]][-2:].values) test_x = data[["X","Y"]][-2:].values.reshape(-1,2) #test_x = np.reshape(test_x,(-1,2)) #test_x2 = data["Y"][-2:].values #test_x2 = np.reshape(test_x2,(-1,1)) test_y = data["Expected_output"][-2:].values.reshape(-1,1) #test_y = np.reshape(test_y,(-1,1)) #print(test_x["X"]) model = LinearRegression() model.fit(train_x,train_y) coeff = model.coef_ intercept = model.intercept_ points = [intercept+(coeff[0]*i[0]) for i in train_x] plt.plot(points,"ro") predict_y = model.predict(test_x) plt.plot(train_y,predict_y,"b*") print(predict_y) plt.show() #intercept = #points =
df.drop('type', axis=1, inplace=True) df.drop(df[df['bedroom_num'] < df['bathroom_num'] - 1].index, inplace=True) mean_price = df['price'].mean() df.drop(df[df['price'] < mean_price - 75000].index, inplace=True) df.drop(df[df['price'] > mean_price + 75000].index, inplace=True) df['price'].describe() X = df.drop('price', axis=1) y = df['price'] from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) model = LinearRegression() model.fit(X_train, y_train) predictions = model.predict(X_test) from sklearn.metrics import r2_score, mean_squared_error #print(model.score(X_test,y_test)) with open('mumbai.pickle', 'wb') as f: pickle.dump(model, f) X = list(X.columns) X = '@'.join(X) with open('locations2.txt', 'w') as file: file.write(X)
import numpy as np from sklearn.linear_model import LinearRegression import matplotlib.pyplot as plt previous_days = np.array([1, 7, 11, 21, 25, 28]) next_days = np.array([7, 11, 21, 25, 28, 31]) previous_days = previous_days.reshape(-1, 1) # miachapanin darcnuma erkchapani reshape # orinakneri grafik # plt.scatter(previous_days,next_days ) # plt.xlabel("previous_days") # plt.ylabel("next_days ") # plt.show() # print(previous_days) ml_model = LinearRegression() # superviz ml_model.fit(previous_days, next_days) ml_model.intercept_#b-n banadzevi #intercept ml_model.coef_#a-n banadzevi # y_pred = ml_model.intercept_ + ml_model.coef_*X #slope # plt.scatter(previous_days,next_days , color='red') # plt.xlabel('previous_days') # plt.ylabel('next_days ') # plt.plot(previous_days, ml_model.predict(previous_days)) # plt.show() y_pred = ml_model.predict([[1]]) print(y_pred) # print(help(ml_model.predict))
def main(): train_X, train_y, valid_X, valid_y, train_w, val_w = get_data() print("********* Linear Regression *********") linear_regression = LinearRegression(fit_intercept=False) linear_regression.fit(train_X, train_y) train_y_predictions = linear_regression.predict(train_X) valid_y_predictions = linear_regression.predict(valid_X) # The betas linear_reg_beta = linear_regression.coef_ print("True betas:{}\nEstimated linear regression betas:{}\n".format( _beta, linear_reg_beta)) print( "Training loss with true beta:{:.3f}\nValidation loss with true beta:{:.3f}" .format(compute_loss_square_with_true_betas(train_X, train_y, _beta), compute_loss_square_with_true_betas(valid_X, valid_y, _beta))) print("Training square loss:{:.3f}\nValidation square loss:{:.3f}".format( compute_square_loss(train_y, train_y_predictions), compute_square_loss(valid_y, valid_y_predictions))) # The mean squared error print("Linear regression mean squared error: {:.2f}, {:.2f}\n".format( mean_squared_error(train_y, train_y_predictions), mean_squared_error(valid_y, valid_y_predictions))) print("********* Linear Regression Error Analysis *********") X = np.vstack([train_X, valid_X]) print("Rank:{:d}".format(np.linalg.matrix_rank(X))) U, S, V = np.linalg.svd(X, full_matrices=False) X_SVD = U @ np.diag(S) @ V print("Is X close to X_SVD?", np.isclose(X, X_SVD).all()) print("Singular values:{}".format(S)) w = np.hstack([train_w, val_w]) Inv_S = np.linalg.inv(np.diag(S)) print("True betas:{}".format(_beta)) print("Computed betas:{}".format(linear_reg_beta)) beta_OLS_beta_true = (U @ Inv_S @ V).T @ w print("Difference:{}".format(beta_OLS_beta_true)) mean_training_error = 1 + 5 / 70 variance_training_error = (70 - 5) / (70.**2) variance_training_error *= 2 print("Mean of the average training error:{}\nVariance:{}".format( mean_training_error, variance_training_error)) valid_x_cov = np.cov(valid_X) U = U[:20, :5] test_mean_square_error = U.T @ valid_x_cov @ U test_mean_square_error = test_mean_square_error @ Inv_S test_mean_square_error = np.sum(test_mean_square_error, axis=1) print("Test mean square error:{}\n".format(test_mean_square_error)) print("********* Ridge Regression *********") ridge_regression = Ridge(alpha=0.5, fit_intercept=False) ridge_regression.fit(train_X, train_y) train_y_predictions = ridge_regression.predict(train_X) valid_y_predictions = ridge_regression.predict(valid_X) # The coefficients ridge_beta = ridge_regression.coef_ print("True beta:{}\nRidge betas:{}\n".format(_beta, ridge_beta)) print( "Training loss with true beta:{:.3f}\nValidation loss with true beta:{:.3f}" .format(compute_loss_square_with_true_betas(train_X, train_y, _beta), compute_loss_square_with_true_betas(valid_X, valid_y, _beta))) print("Training square loss:{:.3f}\nValidation square loss:{:.3f}".format( compute_square_loss(train_y, train_y_predictions), compute_square_loss(valid_y, valid_y_predictions))) # The mean squared error print('Mean squared error: {:.2f}, {:.2f}'.format( mean_squared_error(train_y, train_y_predictions), mean_squared_error(valid_y, valid_y_predictions))) U, S, V = np.linalg.svd(train_X, full_matrices=False) X_SVD = U @ np.diag(S) @ V print("Is X close to X_SVD?", np.isclose(train_X, X_SVD).all()) print("Singular values:{}".format(S))
#concat data frames what to be splited dataFrameExceptHumidity = pd.concat([outlook, temperature], axis=1) dataFrameExceptHumidity = pd.concat([dataFrameExceptHumidity, windy], axis=1) dataFrameExceptHumidity = pd.concat([dataFrameExceptHumidity, play], axis=1) """ DATA PREDICTION """ #split data as test and train from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(dataFrameExceptHumidity, humidity, test_size=0.33, random_state=0) #create multiple regression and predict from sklearn.linear_model import LinearRegression regressor = LinearRegression() regressor.fit(x_train, y_train) y_prediction = regressor.predict(x_test) #backward eliminatipn import statsmodels.formula.api as sm X = np.append(arr=np.ones((14, 1)).astype(int), values=dataFrameExceptHumidity, axis=1) X_l = dataFrameExceptHumidity.iloc[:, [0, 1, 2, 3, 4, 5]].values r_ols = sm.OLS(endog=humidity, exog=X_l).fit() print(r_ols.summary()) #make elimination according to p-values of r_ols X_l = dataFrameExceptHumidity.iloc[:, [0, 1, 2, 3, 5]].values r_ols = sm.OLS(endog=humidity, exog=X_l).fit()
import matplotlib.pyplot as plt import pandas as pd #import datsaet dataset = pd.read_csv('Salary_Data.csv') X = dataset.iloc[:,:1].values Y = dataset.iloc[:,1].values #splitting into train and test data from sklearn.cross_validation import train_test_split X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=1/3,random_state=0) #perform / fit SLR for train dataset from sklearn.linear_model import LinearRegression Regressor = LinearRegression() Regressor.fit(X_train,Y_train) #predict the test set results and compare y_pred = Regressor.predict(X_test) #plotting y_pred for x_train vs x_train plt.scatter(X_train,Y_train,color='red') plt.plot(X_train,Regressor.predict(X_train),color='blue') plt.title('Salary vs years of Experience') plt.xlabel('Years of experience') plt.ylabel('salary') plt.show() #plotting y_pred for x_test vs x_test plt.scatter(X_test,Y_test,color='red') plt.plot(X_train,Regressor.predict(X_train),color='blue')
else: if tid in transcript_counts: all_y.append(transcript_counts[tid]) all_x.append([float(x) for x in data[1:]]) trans_ids.append(tid) f.close() all_x = np.array(all_x) all_y = np.log2(all_y) print "normalizing data" #my_normalization = preprocessing.StandardScaler().fit(all_x) #all_x = my_normalization.transform(all_x) print "fitting linear regression" clf = LinearRegression(fit_intercept=True) clf = clf.fit(all_x, all_y) print "making predictions" predictions = clf.predict(all_x) count_mean = np.mean(all_y) residuals = all_y - predictions corrected_counts = residuals + count_mean out = open(args.output, 'w') for i in xrange(len(all_y)): out.write("%s\t%d\t%d\t%f\t%f\n" % (taxid_transcript[trans_ids[i]], trans_ids[i], transcript_counts[trans_ids[i]], rpkms[trans_ids[i]], 2** corrected_counts[i])) out.close()
grouped_test2 = df_gptest[['drive-wheels', 'price']].groupby(['drive-wheels']) # print(grouped_test2.head(2)) # print(df_gptest) grouped_test2.get_group('4wd')['price'] # ANOVA f_val, p_val = stats.f_oneway( grouped_test2.get_group('fwd')['price'], grouped_test2.get_group('rwd')['price'], grouped_test2.get_group('4wd')['price']) # print("ANOVA results: F=", f_val, ", P =", p_val) f_val, p_val = stats.f_oneway( grouped_test2.get_group('fwd')['price'], grouped_test2.get_group('rwd')['price']) # print("ANOVA results: F=", f_val, ", P =", p_val) f_val, p_val = stats.f_oneway( grouped_test2.get_group('4wd')['price'], grouped_test2.get_group('rwd')['price']) # print("ANOVA results: F=", f_val, ", P =", p_val) X = df[['highway-mpg']] Y = df['price'] lm.fit(X, Y) Yhat = lm.predict(X) print(Yhat[0:5])
########################################################## # Linear Regression ########################################################## def rmsle(y_pred, y_test) : assert len(y_test) == len(y_pred) assert (y_pred < 0).sum() == 0 return np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y_test))**2)) # try linear regression lr = LinearRegression() lr.fit(X_train, y_train) y_pred = lr.predict(X_test) baseline_error = rmsle(y_test, y_pred) print(baseline_error) ########################################################## # Ridge Regression ########################################################## n_alphas = 100 alphas = np.logspace(-5, 5, 100 ) coefs = list()
# by Willi Richert and Luis Pedro Coelho # published by PACKT Publishing # # It is made available under the MIT License import numpy as np from sklearn.datasets import load_svmlight_file from sklearn.linear_model import ElasticNet, LinearRegression data, target = load_svmlight_file('E2006.train') lr = LinearRegression(fit_intercept=True) from sklearn.cross_validation import KFold kf = KFold(len(target), n_folds=10) err = 0 for train, test in kf: lr.fit(data[train], target[train]) p = map(lr.predict, data[test]) p = np.array(p).ravel() e = p - target[test] err += np.dot(e, e) rmse_10cv = np.sqrt(err / len(target)) lr.fit(data, target) p = np.array(map(lr.predict, data)) p = p.ravel() e = p - target total_error = np.dot(e, e) rmse_train = np.sqrt(total_error / len(p)) print('RMSE on training: {}'.format(rmse_train))
'eight': 8, 'nine': 9, 'ten': 10, 'eleven': 11, 'twelve': 12, 'zero': 0, 0: 0 } return word_dict[word] # fit in experience column X['experience'] = X['experience'].apply(lambda x: convert_to_int(x)) y = dataset.iloc[:, -1] #spliting training set and test set #since we have too small data so we will train our model with all the availabel data. from sklearn.linear_model import LinearRegression regressor = LinearRegression() #fitting model with training data regressor.fit(X, y) # saviing model to disk pickle.dump(regressor, open('model.pkl', 'wb')) #loading model to prepare the result model = pickle.load(open('model.pkl', 'rb')) print(model.predict([[2, 9, 6]]))
# Calculate 251 day average closing price and standard deviation # 251 because approx. 251 trading days per year according to # https://tradingsim.com/blog/trading-days-in-a-year/ data['close_year_mean'] = data.Close.rolling(251).mean().shift(1) data['close_year_std'] = data.Close.rolling(251).std().shift(1) # Calculate volume 251 day avg & std data['vol_year_mean'] = data.Volume.rolling(251).mean().shift(1) data['vol_year_std'] = data.Volume.rolling(251).std().shift(1) # Drop null values # First 251 rows where there wasn't enough data to calculate year_mean and year_std data = data.dropna(axis=0) # Use 2013-01-01 as date to begin testing test_date = dt.datetime(year=2013, month=1, day=1) train = data[data['Date'] < test_date] test = data[data['Date'] >= test_date] # Only use new columns as features features = [i for i in data.columns if i.endswith('mean') or i.endswith('std')] target = 'Close' lr = LinearRegression() lr.fit(train[features], train[target]) predictions = lr.predict(test[features]) rmse = mean_squared_error(predictions, test[target])**(1 / 2) print('Root Mean Squared Error: ', rmse) # Output: Root Mean Squared Error: 22.24912756194984
X = preprocessing.scale( X ) # scaled the Data this is a set of ADJ Close values for label these are the results or X #used to generate the model X_lately = X[-forcast_out:] X = X[:-forcast_out] #used for labels y = np.array(dfreg['Label']) y = y[:-forcast_out] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) len(X) #Linear LinearRegression clfreg = LinearRegression(n_jobs=-1) # -1 means uses all processors clfreg.fit(X_train, y_train) #Quaratic Regression clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge()) clfpoly2.fit(X_train, y_train) #Polynomial regresion of degree 3 (Cubic?) clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge()) clfpoly3.fit(X_train, y_train) #KNN Regression clfknn = KNeighborsRegressor(n_neighbors=2) clfknn.fit(X_train, y_train) #confidence scores
# In[10]: feature_cols = [ "Monthly Income", "Transaction Time", "Gender_Female", "Gender_Male", "City_Tier 1", "City_Tier 2", "City_Tier 3", "Record" ] # In[11]: X = df_new[feature_cols] Y = df_new["Total Spend"] # In[12]: lm = LinearRegression() lm.fit(X, Y) # In[13]: print(lm.intercept_) print(lm.coef_) # In[14]: list(zip(feature_cols, lm.coef_)) # In[15]: lm.score(X, Y) # El modelo puede ser escrito como:
gs_a.support = gsFeatureSupport gs_a.selector = gsFeatureSelector gs_a.fit(gsX,Y_train)#Train the model #%% # Linear Regression model and feature selection linearRegression = LinearRegression() linearFeatureSelector = RFECV(linearRegression, cv = 5).fit(X_train,Y_train) LinearX = linearFeatureSelector.transform(X_train) linearFeatureSupport = linearFeatureSelector.support_ # store selector in Linear Regression Model linearRegression.support = linearFeatureSupport linearRegression.selector = linearFeatureSelector # train Linear Regression Model linearRegression.fit(LinearX,Y_train) # %% ''' We can choose to open or not open a model ''' #with open('\Model_RandomForest.joblib', 'rb') as gs_a: #gs_a = joblib.load('./Model_RandomForest.joblib') """ Verification """ verifi_data = pd.read_csv("../Data/Verification_data.csv")
import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LinearRegression # import data data_set = pd.read_csv('Position_Salaries.csv') x = data_set['Level'] y = data_set['Salary'] # split data to train & test sets x_train, x_test, y_train, y_test = train_test_split(x, y) # generate the polynomial features from x: x^0, x^1, x^2, ... poly = PolynomialFeatures(degree=2) x_train_poly = poly.fit_transform(x_train.reshape(-1, 1)) # train the model using the ploynomial features model = LinearRegression() model.fit(x_train_poly, y_train) # generate the ploy features from test set, then predict x_test_poly = poly.transform(x_test.reshape(-1, 1)) y_pred = model.predict(x_test_poly) #plot the results plt.plot(x_train, y_train, 'ro', label='training_data') plt.plot(x_test, y_test, 'bo', label='testing_data') plt.plot(x_test, y_pred, 'go', label='predicted_data') plt.legend() plt.show()
#split our data set into the following parts np.random.seed(1) train, validate, test = np.split(df_clean.sample(frac=1), [int(.6*len(df_clean)), int(.8*len(df_clean))]) train_x= train.drop(['imdb_score'], axis=1) train_y=train['imdb_score'] test_x= test.drop(['imdb_score'], axis=1) test_y=test['imdb_score'] ################################################################################################################### #linear regression ################################################################################################################### # train our algorithm regressor = LinearRegression() results=regressor.fit(train_x, train_y) #training the algorithm X2 = sm.add_constant(train_x) est = sm.OLS(train_y, X2) est2 = est.fit() print(est2.summary()) #test our algorithm pred = results.predict(test_x) #compare actual vs predicted values df_output = pd.DataFrame({'Actual': test_y, 'Predicted': pred}) df_output # Calculate mean absolute percentage error (MAPE)
def main(regressor="random_forest"): """ The main method """ # Fetch data from internet data = fetch_and_load_data() # Process median_income into categories data["income_cat"] = np.ceil(data["median_income"] / 1.5) data["income_cat"].where(data["income_cat"] < 5, 5.0, inplace=True) # Split data into training and testing sets train_data, test_data = split_train_test_stratified(data, "income_cat") # Extract labels and housing data housing_labels = train_data["median_house_value"].copy() housing = train_data.drop("median_house_value", axis=1) # split housing into categorical and numerical data # cat_attributes = ["ocean_proximity", "income_cat"] cat_attributes = ["ocean_proximity"] num_attributes = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income'] # Set up pipeline to prepare data with. full_pipeline = setup_pipeline(num_attributes, cat_attributes) # Prepare the data housing_prepared = full_pipeline.fit_transform(housing) print() # Select the appropriate regressor if regressor == "linear": reg_model = LinearRegression() reg_name = "Linear Regressor" elif regressor == "random_forest": reg_model = RandomForestRegressor() reg_name = "Random Forest Regressor" elif regressor == "decision_tree": reg_model = DecisionTreeRegressor() reg_name = "Decision Tree Regressor" elif regressor == "svr": reg_model = SVR(kernel="linear", gamma='auto') reg_name = "Support Vector Machine" else: error_mes = "Regressor '{regressor}' not recognised." raise ValueError(error_mes.format(regressor=regressor)) # Train regression model reg_model.fit(housing_prepared, housing_labels) display_model_performance(reg_model, housing_prepared, housing_labels, reg_name) if regressor == "random_forest": # Fine tune Random Forest param_grid = [ {'n_estimators': [50, 100, 1000], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [50, 100, 1000], 'max_features': [2, 4, 6]}] final_model = fine_tune_model(RandomForestRegressor(), param_grid, housing_prepared, housing_labels) # Get the best model weights print() print("Attribute weights:") feature_importances = final_model.feature_importances_ print_attribute_importances(feature_importances, num_attributes, full_pipeline) elif regressor == "linear": final_model = reg_model print("Coefficients used by linear model:") coeffs = final_model.coef_ print_attribute_importances(coeffs, num_attributes, full_pipeline) elif regressor == "decision_tree": # Fine tune Decision Tree param_grid = [{'criterion': ["mse", "friedman_mse", "mae"]}] final_model = fine_tune_model(DecisionTreeRegressor(), param_grid, housing_prepared, housing_labels) elif regressor == "svr": param_grid = [ {'kernel': ["linear"], "C": [10000, 100000]}, {'kernel': ["rbf"], "C": [10000, 100000], "gamma": [0.045, 0.05, 0.055]}] final_model = fine_tune_model(SVR(), param_grid, housing_prepared, housing_labels) else: final_model = reg_model print() # Evaluate on test set X_test = test_data.drop("median_house_value", axis=1) y_test = test_data["median_house_value"].copy() X_test_prepared = full_pipeline.transform(X_test) final_predictions = final_model.predict(X_test_prepared) final_mse = mean_squared_error(y_test, final_predictions) final_rmse = np.sqrt(final_mse) print("Final Standard Error:", final_rmse)
forecast_out = int(30) df['Prediction'] = df[['Adj Close']].shift(-forecast_out) df.shape X = np.array(df.drop(['Prediction'], 1)) X = preprocessing.scale(X) X.shape X_forecast = X[-forecast_out:] # set X_forecast equal to last 30 X = X[:-forecast_out] # remove last 30 from X X.shape y = np.array(df['Prediction']) y = y[:-forecast_out] y.shape X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.2) # Training clf = LinearRegression() clf.fit(X_train,y_train) # Testing confidence = clf.score(X_test, y_test) print("confidence: ", confidence) forecast_prediction = clf.predict(X_forecast) print(forecast_prediction)
import pandas as pd import numpy as np ## Importing the dataset dataset = pd.read_csv('50_Startups.csv') X = dataset.iloc[:, :-1].values Y = dataset.iloc[:, 4].values ## Encoding Categorical data from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelEncoder = LabelEncoder() X[:, 3] = labelEncoder.fit_transform(X[:, 3]) # print(X[:, 3]) onehotencoder = OneHotEncoder(categories='auto') X = onehotencoder.fit_transform(X).toarray() ## Avoiding Dummy Variable Trap X = X[:, 1:] ## Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) # print(X_train, '\n', X_test, '\n', Y_train, '\n', Y_test) # Step2: Fitting Multiple Linear Regression to the Training set from sklearn.linear_model import LinearRegression regressor = LinearRegression() regressor.fit(X_train, Y_train) # Step3: Predicting the Test set results Y_pred = regressor.predict(X_test) print('Y_test = ', Y_test, '\n', 'Y_pred = ', Y_pred)
# Importar dataset dataset = pd.read_csv('Position_Salaries.csv') X = dataset.iloc[:, 1:2].values #Variables independientes o predictoras y = dataset.iloc[:, 2].values # Tiene que ser una matriz, no un vector # No hace falta dividir en datatest y datatrain porque hay pocos datos """ from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) """ # Comprobar la regresion lineal y comprobar from sklearn.linear_model import LinearRegression lin_reg = LinearRegression() lin_reg.fit(X, y) # Ajustar la regresión polinómica con todo el dataset from sklearn.preprocessing import PolynomialFeatures poly_reg = PolynomialFeatures(degree = 4) X_poly = poly_reg.fit_transform(X) # Fit solo crea el modelo, y el transform aplica los cambios lin_reg_2 = LinearRegression() # Es la misma, pero es una regresion LINEAL polinomica lin_reg_2.fit(X_poly, y) """ Muchas librerias necesitan una columa de unos a la izquierda""" """Primero se modela las variables, y luego regresion lineal""" # Visualizacion de los resultados del modelo lineal plt.scatter(X, y, color = "red")
x = np.arange(0, 100) y = np.arange(0, 100) print(x) print(y) lr = LinearRegression() #lr.fit(x,y) # you should see an error... x.ndim y.ndim x.shape y.shape x = x.reshape(-1, 1) x.shape x.ndim lr.fit(x, y) # now it works! plt.scatter(x, y, color='red') plt.plot(x, lr.predict(x), color='blue') plt.title('Linear Regression Demo') plt.xlabel('X') plt.ylabel('Y') plt.show()
print(seenMovie) print(metadata) print("Data loaded") print(seenMovie.shape, '\t', metadata.shape) seenMovie = seenMovie.astype('int') # split train and test set X_train, X_test, y_train, y_test = train_test_split(metadata, seenMovie, test_size=0.3, random_state=1, shuffle=True, stratify=seenMovie) # build model 2 nnls regression model reg_nnls = LinearRegression(positive=True) y_pred_nnls = reg_nnls.fit(X_train, y_train).predict(X_test) r2_score_nnls = r2_score(y_test, y_pred_nnls) print("NNLS R2 score", r2_score_nnls) logLossVal_nnls = log_loss(y_test, y_pred_nnls, eps=1e-15, normalize=True, sample_weight=None, labels=None) scaled_test = minmax_scale(y_test, feature_range=(0, 1)) scaled_pred = minmax_scale(y_pred_nnls, feature_range=(0, 1)) mse_2 = calculateMeanSquareError(scaled_test, scaled_pred) # m2_recall = recall_score(y_test, y_pred_nnls, average='binary') # m2_precision = precision_score(y_test, y_pred_nnls, average='binary') print("LogLoss Model 2: ", logLossVal_nnls)
# -*- coding: utf-8 -*- import numpy as np import pandas as pd import matplotlib.pyplot as plt df = pd.read_csv("../../../data/Position_Salaries.csv") X = df.iloc[:, 1:2].values Y = df.iloc[:, 2:].values from sklearn.preprocessing import PolynomialFeatures poly_feature = PolynomialFeatures(degree=2) X_poly = poly_feature.fit_transform(X) from sklearn.linear_model import LinearRegression lin_reg = LinearRegression() lin_reg.fit(X_poly, Y) fig = plt.figure() ax = fig.add_axes([0, 0, 1, 1]) ax.scatter(X, Y, color='r') X_grid = np.arange(min(X), max(X), 0.1) X_grid = X_grid.reshape((len(X_grid), 1)) ax.plot(X_grid, lin_reg.predict(poly_feature.fit_transform(X_grid))) ax.set_title('level-salary curve') ax.set_xlabel('level') ax.set_ylabel('salary') plt.show()
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) # Feature Scaling of train and test data # Not needed as sklearn.linear_model will take care it self. #from sklearn.preprocessing import StandardScaler #sc_X = StandardScaler() #X_train = sc_X.fit_transform(X_train) # fit and transform #X_test = sc_X.transform(X_test) # already data is fit so only transform here # no need to do feature scaling for y as there are only two values (yes/no) # Fitting Simple Linear Regression to Training set from sklearn.linear_model import LinearRegression regressor = LinearRegression() regressor.fit(X_train, y_train) # To retrieve the intercept r_intercept = regressor.intercept_ # For retrieving the slope (coefficient of x) r_coef = regressor.coef_ # Predicting the test set result. y_pred = regressor.predict(X_test) # Visulization of results # First plot predictions for training set and compare with ground truth plt.scatter(X_train, y_train, color = 'red') # Ground truth values for train plt.plot(X_train, regressor.predict(X_train), color = 'blue') # Predicted values for train plt.title("Best Prices vs List Prices - Training")
import pandas from sklearn.linear_model import LinearRegression data = pandas.read_csv('iphone_price.csv') model = LinearRegression() model.fit(data[['version']], data[['price']]) print(model.predict([[20]])) print(model.predict([[25]])) print(model.predict([[30]]))
best_lasso_mse = None # Split data into Training and Test part (85 x 1 vector y_train; 85 x 7 matrix X_train; 18 x 1 vector y_test; and, 85 x 7 x_test) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=18, train_size=85) # X_train and y_train further split on each iteration for 5-fold validation for train_index, test_index in kf.split(X_train): X_train1, X_cv = X_train.iloc[train_index], X_train.iloc[test_index] y_train1, y_cv = y_train.iloc[train_index], y_train.iloc[test_index] # Train the linear model and save if it is best model based on score lr_model = LinearRegression() lr_model.fit(X_train1, y_train1) mse1 = mean_squared_error(y_cv, lr_model.predict(X_cv)) if linear_cv_mse == [] or mse1 < min(linear_cv_mse): best_lm_mse = mse1 lr_model_best = lr_model linear_cv_mse.append(mse1) # Train the ridge model and save if it is best model rg_model = Ridge(alpha=20) rg_model.fit(X_train1, y_train1) mse2 = mean_squared_error(y_cv, rg_model.predict(X_cv)) if ridge_cv_mse == [] or mse2 < min(ridge_cv_mse): best_rg_mse = mse2 rg_model_best = rg_model ridge_cv_mse.append(mse2)
random_state=42) #Feature Scaling the data # You might want to make this optional from sklearn.preprocessing import StandardScaler fs_X = StandardScaler() fs_y = StandardScaler() X_train = fs_X.fit_transform(X_train) X_test = fs_X.transform(X_test) y_train = fs_y.fit_transform(np.array(y_train).reshape(-1, 1)) y_test = fs_y.transform(np.array(y_test).reshape(-1, 1)) # Linear Regression from sklearn.linear_model import LinearRegression lm = LinearRegression() lm.fit(X_train, y_train) predictions_lin = lm.predict(X_test) # Polynomial Regression using degree 3 from sklearn.preprocessing import PolynomialFeatures poly = PolynomialFeatures( degree=2) # You might want to try with different values of degree X_poly = poly.fit_transform(X_train) poly.fit(X_poly, y_train) lm2 = LinearRegression() lm2.fit(X_poly, y_train) predictions_poly = lm2.predict(poly.fit_transform(X_test)) #Suppor Vector Regression
# Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split #We will split 10 to test, 20 to train X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 / 3, random_state=0) #No need for feature scaling #Fitting Simple Lin Regression model to Training Set from sklearn.linear_model import LinearRegression regressor = LinearRegression() #We are fine with default parameters regressor.fit( X_train, y_train) #machine is the regressor, made it learn on the training set #Machine can now based on its learning experience predict the new salary #Regressor learned the correlations between experience and salary #Predicting the test results - create a vector of predictions y_pred = regressor.predict( X_test) #vector of predictions of dependant variable #The predictions are pretty damn close #Visualizing the results with matplotlib plt.scatter(X_train, y_train, color='red') #plots the real values plt.plot(X_train, regressor.predict(X_train), color='blue') #shows the comparisn between X_train and predictions
pos = input('Enter your Position Level (1-10)') act_sal = input('Enter your Salary') error = 0.1 dataset = pd.read_csv('Position_Salaries.csv') idm = dataset.iloc[:,1:2].values dm = dataset.iloc[:,2].values from sklearn.preprocessing import PolynomialFeatures poly_reg = PolynomialFeatures(degree = 5) idm_poly = poly_reg.fit_transform(idm) poly_reg.fit(idm_poly,dm) from sklearn.linear_model import LinearRegression lin_reg = LinearRegression() lin_reg.fit(idm_poly,dm) idm_grid = np.arange(min(idm),max(idm),0.01) idm_grid = idm_grid.reshape(len(idm_grid),1) mpt.scatter(idm,dm,color='red') mpt.plot(idm_grid,lin_reg.predict(poly_reg.fit_transform(idm_grid)),color='blue') mpt.title('Truth or Bluff - Polynomial Regression') mpt.xlabel('Position Level') mpt.ylabel('Salary') mpt.show() prdt_sal = lin_reg.predict(poly_reg.fit_transform(pos)) err=(prdt_sal-float(act_sal))/prdt_sal if(err<=error): print('Truth') else: