def solve_linear_regression(self): nr_features = len(self.train_inputs[0]) # for problem with 1 or 2 features plot data split if nr_features == 1: plot_data_split_simple(self.train_inputs, self.train_outputs, self.test_inputs, self.test_outputs, [self.input_features[0], "Happiness"]) elif nr_features == 2: plot_data_split_multiple( self.train_inputs, self.train_outputs, self.test_inputs, self.test_outputs, [self.input_features[0], self.input_features[1], "Happiness"]) # find model regression = MyLinearRegression() regression.fit(self.train_inputs, self.train_outputs) b = regression.b f = "f(x) = " + str(regression.intercept) for i in range(len(b)): f += " + " + str(b[i]) + "*x" + str(i + 1) print("model: " + f) # test model computed_test_results = regression.predict(self.test_inputs) print("prediction error: " + str(self.mean_square_error(computed_test_results))) # for problem with 1 or 2 features plot model and test results if nr_features == 1: plot_model_simple(self.train_inputs, self.train_outputs, b[0], regression.intercept, [self.input_features[0], "Happiness"]) plot_test_results_simple(self.test_inputs, self.test_outputs, computed_test_results, [self.input_features[0], "Happiness"]) elif nr_features == 2: plot_model_multiple( self.train_inputs, self.train_outputs, b[1], b[0], regression.intercept, [self.input_features[0], self.input_features[1], "Happiness"]) plot_test_results_multiple( self.test_inputs, self.test_outputs, computed_test_results, [self.input_features[0], self.input_features[1], "Happiness"]) # compare with sklearn results regression_sk = LinearRegression() regression_sk.fit(self.train_inputs, self.train_outputs) b = regression_sk.coef_ f = "f(x) = " + str(regression_sk.intercept_) for i in range(len(b)): f += " + " + str(b[i]) + "*x" + str(i + 1) print("model sk: " + f) computed_test_results_sk = regression_sk.predict(self.test_inputs) print("prediction error sk: " + str(self.mean_square_error(computed_test_results_sk)))
train = merged[merged["rating_x"] != -1] # rated movies y_train = train["rating_x"] X_train = train.drop(["rating_x", "user_id"], axis=1) # %%model evaluation kf = KFold(5) # lr = MyLinearRegression() # # lr = Lasso() #sklearn implementation as reference benchmark lr_rmse = [] for train_index, test_index in kf.split(X_train): lr_X_train, lr_X_test = X_train.iloc[train_index], X_train.iloc[test_index] lr_y_train, lr_y_test = y_train.iloc[train_index], y_train.iloc[test_index] lr = MyLinearRegression() # lr = MyLinearRegression(poly_degree=2) lr.fit(lr_X_train, lr_y_train) lr_rmse.append(np.sqrt(mean_squared_error( lr.predict(lr_X_test), lr_y_test))) print("LR, 5fold RMSE ", np.mean(lr_rmse)) # %%mlp mlp = tf.keras.models.Sequential() mlp.add(tf.keras.layers.Input([48, ])) mlp.add(tf.keras.layers.BatchNormalization()) mlp.add(tf.keras.layers.Dense(400, activation="sigmoid")) mlp.add(tf.keras.layers.BatchNormalization()) mlp.add(tf.keras.layers.Dropout(0.4)) mlp.add(tf.keras.layers.Dense(400, activation="sigmoid")) mlp.add(tf.keras.layers.Dense(1))
Y = np.array(df.iloc[:, -1]).reshape(-1, 1) pkl = DataHandler(ARGS) if ARGS.load: PreP_x, PreP_y, theta = pkl.load() if PreP_x.scaler: X = PreP_x.re_apply_minmax(X) if PreP_y.scaler: Y = PreP_y.re_apply_minmax(Y) if type(X) == type(None) or type(Y) == type(None): sys.exit() else: PreP_x = Preprocessing(X, scaler=ARGS.scaler) PreP_y = Preprocessing(Y, scaler=ARGS.scaler) X = PreP_x.data Y = PreP_y.data theta = [1] * (X.shape[1] + 1) lr = MyLinearRegression(theta, alpha=ARGS.alpha, n_cycle=ARGS.n_cycle, visual=ARGS.visual) err = lr.fit(X, Y) if type(err) == type(None): sys.exit() pkl.save(PreP_x, PreP_y, lr.theta) if ARGS.visual: lr.plot_results(X, Y)