def fitting(): data = pd.read_csv('house_data.txt', names=['area', 'bedroom', 'price']) x_data = data[['area', 'bedroom']] x_data = (x_data - x_data.mean())/(x_data.max() - x_data.min()) y_data = data['price'] plt.figure(figsize=(10, 6)) plt.subplot(2, 2, 1) plt.plot(x_data['area'], y_data, 'rx') plt.title('area-price') plt.subplot(2, 2, 2) plt.plot(x_data['bedroom'], y_data, 'bx') plt.title('bedroom-price') alpha = 10 max_iter = 50 model = LinearRegression(alpha, max_iter) loss, _ = model.fit(x_data.values, y_data.values) plt.subplot(2, 1, 2) plt.plot(np.arange(1, max_iter+1), loss) plt.subplots_adjust(hspace=0.4) plt.show()
def main(): # NOTE(Jovan): Load data data = pd.read_csv("data/skincancer.csv", delimiter=',', index_col=0) mort = data.Mort.values lat = data.Lat.values lon = data.Long.values # NOTE(Jovan): Init LinearRegression and predict lin_reg = LinearRegression(lat, mort) hawaii = lin_reg.predict(20) print("Prediction for hawaii[lat=20]:", hawaii) # NOTE(Jovan): Init KMeans and add lat and long points k_means = KMeans() for i, j in zip(lat, lon): k_means.points.append(Point(i, j)) k_means.split(2, 0.01) # NOTE(Jovan): Plot clusters fig = plt.figure() ax = fig.add_axes([0,0,1,1]) # NOTE(Jovan): First clusters for p in k_means._clusters[0].points: ax.scatter(p.x, p.y, c="#ff0000") # NOTE(Jovan): Second clusters for p in k_means._clusters[1].points: ax.scatter(p.x, p.y, c="#00ff00") # NOTE(Jovan): Plot cluster centers center1 = k_means._clusters[0].center center2 = k_means._clusters[1].center ax.scatter(center1.x, center1.y, marker="P", c="#ff0000") ax.scatter(center2.x, center2.y, marker="P", c="#00ff00") plt.show()
def test_l2_regularization_gradient(): from linear_regression import LinearRegression model = LinearRegression(input_dimensions=2) model.weights = np.float32([[1, 2, 4]]).T gradient = model._l2_regularization_gradient() desired = np.float32([[1, 2, 4]]).T assert (np.allclose(gradient, desired, rtol=1e-3, atol=1e-3) or np.allclose(gradient, 2*desired, rtol=1e-3, atol=1e-3))
def visual_3d(): data = pd.read_csv('food_profit.txt', names=['population', 'profit']) x = data['population'] y = data['profit'] x = x[:, np.newaxis] x = np.hstack((np.ones_like(x), x)) theta0 = np.zeros((40, 50)) theta1 = np.zeros((40, 50)) jvals = np.zeros((40, 50)) for i, t0 in enumerate(np.arange(-10, 10, .5)): for j, t1 in enumerate(np.arange(-1, 4, .5)): theta0[i, j] = t0 theta1[i, j] = t1 jvals[i, j] = 0.5*np.mean((x.dot(np.array([t0, t1])) - y)**2) import mpl_toolkits.mplot3d ax = plt.gca(projection='3d') ax.plot_surface(theta0, theta1, jvals, cmap=plt.get_cmap('BuPu_r')) alpha = 0.01 max_iter = 1500 model = LinearRegression(alpha, max_iter) loss, w_list = model.fit(x, y) w_list = np.array(w_list) plt.plot([w_list[0,0]], [w_list[0,1]], [loss[0]], 'rx') plt.plot(w_list[:, 0], w_list[:, 1], loss, 'o') plt.plot([w_list[-1, 0]], [w_list[-1, 1]], [loss[-1]], 'gx') plt.show()
def test_fit_functional(): import sklearn.model_selection import numpy as np from linear_regression import LinearRegression X = np.zeros((900, 3), dtype=np.float32) num_samples = 30 xx = np.linspace(-5, 5, num_samples) XX, YY = np.meshgrid(xx, xx) X[:, 0] = XX.flatten() X[:, 1] = YY.flatten() X[:, -1] = 1 # a column of 1's for the bias trick Z = 0.1 * XX + 0.2 * YY + 0.4 y = Z.reshape(-1, 1) X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split( X, y) model = LinearRegression(input_dimensions=2) train_mse, val_mse = model.fit(X_train, y_train, X_val, y_val, num_epochs=20, batch_size=4, alpha=0.1, _lambda=0.0) final_train_mse = train_mse[-1] desired_weights = np.float32([[0.1, 0.2, 0.4]]).T np.testing.assert_allclose(model.weights, desired_weights, rtol=1e-3, atol=1e-3) assert final_train_mse < 0.001 assert final_train_mse < 0.00001 assert final_train_mse < 1e-10
def test_predict(): from linear_regression import LinearRegression model = LinearRegression(input_dimensions=2) model.weights = np.float32([[1, 2, 4]]).T X = np.float32([[1, 2, 1], [0, 0, 1]]) desired = np.float32([[9, 4]]).T actual = model.predict(X) np.testing.assert_allclose(actual, desired, rtol=1e-3, atol=1e-3)
def test_trains(self): ''' Trains returns new model with trained thetas ''' model = LinearRegression('SquaredError') model_trained = model.train(self.training_set, range(-20, 20), range(-20, 20)) self.assertEqual(model_trained.theta_0, 0) self.assertEqual(model_trained.theta_1, 1)
def test_mse_gradient(): from linear_regression import LinearRegression model = LinearRegression(input_dimensions=2) model.weights = np.float32([[1, 2, 4]]).T X = np.float32([[1, 2, 1], [0, 0, 1]]) y = np.float32([[10, 2]]).T gradient = model._mse_gradient(X, y) desired = np.float32([[-0.5, -1., 0.5]]).T np.testing.assert_allclose(gradient, desired, rtol=1e-3, atol=1e-3)
class TestLinearRegression(unittest.TestCase): def setUp(self): self.model_simple = LinearRegression(1) self.model_multiple = LinearRegression(2) def test_mean_squared_error(self): pred = np.array([0, 0]) label = np.array([1, 1]) err = self.model_simple._mean_squared_error(pred, label) exp = 1 self.assertTrue(err, exp) label2 = np.array([2, 4]) err2 = self.model_simple._mean_squared_error(pred, label2) exp2 = 10 self.assertTrue(err2, exp2) def test_predict(self): # Both W and b are 0 after initialization x1 = np.array([1, 2]) pred1 = self.model_simple.predict(x1) exp1 = np.array([0, 0]) self.assertTrue(np.array_equal(pred1, exp1)) x2 = np.array([[1,1], [2,2]]) pred2 = self.model_multiple.predict(x2) exp2 = np.array([0, 0]) self.assertTrue(np.array_equal(pred2, exp2)) def test_train(self): # y = x + 0 x1 = np.array([2, 4]) y1 = np.array([2, 4]) m1 = LinearRegression(1) m1.train(x1, y1, n_iter=1, lr=0.1) # expected W and b after 1 iteration with lr 0.1 exp_W1 = np.array([1.0]) exp_b1 = 0.3 self.assertTrue(np.array_equal(m1.W, exp_W1)) self.assertAlmostEqual(m1.b[0], exp_b1) # y = x1 + x2 + 0 x2 = np.array([[2, 2], [4, 4]]) y2 = np.array([4, 8]) m2 = LinearRegression(2) m2.train(x2, y2, n_iter=1, lr=0.1) # expected W and b after 1 iteration with lr 0.1 exp_W2 = np.array([2.0, 2.0]) exp_b2 = 0.6 self.assertTrue(np.array_equal(m2.W, exp_W2)) self.assertAlmostEqual(m2.b[0], exp_b2)
def main(): try: model = LinearRegression(filename=None, mode='No_file') model.predict_data_value() except IOError as e: print(Style.BRIGHT + Fore.RED + 'I/O Error: ' + Style.RESET_ALL + Fore.RESET + str(e)) except ParserException as e: print(Style.BRIGHT + Fore.RED + 'ParserException: ' + Style.RESET_ALL + Fore.RESET + str(e)) except LogisticRegressionException as e: print(Style.BRIGHT + Fore.RED + 'Logistic Regression Exception: ' + Style.RESET_ALL + Fore.RESET + str(e))
def learning_curve(X, y, Xval, yval, ilambda): """ 计算学习曲线所需的参数, 返回训练误差, 交叉验证误差""" # 随机选择训练样本以及验证样本, 重复50次取误差的平均值 num_items = 50 # 训练集大小 m = X.shape[0] # 训练误差以及交叉验证误差 error_train = np.zeros((m, 1)) error_val = np.zeros((m, 1)) for i in range(m): # 建立线性回归模型 my_lr = LR(X[0:i + 1, :], y[0:i + 1]) my_lr.gradient_descent_reg(0.001, ilambda, 5000) theta = my_lr.theta error_train[i:], _ = my_lr.compute_cost_reg(theta, 0, X=X[:i + 1, :], y=y[:i + 1]) error_val[i:], _ = my_lr.compute_cost_reg(theta, 0, X=Xval, y=yval) # ~ for i in range(m): # ~ print("样本 {}".format(i+1)) # ~ for t in range(num_items): # ~ # 随机获取训练以及交叉验证样本 # ~ rand_indices = np.arange(m) # 随机获取样本的100行, 进行可视化 # ~ np.random.shuffle(rand_indices) # shuffle返回None 就地打乱 # ~ sel_1 = rand_indices[:i+1] # ~ np.random.shuffle(rand_indices) # ~ sel_2 = rand_indices[:i+1] # ~ # 建立线性回归模型 # ~ my_lr = LR(X[sel_1, :], y[sel_1]) # ~ # 使用不用的训练样本进行参数学习 -> 进行验证 # ~ my_lr.gradient_descent_reg(0.001, ilambda, 5000) # ~ theta = my_lr.theta # ~ # 使用不同训练样本计算训练误差, lambda=0 # ~ cost_train, _ = my_lr.compute_cost_reg(theta, 0, X=X[sel_1, :], y=y[sel_1]) # ~ error_train[i:] += cost_train # ~ # 使用全部交叉验证样本计算交叉验证误差, lambda=0 # ~ cost_val, _ = my_lr.compute_cost_reg(theta, 0, X=Xval[sel_2, :], y=yval[sel_2]) # ~ error_val[i:] += cost_val # ~ # 计算误差平均值 # ~ error_train /= num_items # ~ error_val /= num_items return error_train, error_val
def test_plot_data(): X = [1, 2, 3, 4] y = [2, 3, 4, 5] lr = LinearRegression(X, y, 50, 0.01) try: lr.plot_data(X, y, 'X_VALS', 'Y_VALS', 'TEST_CHART') except Exception as e: print('Test failed it exception: {}'.format(e)) return print('pass')
def test_load_data(): lr = LinearRegression([], [], 50, 0.01) try: features = lr.load('./data/test_data1.txt') except Exception as e: print('Test failed it exception: {}'.format(e)) return if features is not None: print('pass') else: print('fail')
def test_train_on_batch(): from linear_regression import LinearRegression model = LinearRegression(input_dimensions=2) weights_old = np.float32([[1, 2, 4]]).T model.weights = np.float32([[1, 2, 4]]).T X = np.float32([[1, 2, 1], [0, 0, 1]]) y = np.float32([[10, 2]]).T model._train_on_batch(X, y, 0.3, _lambda=0.001) desired = np.float32([[-0.14970, -0.29940, 0.15120]]).T weight_delta = (weights_old - model.weights) np.testing.assert_allclose(weight_delta, desired, rtol=1e-3, atol=1e-3)
def main(): # Get training matrices for linear regression model x, y = get_train_matrices() # Create instance of LinearRegression with the training matrices linear_regression = LinearRegression(x, y) # Fit with learning rate, no of iterations and regularization(L2) parameter linear_regression.fit(0.01, 1000, 0) # Predict for all the input values y_pred = linear_regression.predict(x) # Plot the scatter plots of training data and graph of our linear model plt.scatter(x, y) plt.plot(x, y_pred) plt.show() # Print the weights and biases of the model print("Weights: {}\nBiases: {}".format(linear_regression.w, linear_regression.c)) # Validate the model by printing the performance metrics linear_regression.validate() # Predict for the input data in test folder and save as output.csv in test folder x_test = pd.read_csv('test/input.csv')['x'].values.reshape(-1, 1) y_test = linear_regression.predict(x_test) df_predict = pd.DataFrame({'y': y_test.reshape(-1)}) df_predict.to_csv('test/output.csv')
def main(): if len(sys.argv) != 2: print('usage: ' + Fore.RED + 'python' + Fore.BLUE + ' train.py ' + Fore.RESET + 'data_file.csv') sys.exit(-1) data_file = sys.argv[1] try: model = LinearRegression(data_file) model.train_model() except IOError as e: print(Style.BRIGHT + Fore.RED + 'I/O Error: ' + Style.RESET_ALL + Fore.RESET + str(e)) except ParserException as e: print(Style.BRIGHT + Fore.RED + 'ParserException: ' + Style.RESET_ALL + Fore.RESET + str(e)) except LogisticRegressionException as e: print(Style.BRIGHT + Fore.RED + 'Logistic Regression Exception: ' + Style.RESET_ALL + Fore.RESET + str(e))
def main(): trainfile = r"data/ex1data1.txt" train_X, train_y = loadDataSet(trainfile) clf = LinearRegression() weigh = clf.fit(train_X, train_y, alpha=0.01, maxCycles=500) Fig = plt.figure(figsize=(8, 4)) # Create a `figure' instance Ax = Fig.add_subplot(111) # Create a `axes' instance in the figure Ax.plot(train_X, train_y, 'o') # Create a Line2D instance in the axes #Ax.plot(a1,a2) a1 = [0, 1] a2 = [0, 1 * weigh] b1 = [0, 25] b2 = [0, 25 * weigh] Ax.plot(a1, a2, b1, b2) Fig.savefig("test.pdf")
def test_fit(): y = np.array([ 0.09459717, 0.50650243, 1.03329565, 0.52587828, 0.49264871, -0.64896441, -0.86499999, -1.00885329, -0.80418399, 0.57436388 ]).reshape(-1, 1) x_mat = np.array([ 0., 0., 0.6981317, 0.48738787, 1.3962634, 1.94955149, 2.0943951, 4.38649084, 2.7925268, 7.79820595, 3.4906585, 12.18469679, 4.1887902, 17.54596338, 4.88692191, 23.88200571, 5.58505361, 31.19282379, 6.28318531, 39.4784176 ]).reshape(-1, 2) model = LinearRegression() model.fit(x_mat, y) fitted_weights = model.weights_ correct_weights = np.array([[0.77483422], [-0.42288373], [0.03914334]]) np.testing.assert_almost_equal(fitted_weights, correct_weights, decimal=8)
def test_simple_regression(self): r = LinearRegression(order=1, stds=[100, 100, 100, 1000]) coeffs = np.array([1, 10, 100, 1000]) old_value = np.array([0, 0, 0, 0]) for datum_id in range(1000): value = np.random.uniform(-1000, 1000, 4) value[0] = np.sum(old_value * coeffs) old_value = value r.feed(value) print("With penalty_range: {0}".format(r.penalty_range)) print() print(r.ranking[0]) print() print(r.means) print(r.stds) print()
def test_cross_val_regression(self): data = np.genfromtxt("../src/datasets/wine.data", delimiter=",", dtype=float, usecols=np.arange(13)) labels = np.genfromtxt("../src/datasets/wine.data", delimiter=",", dtype=float, usecols=13) lr = LinearRegression() folds = 5 cross_val_score = GeneralUtilities.cross_val(lr, data, labels, folds) self.assertTrue(0 <= cross_val_score <= 1)
def main(): args = parse_args() train_X, train_y, valid_X, valid_y, col_name = preprocess_training_set() linreg = LinearRegression() if args.method == 'pseudo_inverse': linreg.train_by_pseudo_inverse( train_X, train_y, alpha=0.5, validate_data=(valid_X, valid_y)) elif args.method == 'gradient_descent': linreg.train_by_gradient_descent( train_X, train_y, epoch=1000, rate=0.000001, batch=100, alpha=0.00000001, validate_data=(valid_X, valid_y)) else: raise Exception('wrong method') test_X, ids = preprocess_testing_set(col_name) pred_y = linreg.predict(test_X) result = list() for i in range(ids.shape[0]): result.append([ids[i], pred_y[i]]) with open(args.output, 'w') as fw: for id, pred in result: fw.write('{id},{pred}\n'.format(id=id, pred=pred))
def setUp(self): # create and train the model self.model = LinearRegression.from_csv(TRAIN_CSV) self.weights, self.cost_history = self.model.fit_sgd(epochs=5) # import test data test = np.loadtxt(TEST_CSV, delimiter=",") self.x_test = add_one_bias(normalize(test[:, :5])) self.y_test = normalize(test[:, 5])
def setUp(self): self.data = np.genfromtxt("../src/airfoil_self_noise.txt", dtype=float, usecols=np.arange(5)) self.labels = np.genfromtxt("../src/airfoil_self_noise.txt", dtype=float, usecols=5) self.train_data, self.train_labels, self.test_data, self.test_labels = GeneralUtilities.dataset_split( self.data, self.labels) self.lr = LinearRegression()
def validation_curve(X, y, Xval, yval): """ 自动选择lambda """ lambda_vec = np.array([0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]) lambda_vec = lambda_vec.reshape(lambda_vec.size, 1) error_train = np.zeros((lambda_vec.shape[0], 1)) error_val = np.zeros((lambda_vec.shape[0], 1)) for i in range(lambda_vec.size): # 遍历lambda, 计算每一个的训练以及验证误差 ilambda = lambda_vec[i] #建立模型, 获得参数, 计算误差 my_lr = LR(X, y) my_lr.gradient_descent_reg(0.001, ilambda, 5000) theta = my_lr.theta error_train[i:], _ = my_lr.compute_cost_reg(theta, 0, X=X, y=y) error_val[i:], _ = my_lr.compute_cost_reg(theta, 0, X=Xval, y=yval) return lambda_vec, error_train, error_val
def test2(): import pandas as pd # from sklearn.linear_model import LinearRegression from linear_regression import LinearRegression data_path = '../mnist/data/' df_train = pd.read_csv(data_path + "train.csv") X = df_train.iloc[:, 1:].to_numpy() y = df_train.iloc[:, 0].to_numpy() lg = LinearRegression().fit(X, y) df_test = pd.read_csv(data_path + "test.csv").to_numpy() predictions = lg.predict(df_test) submission = pd.DataFrame({ "ImageId": range(1, 1 + len(predictions)), "Label": list(map(lambda x: int(round(x)), predictions)) }) submission.to_csv("mnist-submission9.csv", index=False)
def test_folds_parameter(self): data = np.genfromtxt("../src/datasets/wine.data", delimiter=",", dtype=float, usecols=np.arange(13)) labels = np.genfromtxt("../src/datasets/wine.data", delimiter=",", dtype=float, usecols=13) lr = LinearRegression() folds = -5 self.assertRaises(ValueError, GeneralUtilities.cross_val, lr, data, labels, folds) folds = 1000 self.assertRaises(ValueError, GeneralUtilities.cross_val, lr, data, labels, folds)
def test_train_and_predict(self): # let's fit a line to the following x and y values # # | # 3 | x # | # 2 | x # | # 1 | x # |________________ # 1 2 3 4 X = np.array([1, 2, 3]) Y = np.array([1, 2, 3]) clf = LinearRegression() m, b = clf.train(X, Y) assert m == 1.0 assert b == 0.0 # now let's use our trained model to predict a y value for a new x predicted_y = clf. predict(4) assert predicted_y == 4.0
def run_linear_regression(): print('Plotting data\n') features = setup() features.columns = ['Profits', 'CityPopulation'] X = features.Profits y = features.CityPopulation m = len(y) iterations = 1500 alpha = 0.01 theta = np.zeros(m) # Set the initial theta value lr = LinearRegression(X, y, iterations, alpha) lr.plot_data(X, y, 'Profits', 'City Population', 'Food Truck Profit v. City Pop') print('Testing gradient descent algorithm...\n') # Add a column of ones to X # X.bias = np.ones((m, 1)) print('Initial cost: {}'.format(lr.cost_function(X, y, theta))) # Run the gradient descent theta, cost_history = lr.gradient_descent(X, y, theta, alpha, iterations) print('Optimum theta found by gradient descent: {}'.format(theta))
def test_train(self): # y = x + 0 x1 = np.array([2, 4]) y1 = np.array([2, 4]) m1 = LinearRegression(1) m1.train(x1, y1, n_iter=1, lr=0.1) # expected W and b after 1 iteration with lr 0.1 exp_W1 = np.array([1.0]) exp_b1 = 0.3 self.assertTrue(np.array_equal(m1.W, exp_W1)) self.assertAlmostEqual(m1.b[0], exp_b1) # y = x1 + x2 + 0 x2 = np.array([[2, 2], [4, 4]]) y2 = np.array([4, 8]) m2 = LinearRegression(2) m2.train(x2, y2, n_iter=1, lr=0.1) # expected W and b after 1 iteration with lr 0.1 exp_W2 = np.array([2.0, 2.0]) exp_b2 = 0.6 self.assertTrue(np.array_equal(m2.W, exp_W2)) self.assertAlmostEqual(m2.b[0], exp_b2)
def test_score(self): train_data, train_labels, test_data, test_labels = GeneralUtilities.dataset_split( self.data, self.labels) lr = LinearRegression() lr.fit(train_data, train_labels) lr.predict(test_data, test_labels) self.assertTrue(0 <= lr.r_squared <= 1)
def __init__(self, numpy_rng, theano_rng=None, n_ins=100, hidden_layers_size=None, n_outs=1, L1_reg=0.00, L2_reg=0.0001): if hidden_layers_size is None: hidden_layers_size = [100, 100] self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_size) assert self.n_layers > 0 if not theano_rng: theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 30)) self.x = T.matrix('x') self.y = T.vector('y') for i in range(self.n_layers): if i == 0: input_sizes = n_ins else: input_sizes = hidden_layers_size[i - 1] if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_sizes, n_out=hidden_layers_size[i], activation=T.nnet.sigmoid) self.sigmoid_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_sizes, n_hidden=hidden_layers_size[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) self.linearRegressionLayer = LinearRegression(input=self.sigmoid_layers[-1].output, n_in=hidden_layers_size[-1], n_out=n_outs) self.L1 = abs(self.sigmoid_layers[-1].W).sum() + abs(self.linearRegressionLayer.W).sum() self.L2_sqr = (self.sigmoid_layers[-1].W ** 2).sum() + (self.linearRegressionLayer.W ** 2).sum() self.squared_errors = self.linearRegressionLayer.squared_errors(self.y) self.finetune_cost = self.squared_errors + L1_reg * self.L1 + L2_reg * self.L2_sqr self.y_pred = self.linearRegressionLayer.p_y_given_x self.params = self.params + self.linearRegressionLayer.params
def main(): X, y = datasets.make_regression(n_samples=500, n_features=1, noise=20) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 比较sklearn上的模型和自己写的模型 lr1 = linear_model.LinearRegression() lr2 = LinearRegression() lasso1 = linear_model.Lasso(alpha=0.1) lasso2 = LinearRegression(l1=0.1) ridge1 = linear_model.Ridge(alpha=0.5) ridge2 = LinearRegression(l2=0.5) elasticnet1 = linear_model.ElasticNet(alpha=0.5, l1_ratio=0.5) elasticnet2 = LinearRegression(l1=0.25, l2=0.25*0.5) models = {'linear1': lr1, 'linear': lr2, 'lasso1': lasso1, 'lasso2': lasso2, 'ridge1': ridge1, 'ridge2': ridge2, 'elasticnet1': elasticnet1, 'elasticnet2': elasticnet2} for model_name, model in models.items(): model.fit(X_train, y_train) y_pred = model.predict(X_test) y_pred = np.reshape(y_pred, y_test.shape) mse = mean_squared_error(y_test, y_pred) print('{}: {}'.format(model_name, mse))
def fitting(): data = pd.read_csv('food_profit.txt', names=['population', 'profit']) x = data['population'] y = data['profit'] alpha = 0.01 max_iter = 1500 model = LinearRegression(alpha, max_iter) loss, _ = model.fit(x, y) p = model.predict(x) plt.figure(figsize=(10, 6)) plt.subplot(2,1,1) plt.plot(np.arange(1, 1501), loss) plt.title('Loss Curve') plt.subplot(2,1,2) plt.plot(x, y, 'rx', markersize=10, label='Traing Data') plt.plot(x, p, 'b', label='Linear Regression') plt.xlabel('Population of City in 10,000s') plt.ylabel('Profit in $10,000s') plt.grid(True) plt.legend() plt.show()
def test_r_calcuation(self): # check that the adjusted_r_squared value is calculated and different to r_squared # when using multiple attributes # it is possible that the adjusted value can drop below 0 which is why the test doesn't check for that lr = LinearRegression() lr.fit(self.train_data, self.train_labels) lr.predict(self.test_data, self.test_labels) self.assertTrue(lr.adj_r_squared <= 1) self.assertTrue(lr.adj_r_squared != lr.r_squared)
def setUp(self): self.model_simple = LinearRegression(1) self.model_multiple = LinearRegression(2)
def test_feeding_data(self): r = LinearRegression(order=2, target_index=0) r.feed([0, 0, 0, 0]) r.feed([1, 1, 0, 0]) r.feed([2, 0, 1, 0]) r.feed([4, 0, 0, 1])
if __name__ == '__main__': pd_train = pd.read_csv('./data/train.csv', sep=';') pd_test = pd.read_csv('./data/test.csv', sep=';') pd_validate = pd.read_csv('./data/validate.csv', sep=';') # 1.对原始特征进行均匀预处理 trn_X, trn_X_max, trn_X_min = uniform_norm(pd_train.drop('quality', axis=1).values) trn_y = pd_train['quality'].values val_X = (pd_validate.drop('quality', axis=1).values - trn_X_min) / (trn_X_max - trn_X_min) val_y = pd_validate['quality'].values test_X = (pd_test.drop('quality', axis=1).values - trn_X_min) / (trn_X_max - trn_X_min) test_y = pd_test['quality'].values model_1 = LinearRegression() train_costs = model_1.fit(trn_X, trn_y, alpha=0.5, lmbda=0, algorithm="batch_gd", verbose=True) val_pred = model_1.predict(val_X) test_pred = model_1.predict(test_X) print("Validate Error %f" % (sum((val_pred - val_y) ** 2) * 0.5 / val_X.shape[0])) print("Test Error %f" % (sum((test_pred - test_y) ** 2) * 0.5 / test_X.shape[0])) print("\n\n") # 2.对原始特征进行高斯预处理 trn_X, trn_X_mean, trn_X_std = gaussian_norm(pd_train.drop('quality', axis=1).values) trn_y = pd_train['quality'].values val_X = (pd_validate.drop('quality', axis=1).values - trn_X_mean) / trn_X_std val_y = pd_validate['quality'].values
housing = fetch_california_housing(data_home='/home/bdol/data') train_data, test_data, train_target, test_target = split_train_test( housing.data, housing.target ) # Normalize the data train_data = preprocessing.scale(train_data) test_data = preprocessing.scale(test_data) # Append bias feature train_data = np.hstack((train_data, np.ones((train_data.shape[0], 1), dtype=train_data.dtype))) test_data = np.hstack((test_data, np.ones((test_data.shape[0], 1), dtype=test_data.dtype))) train_target = train_target[:, None] test_target = test_target[:, None] lin_reg = LinearRegression() print "Training..." # lin_reg.train(train_data, train_target) lin_reg.train_closed_form_unregularized(train_data, train_target) print print "Done!" # Get training error train_preds = lin_reg.test(train_data) test_preds = lin_reg.test(test_data) print "Train error:", RMSE(train_preds, train_target) print "Test error:", RMSE(test_preds, test_target)
# between 0.5 and 1.0) and 20 smaller ones (between 0.0 and 0.1) true_w = 0.3*np.random.rand(20, 1) true_w = np.append(true_w, 0.5*np.random.rand(10, 1) + 0.5) # Now generate the dataset using the true weights N = 50 train_data = np.random.rand(N, 30) train_target = train_data.dot(true_w)[:, None]+np.random.randn(N, 1) test_data = np.random.rand(N, 30) test_target = test_data.dot(true_w)[:, None]+np.random.randn(N, 1) lam_range = np.logspace(-1, 1, 100) unreg_results = np.zeros((len(lam_range), 1)) reg_results = np.zeros((len(lam_range), 1)) lin_reg = LinearRegression() i = 0 for l in lam_range: lin_reg.train_closed_form_unregularized(train_data, train_target) yhat = lin_reg.test(test_data) unreg_results[i] = RMSE(yhat, test_target) lin_reg.train_closed_form_ridge(train_data, train_target, l) yhat = lin_reg.test(test_data) reg_results[i] = RMSE(yhat, test_target) i += 1 fig = plt.figure() ax = fig.add_subplot(111) ax.set_xscale("log")
class DBNR(object): def __init__(self, numpy_rng, theano_rng=None, n_ins=100, hidden_layers_size=None, n_outs=1, L1_reg=0.00, L2_reg=0.0001): if hidden_layers_size is None: hidden_layers_size = [100, 100] self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_size) assert self.n_layers > 0 if not theano_rng: theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 30)) self.x = T.matrix('x') self.y = T.vector('y') for i in range(self.n_layers): if i == 0: input_sizes = n_ins else: input_sizes = hidden_layers_size[i - 1] if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_sizes, n_out=hidden_layers_size[i], activation=T.nnet.sigmoid) self.sigmoid_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_sizes, n_hidden=hidden_layers_size[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) self.linearRegressionLayer = LinearRegression(input=self.sigmoid_layers[-1].output, n_in=hidden_layers_size[-1], n_out=n_outs) self.L1 = abs(self.sigmoid_layers[-1].W).sum() + abs(self.linearRegressionLayer.W).sum() self.L2_sqr = (self.sigmoid_layers[-1].W ** 2).sum() + (self.linearRegressionLayer.W ** 2).sum() self.squared_errors = self.linearRegressionLayer.squared_errors(self.y) self.finetune_cost = self.squared_errors + L1_reg * self.L1 + L2_reg * self.L2_sqr self.y_pred = self.linearRegressionLayer.p_y_given_x self.params = self.params + self.linearRegressionLayer.params def pretraining_function(self, train_set_x, batch_size, k): index = T.lscalar('index') learning_rate = T.scalar('lr') n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size batch_begin = index * batch_size batch_end = batch_begin + batch_size pretrain_fns = [] for rbm in self.rbm_layers: cost, updates = rbm.get_cost_updates(learning_rate, persistent=None, k = k) fn = theano.function( inputs=[index, theano.In(learning_rate, value=0.1)], outputs=cost, updates=updates, givens={ self.x: train_set_x[batch_begin:batch_end] } ) pretrain_fns.append(fn) return pretrain_fns def build_finetune_functions(self, datasets, batch_size, learning_rate): (train_set_x, train_set_y) = datasets[0] (valid_set_x, valid_set_y) = datasets[1] (test_set_x, test_set_y) = datasets[2] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_valid_batches /= batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_test_batches /= batch_size index = T.lscalar('index') gparams = T.grad(self.finetune_cost, self.params) updates = [] for param, gparam in zip(self.params, gparams): updates.append((param, param - gparam * learning_rate)) train_fn = theano.function( inputs=[index], outputs=self.finetune_cost, updates=updates, givens={ self.x: train_set_x[index * batch_size: (index + 1) * batch_size], self.y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) test_score_i = theano.function( inputs=[index], outputs=self.squared_errors, givens={ self.x: test_set_x[index * batch_size: (index + 1) * batch_size], self.y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) valid_score_i = theano.function( inputs=[index], outputs=self.squared_errors, givens={ self.x: valid_set_x[index * batch_size: (index + 1) * batch_size], self.y: valid_set_y[index * batch_size: (index + 1) * batch_size] } ) def valid_score(): return [valid_score_i(i) for i in range(n_valid_batches)] def test_score(): return [test_score_i(i) for i in range(n_test_batches)] return train_fn, valid_score, test_score
dummy_x = list(range(200, 2100, 100)) dummy_y = model.W * dummy_x + model.b plt.scatter(x, y) plt.plot(dummy_x, dummy_y) # automatically close plot after 2 seconds plt.show(block=False) plt.pause(2) plt.close() if __name__=="__main__": x = np.array([d[0] for d in data]) y = np.array([d[1] for d in data]) model = LinearRegression(1) # without this initial parameter, model couldn't be optimized... model.b = 17 display_data(x, y, model) # learning rate higher than this would increase error after each iter... # perhaps, data is not suitable for linear slope and intercept # or, there are too few data points model.train(x, y, n_iter=10, lr=0.000001) display_data(x, y, model) test_x, test_y = testing pred = model.predict(test_x) print('Dist from Dublin to Gdansk: {}\n' 'answer {}\n' 'predict {}'.format(test_x, test_y, pred))
import numpy as np from linear_regression import LinearRegression file = '/Users/ruivaz/workspace/git/c_apps/data.csv' if __name__ == "__main__": data_table = np.loadtxt(open(file,"rb"), delimiter=",", skiprows=1) lr = LinearRegression(data_table) lr.run_gradient_descent() lr.plot_solution()