def fit_and_display(X,Y, sample, deg): N = len(X) trainidx = np.random.choice(N, sample) Xtrain = X[trainidx] Ytrain = Y[trainidx] plt.scatter(Xtrain, Ytrain) plt.show() #fit polynomial # Xtrain_poly = ax.polyfy(Xtrain, deg) X_poly = ax.polyfy(X, deg) Xtrain_poly = X_poly[trainidx] # w = fit(Xtrain_poly, Ytrain) #Compute redisiduals w, resid0 = lin(Xtrain_poly, Ytrain) resid1 = r2(w, X_poly, Y) print ("degree %d .: in-sample %.4f out sample %.4f" % (deg ,resid0, resid1)) #display the polynomial # X_poly = ax.polyfy(X, deg) Y_hat = X_poly.dot(w) plt.plot(X,Y) plt.plot(X,Y_hat) plt.scatter(Xtrain, Ytrain) plt.title("deg= %d" % deg) plt.show()
def testLinearRegression1D(self): df = pd.read_csv('../datasets/1d.csv', header=None) X = df.loc[:, 0] Y = df.loc[:, 1] W, rsq = lin(X, Y) self.assertAlmostEqual(W[0], 2.8644240756607786, None, "W[0]*X + W[1] .: W[0] should ~ 2.8644", 0.0001) self.assertAlmostEqual(W[1], 1.972612167484588, None, "W[0]*X + W[1] .: W[1] should ~ 1.9726", 0.0001) self.assertAlmostEqual(rsq, 0.99118382029778052, None, "r2 ~ 0.99118", 0.0001)
def testLinearRegression2D(self): df = pd.read_csv('../datasets/2d.csv', header=None) X = df.loc[:, [0, 1]] Y = df.loc[:, 2] W, rsq = lin(X, Y, fill=True) self.assertAlmostEqual(W[0], 2.01666793, None, "W[0] .: expected ~2.0166\ngot " + str(W[0]), 0.0001) self.assertAlmostEqual(W[1], 2.96985048, None, "W[1] .: expected ~2.96985\ngot " + str(W[1]), 0.0001) self.assertAlmostEqual(W[2], 1.46191241, None, "W[2] .: expected ~1.46191\ngot " + str(W[2]), 0.0001) self.assertAlmostEqual(rsq, 0.99800406124757779, None, "r2 .: ~0.99800\ngot " + str(rsq), 0.0001)
if __name__ == '__main__': pass N = 50 X = np.linspace(0, 10, N) Y = 0.5 * X + np.random.randn(N) Y[-1] += 30 Y[-2] += 30 plt.scatter(X, Y) plt.show() #Why?? X = np.vstack([np.ones(N), X]).T # XX = np.concatenate((np.ones(N),np.array([N]))) wml, residml = lin(X, Y) Yhat_ml = X.dot(wml) print wml print residml l2 = 1000 wmap, residmap = linl2(X, Y, l2) Yhat_map = X.dot(wmap) plt.scatter(X[:, 1], Y) plt.plot(X[:, 1], Yhat_ml, label=("ml r2=%.4f" % (residml))) plt.plot(X[:, 1], Yhat_map, label=("map r2=%.4f" % (residmap))) plt.legend() plt.show()
''' Created on May 10, 2017 @author: Varela ''' import pandas as pd import regress as rg from aux import get_dataset # import matplotlib.pyplot as plt df = pd.read_csv(get_dataset('1d.csv'), header=None) X = df.iloc[:, 0] Y = df.iloc[:, 1] W, r2 = rg.lin(X, Y) print W print r2 #plot the data to see what's look like # plt.scatter(X, Y) # plt.show()
# \[(.*)\] matches anything enclosed in brackets # [^\d]+ matches anything which is not a decimal Y = df.loc[:, 1].str.replace(r'\[(.*)\]', '') Y = Y.str.replace(r'[^\d]+', '').astype('int') X = df.loc[:, 2].str.replace(r'\[(.*)\]', '') X = X.replace(r'[^\d]+', '').astype('int') # plt.scatter(X, Y) # plt.show() # # Y = np.log(Y) # plt.scatter(X, Y) # plt.show() Y = np.log(Y) W, r2 = lin(X, Y) print W print r2 #LAZY PROGRAMMER SOLUTION # import re # import numpy as np # import matplotlib.pyplot as plt # # X = [] # Y = [] # # non_decimal = re.compile(r'[^\d]+') # # for line in open('../src/datasets/moore.csv'): # r = line.split('\t')