def setUp(self): self.model1 = Model.Model("./data/simple", ".csv") self.files = self.model1.set_files_in_directory() self.model2 = Model.Model("./data/student", ".csv") self.model3 = Model.Model("./datfff", ".csv") self.model4 = Model.Model("./data/student", ".txt") self.model1out = self.model1.set_dataframes(self.files) self.model2out = self.model2.set_dataframes(self.files) self.regression1 = Regression.Regression(self.model1out) self.regression2 = Regression.Regression(self.model2)
def main(): new_model = Model.Model("./data/simple", ".csv") files = new_model.set_files_in_directory() dic = new_model.set_dataframes(files) new_reg = rg.Regression(dic) training_data = new_reg.split_data()[0] column_names = new_reg.get_columnNames(training_data) ind, dep = new_reg.get_data(columns_names=column_names, training_data=training_data) lr = new_reg.run(training_data) if lr.__class__.__name__ == "UnivariateLR": m, b = lr.run() print(m, b) y_hat = lr.predict(m, b) print(lr.evaluate_model(ind, y_hat)) m, b = lr.get_params_history() lr.plot_history_m(m) elif lr.__class__.__name__ == "MultivariateLR": B, cost_history = lr.run() y_hat = lr.predict(B) print(lr.evaluate_model(dep, y_hat)) lr.plot_cost(cost_history)
def main(Expected1, Expected2, Dispersion1, Dispersion2, Number1, Number2, accuracy): fig = pb.figure() data, axes = support.gen_data1(Expected1, Expected2, Dispersion1, Dispersion2, Number1, Number2, fig) l_regression = reg.LogisticRegression() l_regression.fit(data) weights_by_grad = np.zeros(Expected1.shape[0] + 1) weights_by_grad, N = l_regression.find_weights(weights_by_grad, accuracy) weights_by_scipy = np.zeros(Expected1.shape[0] + 1) weights_by_scipy = minimize(l_regression.Q, weights_by_scipy, method='nelder-mead') norma2 = map(lambda x: x * x, weights_by_scipy.x) norma2 = math.sqrt(reduce(lambda x, y: x + y, norma2)) weights_by_scipy.x /= norma2 #print weights_by_grad, weights_by_scipy.x support.draw(weights_by_grad, 'black', axes) support.draw(weights_by_scipy.x, 'yellow', axes) pb.show() return N
def main(): X_train, y_train, X_test, y_test = reg.trainAndTestData() #Gauss Bayes gB = GaussBayes() gB.fit(X_train, y_train) # "Bernoulli" y_hatgB = gB.predict(X_test) y_hat = gB.predict(X_train) accgb = accuracy(y_hatgB, y_test) #Gauss Naive Bayes gNB = GaussNB() gNB.fit(X_train, y_train) y_hatgNB = gNB.predict(X_test) accgNB = accuracy(y_hatgNB, y_test) #Gauss Bernoulli gBern = GenBayes() gBern.fit(X_train, y_train, "Bernoulli") y_hatBern = gBern.predict(X_test, "Bernoulli") accBern = accuracy(y_hatBern, y_test) data = { "Gauss Bayes": accgb, "Gauss Naive Bayes": accgNB, "Bernoulli": accBern } df = pd.DataFrame( data, columns=["Gauss Bayes", "Gauss Naive Bayes", "Bernoulli"], index=[0]) #Graph of y_train fig = plt.figure(figsize=(15, 10)) ax = fig.add_subplot(111) #cax = ax.scatter(pd.to_numeric(df.sqrt_ft), df.price_per_sqft, c = df.bathrooms, cmap='tab20c') cax = ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap='tab20c') #plt.xlim(-1,30) #plt.ylim(-1,15000) plt.xlabel("sqrt_ft") plt.ylabel("price_per_sqft") plt.title("HOA based on sqrt_ft and price_per_sqft") fig.colorbar(cax) plt.show() #Graph of y_hat fig = plt.figure(figsize=(15, 10)) ax = fig.add_subplot(111) #cax = ax.scatter(pd.to_numeric(df.sqrt_ft), df.price_per_sqft, c = df.bathrooms, cmap='tab20c') cax = ax.scatter(X_train[:, 0], X_train[:, 1], c=y_hat, cmap='tab20c') #plt.xlim(-1,30) #plt.ylim(-1,15000) plt.xlabel("sqrt_ft") plt.ylabel("price_per_sqft") plt.title("HOA based on sqrt_ft and price_per_sqft") fig.colorbar(cax) plt.show() return df
def __init__(self, master=None): self.ver = master.ver self.myFont = settings.FONT self.picDir = settings.PICTURE_DIR self.ansText = u"Ranking:\n" self.seihekiText = u"" tkinter.Frame.__init__(self, master) self.ans = tkinter.BooleanVar() self.ans.set(True) self.ansDialog = False self.array = [] self.ansArray = [] self.nextText = [] self.pack() self.makeWidget() self.alignWidget() self.idolsContainer = ac.IdolsContainer() self.tmpCont = MergeSort.readTable(settings.SORT_FILE_NAME) self.tmpCont.shuffle() self.sugCont = MergeSort.readTable(settings.SUGGEST_FILE_NAME) self.nameArray = self.tmpCont.returnNameArray() self.setNameArray(self.nameArray) self.reg = Regression.RegressionClass() self.sugRet = False
def data_shape_test(self): with self.assertRaises(Exception) as context: beta, error, CL, CR = Regression.linearRegression(X, Y) self.assertTrue("X and Y must have equal shape!" in context.exception) self.assertEqual(beta, -1) self.assertEqual(error, -1) self.assertEqual(CL, -1) self.assertEqual(CR, -1)
def __init__(self, dtype="float64", learning_rate=0.01, iters=300, normalize=False, copy_X=True, method='normal', alpha=0.1, batch_size=32, tolerance=1e-07, is_shuffle=True, random_state=42, metric='mse'): # invoking the __init__ of the Optimization class Regression.__init__(self, dtype, learning_rate, iters, normalize, copy_X, method, alpha, batch_size, tolerance, is_shuffle, random_state, metric)
def empty_test(self): with self.assertRaises(Exception) as context: beta, error, CL, CR = Regression.linearRegression(X, Y) self.assertTrue("X or Y should not be Empty!" in context.exception) self.assertEqual(beta, -1) self.assertEqual(error, -1) self.assertEqual(CL, -1) self.assertEqual(CR, -1)
def parameter_type_test(self): with self.assertRaises(Exception) as context: beta, error, CL, CR = Regression.linearRegression(X, Y) self.assertTrue("X and Y must be numpy arrays or python lists!" in context.exception) self.assertEqual(beta, -1) self.assertEqual(error, -1) self.assertEqual(CL, -1) self.assertEqual(CR, -1)
def get_model(model, type, params): if model == 'class': if type == 'tree': if params is not None: model = Classifier.DecisionTreeClassifier( max_depth=params['max_depth'], min_samples_split=params['min_samples_split']) else: model = Classifier.DecisionTreeClassifier() print(model) print('here') else: if params is not None: model = Classifier.RandomForestClassifier( max_depth=params['max_depth'], max_features=params['max_features'], n_trees=params['n_trees'], min_samples_split=params['min_samples_split']) else: model = Classifier.RandomForestClassifier() else: if args.type == 'tree': if params is not None: model = Regression.DecisionTreeRegressor( max_depth=params['max_depth'], min_samples_split=params['min_samples_split']) else: model = Regression.DecisionTreeRegressor() else: if params is not None: model = Regression.RandomForestRegressor( max_depth=params['max_depth'], max_features=params['max_features'], n_trees=params['n_trees'], min_samples_split=params['min_samples_split']) else: model = Regression.RandomForestRegressor() print(model) return model
def nextCommand(self): """ GUIの状態を変えるための関数。 """ self.ansArray.append(self.ans.get()) #次にどの2人を比較するかを決定する。 self.nextText = MergeSort.mergeWithoutRecWithAns( self.array, self.ansArray) #もし2人が返ってきた場合は、画像を表示する。 if len(self.nextText) == 2: self.imageConfig() else: #そうでない場合(終了の場合)は、結果を表示する。 for i in range(0, len(self.nextText)): self.ansText = self.ansText + u"\tNo. %d:\t%s\n" % ( i + 1, self.nextText[i]) #文末の改行コードを消して、結果をログに保存 logging.info(self.ansText.rstrip(u"\n")) for a in self.nextText: #マージソートされた結果が返却されるため、その順序を保持して新しいコンテナに格納する。 self.idolsContainer.appendIdol( self.tmpCont.returnIdolByName(a)) self.nextButton.configure(state=tkinter.DISABLED) #回帰分析用のインスタンスに登録 self.reg.register(self.idolsContainer.returnContainer()) self.reg.normalizeCoef() #回帰分析の実行 regAns = self.reg.regression() self.seihekiText = Regression.seihekiChecker(regAns) #ログに係数を出力 logging.info(self.seihekiText) self.sugText = self.reg.returnPredict( self.sugCont.returnContainer()) #メッセージウィンドウを出す self.messageWindow() #新しい画面を出す if self.ansDialog == True: self.sugWindow = SugWindow(master=self, picDir=self.picDir, sugCont=self.sugCont, sugText=self.sugText, myFont=self.myFont, addText=self.addText, seihekiText=self.seihekiText) self.sugWindow.mainloop()
def cross_validation(X, y, n_folds, method='ols', lambda_=0.01): if len(y.shape) > 1: y = np.ravel(y) kf = KFold(n_splits=n_folds, random_state=0, shuffle=True) mse = np.zeros((n_folds, 2)) r2 = np.zeros((n_folds, 2)) b = np.zeros((n_folds, 2)) var = np.zeros((n_folds, 2)) i = 0 for train_index, val_index in kf.split(X): model = Regression(method, lambda_) model.fit(X[train_index], y[train_index]) model.predict(X[train_index]) y_pred_train = model.y_pred model.predict(X[val_index]) y_pred_test = model.y_pred mse[i][0] = mean_squared_error(y[train_index], y_pred_train) mse[i][1] = mean_squared_error(y[val_index], y_pred_test) r2[i][0] = r2_score(y[train_index], y_pred_train) r2[i][1] = r2_score(y[val_index], y_pred_test) b[i][0] = bias(y[train_index], y_pred_train) b[i][1] = bias(y[val_index], y_pred_test) var[i][0] = np.var(y_pred_train) var[i][1] = np.var(y_pred_test) i += 1 mse_train = np.mean(mse[:,0]) mse_test = np.mean(mse[:,1]) r2_train = np.mean(r2[:,0]) r2_test = np.mean(r2[:,1]) b_train = np.mean(b[:,0]) b_test = np.mean(b[:,1]) var_train = np.mean(var[:,0]) var_test = np.mean(var[:,1]) return mse_train, mse_test, r2_train, r2_test, b_train, b_test, var_train, var_test
def analyze_regression(x1, x2, y, method='ols', n_folds=5, data_name='data'): max_degree = 20 n_lambdas = 9 lambdas = np.logspace(-3, 3, n_lambdas) error_scores = pd.DataFrame(columns=['degree', 'lambda', 'MSE_train', 'MSE_test', 'R2_train', 'R2_test', 'bias_train', 'bias_test', 'var_train', 'var_test']) if method=='ols': lambdas = [0] filename = 'error_scores_' + data_name + '_' + method if n_folds > 1: filename += '_cv' for lambda_ in lambdas: for deg in range(1, max_degree+1): X = create_design_matrix(x1, x2, deg=deg) if n_folds > 1: mse_train, mse_test, r2_train, r2_test, bias_train, bias_test, var_train, var_test = cross_validation(X, y, n_folds, method, lambda_) else: model = Regression(method, lambda_=lambda_) model.fit(X, y) model.predict(X) mse_train = mean_squared_error(model.y, model.y_pred) r2_train = r2_score(model.y, model.y_pred) bias_train = bias(model.y, model.y_pred) var_train = np.var(model.y_pred) mse_test = None r2_test = None bias_test = None var_test = None error_scores = error_scores.append({'degree': deg, 'lambda': lambda_, 'MSE_train': mse_train, 'MSE_test': mse_test, 'R2_train': r2_train, 'R2_test': r2_test, 'bias_train': bias_train, 'bias_test': bias_test, 'var_train': var_train, 'var_test': var_test}, ignore_index=True) print(error_scores) error_scores.to_csv(filename + '.csv')
def kFoldErrorChoose(x,y,maxOrder,k): e = [0 for i in range(0,maxOrder)] d = kSplit([x,y],k) # pdb.set_trace() for order in range(1,maxOrder+1): sumError = 0 for i in range(0,k): #The current partition to use: the ith partition is used as test data. Dcopy = copy.copy(d) dtest = Dcopy.pop(i) dtrain = Dcopy[0] f = Regression.polyTrain(dtrain[0],dtrain[1],order) sumError += meanSquaredError(dtest[0],dtest[1],f) e[order-1] = sumError/(k * 1.0) return min(e[i] for i in range(0,len(e))),(argmin(e)+1)
def build(self): root = ScreenManager() root.transition = SwapTransition() root.add_widget(MainMenu()) root.add_widget( bm.BracketMethods(screenManager=root, name='bracket_methods')) root.add_widget(om.OpenMethods(screenManager=root, name='open_methods')) root.add_widget( soe.SystemOfEquations(screenManager=root, name='system_equations')) root.add_widget( ip.Interpolation(screenManager=root, name='interpolation')) root.add_widget(rg.Regression(screenManager=root, name='regression')) return root
def on_click(): fixed_acidity = var1.get() volatile_acidity = var2.get() citric_acid = var3.get() residual_sugar = var4.get() chlorides = var5.get() free_sulfur_dioxide = var6.get() total_sulfur_dioxide = var7.get() sulphates = var8.get() alcohol = var9.get() if is_float(fixed_acidity) != True or is_float( volatile_acidity ) != True or is_float(citric_acid) != True or is_float( residual_sugar) != True or is_float(chlorides) != True or is_float( free_sulfur_dioxide) != True or is_float( total_sulfur_dioxide) != True or is_float( sulphates) != True or is_float(alcohol) != True: messagebox.showerror("Error", "Float or Integer only") else: data = [ float(fixed_acidity), float(volatile_acidity), float(citric_acid), float(residual_sugar), float(chlorides), float(free_sulfur_dioxide), float(total_sulfur_dioxide), float(sulphates), float(alcohol) ] with open('collectedData.csv', 'w', newline='') as f: #fieldnames = ['colum1','colum3','colum2'] thewriter = csv.writer(f) thewriter.writerow([ 'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'sulphates', 'alcohol' ]) thewriter.writerow(data) predicted = Regression.predicted() # print(predicted) messagebox.showinfo("Quality:", predicted[0])
# Regression.lwlr(dataMat[0], dataMat, labelMat, 1.0) # yHat = Regression.lwlrTest(dataMat, dataMat, labelMat, 0.01) # Regression.plotLwlr(dataMat, labelMat, yHat) ''' # Reduce coefficient ''' # dataMat, labelMat = Regression.loadDataSet("E:/TestDatas/MachineLearningInAction/Ch08/abalone.txt") # ridgeWeights = Regression.ridgeTest(dataMat, labelMat) # returnMat = Regression.stageWise(dataMat, labelMat, 0.005, 1000) # Regression.plotParamTrend(returnMat) ''' # LEGO # Regression.legoDataCollect("E:/TestDatas/MachineLearningInAction/Ch08/lego/") dataArr, labelArr = Regression.loadDataSet("E:/TestDatas/MachineLearningInAction/Ch08/lego/legoData.txt") # ws = Regression.legoStandRegres(dataArr, labelArr) Regression.crossValidation(dataArr, labelArr, 10) ridgeWeights = Regression.stageWise(dataArr, labelArr)
raise Exception("Invalid command line argument") #In the following, D is the data set which has all the x values as its first entry and the y values as its second. error,order = CV.kFoldErrorChoose(D[0],D[1],10,5) #Graph the points on the base polynomial Graph.lineColor(D[0],D[1],'red') #Add Gaussian noise to the data outputs D[1] = Data.addGaussianNoise(D[1],1.0/2000) #Graph them as points in blue Graph.pointsSimple(D[0],D[1]) #Estimate the coefficients of the polynomial with best order fit = Regression.polyTrain(D[0],D[1],order) #Get the function's estimates for the training x values z = [fit(i) for i in D[0]] #Graph the points Graph.lineColor(D[0],z,'g') #Show the plot Graph.show() if(len(sys.argv) == 1): print "True function was an order " + str(trueOrder) + " polynomial, fit with order " + str(order)
from sklearn import datasets from sklearn.model_selection import train_test_split import Regression as reg dataset = datasets.load_boston() X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.2, random_state=42) alpha = 0.1 #initialize LR model LRModel = reg.LinearRegression() #initilizse the RR model RRModel = reg.RidgeRegression() RRModel.set_params(alpha=alpha) #put both models into a list models = [LRModel, RRModel] #initialize empty list to store the scores of the models score = [] #iterate over the models for model in models: model.fit(X_train, y_train) score.append(model.score(X_test, y_test)) print(model.params) #print the computed scores for the different models in nice format
# used to remove car names from data array cols_rmv = [8] # represents the data split (traingin, validation) size = [.75, .25] split_selection = list() runs = 15 for x in range(0, runs): split_selection.append(size) print('Number of Runs: ', runs) print("Data Split: ", split_selection[0]) # get the data using data cleaner # returns a 2D array where rows are observations and columns # are attributes of a specific observations data_array = DataCleaner.data_cleaner("CarData.txt") # used to do Linear Regression. # Arguments are: # data_array: The data array created with DataCleaner # imputation: The users choice of imputation # cont_dis: The array that represents which cols/attributes are continuous or discrete(0,1) # cols_rmv: The columns the user would like to be removed from the data set here it is the car_name # bad data signal: This will be used to determine if and what data points are missing # split_selection: Array controlling how many tests are run and the split between test and validation sets Regression.perform_regression(list(data_array), imputation, cont_dis, cols_rmv, '?', 0, split_selection)
# plt.axvline(x= (noms(w_0)/(1000*2*np.pi))+(noms(w_pl)/(1000*2*np.pi)) , color='r', linestyle='--', label=r'Omega 0') # plt.axvline(x= (noms(w_0)/(1000*2*np.pi))+(noms(w_mi)/(1000*2*np.pi)) , color='g', linestyle='--', label=r'Omega 0') plt.ylabel(r'$U_c/U_er $') plt.xlabel(r'$v \:/\: \si{\kilo\hertz}$') plt.legend(loc='best') plt.tight_layout(pad=0, h_pad=1.08, w_pad=1.08) plt.savefig('build/U_gegen_v.pdf') plt.clf() #Zoom Indem_max = np.argmax(U_c_ges/U_er_ges) # positive Flanke params = ucurve_fit(reg.reg_linear, noms(fre_ges[Indem_max-6:Indem_max-1]) , noms(U_c_ges[Indem_max-6:Indem_max-1]/U_er_ges[Indem_max-6:Indem_max-1])) t_plot = np.linspace(32.5, 33.95, 2) plt.plot(t_plot, reg.reg_linear(t_plot, *noms(params)), 'b-', label='$Fit_\t{1}$') X = np.array([((np.amax(U_c_ges/U_er_ges))/unp.sqrt(2) - params[1]) / params[0]]) print('X') print(X) # negative Flanke params = ucurve_fit(reg.reg_linear, noms(fre_ges[Indem_max+1:Indem_max+6]) , noms(U_c_ges[Indem_max+1:Indem_max+6]/U_er_ges[Indem_max+1:Indem_max+6])) t_plot = np.linspace(33.95, 35, 2) plt.plot(t_plot, reg.reg_linear(t_plot, *noms(params)), 'y-', label='$Fit_\t{2}$') Y = np.array([((np.amax(U_c_ges/U_er_ges)/unp.sqrt(2)) - params[1]) / params[0]]) print('Y') print(Y)
# # plt.plot(t_plot, reg.reg_linear(t_plot, *noms(params)), 'b-', label='Fit') # plt.xlim(t_plot[0], t_plot[-1]) # # plt.xlabel(r'$t \:/\: \si{\milli\second}$') # # plt.ylabel(r'$U \:/\: \si{\kilo\volt}$') # plt.legend(loc='best') # plt.tight_layout(pad=0, h_pad=1.08, w_pad=1.08) # plt.savefig('build/test-plot.pdf') # Ablesen der Grenzfrequenzen und Umrechnen f_gr = 9 f_gr1 = 5.60 f_gr2 = 14.73 f_gr3 = 18.15 f_gr = np.exp(reg.reg_linear(f_gr, noms(m1), noms(b1))) f_gr1 = np.exp(reg.reg_linear(f_gr1, noms(m2), noms(b2))) f_gr2 = np.exp(reg.reg_linear(f_gr2, noms(m2), noms(b2))) f_gr3 = np.exp(reg.reg_linear(f_gr3, noms(m2), noms(b2))) write('build/Z_w_gr.tex', make_SI(Wellenwiderstand(2*np.pi*f_gr), r'\ohm', figures=0)) write('build/f_mess.tex', make_SI(f_gr*1e-3, r'\kilo\hertz', 'e-3',figures=1)) write('build/f1_mess.tex', make_SI(f_gr1*1e-3, r'\kilo\hertz', 'e-3',figures=1)) write('build/f2_mess.tex', make_SI(f_gr2*1e-3, r'\kilo\hertz', 'e-3',figures=1)) write('build/f3_mess.tex', make_SI(f_gr3*1e-3, r'\kilo\hertz', 'e-3',figures=1)) # Theoriewerte der Grenzfrequenzen w_th = 2 / np.sqrt(L*C1) w1_th = np.sqrt(2/(L*C1)) w2_th = np.sqrt(2/(L*C2)) w3_th = np.sqrt( 2/L * (C1+C2)/(C1*C2) )
'build/Tabelle_Verdampfungskurve.tex', [], [r'$T \:/\: \si{\kelvin}$', r'$p \:/\: \si{\bar}$', r'$\frac{1}{T} \:/\: 10^{-3}\si{\per\kelvin}$', r'$\ln{(p/\si{\pascal})}$'])) # Fit Verdampfungskurve params = ucurve_fit(reg.reg_linear, 1/T1, np.log(p1), p0=[-1, 1]) m1, b1 = params write('build/m1.tex', make_SI(m1, r'\kelvin', '', 1)) # 1 signifikante Stelle write('build/b1.tex', make_SI(b1, r'', '', 1)) # 1 signifikante Stelle # Plot ln(p) vs 1/T -> Verdampfungskurve T_plot = np.linspace(np.amin(1/T1), np.amax(1/T1), 100) plt.plot(T_plot*1e3, reg.reg_linear(T_plot, *noms(params)), 'b-', label='Fit') plt.plot(1/T1*1e3, np.log(p1), '.r', label='Messdaten') plt.xlim(1e3*(T_plot[0]-1/np.size(T1)*(T_plot[-1]-T_plot[0])), 1e3*(T_plot[-1]+1/np.size(T1)*(T_plot[-1]-T_plot[0]))) plt.xlabel(r'$T^{-1} \:/\: 10^{-3}\si{\per\kelvin}$') plt.ylabel(r'$\ln(p / \si{\pascal})$') plt.legend(loc='best') plt.tight_layout(pad=0, h_pad=1.08, w_pad=1.08) plt.savefig('build/Verdampfungskurve.pdf') R = const.physical_constants["molar gas constant"] # value, unit, error R_unc = ufloat(R[0],R[2]) write('build/R.tex', make_SI(R_unc, r'\joule\per\mol\per\kelvin')) L1 = -R_unc * m1 write('build/L.tex', make_SI(L1*1e-3, r'\kilo\joule\per\mol', '', 1)) # eine signifikante Stelle #####################################################################################################################################################
########DEFINE COLLECTION FIELDS########## print('DATA_COLLECTION_BEGIN') inputPeriods = [5, 10, 20, 50, 100, 200] pastReturnPeriods = [1, 2, 5, 10, 20, 50, 100] retPeriods = [1, 2, 3, 4, 5, 10, 20, 30, 40, 50] adjClose = ifld.AdjClose() longVolume = ifld.SMA(100, ifld.AdjVolume()) collectionFields = [] #import random #randomSymbols = random.sample(list(stockData.index.get_level_values('Symbol').unique()),2) #stockData = stockData[stockData.index.get_level_values('Symbol').isin(randomSymbols)] linRegressions = [] for period in inputPeriods: linearReg = reg.Regression(period, adjClose) linRegressions.append(linearReg) collectionFields.extend(linearReg.getRegFieldsList()) sdPeriod = ifld.SD(period, ifld.PcntChange(1, False, adjClose), 'SD_PCNT_' + str(period)) rollingMin = ifld.RollingMin(period, adjClose) rollingMax = ifld.RollingMax(period, adjClose) minDuration = ifld.ExtremeDuration(period, adjClose, False, 'Min_Duration_' + str(period)) maxDuration = ifld.ExtremeDuration(period, adjClose, True, 'Max_Duration_' + str(period)) minDurationLag = ifld.Lag(minDuration, 1, 'Min_Duration_' + str(period) + '_Lag') maxDurationLag = ifld.Lag(maxDuration, 1, 'Max_Duration_' + str(period) + '_Lag') retracedFromHigh = ifld.Divide(ifld.RetracementPcnt(period, True),
from pylab import * from numpy import * from Regression import * Reg = Regression() """Load in data and calculate the split ratio""" data = loadtxt('Q1.data') p = 13 """Shuffle Data""" data = data.reshape(-1,p+1) order = range(shape(data)[0]) random.shuffle(order) data = data[order,:] split = int(len(data)*.66) covX = cov(transpose(data)) sdX = sqrt(diag(covX)) for i in range(p+1): data[:,i] = data[:,i]/sdX[i] traindata = data[0:split,:] testdata = data[split:len(data),:] """Response splitting""" ytrain = traindata[:,p] ytrain = transpose(matrix(ytrain)) N = len(ytrain) ytest = testdata[:,p]
from sklearn import datasets from sklearn.model_selection import train_test_split import Regression as reg dataset = datasets.load_boston() X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.2, random_state=42) alpha = 0.1 rdg_regress = reg.RidgeRegression() rdg_regress.set_params(alpha=alpha) models = [reg.LinearRegression(), rdg_regress] model_scores = {} model_params = {} for model in models: model.fit(X_train, y_train) model_scores[model.__class__.__name__] = model.score(X_test, y_test) model_params[model.__class__.__name__] = model.get_params() print("The model is : {}. The R-square value in the test dataset is : {}.". format(model.__class__.__name__, model.score(X_test, y_test))) best_model = max(model_scores, key=model_scores.get) print("The best model is : {} \nParameters are : \n{}".format( best_model, model_params[best_model]))
# write('build/D2.tex', make_SI(D2*1e5, r'\kilogram\square\meter\per\square\second', 'e-5', figures=1)) y = 4*np.pi**2*(Theta_Kugel+Theta_Aufhaengung)/T**2 params = ucurve_fit(reg.reg_linear, B, y) # linearer Fit m, D = params write('build/m.tex', make_SI(m*1e3, r'\ampere\square\meter', 'e-3', figures=1)) write('build/D.tex', make_SI(D*1e5, r'\kilogram\square\meter\per\square\second', 'e-5', figures=1)) # D = 4*(np.pi**2)*(Theta_Kugel+Theta_Aufhaengung)/(T**2) m_th = 1/B * (4*(np.pi**2) * (Theta_Kugel+Theta_Aufhaengung) / T**2 - D_ohneB) m_th_unc = ufloat(np.mean(noms(m_th)), MeanError(noms(m_th))) write('build/m_th.tex', make_SI(m_th_unc*1e3, r'\ampere\square\meter', 'e-3', figures=1)) # print(m_th) t_plot = np.linspace(np.amin(B), np.amax(B), 100) # plt.plot(t_plot*1e3, reg.reg_linear(t_plot, *noms(params))*1e5, 'b-', label='Methode 1') plt.plot(t_plot*1e3, reg.reg_linear(t_plot, np.mean(noms(m_th)), np.mean(noms(D)))*1e5, 'g-', label='Methode 2') # plt.plot(B * 1e3, noms(y)*1e5, 'rx', label='Messdaten') plt.errorbar(B * 1e3, noms(y) * 1e5, fmt='r.', yerr=stds(y) * 1e5, label='Messdaten') ## plt.xscale('log') # logarithmische x-Achse plt.xlim((t_plot[0]-1/np.size(B)*(t_plot[-1]-t_plot[0]))*1e3, (t_plot[-1]+1/np.size(B)*(t_plot[-1]-t_plot[0]))*1e3) plt.xlabel(r'$B \:/\: \SI{e-3}{\tesla}$') plt.ylabel(r'$\frac{4\pi^2 \Theta_\text{Gesamt}}{T^2} \:/\: \SI{e-5}{\kilogram\square\meter\per\square\second}$') plt.legend(loc='best') plt.tight_layout(pad=0, h_pad=1.08, w_pad=1.08) plt.savefig('build/zeta.pdf') # Berechnung des Erdmagnetfeldes B_Erde = (D_mitB-D_ohneB)/m write('build/B_Erde.tex', make_SI(B_Erde*1e6, r'\micro\tesla', figures=1))
print(stds(Falldauer_unc)) # Plot # plt.plot(1/T_2*1e3, noms(eta_gr_b_log), 'rx', label='Messdaten') # plt.plot(t * 1e3, U * 1e3, 'rx', label='Messdaten') plt.errorbar(1/T_2*1e3, noms(eta_gr_b_log), yerr=stds(eta_gr_b_log),fmt='r.', label='Messdaten') # plt.xscale('log') # logarithmische x-Achse # plt.xlim(t_plot[0] * 1e3, t_plot[-1] * 1e3) # t_plot = np.linspace(np.amin(1/T_2), np.amax(1/T_2), 10) t_plot = np.linspace(0.003, 0.00345, 10) # plt.xlim(t_plot[0]-1/np.size(T_2)*(t_plot[-1]-t_plot[0]), t_plot[-1]+1/np.size(T_2)*(t_plot[-1]-t_plot[0])) # print('Max') print(np.amin(1/T_2), np.amax(1/T_2)) plt.plot(t_plot * 1e3, reg.reg_linear(t_plot, *noms(params)), 'b-', label='Fit') plt.xlabel(r'$\frac{1}{T} \:/\: \SI{e-3}{\per\kelvin}$') plt.ylabel(r'$\text{ln}\left(\frac{\eta}{\si{\kilogram\meter\per\second}}\right)$') plt.legend(loc='best') plt.tight_layout(pad=0, h_pad=1.08, w_pad=1.08) plt.savefig('build/Plot1.pdf') print(Falldauer_roh[1]) write('build/Tabelle_b_1.tex', make_table([T[1:], Falldauer_roh[1:]],[0, 0])) # FULLTABLE write('build/Tabelle_b_1_texformat.tex', make_full_table( 'Messdaten Falldauer in Abhängigkeit der Temperatur.', 'table:b_1', 'build/Tabelle_b_1.tex', [], # Hier aufpassen: diese Zahlen bezeichnen diejenigen resultierenden Spaltennummern,
from sklearn.model_selection import train_test_split from sklearn.linear_model import Ridge import matplotlib.pyplot as plt dataset = datasets.load_boston() X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.2, random_state=42) alpha_list = np.arange(0.05,1,0.09) score_list = [] for a in alpha_list: #model = reg.RidgeRegression(alpha) model = reg.RidgeRegression(0.1) model.set_params(alpha = a) model.fit(X_train, y_train) y_predict = model.predict(X_test) score = model.score(X_test,y_test) score_list.append(score) plt.plot(alpha_list,score_list,label = 'Ridge Regression') score_list_l = [] for alpha in alpha_list: model = reg.LinearRegression() model.fit(X_train, y_train) y_predict = model.predict(X_test) score = model.score(X_test,y_test)
plt.axhline(y=85, color='k', linestyle='--', label='85%') plt.xticks(np.arange(1, features + 1, 1)) plt.xlabel('Number of Components') plt.ylabel('Variance Explained') plt.legend() plt.show() ##################### ##################### ## Linear Regression ##################### linear = R.LinearRegression() X_train, X_test, y_train, y_test = data.getSplitData() linear.train(features, X_train, X_test, y_train, y_test, n_jobs=1, verbose=True, startIndex=1) linear.fit(X, y) #func = linear.function(columnNames=['D','E', 'F', 'G', 'L', 'P', 'U', 'AA', 'AB', 'AD'], featureStartIndex = 3) #func = linear.function(columnNames=['D','E', 'F', 'G', 'P','W','X','Y','AA', 'AB', 'AD'], featureStartIndex = 3) linear.function(columnNames=[ feature_columns[letter - ord('A')]
plt.figure(1) plt.subplot(221) plt.plot(shortData['beta0']) plt.title('Beta 0') plt.subplot(222) plt.plot(shortData['CNBrepo']) plt.title('CNB repo rate') plt.subplot(223) plt.plot(shortData['y10Yforecast']) plt.title('10Y Yield forecast') plt.subplot(224) plt.plot(shortData['PriborSpread']) plt.title('Implied forward 1Y') plt.show() ols = Regression.EstimateOLS(shortData, 'beta0 ~ y10Yforecast + ImpFwd1Y + CNBrepo') if True: diffsdata = data['2011-02-01':].diff()[1:len(data)] #print(diffsdata) ols = Regression.EstimateOLS(diffsdata, 'beta0 ~ y10Yforecast + CNBrepo', False) #print(ols.summary()) #plt.figure(1) #plt.plot(diffsdata['beta0']) #plt.plot(ols.fittedvalues) #plt.show() shortData['b0_fitted'] = Regression.getFittedLevels( ols.fittedvalues, shortData.ix[0, 'beta0'], 'FittedBeta0') plt.figure(1) plt.plot(shortData['beta0'])
def regres(filename): serv = Regression.Regres(filename) result = serv.predictValue(0) resultString = str(result).strip('[]') return resultString
from sklearn import datasets from sklearn.model_selection import train_test_split #import regression classes import Regression as Reg dataset = datasets.load_boston() X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.2, random_state=42) alpha = 0.1 linreg = Reg.LinearRegression() ridreg = Reg.RidgeRegression() ridreg.set_params(alpha=alpha) models = [linreg, ridreg] model_scores = [] for model in models: model.fit(X_train, y_train) score = model.score(X_test, y_test) model_scores.append(score) print(str(type(model).__name__) + " has R^2 score of: " + str(score)) best_model = models[model_scores.index(max(model_scores))] print("The best model is " + str(type(best_model).__name__)) print("And params for the best model are: ") print(best_model.get_params())
def squaredErrorChoose(x,y,maxOrder): e = [0 for i in range(0,maxOrder)] for order in range(1,maxOrder+1): f = Regression.polyTrain(x,y,order) e[order-1] = meanSquaredError(x,y,f) return min(e[i] for i in range(0,len(e))),(argmin(e)+1)
# Sandbox.py # Ashish D'Souza and Stephen Brown # July 26th, 2018 # _____ __ _____ __ ____ __ ____ _ __ # / ___/____ _/ /____ / / (_) /____ / __ \____ _/ /_____ _ / __ \_________ (_)__ _____/ /_ # \__ \/ __ `/ __/ _ \/ / / / __/ _ \ / / / / __ `/ __/ __ `/ / /_/ / ___/ __ \ / / _ \/ ___/ __/ # ___/ / /_/ / /_/ __/ / / / /_/ __/ / /_/ / /_/ / /_/ /_/ / / ____/ / / /_/ / / / __/ /__/ /_ # /____/\__,_/\__/\___/_/_/_/\__/\___/ /_____/\__,_/\__/\__,_/ /_/ /_/ \____/_/ /\___/\___/\__/ # /___/ import netCDF4 import Regression data = [] lat_array = [] lon_array = [] for i in range(1, 13): string = str(i) if i < 10: string = "0" + string dataset = netCDF4.Dataset( "C:/Users/skillsusa/Downloads/CH4/CH4_flux_2010" + string + "01.nc", "r") data.append(dataset.variables["emissions"][0]) lat_array = dataset.variables["Lat"] lon_array = dataset.variables["Lon"] Regression.predict(data, lat_array, lon_array, 13, 3, [0.5, 0.667], "C:/Users/skillsusa/Downloads/Map.html")
# Problem 2 -- Model Scoring -- for Homework 3 of CS107 # Author: Max Li from sklearn import datasets from sklearn.model_selection import train_test_split import Regression as reg dataset = datasets.load_boston() X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.2, random_state=42) linear_model = reg.LinearRegression() ridge_model = reg.RidgeRegression() ridge_model.set_params(alpha=0.1) models = [linear_model, ridge_model] scores = [] for model in models: model.fit(X_train, y_train) score = model.score(X_test, y_test) scores.append(score) print("R-squared: " + str(score)) print(model.get_params())
##model_performance.py import numpy as np from sklearn import datasets from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt import Regression as myReg dataset = datasets.load_boston() X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.2, random_state=42) alpha = 0.1 olsreg = myReg.LinearRegression() rigreg = myReg.RidgeRegression() rigreg.set_params(alpha=0.1) models = [olsreg, rigreg] alpha_array = np.logspace(-2, 1, 10) score_array_ols = np.zeros(alpha_array.shape) score_array_rig = np.zeros(alpha_array.shape) cnt = 0 for alpha_i in alpha_array: for model in models: model.set_params(alpha=alpha_i) model.fit(X_train, y_train)
def fun(): seed(1) crtDir = os.getcwd() filePath = os.path.join(crtDir, 'date.txt') inputs, outputs = loadDataSingleFeature(filePath, 'Economy..GDP.per.Capita.', 'Happiness.Score') print('in: ', inputs[:5]) print('out: ', outputs[:5]) indexes = [i for i in range(len(inputs))] trainSample = np.random.choice(indexes, int(0.8 * len(inputs)), replace=False) testSample = [i for i in indexes if not i in trainSample] trainInputs = [inputs[i] for i in trainSample] testInputs = [inputs[i] for i in testSample] trainOutputs = [outputs[i] for i in trainSample] testOutputs = [outputs[i] for i in testSample] norm = stdNorm() featuresComplet = [] for feat in trainInputs: featuresComplet.append(feat) for feat in testInputs: featuresComplet.append(feat) '''for feat in trainOutputs: featuresComplet.append(feat) for feat in testOutputs: featuresComplet.append(feat)''' norm.statisticalNormalisation(featuresComplet) #NORMALIZATION OF TRAIN DATA trainInputs = norm.statisticalNormalisation(trainInputs) #NORMALIZATION OF TEST DATA testInputs = norm.statisticalNormalisation(testInputs) plotDataHistogram(trainInputs + testInputs, 'Capita GDP') plotDataHistogram(trainOutputs + testOutputs, 'Happiness score') plotData2D(trainInputs + testInputs, trainOutputs + testOutputs) xx = [[el] for el in trainInputs] #regressor = linear_model.LinearRegression() #regressor = regression.MyLinearUnivariateRegression() #regressor = Regression.MySGDRegression() regressor = Regression.MySGDRegression() regressor.fit(xx, trainOutputs) # FIT SINGLE MATRIX of noSamples x noFeatures w0, w1 = regressor.intercept_, regressor.coef_[0] feature1 = [el for el in trainInputs] feature1train = trainInputs noOfPoints = 50 xref1 = [] val = min(feature1) step1 = (max(feature1) - min(feature1)) / noOfPoints for _ in range(1, noOfPoints): for _ in range(1, noOfPoints): xref1.append(val) val += step1 yref = [w0 + w1 * el1 for el1 in xref1] plot2DModel(feature1train, trainOutputs, xref1, yref) xx = [[el] for el in testInputs] computedTestOutputs = regressor.predict(xx) #computedTestOutputs = [w0 + w1 * el for el in testInputs] noOfPoints = 50 xref1 = [] val = min(testInputs) step1 = (max(testInputs) - min(testInputs)) / noOfPoints for _ in range(1, noOfPoints): for _ in range(1, noOfPoints): xref1.append(val) val += step1 #plot2DModel(feature1test, computedTestOutputs, xref1, yref) # "predictions vs real test data" plot2DModel(testInputs, testOutputs, xref1, yref) # "predictions vs real test data" #plotData(inputs, outputs, testInputs, computedTestOutputs, testInputs, testOutputs, "predictions vs real test data") #compute the differences between the predictions and real outputs error = 0.0 for t1, t2 in zip(computedTestOutputs, testOutputs): error += (t1 - t2)**2 error = error / len(testOutputs) print("prediction error (manual): ", error) error = mean_squared_error(testOutputs, computedTestOutputs) print("prediction error (tool): ", error)
meanY = float(Y.mean().values) X = dataset.drop(columns=['ERP', 'PRP', 'vendor name', 'model name']) # # Separation between train dataset and test dataset with train_frac index_separation = int(data_lenght * train_frac) Xtrain = X.iloc[:index_separation] Ytrain = Y.iloc[:index_separation] Xtest = X.iloc[index_separation:] Ytest = Y.iloc[index_separation:] return Xtrain, Ytrain, Xtest, Ytest, meanY # Preparing the values and initiating the regression class # ---------------------------------------------------------- X, Y, Xtest, Ytest, meanY = prepareValues(verbose=True) Regression = rd.Regression(X, Y, verbose=True, unified=False) # # Training the model with X and Y sets # # ------------------------------------- print(tc.WARNING + "--> Training our regression model..." + tc.ENDC) Regression.train_model() print(tc.OKGREEN + " Training phase of the model finished!" + tc.ENDC) print(tc.OKGREEN + " Output model of the training (beta) :" + tc.ENDC) print(Regression.beta) # # Training the model with X and Y sets # # ------------------------------------- print(tc.WARNING + "--> Testing the model with the last 20% of the dataset!" + tc.ENDC) average_error = Regression.test_model(Xtest, Ytest) print(tc.OKGREEN + " Average error :" + tc.ENDC, average_error)