def test_linear_regression(self): X_test = DataFrame([1.5, 2.5, 3.5]) y_test = DataFrame([1.5, 2.5, 3.5]) fit = linear_regression(X_test, y_test) assert_frame_equal(y_test, DataFrame(fit.predict(X_test))) self.assertEqual(round(fit.coef_, 2), 1.0) self.assertEqual(round(fit.intercept_, 2), 0.0) r, _ = pearsonr(X_test.values, y_test.values) self.assertEqual(r, 1.0)
def hypothesisTesting(): for i in range(0,CSVcount): for j in range(0,CSVcount): csv1_name = "CSV" + str(i+1) csv2_name = "CSV" + str(j+1) csv1 = ResList[csv1_name] csv2 = ResList[csv2_name] relation = CsvRelations[i][j] if(relation == 0): pass elif(relation == -1): # Hypothesis 1 pass elif(relation == 2): pass elif(relation == 1): # Hypothesis 2 # Correlation Results anomalies_from_correlation = anomaliesFromWindowCorrelationWithConstantlag(csv1, csv2, window_size=15,maxlag=15, positive_correlation=True, pos=1, neg=1) # Slope Based Detection Technique # Extracting only data data1 = [x[1] for x in csv1] data2 = [x[1] for x in csv2] slope_based = slopeBasedDetection(data1,False,data2,False) anomalies_from_slope_based = anomalyDatesSlopeBaseddetetion(slope_based,csv1) (lr_based,lr_object) = linear_regression(data1, data2, 1) anomalies_from_lr = anomalies_from_linear_regression(lr_based,csv1) # Converting results to string resultString = "" resultString = "Anomalies from Correlation test <br>" resultString += "Start Date End Date Correlation Value<br>" for dataPoint in anomalies_from_correlation: resultString += str(dataPoint[0]) + " " + str(dataPoint[1]) + " " + str(dataPoint[2]) + "<br>" resultString += "Anomalies from Slope Based test <br>" resultString += "Start Date End Date Slope Value <br>" for dataPoint in anomalies_from_slope_based: resultString += str(dataPoint[0]) + " " + str(dataPoint[1]) + " " + str(dataPoint[2]) + " <br>" resultString += "Anomalies from Linear Regression test<br>" resultString += "Date X Val Y Val Expected Y Val Difference <br>" for dataPoint in anomalies_from_lr: resultString += str(dataPoint[0]) + " " + str(dataPoint[1]) + " " + str(dataPoint[2]) + " " + str(dataPoint[3]) + " " + str(dataPoint[4]) + "<br>" plotGraph(csv1,csv2,anomalies_from_correlation) return resultString elif(relation == -2): pass # Hypothesis 1 Methods # Correlation pass
def hypothesis4Testing(numOfFiles, *timeSeriesFileNames): if len(timeSeriesFileNames) != numOfFiles: print "Number of files mentioned do not match the specified files provided" return csvDataList = [] # 2D list storing data of each file for fileName in timeSeriesFileNames: with open(fileName, "rb") as f: reader = csv.reader(f) csvData = map(tuple, reader) csvDataList.append(csvData) centresList = [] testData = [] temp1 = [] for i in csvDataList: td = getColumnFromListOfTuples(i, 2) # wholesale price, indexing starts from 1 testData.append(convertListToFloat(td)) temp1 = getColumnFromListOfTuples(i, 0) temp2 = getColumnFromListOfTuples(i, 2) temp = zip(temp1, temp2) centresList.append(temp) # print "testData" + str(testData) avgTimeSeries = findAverageTimeSeries(testData) avgTimeSeries = zip(temp1, avgTimeSeries) # print "Average Time Series :::::: "+ str(avgTimeSeries) for i, c_list in enumerate(centresList): # CALL SLOPE BASED slopeBasedResult = slopeBased(c_list, False, avgTimeSeries, False) slopeBasedResult = mergeDates(slopeBasedResult) # Correlation correlationResult = anomaliesFromWindowCorrelationWithConstantlag(c_list, avgTimeSeries) correlationResult = mergeDates(correlationResult) # Linear Regression lrResult = linear_regression(avgTimeSeries, c_list, 1) lrResult = mergeDates(lrResult) result = intersection( 3, slopeBasedResult, "slope_based", correlationResult, "correlation", lrResult, "linear_regression" ) print "Anomalies fior time-series " + str(i) + " are:" for (a, b, c) in result: print str(a) + "," + str(b) + "," + str(c)
import numpy as np from matplotlib import pyplot as plt from linear_regression import linear_regression from sklearn import linear_model, datasets n_samples = 1000 n_outliers = 50 X = np.random.uniform(-10,10,1000) y = X X = X + X * np.random.normal(0, 0.2, 1000) X = X.reshape((1000, 1)) print X.shape, y.shape model = linear_regression() model.fit(X, y) pred = model.predict(X) print model.coef plt.scatter(X, y, color='gold', marker='.') plt.plot(X.reshape((1000)), pred) plt.grid(True) plt.show()
least_fold, param['alpha'], test="False")) avg_train_rmse = (sum(train_cost) / len(train_cost)) lr.plt.title("RMSE vs Iterations for " + str(model) + " regularization") lr.plt.xlabel('Iterations', fontsize=18) lr.plt.ylabel('RMSE', fontsize=18) lr.plt.plot(lr.np.linspace(0, iterations, len(train_cost)), train_cost, 'r') lr.plt.show() lr_model.cost_func_val(least_fold + 1) print('Test RMSE Error for ' + str(model) + " " + str(lr_model.Val_rmse)) if (__name__ == "__main__"): lr_model = lr.linear_regression() # filename=lr_model.convert_data_to_csv('./abalone.data') dataset = lr.pd.read_csv('./q1.csv') lr_model.find_vectors_k_fold(dataset, 5) least_val = 10**5 least_fold = 0 print("K-folds created") for i in range(5): # lr_model.find_vector(dataset,fold_count,5) lr_model.optimise_weight_normal(i) lr_model.cost_func_train(i + 1) lr_model.cost_func_val(i + 1) if lr_model.Val_rmse < least_val: least_val = lr_model.Val_rmse least_fold = i print("Choose Fold " + str(least_fold + 1))
# def calc_potential_energy (self, xx): # potential_energy=torch.dot(xx,torch.matmul(self.weight_matrix,xx)) # return potential_energy #Regular run #print("Potential") # #Amat = torch.FloatTensor([[-2, 0, 0, 0], # [0, -2, 0, 0], # [0, 0, -2, 0], # [0, 0, 0, -2]]) dim = 4 bias = True c = linear_regression(dim, lamb=0.1, bias=bias) c.generate_data(200, scale=1, noise_db=-np.inf) ps = lambda x: -20 * c.get_regularized_loss(x) w, b = c.get_ground_truth(lr=2) if bias: init_position = np.append(w.numpy(), b) else: init_position = w hmc = sampler(position_dim=dim + bias, step_size=0.02, potential_struct=ps, T=0.1, init_position=init_position) #sample,rej_cnt = hmc.main_hmc_loop(1000)
def stats_calc(t, res, err, flog): '''Statistical result of residual. ''' # beginning epoch for calculate the trend t0 = 2000.0 resn, errn, mean, wrms, std, cond = elim_wrms(res, err) # slope, intercept, r_value, p_value, std_err = stats.linregress( # t[cond], resn) tn = t[cond] par, parerr, outlier, cor = linear_regression(tn - t0, resn, errn) slope, intercept = par slperr, itperr = parerr # print("# weighted\n", # "# Mean : %.3f\n" % mean, # "# Std : %.3f\n" % std, # "# WRMS : %.3f\n" % wrms, # "# Slope : %.3f +/- %.3f\n" % (slope, slperr), # "# Intercept : %.3f " % intercept, # file=flog) print("# weighted\n", "# Mean : %.2f\n" % mean, "# Std : %.2f\n" % std, "# WRMS : %.2f\n" % wrms, "# Slope : %.2f +/- %.2f\n" % (slope, slperr), "# Intercept : %.2f " % intercept, file=flog) print("STAS_ALL ", mean, std, wrms, slope, slperr, file=flog) # Add the statistics after removing the linear trend; res1 = res - slope * (t - t0) # resn1, errn1, mean1, wrms1, std1, cond1 = elim_wrms(res1, err) # tn1 = t[cond1] # par1, parerr1, outlier1, cor1 = linear_regression( # tn1 - t0, resn1, errn1) # slope1, intercept1 = par1 # slperr1, itperr1 = parerr1 resn1 = resn - slope * (tn - t0) errn1 = errn _, _, mean1, wrms1, std1, cond1 = elim_wrms(res1, err) # tn1 = t[cond1] par1, parerr1, outlier1, cor1 = linear_regression(tn - t0, resn1, errn1) slope1, intercept1 = par1 slperr1, itperr1 = parerr1 print("# After removing linear trend:\n", "# Mean : %.2f\n" % mean1, "# Std : %.2f\n" % std1, "# WRMS : %.2f\n" % wrms1, "# Slope : %.2f +/- %.2f\n" % (slope1, slperr1), "# Intercept : %.2f\n" % intercept1, file=flog) print("STAS_AFTER ", mean1, std1, wrms1, slope1, slperr1, file=flog) return slope, intercept
sys.path.append('unsupervised/') import preprocessing, linear_regression, logistic_regression, decision_tree, svm, k_means best_score = [] counter = 0.0 # Call Preprocess class to format the data preprocess = preprocessing.Preprocess() # Uncomment the line below if need to preprocess the dataframe on run #preprocess.preprocess() # Supervised learning methods # Linear Regression linear_regression = linear_regression.LinearReg() best_score.append(linear_regression.linear_regression()) # Logistic Regression logistic_regression = logistic_regression.LogisticReg() best_score.append(logistic_regression.logistic_regression()) # Decision Tree decision_tree = decision_tree.DeciTree() best_score.append(decision_tree.decision_tree()) # Unsupervised learning methods # Support Vector Machines svm = svm.Svm() best_score.append(svm.svmachines()) # K-means
daily_df = daily_gb.sort_values(by=['date'], ignore_index=True) state_dfs[state_code] = daily_df #%% Add US and states into single entity # to simplify plotting and calcuations entity_df_dict.update(state_dfs) entity_codes.extend(state_codes) entity_names.extend(list(state_names_dict.values())) entity_names_dict = dict(zip(entity_codes, entity_names)) #%% plots! for entity_code, entity_df in entity_df_dict.items(): plot_daily_data_cum(entity_df, title_str=entity_names_dict[entity_code], plot_trend=True) plot_daily_data_diff(entity_df, title_str=entity_names_dict[entity_code], plot_trend=False) #%% some regression tests for entity_code, entity_df in entity_df_dict.items(): df = entity_df[['datetime', 'positive']].copy().dropna() df = df[df['positive'] > 0].reset_index(drop=True) slope, intercept, r_value, std_err, y_hat = lr.linear_regression(df.index, np.log10(df['positive'])) days2double = doubling.days_to_double(slope) print(f"Days for positive cases in {entity_names_dict[entity_code]} to double => {days2double:0.2f}") df = entity_df[['datetime', 'death']].copy().dropna().reset_index(drop=True) slope, intercept, r_value, std_err, y_hat = lr.linear_regression(df.index, np.log10(df['death'])) days2double = doubling.days_to_double(slope) print(f"Days for deaths in {entity_names_dict[entity_code]} to double => {days2double:0.2f}") print()
import pandas as pd import numpy as np import matplotlib.pyplot as plt from stand_functions import create_file, append_text, append_math, end_file # Define data file and columns to use data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data/') data_file = 'offenes_experiment.csv' x_col = 1 y_col = 3 data = pd.read_csv(data_dir + data_file, sep=',', header=0) x_col_name = data.columns[x_col] y_col_name = data.columns[y_col] data, calculations = linear_regression(data, x_col_name, y_col_name) # Make the graph plt.plot(data[x_col_name].drop(['Summe', 'Mittelwert']), data[y_col_name].drop(['Summe', 'Mittelwert']), 'rx') ## Make the linear regression line axes = plt.gca() x_vals = np.array(axes.get_xlim()) y_vals = calculations['a'] + calculations['b'] * x_vals plt.plot(x_vals, y_vals, 'r--') ## Make the line from the calculations plt.plot(data['Zeit (in s)'].drop(['Summe', 'Mittelwert']), data['berechnete Strecke (in m)'].drop(['Summe', 'Mittelwert']), 'bx-') ## Graphical enhancement plt.xlabel('Zeit (in s)')
sum_3 = 0 for i in range(total_flavors): sum_1 += math.pow((predict[i] - actual[i]), 2) sum_2 += math.pow((predict[i]), 2) sum_3 += math.pow(actual[i], 2) score_1 = ( 1 - math.sqrt(sum_1 / total_flavors) / (math.sqrt(sum_2 / total_flavors) + math.sqrt(sum_3 / total_flavors))) return score_1 if __name__ == '__main__': history_data, future_data, sample_ps, sample_vm, dim_to_be_optimized, history_begin, predict_begin, predict_end, flavor_num = read_data( ) lse_model = linear_regression() predict = [] actual = [] for i in range(total_flavors): predict_list = [] # history_data[i] = avg_filter(history_data[i]) history_data[i] = get_pow(history_data[i], exponent) history_data[i] = batch_add(history_data[i], addition) x_train, y_train, x_last = create_dataset(history_data[i], 7, 1) x_train = gaussian_weighted(x_train) x_last = gaussian_weighted(x_last) lse_model.lse_fit(x_train, y_train) x_train.show() for j in range(predict_span):
] normal = pd.DataFrame({ 'x': [1, 2, 3, 4, 5, 6, 7], 'y': [1, 3, 2, 5, 3, 7, 5] }) normal.name = '' outlier = pd.DataFrame({ 'x': [1, 1, 5, 5, 25, 10], 'y': [1, 5, 1, 10, 10, 5] }) outlier.name = 'with outlier' dataframes = [normal, outlier] for df in dataframes: linear_regression(df) dfs = [df] * len(plots) pc = PlotContainer(plots, dfs) # now we want to successively create plots and overlay them over previous # ones for i in range(len(pc)): fname = 'Linear Regression {0} example part {1}'.format( pc.dfs[i].name, i) pc.graph(fname, directory='images', setup=shared_setup, start=0, stop=1 + i)
def test_linear_regression_can_learn_doubling(self): model = linr.linear_regression( np.array([[1.0, 2.0], [2.0, 4.0], [3.0, 6.0], [4.0, 8.0]]), ) prediction = model.predict(np.array([[6.0]])) self.assertAlmostEqual(prediction[0][0], 12.0, places=3)
run_linear_regression(100) # In[161]: run_linear_regression(100, 'noisy') # In[ ]: train_data = scipy.io.loadmat('data/poly_train.mat') test_data = scipy.io.loadmat('data/poly_test.mat') x_train = train_data['X'] y_train = train_data['y'] x_test = test_data['X_test'] y_test = test_data['y_test'] w = linear_regression(x_train, y_train) x_train = add_bias(x_train) x_test = add_bias(x_test) e_train = np.where(y_train * (w.T @ x_train) < 0)[0].shape[0] / len(y_train[0]) e_test = np.where(y_test * (w.T @ x_test) < 0)[0].shape[0] / len(y_test[0]) print('E_train is %f, E_test is %f.' % (e_train, e_test)) # In[171]: train_data = scipy.io.loadmat('data/poly_train.mat') test_data = scipy.io.loadmat('data/poly_test.mat') x_train = train_data['X'] y_train = train_data['y']
print "\n=== Naive Bayes CLassifier with Laplace Smoothing ===" c = NaiveBayesClassifier(SPAM, HAM, 1) result("SPAM", c.spam.p, 0.4) result("HAM", c.ham.p, 0.6) result("today|SPAM", c.spam.p_word("today"), 0.0476) result("today|HAM", c.ham.p_word("today"), 0.1111) result("SPAM|today is secret)", c.p_spam_given_phrase("today is secret"), 0.4858) from linear_regression import linear_regression, gaussian from scipy import matrix print "\n=== Linear Regression ===" x = [3, 4, 5, 6] y = [0, -1, -2, -3] (w0, w1), err = linear_regression(x, y) print "(w0=%.1f, w1=%.1f) err=%.2f" % (w0, w1, err) x = [2, 4, 6, 8] y = [2, 5, 5, 8] (w0, w1), err = linear_regression(x, y) print "(w0=%.1f, w1=%.1f) err=%.2f" % (w0, w1, err) x = matrix([[3], [4], [5], [6], [7]]) m, s = gaussian(x) print "m = %s" % str(m) print "s^2= %s" % str(s)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # File name: verify_codes.py """ Created on Tue Jan 9 15:26:08 2018 @author: Neo([email protected]) """ import numpy as np from scipy import stats from linear_regression import linear_regression # ----------------------------- FUNCTIONS ----------------------------- x = np.random.normal(0, 1, 100) y = 1.5 * x + 0.4 + np.random.normal(0, 0.5, 100) # err = np.ones_like(x) err = np.random.normal(0, 0.5, 100) pi = err**-2 slope, intercept, r_value, p_value, std_err = stats.linregress(x, y * pi) print(slope, intercept) par, err, outlier, cor = linear_regression(x, y, err) print(par) # --------------------------------- END --------------------------------
def test_linear_regression(self): points = ((0, -1), (1, 0.2), (2, 0.9), (3, 2.1)) k, n = linear_regression(points) self.assertAlmostEqual(k, 1.0) self.assertAlmostEqual(n, -0.95)
from linear_regression import linear_regression import matplotlib.pyplot as plt import pandas as pd import numpy as np pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) df = pd.read_csv("winequality-white.csv", sep=";") covariates = df.drop("quality", axis=1).values targets = df["quality"].values beta, se_beta, lower_bounds, upper_bounds = linear_regression( covariates, targets) result_table = pd.DataFrame.from_dict({ "lower_bound_for_estimates": lower_bounds, "estimates": beta, "upper_bound_for_etimates": upper_bounds, "standard_errors": se_beta }) print("Result table:") display(result_table) plt.plot(lower_bounds) plt.plot(beta) plt.plot(upper_bounds) plt.title("Result plot")