def test_corr(): "Test stats.corr" ds = datasets.get_uts() y = ds.eval("uts.x[:,:3]") x = ds.eval('Y.x') n_cases = len(y) df = n_cases - 2 corr = stats.corr(y, x) p = stats.rtest_p(corr, df) for i in range(len(corr)): r_sp, p_sp = scipy.stats.pearsonr(y[:, i], x) assert corr[i] == pytest.approx(r_sp) assert p[i] == pytest.approx(p_sp) # NaN with warnings.catch_warnings(): # divide by 0 warnings.simplefilter("ignore") assert stats.corr(np.arange(10), np.zeros(10)) == 0 # perm y_perm = np.empty_like(y) for perm in permute_order(n_cases, 2): y_perm[perm] = y stats.corr(y, x, corr, perm) for i in range(len(corr)): r_sp, _ = scipy.stats.pearsonr(y_perm[:, i], x) assert corr[i] == pytest.approx(r_sp)
def test_corr(): "Test stats.corr" ds = datasets.get_uts() y = ds.eval("uts.x[:,:3]") x = ds.eval('Y.x') n_cases = len(y) df = n_cases - 2 corr = stats.corr(y, x) p = stats.rtest_p(corr, df) for i in xrange(len(corr)): r_sp, p_sp = scipy.stats.pearsonr(y[:, i], x) assert_almost_equal(corr[i], r_sp) assert_almost_equal(p[i], p_sp) # NaN r = stats.corr(np.arange(10), np.zeros(10)) eq_(r, 0) # perm y_perm = np.empty_like(y) for perm in permute_order(n_cases, 2): y_perm[perm] = y stats.corr(y, x, corr, perm) for i in xrange(len(corr)): r_sp, _ = scipy.stats.pearsonr(y_perm[:, i], x) assert_almost_equal(corr[i], r_sp)
def test_corr(): "Test stats.corr" ds = datasets.get_uts() y = ds.eval("uts.x[:,:3]") x = ds.eval('Y.x') n_cases = len(y) df = n_cases - 2 corr = stats.corr(y, x) p = stats.rtest_p(corr, df) for i in range(len(corr)): r_sp, p_sp = scipy.stats.pearsonr(y[:, i], x) assert_almost_equal(corr[i], r_sp) assert_almost_equal(p[i], p_sp) # NaN with warnings.catch_warnings(): # divide by 0 warnings.simplefilter("ignore") eq_(stats.corr(np.arange(10), np.zeros(10)), 0) # perm y_perm = np.empty_like(y) for perm in permute_order(n_cases, 2): y_perm[perm] = y stats.corr(y, x, corr, perm) for i in range(len(corr)): r_sp, _ = scipy.stats.pearsonr(y_perm[:, i], x) assert_almost_equal(corr[i], r_sp)
def _corr_all_orig(self, pref): df = [] for dim in self.myexp.dims: dim_data = load(pref='dis', exp=self.myexp.exp, suffix=dim)[dim] if dim_data.ndim == 3: dim_data = np.mean(dim_data, axis=0) for depth, model_name in self.myexp.models: self.myexp.set_model(model_name) dis = self.myexp.dissimilarity() layer = dis.keys()[-1] dis = dis[layer] corr = stats.corr(dis, dim_data, sel='upper') if self.myexp.bootstrap: print('bootstrapping stats...') bf = stats.bootstrap_resample( dis, dim_data, func=stats.corr, ci=None, seed=0, sel='upper', struct=self.dims[dim].ravel()) for i, b in enumerate(bf): df.append([dim, depth, model_name, layer, corr, i, b]) else: df.append([dim, depth, model_name, layer, corr, 0, np.nan]) df = pandas.DataFrame(df, columns=[ 'kind', 'depth', 'models', 'layer', 'correlation', 'iter', 'bootstrap' ]) self.save(df, pref=pref) return df
def calc(names, data, item, r, d, model=RandomForestRegressor()): model = make_model(data, names, False, r, d, model) graph = util.sequence_dotbracket_to_graph(data[item][1], data[item][2]) res = np.array(predict(model, graph)) other = np.array(data[item][0]) res, other = mask(res, other) value = corr(res, other)[0] #print '\t',len(data[item][1]),"\t", value return value
def correlation_all(trees): from scipy.stats import pearsonr as corr #We are assuming each data set is normally distributed tscores_all = np.array([]) rscores_all = np.array([]) for tree in trees: tscores, rscores = tree.scores() tscores_all = np.concatenate((tscores_all, tscores)) rscores_all = np.concatenate((rscores_all, rscores)) return corr(tscores_all, rscores_all)
def correlation_all(trees): from scipy.stats import pearsonr as corr # We are assuming each data set is normally distributed tscores_all = np.array([]) rscores_all = np.array([]) for tree in trees: tscores, rscores = tree.scores() tscores_all = np.concatenate((tscores_all, tscores)) rscores_all = np.concatenate((rscores_all, rscores)) return corr(tscores_all, rscores_all)
def corr(self): dis = self.dissimilarity() df = [] nname = models.NICE_NAMES[self.model_name].lower() for dim in self.dims: dim_data = load(pref='dis', exp=self.exp, suffix=dim) if dim_data is None: name = self.model_name self.set_model(dim) dim_data = self.dissimilarity() self.set_model(name) if dim_data is None: raise Exception('dimension data %s cannot be obtained' % dim) dim_data = dim_data[dim] if dim_data.ndim == 3: dim_data = np.mean(dim_data, axis=0) struct = self.dims[dim] if self.exp in ['fonts', 'stefania' ] else None if self.filter: dim_data = dim_data[self.sel][:, self.sel] struct = None for layer, data in dis.items(): d = data[self.sel][:, self.sel] if self.filter else data corr = stats.corr(d, dim_data, sel='upper') if self.bootstrap: print('bootstrapping stats...') bf = stats.bootstrap_resample(d, dim_data, func=stats.corr, ci=None, seed=0, sel='upper', struct=struct) for i, b in enumerate(bf): df.append([dim, nname, layer, corr, i, b]) else: df.append([dim, nname, layer, corr, 0, np.nan]) df = pandas.DataFrame(df, columns=[ 'kind', 'models', 'layer', 'correlation', 'iter', 'bootstrap' ]) self.save(df, pref='corr') if self.task == 'run': self.plot_single(df, 'corr') return df
def get_corr(ts1, ts2, period): ''' ts1 and ts2 are the two traces for computing the correlation period is how long the trace is in unit of day ''' # resample the time-series to have equal length if len(ts1)==len(ts2): pass elif len(ts1)>len(ts2): ts1 = resample(ts1,len(ts2)) else: ts2 = resample(ts2,len(ts1)) # parameter for filter fs = float(len(ts1))/(period*24*3600) lowcut = 1.0/(6*3600) #6hrs highcut = 1.0/(20*60) #20mins out_ts1 = butter_bandpass_filter(ts1, lowcut, highcut, fs, order=3) out_ts2 = butter_bandpass_filter(ts2, lowcut, highcut, fs, order=3) #return the corr-coef return corr(out_ts1, out_ts2)[0]
def write_sig_data(round, clim_data, hindcast, ensemble): from scipy.stats import pearsonr as corr rho, p = corr(clim_data, hindcast) #_Write the round and model correlation line = '%i,%.2f,' % (round, rho) #_Get whole model hss and rpss, write to file. hss = HSS(clim_data, hindcast) rpss = RPSS(clim_data, ensemble) line = line + '%.2f,%.2f,' % (rpss['all'], hss['all']) #_Loop through driest/wettest 10/20/30 years and write information for n in [10, 20, 30]: hss = HSS(clim_data, hindcast, n, n ) rpss = RPSS(clim_data, ensemble, n, n ) line = line + '%.2f,%.2f,%.2f,%.2f,' % \ (rpss['dry'], rpss['wet'], hss['dry'], hss['wet']) #_Remove the last comma and return to newline line = line[:-1] + '\n' return line
def corr(self): dis = self.dissimilarity() df = [] nname = models.NICE_NAMES[self.model_name].lower() for dim in self.dims: dim_data = load(pref="dis", exp=self.exp, suffix=dim) if dim_data is None: name = self.model_name self.set_model(dim) dim_data = self.dissimilarity() self.set_model(name) if dim_data is None: raise Exception("dimension data %s cannot be obtained" % dim) dim_data = dim_data[dim] if dim_data.ndim == 3: dim_data = np.mean(dim_data, axis=0) struct = self.dims[dim] if self.exp in ["fonts", "stefania"] else None if self.filter: dim_data = dim_data[self.sel][:, self.sel] struct = None for layer, data in dis.items(): d = data[self.sel][:, self.sel] if self.filter else data corr = stats.corr(d, dim_data, sel="upper") if self.bootstrap: print("bootstrapping stats...") bf = stats.bootstrap_resample( d, dim_data, func=stats.corr, ci=None, seed=0, sel="upper", struct=struct ) for i, b in enumerate(bf): df.append([dim, nname, layer, corr, i, b]) else: df.append([dim, nname, layer, corr, 0, np.nan]) df = pandas.DataFrame(df, columns=["kind", "models", "layer", "correlation", "iter", "bootstrap"]) self.save(df, pref="corr") if self.task == "run": self.plot_single(df, "corr") return df
def _corr_all_orig(self, pref): df = [] for dim in self.myexp.dims: dim_data = load(pref="dis", exp=self.myexp.exp, suffix=dim)[dim] if dim_data.ndim == 3: dim_data = np.mean(dim_data, axis=0) for depth, model_name in self.myexp.models: self.myexp.set_model(model_name) dis = self.myexp.dissimilarity() layer = dis.keys()[-1] dis = dis[layer] corr = stats.corr(dis, dim_data, sel="upper") if self.myexp.bootstrap: print("bootstrapping stats...") bf = stats.bootstrap_resample( dis, dim_data, func=stats.corr, ci=None, seed=0, sel="upper", struct=self.dims[dim].ravel() ) for i, b in enumerate(bf): df.append([dim, depth, model_name, layer, corr, i, b]) else: df.append([dim, depth, model_name, layer, corr, 0, np.nan]) df = pandas.DataFrame(df, columns=["kind", "depth", "models", "layer", "correlation", "iter", "bootstrap"]) self.save(df, pref=pref) return df
def test_ml(stock='F', forecast_out=5, month=None, day=None, year=2019, plot=False, volume=False): # Assume input day is valid trading day # Want to separate 1 percent of the data to forecast # Today info if (month == None or day == None): today = datetime.datetime.now() month = today.month day = today.day end_date = dt(year, month, day) trading_days = get_trading_days([2017, 2018, 2019]) end_idx = np.where(end_date == trading_days)[0][0] end = trading_days[end_idx - forecast_out] new_start = trading_days[end_idx - forecast_out] new_end = trading_days[end_idx] # For prediction start = datetime.datetime(2016, 4, 1) df = read_data(stock, start, end) #df = web.DataReader(stock, 'yahoo', start, end) #print(df.index) df = read_data(stock, start, end) if (df.empty): #print("SHOULD BE EMPTY") return [0] * 10, "ERROR" df = df[df.index <= end] #print(df.tail(forecast_out)) dfreg = df.loc[:, ['adjusted close', 'volume']] dfreg['HL_PCT'] = (df['high'] - df['low']) / df['adjusted close'] * 100.0 dfreg['PCT_change'] = (df['adjusted close'] - df['open']) / df['open'] * 100.0 # For volume testing if (volume): dfreg['adjusted close'] = dfreg['volume'] dfreg['EMA'] = get_ema(dfreg, forecast_out) if (dfreg['EMA'].empty): return [0] * 10, "ERROR" dfreg['old close'] = dfreg['adjusted close'] dfreg['adjusted close'] = dfreg['EMA'] # For validation #print("NEW START: \t{}".format(new_start)) #print("NEW END: \t{}".format(new_end)) #print("VALIDATION START: {} END: {}\n".format(new_start, new_end)) #new_df = web.DataReader(stock, 'yahoo', new_start, new_end) new_df = read_data(stock, new_start, new_end) #print("TESTING VALIDATION DATA") if (new_df.empty): return [0] * 10, "ERROR" #print(new_end) new_df = new_df[new_df.index <= new_end] #print(new_df) #exit(1) new_dfreg = new_df.loc[:, ['adjusted close', 'volume']] new_dfreg['HL_PCT'] = (new_df['high'] - new_df['low']) / new_df['adjusted close'] * 100.0 new_dfreg['PCT_change'] = (new_df['adjusted close'] - new_df['open']) / new_df['open'] * 100.0 # Drop missing value dfreg.fillna(value=-99999, inplace=True) new_dfreg.fillna(value=-99999, inplace=True) # Searating the label here, we want to predict the Adjclose forecast_col = 'adjusted close' dfreg['label'] = dfreg[forecast_col].shift(-forecast_out) X = np.array(dfreg.drop(['label'], 1)) # Scale X for linear regression X = preprocessing.scale(X) # Finally want late X and early X for model X_lately = X[-forecast_out:] X = X[:-forecast_out] # Separate label and identify it as y y = np.array(dfreg['label']) y = y[:-forecast_out] # Training and testing sets X_train = X[:len(X) - forecast_out] X_test = X[len(X) - forecast_out:] y_train = y[:len(y) - forecast_out] y_test = y[len(y) - forecast_out:] # LinReg clfreg = LinearRegression(n_jobs=-1) # QuadReg2 clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge()) # QuadReg3 clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge()) # QuadReg4 clfpoly4 = make_pipeline(PolynomialFeatures(4), Ridge()) # QuadReg5 clfpoly5 = make_pipeline(PolynomialFeatures(5), Ridge()) # KNN Regression clfknn = KNeighborsRegressor(n_neighbors=2) # Bayesian Ridge clfbayr = BayesianRidge() # Neural Network clfmlp = MLPRegressor(hidden_layer_sizes=(100, 100, 100), learning_rate='adaptive', solver='adam', max_iter=5, verbose=False) # Random Forest Regressor clfrfr = RFR(n_estimators=15) # Support Vector Regressor clfsvr = SVR(gamma='auto') threads = [] models = [ clfreg, clfpoly2, clfpoly3, clfpoly4, clfpoly5, clfknn, clfbayr, clfrfr, clfsvr ] fits = [''] * len(models) for i in range(len(models)): process = Thread(target=fitting, args=[models[i], X_train, y_train, fits, i], name=stock) process.start() threads.append(process) for process in threads: process.join() start = time.time() try: reg_forecast = fits[0].predict(X_lately) poly2_forecast = fits[1].predict(X_lately) poly3_forecast = fits[2].predict(X_lately) poly4_forecast = fits[3].predict(X_lately) poly5_forecast = fits[4].predict(X_lately) try: knn_forecast = fits[5].predict(X_lately) except ValueError: #print("KNN ERROR: {}".format(stock)) #print("F*****g really: {}".format(stock)) #print(X_lately) #print(X_lately.shape) knn_forecast = np.zeros(poly5_forecast.shape) #exit(1) bayr_forecast = fits[6].predict(X_lately) rfr_forecast = fits[7].predict(X_lately) svr_forecast = fits[8].predict(X_lately) mlp_forecast = fits[6].predict(X_lately) except AttributeError: #print("ISSUES WITH {}".format(stock)) return [0] * 10, {} #print(fits) #print(threads) #print(X_train, y_train) #print(X, y) #print(stock) #print(dfreg) #exit(1) #mlp_forecast = clfmlp.predict(X_lately) # Set up dataframe dfreg['reg_forecast'] = np.nan dfreg['poly2_forecast'] = np.nan dfreg['poly3_forecast'] = np.nan dfreg['poly4_forecast'] = np.nan dfreg['poly5_forecast'] = np.nan dfreg['knn_forecast'] = np.nan dfreg['bayr_forecast'] = np.nan dfreg['mlp_forecast'] = np.nan dfreg['rfr_forecast'] = np.nan dfreg['svr_forecast'] = np.nan last_date = dfreg.iloc[-1].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in zip(reg_forecast, poly2_forecast, poly3_forecast, poly4_forecast, poly5_forecast, knn_forecast, bayr_forecast, mlp_forecast, rfr_forecast, svr_forecast): next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg.loc[next_date] = list( [np.nan for _ in range(len(dfreg.columns) - 10)] + list(i)) #dfreg['mean_forecast'] = dfreg[['poly2_forecast', 'poly3_forecast']].mean(axis=1) #print(dfreg.tail(forecast_out+1)) dfreg['mean_forecast'] = dfreg[[ 'reg_forecast', 'poly2_forecast', 'poly3_forecast', 'knn_forecast', 'bayr_forecast', # mlp_forecast, 'rfr_forecast', 'svr_forecast' ]].mean(axis=1) as_list = dfreg.index.tolist() # I THINK THIS IS FIXED #print(as_list[-forecast_out-5:]) #for asd in as_list[-forecast_out-1:]: # print(asd) #print() #for asd in new_df.index.tolist():#[:forecast_out]: # print(asd) as_list[-forecast_out:] = new_df.index.tolist()[1:] try: dfreg.index = as_list except: print("DATA MISALIGNMENT FOR: {}".format(stock)) #print(new_df) #print(dfreg.tail(forecast_out+1)) #exit(1) return [0] * 10, {} #for asd in as_list[-forecast_out-5:]: # print(asd) dfreg[-forecast_out:].index = new_df.index.tolist()[:forecast_out] #print(dfreg.tail(forecast_out+1)) #return [None]*10, None #exit(1) # # Trying to do all combinations # forecasts = [ 'reg_forecast', 'poly2_forecast', 'poly3_forecast', 'poly4_forecast', 'poly5_forecast', 'knn_forecast', 'bayr_forecast', 'rfr_forecast', 'svr_forecast' ] if (plot): dfreg['old close'].tail(20).plot(figsize=(20, 12), lw=2) dfreg['adjusted close'].tail(20).plot(figsize=(20, 12), lw=2) dfreg['reg_forecast'].tail(20).plot(lw=0.5) dfreg['poly2_forecast'].tail(20).plot(lw=0.5) dfreg['poly3_forecast'].tail(20).plot(lw=0.5) dfreg['poly4_forecast'].tail(20).plot(lw=0.5) dfreg['poly5_forecast'].tail(20).plot(lw=0.5) dfreg['knn_forecast'].tail(20).plot(lw=0.5) dfreg['bayr_forecast'].tail(20).plot(lw=0.5) dfreg['mean_forecast'].tail(20).plot(c='k') #dfreg['mlp_forecast'].tail(20).plot() dfreg['rfr_forecast'].tail(20).plot(lw=0.5) dfreg['svr_forecast'].tail(20).plot(lw=0.5) new_dfreg['Actual close'] = new_df['adjusted close'] if (plot): new_dfreg['Actual close'].tail(20).plot(c='g', lw=2) fit = np.polyfit([i for i in range(forecast_out)], dfreg['mean_forecast'].values[-forecast_out:], deg=1) #print("CALCULATING CORRELATION BETWEEN METHOD AND ACTUAL") actual = new_dfreg['Actual close'].tail(forecast_out) highest_corr = 0 best_comb = '' num_combs = 0 correlations = [] good_combinations = [] #for j in range(1,9): # for comb in combinations(forecasts, j): # num_combs += 1 # comb_dat = dfreg[[*list(comb)]].mean(axis=1).tail(forecast_out) # new_correlation = corr(comb_dat, actual)[0] # correlations.append(new_correlation) # if(new_correlation > 0.4): # good_combinations.append(comb) # if(new_correlation > highest_corr): # highest_corr = new_correlation # best_comb = comb for comb in all_combinations: num_combs += 1 comb_dat = dfreg[[*list(comb)]].mean(axis=1).tail(forecast_out) new_correlation = corr(comb_dat, actual)[0] correlations.append(new_correlation) if (new_correlation > 0.4): good_combinations.append(comb) if (new_correlation > highest_corr): highest_corr = new_correlation best_comb = comb reg_dat = dfreg['reg_forecast'].tail(forecast_out) reg_corr = corr(reg_dat, actual) #print("Linear Regression: {}".format(reg_corr)) poly2_dat = dfreg['poly2_forecast'].tail(forecast_out) poly2_corr = corr(poly2_dat, actual) #print("Poly2: {}".format(poly2_corr)) poly3_dat = dfreg['poly3_forecast'].tail(forecast_out) poly3_corr = corr(poly3_dat, actual) #print("Poly3: {}".format(poly3_corr)) poly4_dat = dfreg['poly4_forecast'].tail(forecast_out) poly4_corr = corr(poly4_dat, actual) #print("Poly3: {}".format(poly3_corr)) poly5_dat = dfreg['poly5_forecast'].tail(forecast_out) poly5_corr = corr(poly5_dat, actual) #print("Poly3: {}".format(poly3_corr)) knn_dat = dfreg['knn_forecast'].tail(forecast_out) knn_corr = corr(knn_dat, actual) #print("K Nearest Neighbors: {}".format(knn_corr)) bayr_dat = dfreg['bayr_forecast'].tail(forecast_out) bayr_corr = corr(bayr_dat, actual) #print("Bayesian: {}".format(bayr_corr)) rfr_dat = dfreg['rfr_forecast'].tail(forecast_out) rfr_corr = corr(rfr_dat, actual) #print("Random Forest: {}".format(rfr_corr)) svr_dat = dfreg['svr_forecast'].tail(forecast_out) svr_corr = corr(svr_dat, actual) #print("Support Vector: {}".format(rfr_corr)) mean_dat = dfreg['mean_forecast'].tail(forecast_out) mean_corr = corr(mean_dat, actual) if (plot): plt.legend(loc='best') plt.xlabel('Date') plt.ylabel('Price') plt.title(stock) plt.savefig("./test_plots/{1}_{2}/{0}_{1}_{2}_{3}".format( stock, month, day, forecast_out)) plt.close() return (reg_corr[0], poly2_corr[0], poly3_corr[0], poly4_corr[0], poly5_corr[0],\ knn_corr[0], bayr_corr[0], rfr_corr[0], mean_corr[0], svr_corr[0]), good_combinations
def crossvalpcr(self, xval = True, debug = False): #Must set phase with bootcorr, and then use crossvalpcr, as it just uses the corr_grid attribute import numpy as np from numpy import array from scipy.stats import pearsonr as corr from scipy.stats import linregress from matplotlib import pyplot as plt from atmos_ocean_data import weightsst predictand = self.clim_data if self.corr_grid.mask.sum() >= len(self.sst.lat) * len(self.sst.lon) - 4: yhat = np.nan e = np.nan index = self.clim_data.index hindcast = pd.Series(data = yhat, index = index) error = pd.Series(data = e, index = index) self.correlation = np.nan self.hindcast = np.nan self.hindcast_error = np.nan self.flags['noSST'] = True return self.flags['noSST'] = False sstidx = self.corr_grid.mask == False n = len(predictand) yhat = np.zeros(n) e = np.zeros(n) idx = np.arange(n) params = [] std_errs = [] p_vals = [] t_vals = [] if not xval: rawSSTdata = weightsst(self.sst).data rawdata = rawSSTdata[:, sstidx] cvr = np.cov(rawdata.T) eigval, eigvec = np.linalg.eig(cvr) eigvalsort = np.argsort(eigval)[::-1] eigval = eigval[eigvalsort] eigval = np.real(eigval) ncomp = 1 eof_1 = eigvec[:,:ncomp] #_fv stands for Feature Vector, in this case EOF-1 eof_1 = np.real(eof_1) pc_1 = eof_1.T.dot(rawdata.T).squeeze() self.pc1 = pc_1 return pc_1 for i in idx: test = idx == i train = idx != i rawSSTdata = weightsst(self.sst).data[train] droppedSSTdata = weightsst(self.sst).data[test] rawdata = rawSSTdata[:, sstidx]# dropped_data = droppedSSTdata[:,sstidx].squeeze() #U, s, V = np.linalg.svd(rawdata) #pc_1 = V[0,:] #_Rows of V are principal components #eof_1 = U[:,0].squeeze() #_Columns are EOFS #EIGs = s**2 #_s is square root of eigenvalues cvr = np.cov(rawdata.T) eigval, eigvec = np.linalg.eig(cvr) eigvalsort = np.argsort(eigval)[::-1] eigval = eigval[eigvalsort] eigval = np.real(eigval) ncomp = 1 eof_1 = eigvec[:,:ncomp] #_fv stands for Feature Vector, in this case EOF-1 eof_1 = np.real(eof_1) pc_1 = eof_1.T.dot(rawdata.T).squeeze() slope, intercept, r_value, p_value, std_err = linregress(pc_1, predictand[train]) predictor = dropped_data.dot(eof_1) yhat[i] = slope * predictor + intercept e[i] = predictand[i] - yhat[i] params.append(slope); std_errs.append(std_err); p_vals.append(p_value) t_vals.append(slope/std_err) r, p = corr(predictand, yhat) index = self.clim_data.index hindcast = pd.Series(data = yhat, index = index) error = pd.Series(data = e, index = index) self.hindcast = hindcast self.hindcast_error = error self.correlation = round(r, 2) self.reg_stats = { 'params' : array(params), 'std_errs' : array(std_errs), 't_vals' : array(t_vals), 'p_vals' : array(p_vals)} return
def crossvalpcr(self, fig = None, ax = None, phase = 'allyears', onlySST = False, debug = False): #Must set phase with bootcorr, and then use crossvalpcr, as it just uses the corr_grid attribute import numpy as np from scipy.stats import pearsonr as corr from scipy.stats import linregress from matplotlib import pyplot as plt """ if fig == None: fig = plt.figure() ax = fig.add_subplot(111) """ #Set up predictand from climate data predictand = self.clim_data[phase] #predictand = (predictand - predictand.mean())/predictand.std() #self.predictand[phase] = predictand #Get an index of every significantly correlated gridpoint for the predictor fields if self.corr_grid['sst'][phase].mask.sum() >= 16019: print ('No sig SST or SLP') yhat = np.zeros(len(self.clim_data[phase])) e = np.zeros(len(self.clim_data[phase])) index = self.clim_data[phase].index hindcast = pd.Series(data = yhat, index = index) error = pd.Series(data = e, index = index) self.hindcast[phase] = hindcast self.hindcast_error[phase] = error self.nosigSST = True return else: self.nosigSST = False sstidx = self.corr_grid['sst'][phase].mask == False # if self.corr_grid['slp'][phase].mask.sum() == 2664: # onlySST = True if not onlySST: slpidx = self.corr_grid['slp'][phase].mask == False #Set up some empty variables n = len(predictand) yhat = np.zeros(n) e = np.zeros(n) idx = np.arange(n) for i in idx: test = idx == i train = idx != i rawSSTdata = self.sst[phase][train] rawSLPdata = self.slp[phase][train] droppedSSTdata = self.sst[phase][test] droppedSLPdata = self.slp[phase][test] if onlySST: rawdata = rawSSTdata[:, sstidx]#.T dropped_data = droppedSSTdata[:,sstidx].squeeze() else: rawdata = np.concatenate((rawSSTdata[:, sstidx], rawSLPdata[:, slpidx]), axis = 1) #.T dropped_data = np.concatenate((droppedSSTdata[:,sstidx], droppedSLPdata[:,slpidx]), axis = 1)#.T.squeeze() #U, s, V = np.linalg.svd(rawdata) #pc_1 = V[0,:] #_Rows of V are principal components #eof_1 = U[:,0].squeeze() #_Columns are EOFS #EIGs = s**2 #_s is square root of eigenvalues cvr = np.cov(rawdata.T) eigval, eigvec = np.linalg.eig(cvr) eigvalsort = np.argsort(eigval)[::-1] eigval = eigval[eigvalsort] eigval = np.real(eigval) ncomp = 1 eof_1 = eigvec[:,:ncomp] #_fv stands for Feature Vector, in this case EOF-1 eof_1 = np.real(eof_1) pc_1 = eof_1.T.dot(rawdata.T).squeeze() slope, intercept, r_value, p_value, std_err = linregress(pc_1, predictand[train]) predictor = dropped_data.dot(eof_1) yhat[i] = slope * predictor + intercept e[i] = predictand[i] - yhat[i] c = corr(predictand, yhat) """ ax.scatter(predictand, yhat) ax.set_title('%s, r = %f' % (phase, round(c[0],2))) ax.axis([0,15,0,15]) """ index = self.clim_data[phase].index hindcast = pd.Series(data = yhat, index = index) error = pd.Series(data = e, index = index) self.hindcast[phase] = hindcast self.hindcast_error[phase] = error return fig, ax
def bootcorr(self, n = 100, fig = None, ax = None, field = 'sst', \ phase = 'allyears', corrconf = 0.9, bootconf = 0.9, cbloc = 'bottom',\ quick = False, debug = False, monte = False): from numpy import meshgrid, zeros, ma, isnan, linspace import time from random import sample if field == 'sst': fieldData = self.sst[phase] if field == 'slp': fieldData = self.slp[phase] clim_data = self.clim_data[phase] corrlevel = 1 - corrconf corr_grid = vcorr(X = fieldData, y = clim_data) n_yrs = len(clim_data) p_value = sig_test(corr_grid, n_yrs) #Mask insignificant gridpoints corr_grid = ma.masked_array(corr_grid, ~(p_value < corrlevel)) #Mask land corr_grid = ma.masked_array(corr_grid, isnan(corr_grid)) #Mask northern/southern ocean corr_grid.mask[self.lat[field] > 60] = True corr_grid.mask[self.lat[field] < -30] = True ###SAVE THE MASK TO FILTER THE BOOTSTRAP mask = corr_grid.mask if quick: self.corr_grid[field][phase] = corr_grid return ###SET UP INDICES FOR FIELD DATA### ###INITIALIZE A NEW CORR GRID#### nlat = fieldData.shape[1] nlon = fieldData.shape[2] count = np.zeros((nlat,nlon)) ntim = n dat = clim_data mask = corr_grid.mask if debug: print 'Starting %s' % phase for boot in xrange(ntim): if debug: print 'starting round %i' % boot ###SHUFFLE THE YEARS AND CREATE THE BOOT DATA### idx = np.random.randint(0, len(dat) - 1, len(dat)) bootdata = np.zeros((len(idx), nlat, nlon)) bootdata[:] = fieldData[idx] bootvar = np.zeros((len(idx))) bootvar = dat[idx] corr_grid_boot = vcorr(X = bootdata, y = bootvar) n_yrs = len(bootvar) p_value = sig_test(corr_grid_boot, n_yrs) count[p_value <= corrlevel] += 1 if debug: print 'Count max is %i' % count.max() # for lon, lat in zip(xx[~mask], yy[~mask]): # c, p = corr(bootdata[:,lat,lon], bootvar) # if p <= corrlevel: # count[lat,lon] += 1 ###GET THE ACTUAL CORRELATION AGAIN #_Mask insignificant values ###CREATE MASKED ARRAY USING THE COUNT AND BOOTCONF ATTRIBUTES corr_grid = np.ma.masked_array(corr_grid, count < bootconf * ntim) self.corr_grid[field][phase] = corr_grid if monte: n_phase = len(clim_data) n_total = len(self.clim_data['allyears']) count = np.zeros((nlat, nlon)) field = self.sst[phase] for t in xrange(100): #print 'Starting monte round %i' % t idx = sample(xrange(n_total), n_phase) var = self.clim_data['allyears'][idx] for lon, lat in zip(xx[~mask], yy[~mask]): r, p = corr(var, field[:,lat,lon]) if p <= (1 - corrconf): #print 'Entering while loop for grid %i, %i' % (i, j) x = 0 c2 = 0 while x < 100: #print 'Monte round %i' % x idx2 = np.random.randint(0, n_phase-1, n_phase) x += 1 r, p = corr(var[idx2], field[idx2,lat,lon]) if p <= (1 - corrconf): c2 += 1 if c2 >= 80: count[lat,lon] += 1 print 'Count max is %.0f' % (count.max()) self.monte_count[phase] = count.max() self.monte_grid[phase] = np.ma.masked_array(data = count, \ mask = self.corr_grid['sst'][phase].mask) return
def crossvalpcr(self, xval=True, debug=False): #Must set phase with bootcorr, and then use crossvalpcr, as it just uses the corr_grid attribute import numpy as np from numpy import array from scipy.stats import pearsonr as corr from scipy.stats import linregress from matplotlib import pyplot as plt from utils import weightsst predictand = self.clim_data if self.corr_grid.mask.sum( ) >= len(self.sst.lat) * len(self.sst.lon) - 4: yhat = np.nan e = np.nan #index = self.clim_data.index index = self.mei hindcast = pd.Series(data=yhat, index=index) error = pd.Series(data=e, index=index) self.correlation = np.nan self.hindcast = np.nan self.hindcast_error = np.nan self.flags['noSST'] = True return self.flags['noSST'] = False sstidx = self.corr_grid.mask == False n = len(predictand) yhat = np.zeros(n) e = np.zeros(n) idx = np.arange(n) params = [] std_errs = [] p_vals = [] t_vals = [] if not xval: rawSSTdata = weightsst(self.sst).data rawdata = rawSSTdata[:, sstidx] cvr = np.cov(rawdata.T) eigval, eigvec = np.linalg.eig(cvr) eigvalsort = np.argsort(eigval)[::-1] eigval = eigval[eigvalsort] eigval = np.real(eigval) ncomp = 1 eof_1 = eigvec[:, : ncomp] #_fv stands for Feature Vector, in this case EOF-1 eof_1 = np.real(eof_1) pc_1 = eof_1.T.dot(rawdata.T).squeeze() slope, intercept, r, p, err = linregress(pc_1, predictand) yhat = slope * pc_1 + intercept self.pc1 = pc_1 self.correlation = r self.hindcast = yhat return for i in idx: test = idx == i train = idx != i rawSSTdata = weightsst(self.sst).data[train] droppedSSTdata = weightsst(self.sst).data[test] rawdata = rawSSTdata[:, sstidx] # dropped_data = droppedSSTdata[:, sstidx].squeeze() #U, s, V = np.linalg.svd(rawdata) #pc_1 = V[0,:] #_Rows of V are principal components #eof_1 = U[:,0].squeeze() #_Columns are EOFS #EIGs = s**2 #_s is square root of eigenvalues cvr = np.cov(rawdata.T) #print cvr.shape eigval, eigvec = np.linalg.eig(cvr) eigvalsort = np.argsort(eigval)[::-1] eigval = eigval[eigvalsort] eigval = np.real(eigval) ncomp = 1 eof_1 = eigvec[:, : ncomp] #_fv stands for Feature Vector, in this case EOF-1 eof_1 = np.real(eof_1) pc_1 = eof_1.T.dot(rawdata.T).squeeze() slope, intercept, r_value, p_value, std_err = linregress( pc_1, predictand[train]) predictor = dropped_data.dot(eof_1) yhat[i] = slope * predictor + intercept e[i] = predictand[i] - yhat[i] params.append(slope) std_errs.append(std_err) p_vals.append(p_value) t_vals.append(slope / std_err) r, p = corr(predictand, yhat) hindcast = yhat error = e self.hindcast = hindcast self.hindcast_error = error self.correlation = round(r, 2) self.reg_stats = { 'params': array(params), 'std_errs': array(std_errs), 't_vals': array(t_vals), 'p_vals': array(p_vals) } return
def ksi_func(pp, tt): ppx, ppy = pp.T rat_x = corr(ppx, tt.flatten())[0] ** 2 rat_y = corr(ppy, tt.flatten())[0] ** 2 return 1./ rat_x + 1./ rat_y
plt.plot(range(1, ph + 1), tmp[::2], c='red', label='ask') plt.plot(range(1, ph + 1), tmp[1::2], c='green', label='bid') plt.yscale('log') plt.xlabel('$h$') plt.ylabel('Jarque-Bera statistic for $h$-step log returns') for type in ['pdf', 'png']: plt.savefig(f'{path_save}/log_returns_normality.{type}', bbox_inches='tight', dpi=300) plt.show() # %% Correlation tmp = res cor_ask = np.array( [corr(res[:, i], res[:, 0])[0] for i in range(0, res.shape[1], 2)]) cor_bid = np.array( [corr(res[:, i], res[:, 1])[0] for i in range(1, res.shape[1], 2)]) cor_ask_bid = np.array( [corr(res[:, i], res[:, 1])[0] for i in range(0, res.shape[1], 2)]) cor_bid_ask = np.array( [corr(res[:, i], res[:, 0])[0] for i in range(1, res.shape[1], 2)]) plt.plot(range(0, ph), cor_ask, c='red', label='ask') plt.plot(range(0, ph), cor_bid, c='green', label='bid') plt.plot(range(0, ph), cor_ask_bid, c='red', label='ask2', linestyle='dashed') plt.plot(range(0, ph), cor_bid_ask, c='green',
def crossvalpcr(self, xval = True, debug = False): #Must set phase with bootcorr, and then use crossvalpcr, as it just uses the corr_grid attribute import numpy as np import statsmodels.api as sm from numpy import array, ndarray, hstack, zeros, vstack from scipy.stats import pearsonr as corr from scipy.stats import linregress from matplotlib import pyplot as plt from atmos_ocean_data import weightsst predictand = self.clim_data.copy() pcs = self.pcs.copy() n = len(predictand) yhat = np.zeros(n) e = np.zeros(n) params = [] std_errs = [] p_vals = [] t_vals = [] rnd = 0 ncomps, nt = pcs.shape selection_index = range(ncomps) xval_idx = np.arange(nt) best_score = 0.01 scores = [0] overall_index = [] final_pcs = zeros((nt)) while best_score >= scores[rnd]: score = zeros((len(selection_index))) # print 'Round %i' % rnd for ind, index in enumerate(selection_index): # print 'Checking pc-%i: ' % index # print overall_index + [ind] data = pcs[overall_index + [ind]] # if rnd ==3: import pdb; pdb.set_trace() for i in xval_idx: test = xval_idx == i train = xval_idx != i rawdata = data.T[train] dropped_data = data.T[test] X = sm.add_constant(data.T[train]) y = predictand[train] olsmod = sm.OLS(y, X) olsres = olsmod.fit() intercept = array([[0]]) yhat[i] = olsres.predict(hstack((intercept, dropped_data))) e[i] = predictand[i] - yhat[i] # params.append(slope); std_errs.append(std_err); p_vals.append(p_value) # t_vals.append(slope/std_err) r, p = corr(predictand, yhat) score[ind] = abs(r) # print 'score is %.2f' % abs(r) if max(score) > best_score: best_score = max(score) best_loc = np.where(score == max(score))[0][0] best_pc = selection_index[best_loc] overall_index.append(best_pc) # print overall_index # print pcs.shape selection_index = np.delete(selection_index, best_loc) # print selection_index final_pcs = vstack((final_pcs, pcs[best_loc])) scores.append(best_score) else: break rnd += 1 self.overall_index = overall_index ### NOW REBUILD BEST MODEL ### data = final_pcs[1:] #print 'Overall index is ', overall_index for i in xval_idx: test = xval_idx == i train = xval_idx != i rawdata = data.T[train] dropped_data = data.T[test] X = sm.add_constant(data.T[train]) y = predictand[train] olsmod = sm.OLS(y, X) olsres = olsmod.fit() intercept = array([[0]]) yhat[i] = olsres.predict(hstack((intercept, dropped_data))) e[i] = predictand[i] - yhat[i] # params.append(slope); std_errs.append(std_err); p_vals.append(p_value) # t_vals.append(slope/std_err) ###Okay, best first predictor has been obtained... Now need to keep adding r, p = corr(predictand, yhat) index = self.clim_data.index hindcast = pd.Series(data = yhat, index = index) error = pd.Series(data = e, index = index) self.hindcast = hindcast self.hindcast_error = error self.correlation, _ = corr(self.clim_data, self.hindcast) # self.reg_stats = { 'params' : array(params), # 'std_errs' : array(std_errs), # 't_vals' : array(t_vals), # 'p_vals' : array(p_vals)} return
def main(): #To plot yost v theoretical # arrPitch = [] # arrRoll = [] # arrAx = [] # arrAy = [] # imu = int(input("Enter 0 for YOST IMU, 1 for MPU6050:\n")) # fileName = input("Enter the acceleration reading file path and name:\n") experiments = [ "../Data/YOST_stewart_0degPitch_10sPeriod_test_1.txt", "../Data/MPU6050_stewart_0degPitch_10sPeriod_test_1.txt", "../Data/YOST_stewart_0degPitch_20sPeriod_test_2.txt", "../Data/MPU6050_stewart_0degPitch_20Period_test_2.txt", "../Data/YOST_stewart_20degPitch_20sPeriod_test_3.txt", "../Data/MPU6050_stewart_20degPitch_20Period_test_3.txt" ] plot = True displacements = [] sigWaveHeights = [] for i in range(0, 6): arrAz = [] totalTime = 0 imu = i % 2 #YOST => 0, MPU6050 => 1 with open(experiments[i]) as f: #Valid files #YOST_stewart_0degPitch_10sPeriod_test_1.txt #YOST_stewart_0degPitch_20sPeriod_test_2.txt #YOST_stewart_20degPitch_20sPeriod_test_3.txt #MPU6050_stewart_0degPitch_10sPeriod_test_1.txt #MPU6050_stewart_0degPitch_20Period_test_2.txt #MPU6050_stewart_20degPitch_20Period_test_3.txt #If YOST IMU (imu = 0) #Data format: "%int(Month)/%int(Day)/%int(Year),%int(Hours):%int(Minutes):%float(Seconds), # %float(OrientPitch),%float(OrientYaw),%float(OrientRoll), # %float(CorrectedGyroX),%float(CorrectedGyroY),%float(CorrectedGyroZ), # %float(CorrectedAccelX),%float(CorrectedAccelY),%float(CorrectedAccelZ), # %float(CorrectedMagX),%float(CorrectedMagY),%float(CorrectedMagZ)" if (imu == 0): f.readline() # Read in first line - this is the Foramt #Get values from file startTime = 0 endTime = 0 for line in f: row = line.split(',') #Get start time if (startTime == 0): startTime = row[0].split(' ')[1] #Get end time endTime = row[0].split(' ')[1] #Select relevent accleration data - comment out if plotting yost v theoretical row = row[7:10] #Set upper bound of 0.5g Az if (float(row[1]) > 0.5 * g): row[1] = str(0.5 * -g) arrAz.append( float(row[1]) * -g) #comment out if plotting yost v theoretical #This is also used to compare yost with the true signal # arrAz.append(float(row[-5])*-g ) # arrAx.append(float(row[-6])*-g ) # arrAy.append(float(row[-4])*-g ) # arrPitch.append(float(row[1])) # arrRoll.append( float(row[3]) ) #Calculate the sampling frequency startTime = startTime.split(':') endTime = endTime.split(':') totalTime = [] totalTime.append(float(endTime[0]) - float(startTime[0])) totalTime.append(float(endTime[1]) - float(startTime[1])) totalTime.append(float(endTime[2]) - float(startTime[2])) totalTime = totalTime[0] * 60 * 60 + totalTime[ 1] * 60 + totalTime[2] #Else MPU6050 (imu = 1) #Data format: "int(timeSinceStart ms), float(accelAx mg), float(accelAy mg), float(accelAz g)" else: startTime = -1 endTime = 0 for line in f: #Format is: int ms, float ax, float ay, float az row = line.split(',') if (startTime == -1): startTime = float(row[0]) * 10**-3 endTime = float(row[0]) * 10**-3 #Set upper bound of 0.5g Az if (float(row[3]) > 0.5 * g): row[1] = str(0.5 * -g) #arrAx.append(float(row[1])*-g/1000 ) #arrAy.append(float(row[2])*-g/1000 ) arrAz.append(float(row[3]) * -g) totalTime = endTime - startTime fs = len(arrAz) / (totalTime) #Sampling frequency fs = round(fs) #Account for errors ##Debuging and graphing #print("Sampling rate = " + str(fs)) #trueVerticalAcceleration(arrAx, arrAy, arrAz, arrPitch,arrRoll, fs) ##EndDebug #Condition signal: azFiltered = cond.condition(arrAz, fs, plot) #Calculate Wave height time series eta, times = heightTimeSeries(azFiltered, fs, plot, plot, plot) #Resample to allow for comparison between the imus (has to have same amount of samples) eta180, times = sig.resample(eta, 180, t=times) if (plot): plt.plot(times, eta180, label="Reasmpled heights") plt.legend(loc='lower right') plt.show() displacements.append(eta180) ht = significantWaveHeight(eta) hs = spectralSignificantWaveHeight(eta, fs) sigWaveHeights.append((ht, hs)) # print(displacements) h = 0.045 c = 0.155 f = 0.1 t = np.arange(0, 90, 0.5) s = h * np.sin(2 * np.pi * f * t) for j in range(0, 6): if (j % 2 == 0): print("YOST Significant Wave Height (Ht, Hs) for test " + str(round(j * 2 / 5)) + ": Ht=" + '{:6f}'.format(sigWaveHeights[j][0] * 1000) + "mm Hs=" + '{:6f}'.format(sigWaveHeights[j][1] * 1000)) else: print("MPU6050 Significant Wave Height(Ht, Hs) for test " + str(round(j * 2 / 5)) + ": Ht=" + '{:6f}'.format(sigWaveHeights[j][0] * 1000) + "mm Hs=" + '{:6f}'.format(sigWaveHeights[j][1] * 1000)) print("Theoretical Significant Wave Height: " + '{:6f}'.format(significantWaveHeight(s) * 1000) + "mm") for k in range(0, 6, 2): print("Pearson coerrelation coefficient between IMUs for test " + str(int(k / 2)) + " is: " + '{:6f}'.format( abs(corr(displacements[k], displacements[k + 1])[0])))
if(self.dimensionality(n+1)>=dimensionality): return(n+1) if __name__ == "__main__": from scipy.stats import pearsonr as corr import matplotlib.pyplot as plt # Test dataset of random value m, n = 128, 10 weightMatrix = np.random.randn(m, n) pcaObj = PCA(matrix=weightMatrix) # Check that we get back the same matrix we put in when we set the number # of selected components equal to a number of possible values (up to n) for i in [n,5,1]: filteredMatrix = pcaObj.filterMatrix(n=i) pcaObjFiltered = PCA(matrix=filteredMatrix) isMatrixSame = np.allclose(weightMatrix,filteredMatrix) r = corr(weightMatrix.flatten(),filteredMatrix.flatten()) print("\nIs PCA Filtered Matrix same (n={})?: {}".format(n,isMatrixSame)) print("Original matrix: {}".format( weightMatrix.flatten())) print("Filtered matrix: {}".format(filteredMatrix.flatten())) print("Pearson's Correlation: r = {:4.3f}".format(r[0])) print('Target Dimensionality at 50%: {}'.format(pcaObj.computeTargetPCs(0.5))) (fig,ax)=plt.subplots(nrows=2,ncols=1) ax[0].imshow(weightMatrix.T) ax[0].set_title('Original dimensionality: {:4.2f}'.format(pcaObj.dimensionality())) ax[1].imshow(filteredMatrix.T) ax[1].set_title('Filtered dimensionality: {:4.2f}'.format(pcaObjFiltered.dimensionality()))