class Simple: def __init__(self, a, b, c, d): self.model = TheilSenRegressor() def update_a_b(self, x, y): self.model.fit(x.reshape(-1, 1), y) def set_c_d(self, c, d): pass def get_y(self, x): return self.model.predict(x.reshape(-1, 1)) def get_likelihood(self, x, y): return 1 / float(x.shape[0]) * np.sum(np.abs(y - self.get_y(x))) def to_string(self): return "a:{}, b:{}".format(self.model.coef_, self.model.intercept_) def get_a_b(self): return self.model.coef_, self.model.intercept_ @staticmethod def var_to_weight(v): return 1 @staticmethod def get_c_d(x, r): return None, None
def robust_cor(x, y): if isinstance(x[0], list): x = list(map(list, zip(*x))) else: x = np.array(x).reshape(-1, 1) X = np.array(x) Y = np.array(y) theil_regr = TheilSenRegressor(random_state=42) theil_regr.fit(X, Y) y_pred = theil_regr.predict(X) res = y_pred - y tot_dev = y - np.mean(y) SSres = np.dot(res, res) SStot = np.dot(tot_dev, tot_dev) adjR2 = 1 - (SSres / SStot) * (X.shape[0] - 1) / (X.shape[0] - X.shape[1] - 1) sgn = np.sign(theil_regr.coef_)[0] if adjR2 > 0: corr_val = sgn * np.sqrt(adjR2) else: corr_val = 0 return [ corr_val, theil_regr.coef_, theil_regr.intercept_, theil_regr.breakdown_ ]
def getscore_getnext(df, days_ahead, coin): forecast_val = days_ahead forecast_col = 'close' df.fillna(value=-99999, inplace=True) df['label'] = df[forecast_col].shift(-forecast_val) #X = X[:-forecast_val] X = np.array(df.drop(['label', 'date'], 1)) X = preprocessing.scale(X) futureX = X[-1:] X = X[:-forecast_val] df.dropna(inplace=True) y = np.array(df['label']) X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.15) ''' inPickle = open('%s.pickle' %(coin), 'rb') clf = pickle.load(inPickle) ''' clf = TheilSenRegressor() clf.fit(X_train, y_train) confidence = clf.score(X_test, y_test) #print "accuracy with 1.0 being perfect:", (confidence) futureval = clf.predict(futureX) return (confidence, futureval)
class Regressor(BaseEstimator): def __init__(self): self.regressorName="linear" if self.regressorName=="rf": self.clf= RandomForestRegressor(n_estimators=30, max_depth=63,max_features=50, n_jobs=-1) elif self.regressorName=="gb": self.clf= GradientBoostingRegressor(alpha=0.9, init=None,max_depth=3, learning_rate=0.2, loss='ls' ,max_features=None,min_samples_leaf=1, min_samples_split=2,min_weight_fraction_leaf=0.0 ,n_estimators=2500,presort='auto', random_state=None, subsample=1.0, verbose=0,warm_start=True) #self.clf =GridSearchCV(estimator=gb, param_grid=self.getParamGrid(),scoring='mean_squared_error',cv=3,n_jobs=-1) #self.clf=gb elif self.regressorName=="ridge": self.clf = RidgeCV(alphas=(0.01, 0.1), fit_intercept=True, normalize=False, scoring=None, cv=5, gcv_mode=None, store_cv_values=False) elif self.regressorName=="linear": self.clf = LinearRegression() elif self.regressorName=="lasso": self.clf = LassoCV(cv=10) elif self.regressorName=="svr": self.clf = SVR(kernel='rbf',C=0.2, gamma=0.01) elif self.regressorName=="knn": self.clf = neighbors.KNeighborsRegressor(1, weights='distance',n_jobs=-1) elif self.regressorName=="gauss": self.clf = TheilSenRegressor() def fit(self, X, y): X=csc_matrix(X) print "Training Algorithm" self.clf.fit(X, y) #print self.clf.best_estimator_ def predict(self, X): X=csr_matrix(X) print "Testing Algorithm" return self.clf.predict(X) def getRegressor(self): return self.clf def getRegressorName(self): return self.regressorName def getParamGrid(self): if self.regressorName=="rf": defaultGrid=[None] maxDepthGrid=np.arange(10,70,7) maxFeaturesGrid=["sqrt","log2",None] maxTreesGrid=np.arange(10,100,10) param_grid = {'max_features': defaultGrid} elif self.regressorName == "gb": #maxDepthGrid=np.arange(3,20,5) learningRateGrid=np.arange(50,100,10) #param_grid = {'max_depth': maxDepthGrid} #param_grid={'loss':['ls', 'lad', 'huber', 'quantile']} param_grid={'alpha':[0.9]} return param_grid
class Regressor(BaseEstimator): def __init__(self): self.regressorName="gb" if self.regressorName=="rf": self.clf= RandomForestRegressor(n_estimators=400, max_depth=63,max_features=50, n_jobs=-1) elif self.regressorName=="gb": self.clf= GradientBoostingRegressor(alpha=0.9, init=None,max_depth=3, learning_rate=0.2, loss='ls' ,max_features=None,min_samples_leaf=1, min_samples_split=2,min_weight_fraction_leaf=0.0 ,n_estimators=2500,presort='auto', random_state=None, subsample=1.0, verbose=0,warm_start=True) #self.clf =GridSearchCV(estimator=gb, param_grid=self.getParamGrid(),scoring='mean_squared_error',cv=3,n_jobs=-1) #self.clf=gb elif self.regressorName=="ridge": self.clf = RidgeCV(alphas=(0.01, 0.1), fit_intercept=True, normalize=False, scoring=None, cv=5, gcv_mode=None, store_cv_values=False) elif self.regressorName=="linear": self.clf = LinearRegression(alpha=0.01,max_iter=5000) elif self.regressorName=="lasso": self.clf = LassoCV(cv=10) elif self.regressorName=="svr": self.clf = SVR(kernel='rbf',C=0.2, gamma=0.01) elif self.regressorName=="knn": self.clf = neighbors.KNeighborsRegressor(1, weights='distance',n_jobs=-1) elif self.regressorName=="gauss": self.clf = TheilSenRegressor() def fit(self, X, y): #X=csc_matrix(X) self.clf.fit(X, y) #print self.clf.best_estimator_ def predict(self, X): #X=csr_matrix(X) return self.clf.predict(X) def getRegressor(self): return self.clf def getRegressorName(self): return self.regressorName def getParamGrid(self): if self.regressorName=="rf": defaultGrid=[None] maxDepthGrid=np.arange(10,70,7) maxFeaturesGrid=["sqrt","log2",None] maxTreesGrid=np.arange(10,100,10) param_grid = {'max_features': defaultGrid} elif self.regressorName == "gb": #maxDepthGrid=np.arange(3,20,5) learningRateGrid=np.arange(50,100,10) #param_grid = {'max_depth': maxDepthGrid} #param_grid={'loss':['ls', 'lad', 'huber', 'quantile']} param_grid={'alpha':[0.9]} return param_grid
def _fit_robust_line(shifts): """ Use a robust linear regression algorithm to fit a line to the data.""" from sklearn.linear_model import TheilSenRegressor X = np.arange(len(shifts)).reshape(-1, 1) y = shifts model = TheilSenRegressor() # robust regression model.fit(X, y) line = model.predict(X) return line
class _TheilSenRegressorImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X)
def theilsen_regress_predict(var): """ Input:- var: 1-D array var regressortype = LinearRegression, TheilSenRegressor Output: regression coefficient """ regressor = TheilSenRegressor() y = np.asarray(var).reshape(-1, 1) X = np.arange(len(y)).reshape(-1, 1) regressor.fit(X, y) return regressor.predict(X)
def test_less_samples_than_features(): random_state = np.random.RandomState(0) n_samples, n_features = 10, 20 X = random_state.normal(size=(n_samples, n_features)) y = random_state.normal(size=n_samples) # Check that Theil-Sen falls back to Least Squares if fit_intercept=False theil_sen = TheilSenRegressor(fit_intercept=False, random_state=0).fit(X, y) lstq = LinearRegression(fit_intercept=False).fit(X, y) assert_array_almost_equal(theil_sen.coef_, lstq.coef_, 12) # Check fit_intercept=True case. This will not be equal to the Least # Squares solution since the intercept is calculated differently. theil_sen = TheilSenRegressor(fit_intercept=True, random_state=0).fit(X, y) y_pred = theil_sen.predict(X) assert_array_almost_equal(y_pred, y, 12)
def test_less_samples_than_features(): random_state = np.random.RandomState(0) n_samples, n_features = 10, 20 X = random_state.normal(size=(n_samples, n_features)) y = random_state.normal(size=n_samples) # Check that Theil-Sen falls back to Least Squares if fit_intercept=False theil_sen = TheilSenRegressor(fit_intercept=False, random_state=0).fit(X, y) lstq = LinearRegression(fit_intercept=False).fit(X, y) assert_array_almost_equal(theil_sen.coef_, lstq.coef_, 12) # Check fit_intercept=True case. This will not be equal to the Least # Squares solution since the intercept is calculated differently. theil_sen = TheilSenRegressor(fit_intercept=True, random_state=0).fit(X, y) y_pred = theil_sen.predict(X) assert_array_almost_equal(y_pred, y, 12)
class r07522507_TheilSenRegressor(regression): def trainAlgo(self): self.model = TheilSenRegressor( fit_intercept=self.param['fit_intercept'], copy_X=self.param['copy_X'], max_subpopulation=self.param['max_subpopulation'], n_subsamples=self.param['n_subsamples'], max_iter=self.param['max_iter'], tol=self.param['tol'], random_state=self.param['random_state'], verbose=self.param['verbose'], ) self.model.fit(self.inputData['X'], self.outputData['Y']) def predictAlgo(self): self.result['Y'] = self.model.predict(self.inputData['X'])
def train_and_return_model_replicas(self, host, port, username, password, appType, appNames, folderNames): df = self.getAndCombineAllDbs(host, port, username, password, appNames, folderNames) df['total_cpu_util'] = df['pod_util_cpu_avg'] * df['num_pods'] df['total_mem_util'] = df['pod_util_mem_avg'] * df['num_pods'] df_X = df[['requests']].values df_Y = df[['total_cpu_util']].values X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, test_size=0.33, random_state=42) X, y = make_regression(n_samples=df_X.shape[0], n_features=1, noise=4.0, random_state=0) regr = TheilSenRegressor(random_state=0).fit(X_train, y_train) regr.score(X, y) y_pred = regr.predict(X_test) rms = sqrt(mean_squared_error(y_test, y_pred)) print('RMs score: %.2f' % rms) return regr, rms
fontsize=18) plt.show() # - from sklearn.linear_model import TheilSenRegressor # + lr.fit(X, y) # Entreno RANSAC theil_model = TheilSenRegressor(random_state=42).fit(X, y) # Datos predichos para graficar después line_X = np.arange(X.min(), X.max())[:, np.newaxis] line_y = lr.predict(line_X) line_y_theil = theil_model.predict(line_X) lw = 2 fig = plt.figure(figsize=(12, 6), dpi=100) plt.scatter(X, y, marker='.') plt.plot(line_X, line_y, color='navy', linewidth=lw, label='Lineal') plt.plot(line_X, line_y_theil, color='green', linewidth=lw, label='Theil Sen') plt.plot(line_X, line_y_ransac, color='tomato', linewidth=lw, label='RANSAC') plt.xlabel('X', weight="bold", fontsize=16) plt.ylabel('Y', weight="bold", fontsize=16) plt.text( -1, 300, "y = {:.2f}x + {:.2f}".format(lr.coef_[0], lr.intercept_),
# plt.show() # Theil-Sen estimator: # General info: https://en.wikipedia.org/wiki/Theil%E2%80%93Sen_estimator # Good ONLY for LINEAR REGRESSION # Sci-kit learn implementation: http://scikit-learn.org/stable/auto_examples/linear_model/plot_theilsen.html # Init the Theil-Sen estimator instance theil = TheilSenRegressor() # Fit with the Theil-Sen estimator theil.fit(x, line_data) # Get the fitted data result line_theil = theil.predict(x) # Plot Theil-Sen results plt.plot(x, line_theil, color='red', label='Theil-Sen') plt.legend(loc='lower right') plt.show() plt.clf() ################################### # Minimization - e.g. how to find a minimum of a function? def f1(x):
import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score from sklearn.metrics import classification_report, confusion_matrix #loading the dataset train = pd.read_csv("C:/Users/HP/Desktop/train (1).csv") test = pd.read_csv("C:/Users/HP/Desktop/test (2).csv") train = train.dropna() test = test.dropna() train.head() X_train = np.array(train.iloc[:, :-1].values) y_train = np.array(train.iloc[:, 1].values) X_test = np.array(test.iloc[:, :-1].values) y_test = np.array(test.iloc[:, 1].values) #TheilSen Regressor from sklearn.linear_model import TheilSenRegressor model = TheilSenRegressor() model.fit(X_train, y_train) y_pred = model.predict(X_test) accuracy = model.score(X_test, y_test) plt.plot(X_train, model.predict(X_train), color='b') plt.show() print(accuracy) print(accuracy)
def computeLR(data: pd.DataFrame, dimensions, record_id): # reg = LinearRegression() reg = TheilSenRegressor(random_state=1, max_subpopulation=50) values = data.values numDims = np.size(values, 1) X = values[:, 0:numDims - 1] Y = values[:, numDims - 1].reshape(-1, 1) ndf = data.copy(deep=True) ndf.reset_index(drop=True, inplace=True) ndf["X"] = X ndf["Y"] = Y ndf["Filter"] = True prev_length = 0 within = None m = 0 for _ in range(10): curr_idx = ndf.index[ndf.loc[:, "Filter"]] # type: ignore curr = ndf.iloc[curr_idx, :] if prev_length == curr.shape[0]: break prev_length = curr.shape[0] x, y = curr["X"].values.reshape(-1, 1), curr["Y"].values reg.fit(x, y) ts = reg.predict(X) residuals = ts - ndf["Y"].values residuals = abs(residuals) inlier_residuals = abs(reg.predict(x) - y) m = np.median(inlier_residuals) within = residuals < (5 * m) ndf["Filter"] = within within = ndf["Filter"].astype(int) # type: ignore coeffs = reg.coef_.tolist() intercept = reg.intercept_ threshold = m # type: ignore return [ LR( dimensions=dimensions, output=",".join(map(str, within.tolist())), info=json.dumps({ "threshold": threshold, "coeff": coeffs, "intercept": intercept, "type": "within", }), record_id=record_id, ) ]
df=df[df["Fluid"]!="Oli"] df["frequency"]=df["frequency"]/60 df["frequency"]=df["frequency"].astype(float) #df=df[df["Serie"]!="B1.1"] df=df[df["Serie"]!="A1.1"] #df=df[df["Serie"]!="B1.2"] ###Power-Frequency correlation #Theil-Sen ts=TheilSenRegressor(fit_intercept=True) ts.fit(X=df[["frequency"]],y=df["power"]) df["ts-estimated"]=ts.predict(df[["frequency"]]) #Least-Squares lsq=LinearRegression() lsq.fit(X=df[["frequency"]],y=df["power"]) df["lsq-estimated"]=lsq.predict(df[["frequency"]]) print('Least Squares: P={}·n +{}, Rsq{}'.format(lsq.coef_, lsq.intercept_, lsq.score(X=df[["frequency"]],y=df["power"]))) print(mean_squared_error(df["power"],df["lsq-estimated"])) #Get confidence conf_max=[] conf_min=[] frequencydummy=[] for freq in df["frequency"].unique(): if freq != 1150/60 and freq != 1250/60: serie=df[df["frequency"]==freq]["power"]
def DumpTimestamps(video, vid_boundary_frames, output_fig_path, output_csv_path, input_vid_fname_stem, debug=True): """Takes video as input and dumps hex-encoded timestamps to file""" import pandas as pd import numpy as np all_timestamps = [] for pixels in video[:, 0, :14]: # convert pixel integers to strings with hexadecimal representation # eg integer 1 output is '0x1' # crop the '0x' off the string, and pad each digit with leading zeros if necessary. # Then join all the 2-character strings together into one big string. all_timestamps.append("".join([hex(_)[2:].zfill(2) for _ in pixels])) df = pd.DataFrame(all_timestamps, columns=['raw']) def FormatTimestamp(s): tstr = list(s) tstr.insert(8, ' ') tstr.insert(13, '-') tstr.insert(16, '-') tstr.insert(19, ' ') tstr.insert(22, ':') tstr.insert(25, ':') tstr.insert(28, '.') return "".join(tstr) # Assign each frame in the concatenated video the video part it came from df['video'] = 1 for i, frame_i in enumerate(vid_boundary_frames, start=2): rows = list(range(frame_i, len(video))) df.loc[rows, 'video'] = i # Parse timestamp df['timestamp'] = df['raw'].apply(FormatTimestamp) df['frame_index'] = df['raw'].str[:8] df['date'] = df['raw'].str[8:16] df['hour'] = df['raw'].str[16:18] df['min'] = df['raw'].str[18:20] df['rawsec'] = df['raw'].str[20:22] df['sec'] = pd.to_numeric( df['rawsec'], errors='coerce').fillna(method='ffill').astype(int) # Adjust for seconds rolling over to the next minute sec_copy = df['sec'].values.copy() t_i = sec_copy[0] for i, t_i_plus_1 in enumerate(df['sec'].values[1:], start=1): if t_i_plus_1 < t_i: sec_copy[i] += 60 df['sec'] = sec_copy df['sec'] = df['sec'].astype(str) df['sec_fraction'] = df['raw'].str[22:].str.extract( r'^(\d+)') #.str.ljust(6,'0') df['raw_time'] = pd.to_numeric(df['sec'] + '.' + df['sec_fraction'], errors='coerce').fillna(method='bfill') #from scipy.stats import linregress #slope, intercept, r_value, p_value, std_err = linregress( np.array( df.index ), df['time'].values ) #from sklearn.linear_model import RANSACRegressor #model = RANSACRegressor() from sklearn.linear_model import TheilSenRegressor model = TheilSenRegressor() X = np.array(df.index).reshape(-1, 1) Y = df['raw_time'].values model.fit(X, Y) print("Fitting timestamps with slope and intercept.") print( f"Timestamp estimated coefficients: intercept={float(model.intercept_):0.2f}, slope={float(model.coef_)*1000:0.3f}ms/frame" ) Y_pred = model.predict(X) df['adj_time'] = Y_pred df['delta t (ms)'] = (df['raw_time'] - df['raw_time'].shift()).fillna(0) * 1000 df['delta t %-ile'] = df['delta t (ms)'].rank(pct=True) import matplotlib.pyplot as plt fig, ax1 = plt.subplots(dpi=300) df.plot(y=['raw_time', 'adj_time'], ax=ax1) ax1.set_ylabel("time (s)") ax1.set_xlabel("frame index") ax1.set_title(f'"{input_vid_fname_stem}" raw and adjusted timestamps') #ax2 = ax1.twinx() #ax2.plot( (out['adj_time']-out['reg_time']), label='diff', color='r') #ax2.set_ylabel( "difference between raw time and straight line (s)") for x in vid_boundary_frames: ax1.axvline(x, linestyle='dashed', color='black') fig.savefig(str(output_fig_path)) plt.close(fig) # Explicitly close to free memory and avoid warning df.to_csv(str(output_csv_path)) print( f"Wrote \"{str( output_fig_path )}\" and \"{str( output_csv_path ) }\" to disk" ) return (df.loc[len(video) - 1, 'adj_time'] - df.loc[0, 'adj_time'])
def fix_ecg_peaks(ecg, plt=None): ecg = ecg.copy() slopesize = int(ecg.fps / 45.0) # climb to maxima, and invert if necessary ecgidx = [ max(i - slopesize, 0) + np.argmax(ecg.x[max(i - slopesize, 0):min(i + slopesize, len(ecg.x) - 1)]) for i in ecg.ibeats ] beatheight = np.mean(ecg.x[ecgidx]) - np.mean( ecg.x) # average detected beat amplitude negecgidx = [ max(i - slopesize, 0) + np.argmin(ecg.x[max(i - slopesize, 0):min(i + slopesize, len(ecg.x) - 1)]) for i in ecg.ibeats ] negbeatheight = np.mean(ecg.x[negecgidx]) - np.mean( ecg.x) # average detected beat amplitude in the other direction if np.abs(negbeatheight) > np.abs( beatheight ): # if the other direction has "higher" peaks, invert signal ecg.x *= -1 ecgidx = negecgidx if plt != None: plt.plot(ecg.t, ecg.x) plt.scatter(ecg.t[ecg.ibeats], ecg.x[ecg.ibeats], 30, 'y') window = slopesize / 2 fixed_indices, fixed_times = [], [] # loop through and linearly interpolate peak flanks for i in ecgidx: up_start = i while ecg.x[up_start] >= ecg.x[ i] and up_start > i - slopesize: # make sure start is in trough, not still on peak / plateau up_start -= 1 up_start -= slopesize while ecg.x[up_start + 1] <= ecg.x[ up_start] and up_start < i - 1: # climb past noise (need to go up) up_start += 1 up_end = i + 2 while ecg.x[up_end - 1] >= ecg.x[ up_end] and up_end > i + 1: # climb past noise (need to go up) up_end -= 1 upidx = np.arange(up_start, up_end) # indices of upslope down_start = i down_end = i while ecg.x[down_end] >= ecg.x[ i] and down_end < i + slopesize: # make sure end is in trough, not still on peak / plateau down_end += 1 down_end += slopesize while ecg.x[down_start + 1] >= ecg.x[down_start] or ecg.x[ down_start + 2] >= ecg.x[ down_start] and down_start < down_end: # climb past noise (need to go down) down_start += 1 while ecg.x[down_end - 1] <= ecg.x[ down_end] and down_end > down_start: # climb past noise (need to go down) down_end -= 1 downidx = np.arange(down_start, down_end) # indices of downslope if len(ecg.t[upidx]) <= 1 or len( ecg.t[downidx] ) <= 1: # one or both flanks missing. just use max reali = i bestt = ecg.t[i] else: # interpolate flanks model1 = TheilSenRegressor().fit(ecg.t[upidx].reshape(-1, 1), ecg.x[upidx]) model2 = TheilSenRegressor().fit(ecg.t[downidx].reshape(-1, 1), ecg.x[downidx]) k1, d1 = model1.coef_[0], model1.intercept_ k2, d2 = model2.coef_[0], model2.intercept_ angle1, angle2 = np.arctan(k1), np.arctan(k2) if False: pass else: bestt = (d2 - d1) / ( k1 - k2) # obtain intersection point (noise robust peak) if np.abs(bestt - ecg.t[i]) > slopesize or np.abs( angle1 ) < 0.1 or np.abs( angle2 ) < 0.1: # calculated intersection point is very far from max - something went wrong - reset print( "fix_ecg_peaks WARNING: fixed beat is very far from actual maximum, or slopes suspiciously unsteep. Taking actual maximum to be safe" ) i = max(i - slopesize, 0) + np.argmax( ecg.x[max(i - slopesize, 0):min(i + slopesize, len(ecg.x) - 1)]) if plt != None: reali = i - window + np.argmin( np.abs(ecg.t[(i - window):(i + window)] - bestt)) plt.scatter(bestt, ecg.x[reali], 200, 'y') plt.scatter(ecg.t[i], ecg.x[i], 200, 'g') plt.plot([bestt, ecg.t[i]], [ecg.x[reali], ecg.x[i]], 'r', linewidth=2) reali = i bestt = ecg.t[i] else: reali = i - window + np.argmin( np.abs(ecg.t[(i - window):(i + window)] - bestt)) # store fixed times and indices fixed_indices.append(reali) fixed_times.append(bestt) if plt != None: # plot plt.plot(ecg.t[upidx], ecg.x[upidx], 'g') plt.plot(ecg.t[downidx], ecg.x[downidx], 'm') if len(upidx) > 1 and len(downidx) > 1: plt.plot(ecg.t[upidx], model1.predict(ecg.t[upidx].reshape(-1, 1)), '--k') plt.plot(ecg.t[downidx], model2.predict(ecg.t[downidx].reshape(-1, 1)), '--y') plt.scatter(ecg.t[reali], ecg.x[reali], 60, 'r') plt.scatter(bestt, ecg.x[reali], 90, 'k') ecg.tbeats = np.ravel(fixed_times) ecg.ibeats = np.ravel(fixed_indices).astype(int) return ecg
X = vec.fit_transform(x_train).toarray() Y = np.asarray(train.CLOSE) Y = Y.astype('int') #Pre-Processing Test data X_test = test[['HIGH', 'LOW', 'OPEN', 'TOTTRDQTY', 'TOTTRDVAL', 'TOTALTRADES']] x_test = X_test.to_dict(orient='records') vec = DictVectorizer() x = vec.fit_transform(x_test).toarray() y = np.asarray(test.CLOSE) y = y.astype('int') #Classifier clf = TheilSenRegressor() clf.fit(X, Y) print("Accuracy of this Statistical Arbitrage model is: ", clf.score(x, y)) predict = clf.predict(x) test['predict'] = predict #Ploting train.index = train.Date test.index = test.Date train['CLOSE'].plot() test['CLOSE'].plot() test['predict'].plot() plt.legend(loc='best') plt.xlabel('Date') plt.ylabel('Price') plt.show()
def fit_TheilSen(features_train, labels_train, features_pred): model = TheilSenRegressor() model.fit(features_train, labels_train) labels_pred = model.predict(features_pred) print "TheilSen - coefficient of determination R^2 of the prediction: ", model.score(features_train, labels_train) return labels_pred
# author: David Ruddell # contact: [email protected], [email protected] import pandas as pd from sklearn import svm from sklearn.linear_model import TheilSenRegressor from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score hyper_data = pd.read_csv('../Data/headers3mgperml.csv', sep=',') X = hyper_data.values[:, 16:] y1 = hyper_data.values[:, 5] y2 = hyper_data.values[:, 6] X_train, X_test, y_train, y_test = train_test_split(X, y1, random_state=100, test_size=0.3) clf = TheilSenRegressor() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print(accuracy_score(y_test, y_pred)) X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y2, random_state=100, test_size=0.3) clf2 = TheilSenRegressor() clf2.fit(X_train2, y_train2) y_pred2 = clf2.predict(X_test) print(accuracy_score(y_test2, y_pred2))
def main(): df = getTableVidrieria() filtrado = df[(df['idproducto'] == 38) & (pd.to_datetime(df['fecha'],format='%Y-%m-%d') < '2018-03-01')] #print(filtrado.tail(10)) agrupado = filtrado.groupby(['mes','cuatrimestre','anho'] ).aggregate( {'precioproducto': {'precioproducto_mean':np.mean, 'precioproducto_max':np.max, 'precioproducto_min':np.min}, 'cantidad': {'cantidad_sum':np.sum}}) agrupado = agrupado.reset_index(col_level=1) agrupado.columns = agrupado.columns.get_level_values(1) agrupado = agrupado.sort_values(by=['anho', 'mes']) x = pd.DataFrame(agrupado,columns=['precioproducto_mean', 'precioproducto_min', 'precioproducto_max']) y = pd.DataFrame(agrupado,columns=['cantidad_sum']) #ventana de 22 #ventana de 15 #hasta = 23 - ventana + 1 #[14, ('anho',), 0.03849277569925645] #[[19, ('precioproducto_mean', 'precioproducto_min', 'precioproducto_max'), 0.020839253014839243], # [16, ('precioproducto_mean', 'precioproducto_min', 'precioproducto_max'), 0.023759777216876814], # [17, ('anho', 'precioproducto_mean', 'precioproducto_min', 'precioproducto_max'), 0.028666478132123124], # [17, ('anho', 'precioproducto_min', 'precioproducto_max'), 0.03180598120259058], # [15, ('anho', 'precioproducto_mean', 'precioproducto_min', 'precioproducto_max'), 0.03220899665300666]] aList = [] nameList= [] ventana = 17 hasta = 23 - ventana + 1 CV = ventana - 1 for i in range(0, hasta): x_new = x[i:(i+ventana)] y_new = y[i:(i+ventana)] x_train = x_new[:CV] x_test = x_new[CV:] y_train = y_new[:CV] y_test = y_new[CV:] cv_lr = np.mean(cross_val_score(LinearRegression(),x_train,y_train.values.ravel(),cv=CV,scoring='neg_mean_absolute_error')) cv_tsr = np.mean(cross_val_score(TheilSenRegressor(),x_train,y_train.values.ravel(),cv=CV,scoring='neg_mean_absolute_error')) cv_gbr = np.mean(cross_val_score(GradientBoostingRegressor(n_estimators=N_ESTIMATORS),x_train,y_train.values.ravel(),cv=CV,scoring='neg_mean_absolute_error')) cv_ext = np.mean(cross_val_score(ExtraTreesRegressor(n_estimators=N_ESTIMATORS), x_train, y_train.values.ravel(), cv=CV, scoring='neg_mean_absolute_error')) cv_ab = np.mean(cross_val_score(AdaBoostRegressor(n_estimators=N_ESTIMATORS),x_train,y_train.values.ravel(),cv=CV,scoring='neg_mean_absolute_error')) cv_bag = np.mean(cross_val_score(BaggingRegressor(n_estimators=N_ESTIMATORS),x_train,y_train.values.ravel(),cv=CV,scoring='neg_mean_absolute_error')) #cv_mlp = np.mean(cross_val_score(MLPRegressor(),x_train,y_train.values.ravel(),cv=CV,scoring='neg_mean_absolute_error')) myList = (cv_lr,cv_tsr,cv_gbr,cv_ext,cv_ab,cv_bag) xi = myList.index(max(myList)) if(xi == 0): regr = LinearRegression().fit(x_train, y_train) nameList.append('Linear Regression') #aList.append(mean_absolute_error(y_test, regr.predict(x_test))) aList.append(np.absolute(y_test.iloc[0]['cantidad_sum'] - np.array(regr.predict(x_test)).item())/y_test.iloc[0]['cantidad_sum']) elif(xi == 1): tsr = TheilSenRegressor().fit(x_train, y_train) nameList.append('Theil-Sen Regression') #aList.append(mean_absolute_error(y_test, tsr.predict(x_test))) aList.append(np.absolute(y_test.iloc[0]['cantidad_sum'] - np.array(tsr.predict(x_test)[0]).item()) / y_test.iloc[0]['cantidad_sum']) elif(xi == 2): gbr = GradientBoostingRegressor(n_estimators=N_ESTIMATORS).fit(x_train, y_train) nameList.append('Gradient Boosting Regression') #aList.append(mean_absolute_error(y_test, gbr.predict(x_test))) aList.append(np.absolute(y_test.iloc[0]['cantidad_sum'] - np.array(gbr.predict(x_test)[0]).item()) / y_test.iloc[0]['cantidad_sum']) elif(xi == 3): ext = ExtraTreesRegressor(n_estimators=N_ESTIMATORS).fit(x_train, y_train) nameList.append('Extra Trees Regression') #aList.append(mean_absolute_error(y_test, ext.predict(x_test))) aList.append(np.absolute(y_test.iloc[0]['cantidad_sum'] - np.array(ext.predict(x_test)[0]).item()) / y_test.iloc[0]['cantidad_sum']) elif(xi == 4): ab = AdaBoostRegressor(n_estimators=N_ESTIMATORS).fit(x_train, y_train) nameList.append('Ada Boost Regression') #aList.append(mean_absolute_error(y_test, ab.predict(x_test))) aList.append(np.absolute(y_test.iloc[0]['cantidad_sum'] - np.array(ab.predict(x_test)[0]).item()) / y_test.iloc[0]['cantidad_sum']) elif(xi == 5): bag = BaggingRegressor(n_estimators=N_ESTIMATORS).fit(x_train, y_train) nameList.append('Bagging Regression') #aList.append(mean_absolute_error(y_test, bag.predict(x_test))) aList.append(np.absolute(y_test.iloc[0]['cantidad_sum'] - np.array(bag.predict(x_test)[0]).item()) / y_test.iloc[0]['cantidad_sum']) print(aList) print(nameList) print(np.var(aList)) fig, ax = plt.subplots() data_line = ax.plot(aList, label='% Error', marker='o') mean_line = ax.plot([np.mean(aList)]*len(aList), label='Media', linestyle='--') legend = ax.legend(loc='upper right') plt.show()
# Lasso Lars lassolars_reg = LassoLars() lassolars_reg.fit(X_train, Y_train) Y_pred = lassolars_reg.predict(X_test) lassolars_r2 = r2_score(Y_expected, Y_pred) lassolars_mse = mean_squared_error(Y_expected, Y_pred) print("Lasso Lars Regression\n", "R2: ", lassolars_r2, "MSE:", lassolars_mse) plot_prediction("Lasso Lars Regression", Y_pred, test['close']) # Theil Sen Regressor theil_reg = TheilSenRegressor() theil_reg.fit(X_train, Y_train) Y_pred = theil_reg.predict(X_test) theil_r2 = r2_score(Y_expected, Y_pred) theil_mse = mean_squared_error(Y_expected, Y_pred) print("Theil Sen Regression\n", "R2: ", theil_r2, "MSE:", theil_mse) plot_prediction("Theil Sen Regression", Y_pred, test['close']) # Bayesian Ridge bayesian_reg = BayesianRidge() bayesian_reg.fit(X_train, Y_train) Y_pred = bayesian_reg.predict(X_test) bayesian_r2 = r2_score(Y_expected, Y_pred) bayesian_mse = mean_squared_error(Y_expected, Y_pred) print("Bayesian Ridge Regression\n", "R2: ", bayesian_r2, "MSE:", bayesian_mse) plot_prediction("Bayesian Ridge Regression", Y_pred, test['close'])
for document in range(0, len(documents)): plt.subplot(3, 1, document + 1) estimators = TheilSenRegressor() result = pd.read_csv(documents[document]).iloc[:5001] timestamp = numpy.array(result['Time']).reshape(-1, 1) time_offset = result['TimeOffset'] time_result = [ datetime.datetime.fromtimestamp(each).strftime('%H:%M') for each in list(result['Time']) ] estimators.fit(timestamp, time_offset) plt.xticks( range(0, len(time_result), 1000), [time_result[each] for each in range(0, len(time_result), 1000)]) plt.yticks( np.arange(min(time_offset) // 1.0, max(time_offset) // 1.0 + 1, 0.25)) plt.plot(time_offset, label='NTP Records') plt.plot(estimators.predict(timestamp), label='Regression Result') time_predicted = time_shift.get_time_offset() result = estimators.predict(numpy.array(time_predicted[0]).reshape(-1, 1)) print('Prediction(Predicted, NTPlib):', result, time_predicted[1]) print('Daily time shifting', estimators.coef_[0] * 60 * 60 * 24) plt.legend() plt.show()
def quantify_beat(self, beatnumber): beatindex = self.ibeats[beatnumber] # approx expected ibi meanibi = np.mean(np.diff(self.tbeats)) # downslope is less than half of full beat. look for peaks on either side downslopewindow = int((meanibi / 2.5) * self.fps) # pick preceding maximum try: maxindex = np.where( heartbeat_localmax(self.x[(beatindex - downslopewindow):beatindex]))[0][-1] except: maxindex = np.argmax(self.x[(beatindex - downslopewindow):beatindex]) peaki = beatindex - downslopewindow + maxindex # double check we didn't go beyond prev. beat if beatnumber > 0 and peaki <= self.ibeats[beatnumber - 1]: peaki = self.ibeats[beatnumber - 1] + downslopewindow + np.argmax( self.x[(self.ibeats[beatnumber - 1] + downslopewindow):beatindex]) # pick succeeding minimum troughi = beatindex + np.argmin( self.x[beatindex:(beatindex + downslopewindow)]) # double check we didn't go beyond next beat if beatnumber < len( self.ibeats) - 1 and troughi >= self.ibeats[beatnumber + 1]: troughi = beatindex + np.argmin( self.x[beatindex:(self.ibeats[beatnumber + 1] - 1)]) # robust regression on downslope downslopemodel = TheilSenRegressor().fit( self.t[peaki:troughi].reshape(-1, 1), self.x[peaki:troughi]) r2 = downslopemodel.score(self.t[peaki:troughi].reshape(-1, 1), self.x[peaki:troughi]) # count which points are close enough to prediction predicted_downslope = downslopemodel.predict( self.t[peaki:troughi].reshape(-1, 1)) amplitude = self.x[peaki] - self.x[troughi] m, k = downslopemodel.coef_[0], downslopemodel.intercept_ point_to_line_distances = np.abs(k + m * self.t[peaki:troughi] - self.x[peaki:troughi]) / np.sqrt( 1 + m * m) point_to_line_distance_percentages = 100.0 / amplitude * point_to_line_distances ok_points = np.where(point_to_line_distance_percentages < BeatQuality.ACCEPTED_DEVIATION_PERCENTAGE)[0] fraction_acceptable = 1.0 / (troughi - peaki) * len(ok_points) # numerically characterize non-crap portion of the slope ok_slope_length = fraction_acceptable * np.sqrt( (troughi - peaki)**2 + (self.x[peaki] - self.x[troughi])**2) ok_slope_angle = np.arctan(downslopemodel.coef_[0]) # numerically characterize beat placement beat_downslope_orthogonal_distance = 0 if ok_slope_length == 0 else 1.0 / ok_slope_length * ( np.abs(k + m * self.t[beatindex] - self.x[beatindex]) / np.sqrt(1 + m * m)) beat_downslope_peak_distance = 0 if ok_slope_length == 0 else 1.0 / ok_slope_length * np.sqrt( (beatindex - peaki)**2 + (self.x[peaki] - self.x[beatindex])**2) # check if certain to be bad fit iscrap = False if np.abs( r2 ) < BeatQuality.MINIMUM_R2 or fraction_acceptable < BeatQuality.MINIMUM_LINEARITY: print "crap! ", beatnumber, r2, fraction_acceptable iscrap = True return ok_slope_length, ok_slope_angle, beat_downslope_orthogonal_distance, beat_downslope_peak_distance, iscrap
xValues['medHighIncome'] = incomeDF['50to75k'] # xValues['highIncome'] = incomeDF['above75k'] # overspecified xValues['P_married'] = marriageDF['marriedPercent'] xValues['P_noCar'] = carDF['TotalNoVehicle'] xValues['P_1Car'] = carDF['Total1Vehicle'] # xValues['P_2+Car'] = carDF['Total2orMoreVehicle'] # overspecified xValues['P_homeOwner'] = homeOwnerDF['OWN'] # make sure all feature vectors are the same length # for col in xValues.columns: # print(xValues[col].shape) xValues.head(1) # %% # predict values linPredictions = linModel.predict(xValues) tsPredictions = tsModel.predict(xValues) hrPredictions = hrModel.predict(xValues) # bardPredictions = bardModel.predict(xValues) brPredictions = brModel.predict(xValues) # enPredictions = enModel.predict(xValues) ridgePredictions = ridgeModel.predict(xValues) # logPredictions = logModel.predict(xValues) print('Features:') print(xTrain.columns.values, '\n') print("Linear coefficients:", '\n', linModel.coef_, '...Intercept:', linModel.intercept_, '\n') print("TS coefficients:", '\n', tsModel.coef_, '...Intercept:', tsModel.intercept_, '\n') print("HR coefficients:", '\n', hrModel.coef_, '...Intercept:', hrModel.intercept_, '\n') # print("BARD coefficients:", '\n', bardModel.coef_, '...Intercept:', bardModel.intercept_, '\n') print("BR coefficients:", '\n', brModel.coef_, '...Intercept:', brModel.intercept_, '\n') # print("EN coefficients:", '\n', enModel.coef_, '...Intercept:', enModel.intercept_, '\n') print("Ridge coefficients:", '\n', ridgeModel.coef_, '...Intercept:', ridgeModel.intercept_, '\n')
# 5.1.5.1 RANSAC regression ransac = RANSACRegressor() pred_ransac = ransac.fit(X_train, y_train).predict( X_test ) #train the algorithm on training data and predict using the testing data y_predransac = ransac.predict(X_test) print('Betas: ', list(zip(ransac.coef_, X))) print('Beta0: %.2f' % ransac.intercept_) #Beta0 # 5.1.5.2 Theil-Sen regression ts = TheilSenRegressor() pred_ts = ts.fit(X_train, y_train).predict( X_test ) #train the algorithm on training data and predict using the testing data y_predts = ts.predict(X_test) print('Betas: ', list(zip(ts.coef_, X))) print('Beta0: %.2f' % ts.intercept_) #Beta0 # 5.1.5.3 Huber regression huber = HuberRegressor(alpha=0.0) pred_huber = huber.fit(X_train, y_train).predict( X_test ) #train the algorithm on training data and predict using the testing data y_predhuber = huber.predict(X_test) print('Betas: ', list(zip(huber.coef_, X))) print('Beta0: %.2f' % huber.intercept_) #Beta0 """# Regression Model selection After calculating different regression models it is necessary to compare models and evaluate which is the best given the database. - MAE - MSE
epsilons) # assumes 3 dims results.append({ 'epsilon': epsilon, 'num_boxes': len(all_floors), 'filled_boxes': count_boxes(points, all_floors, epsilons) }) return results if __name__ == "__main__": data = pd.read_csv('CapDimData.dat', header=None) data = get_capacity_dimension(data) print(data) y = [log(i['filled_boxes']) for i in data] x = [log(1 / i['epsilon']) for i in data] regressor = TheilSenRegressor(random_state=42) regressor.fit(np.array(x)[:, np.newaxis], y) print(regressor.coef_) plt.plot(x, y) plt.plot(x, [regressor.predict(xx) for xx in x], color='red') plt.xlabel('log(1/epsilon)') plt.ylabel('log(num boxes)') plt.legend(['Data', 'Fit: slope {:.2}'.format(regressor.coef_[0])]) plt.show() # 2 dims - slope 1.7
######### # Theil sen model from sklearn.linear_model import TheilSenRegressor # Theil Sen Regressor Model # Instantiate ts_reg = TheilSenRegressor(random_state = 508) # Fit ts_reg.fit(X_train, y_train) # Predict y_pred = ts_reg.predict(X_test) # Score y_score_ts = ts_reg.score(X_test, y_test) print(y_score_ts) ############# # Regression tree from sklearn.tree import DecisionTreeRegressor # Regression trees # Instantiate tree_reg = DecisionTreeRegressor(criterion = 'mse', min_samples_leaf = 14, random_state = 508)
def main(): df = getTableVidrieria() filtrado = df.loc[df['idproducto'] == 38] agrupado = filtrado.groupby(['cuatrimestre', 'anho']).aggregate({ 'precioproducto': { 'precioproducto_mean': np.mean, 'precioproducto_max': np.max, 'precioproducto_min': np.min }, 'cantidad': { 'cantidad_sum': np.sum } }) agrupado = agrupado.reset_index(col_level=1) agrupado.columns = agrupado.columns.get_level_values(1) agrupado = agrupado.sort_values(by=['anho', 'cuatrimestre']) x = pd.DataFrame(agrupado, columns=['cuatrimestre', 'precioproducto_min']) y = pd.DataFrame(agrupado, columns=['cantidad_sum']) aList = [] nameList = [] for i in range(0, 4): x_new = x[i:(i + 5)] y_new = y[i:(i + 5)] x_train = x_new[:4] x_test = x_new[4:] y_train = y_new[:4] y_test = y_new[4:] cv_lr = np.mean( cross_val_score(LinearRegression(), x_train, y_train.values.ravel(), cv=CV, scoring='neg_mean_absolute_error')) cv_tsr = np.mean( cross_val_score(TheilSenRegressor(), x_train, y_train.values.ravel(), cv=CV, scoring='neg_mean_absolute_error')) cv_gbr = np.mean( cross_val_score( GradientBoostingRegressor(n_estimators=N_ESTIMATORS), x_train, y_train.values.ravel(), cv=CV, scoring='neg_mean_absolute_error')) cv_ext = np.mean( cross_val_score(ExtraTreesRegressor(n_estimators=N_ESTIMATORS), x_train, y_train.values.ravel(), cv=CV, scoring='neg_mean_absolute_error')) cv_ab = np.mean( cross_val_score(AdaBoostRegressor(n_estimators=N_ESTIMATORS), x_train, y_train.values.ravel(), cv=CV, scoring='neg_mean_absolute_error')) cv_bag = np.mean( cross_val_score(BaggingRegressor(n_estimators=N_ESTIMATORS), x_train, y_train.values.ravel(), cv=CV, scoring='neg_mean_absolute_error')) # cv_mlp = np.mean(cross_val_score(MLPRegressor(),x_train,y_train.values.ravel(),cv=CV,scoring='neg_mean_absolute_error')) myList = (cv_lr, cv_tsr, cv_gbr, cv_ext, cv_ab, cv_bag) xi = myList.index(max(myList)) if (xi == 0): regr = LinearRegression().fit(x_train, y_train) nameList.append('Linear Regression') aList.append(mean_absolute_error(y_test, regr.predict(x_test))) elif (xi == 1): tsr = TheilSenRegressor().fit(x_train, y_train) nameList.append('Theil-Sen Regression') aList.append(mean_absolute_error(y_test, tsr.predict(x_test))) elif (xi == 2): gbr = GradientBoostingRegressor(n_estimators=N_ESTIMATORS).fit( x_train, y_train) nameList.append('Gradient Boosting Regression') aList.append(mean_absolute_error(y_test, gbr.predict(x_test))) elif (xi == 3): ext = ExtraTreesRegressor(n_estimators=N_ESTIMATORS).fit( x_train, y_train) nameList.append('Extra Trees Regression') aList.append(mean_absolute_error(y_test, ext.predict(x_test))) elif (xi == 4): ab = AdaBoostRegressor(n_estimators=N_ESTIMATORS).fit( x_train, y_train) nameList.append('Ada Boost Regression') aList.append(mean_absolute_error(y_test, ab.predict(x_test))) elif (xi == 5): bag = BaggingRegressor(n_estimators=N_ESTIMATORS).fit( x_train, y_train) nameList.append('Bagging Regression') aList.append(mean_absolute_error(y_test, bag.predict(x_test))) print(aList) print(nameList) print(np.var(aList)) fig, ax = plt.subplots() data_line = ax.plot(aList, label='Data', marker='o') mean_line = ax.plot([np.mean(aList)] * len(aList), label='Mean', linestyle='--') legend = ax.legend(loc='upper right') plt.show()