class NadarayaWatsonUNLR(UnivariateNonlinearRegressor): kernel: KernelReg bandwidth: float def __init__( self, bandwidth: float = 0.25, random_state: Optional[Union[int, np.random.RandomState]] = None, ): """Instantiates a kernel regression model. Args: bandwidth: affects the scale on which to locally average samples random_state: random state which effects sample bootstrapping """ super().__init__(random_state) self.bandwidth = bandwidth def _fit_univariate(self, x: np.ndarray, y: np.ndarray, w: Optional[np.ndarray]) -> None: if w is not None: x, y = self.weighted_resampler(x, y, w) self.kernel = KernelReg(endog=y, exog=x, var_type="c", bw=[self.bandwidth]) def predict(self, x: np.ndarray) -> np.ndarray: return self.kernel.fit(x)[0] def derivative(self, x: np.ndarray) -> np.ndarray: return self.kernel.fit(x)[1].ravel()
def FWHM(wave, pertdata, mode='data', imin=False, ll_bw='cv_ls'): """ Mode can be data, ll, lc """ fwhms = [] imins = [] mvels = [] LLEs = [] for i in tqdm(range(pertdata.shape[0])): data = pertdata[i, :] if mode in ['ll', 'lc']: lle = KernelReg(data, wave, 'c', reg_type=mode, bw=ll_bw) data = lle.fit()[0] LLEs.append(data) print('LLE bandwidth: ', lle.bw[0], end="\r") iplwave = np.linspace(wave.min(), wave.max(), 1000) ipldata = np.interp(iplwave, wave, data) iplidx = np.where(ipldata > ipldata.max() / 2)[0] vmin, vmax = iplidx.min(), iplidx.max() fwhms.append(iplwave[vmax] - iplwave[vmin]) if imin: imins.append(1 - data.max()) mvels.append(iplwave[ipldata.argmax()]) if imin: return np.array(fwhms), np.array(mvels), np.array(imins), np.array( LLEs) return np.array(fwhms)
def __init__(self, summaryfile=None, inwave=None, indata=None, inerrs=None, inmask=None, smooth=None): self.data = indata self.wave = inwave self.errs = inerrs self.mask = inmask if summaryfile: self.open_summary(summaryfile) # Interpolate masked areas self.data[self.mask] = np.nan self.nonnanidx = np.where(~self.mask)[0] self.interp = np.interp(self.wave, self.wave[self.nonnanidx], self.data[self.nonnanidx]) self.interr = np.interp(self.wave, self.wave[self.nonnanidx], self.errs[self.nonnanidx]) if smooth == 'll': lle = KernelReg(self.interp, self.wave, 'c', bw=[10]) mean, marg = lle.fit() del marg self.smoothed = mean elif smooth == 'box': mean = np.convolve(self.data, np.array([1, 1, 1]) / 3) else: self.smoothed = self.data self._build_plot()
def dataSmoothing3(changes): length = len(changes) x = np.linspace(1, length, num=length, endpoint=True) y = np.array(changes) kr = KernelReg(y, x, 'c') r_fit = KernelReg.r_squared(kr) #plt.figure(1) #plt.subplot(131) #plt.plot(x, y, 'go-') #plt.title("Original",fontsize=20) #plt.xlabel('Periods',fontsize=20) #plt.ylabel('Dockerfile Size',fontsize=20) #plt.grid(True) if length < 20: x1 = np.linspace(1, length, num=3 * length, endpoint=True) else: x1 = x y_pred, y_std = kr.fit(x1) #plt.subplot(132) #plt.plot(x1, y_pred,'bo-') #plt.title("Smoothing",fontsize=20) #plt.xlabel('Periods',fontsize=20) #plt.ylabel('Dockerfile Size',fontsize=20) #plt.grid(True) #plt.show() ynew = dataResampling(y_pred) xnew = np.linspace(1, 20, 20, endpoint=False) #plt.subplot(133) #plt.plot(xnew, ynew,'ro-') #plt.title("Resampling",fontsize=20) #plt.xlabel('Periods',fontsize=20) #plt.ylabel('Dockerfile Size',fontsize=20) #plt.grid(True) #plt.show() return ynew, r_fit
def integrated_calibration_index_mod(y, p): """ local reg 使うバージョン TOOD: statsmodels.nonparametric.kernel_regression.KernReg がとても遅い. C++とかで実装したほうが良いのでは? """ ll = KernelReg(endog=y, exog=p, reg_type='ll', var_type='o') return mean_absolute_error(y, ll.fit()[0])
def get_fitted_values(week): # week - for knowing for which s_spotify values to take s_streams # делаем working_df с которой будет работать модель working_df = pd.read_csv(get_paths()[1]+"all_spotify.csv") working_df = working_df.drop(working_df.columns[[0]], axis=1) # делаем регрессию y = np.array(list(working_df["streams"])) x_r = np.array(list(working_df["rank"])) x_s = np.array(list(working_df["s_streams"])) var_cont = (np.var(x_s))**0.5 b_c = var_cont*(len(y)**(-1/5)) print(b_c) # count ordered discrete variable bandwidth b_o = len(y)**(-2/5) print(b_o) reg_new = KernelReg(y, [x_r, x_s], var_type="oc", reg_type = "ll", bw = [b_o, b_c]) df_of_needed_week = working_df[working_df["week_f_show"] == week] last_week_sstreams = df_of_needed_week["s_streams"][-1:].values[0] fit_values = reg_new.fit([[i for i in range(1,201)],[last_week_sstreams for h in range(1,201) ]])[0] return fit_values
def kreg_demo1(hs=None, fast=True, fun='hisj'): """Compare KRegression to KernelReg from statsmodels.nonparametric Examples -------- >>> kreg_demo1() """ N = 100 # ei = np.random.normal(loc=0, scale=0.075, size=(N,)) ei = np.array([ -0.08508516, 0.10462496, 0.07694448, -0.03080661, 0.05777525, 0.06096313, -0.16572389, 0.01838912, -0.06251845, -0.09186784, -0.04304887, -0.13365788, -0.0185279, -0.07289167, 0.02319097, 0.06887854, -0.08938374, -0.15181813, 0.03307712, 0.08523183, -0.0378058, -0.06312874, 0.01485772, 0.06307944, -0.0632959, 0.18963205, 0.0369126, -0.01485447, 0.04037722, 0.0085057, -0.06912903, 0.02073998, 0.1174351, 0.17599277, -0.06842139, 0.12587608, 0.07698113, -0.0032394, -0.12045792, -0.03132877, 0.05047314, 0.02013453, 0.04080741, 0.00158392, 0.10237899, -0.09069682, 0.09242174, -0.15445323, 0.09190278, 0.07138498, 0.03002497, 0.02495252, 0.01286942, 0.06449978, 0.03031802, 0.11754861, -0.02322272, 0.00455867, -0.02132251, 0.09119446, -0.03210086, -0.06509545, 0.07306443, 0.04330647, 0.078111, -0.04146907, 0.05705476, 0.02492201, -0.03200572, -0.02859788, -0.05893749, 0.00089538, 0.0432551, 0.04001474, 0.04888828, -0.17708392, 0.16478644, 0.1171006, 0.11664846, 0.01410477, -0.12458953, -0.11692081, 0.0413047, -0.09292439, -0.07042327, 0.14119701, -0.05114335, 0.04994696, -0.09520663, 0.04829406, -0.01603065, -0.1933216, 0.19352763, 0.11819496, 0.04567619, -0.08348306, 0.00812816, -0.00908206, 0.14528945, 0.02901065]) x = np.linspace(0, 1, N) va_1 = 0.3 ** 2 va_2 = 0.7 ** 2 y0 = np.exp(-x ** 2 / (2 * va_1)) + 1.3 * np.exp(-(x - 1) ** 2 / (2 * va_2)) y = y0 + ei kernel = Kernel('gauss', fun=fun) hopt = kernel.hisj(x) kreg = KRegression( x, y, p=0, hs=hs, kernel=kernel, xmin=-2 * hopt, xmax=1 + 2 * hopt) if fast: kreg.__call__ = kreg.eval_grid_fast f = kreg(x, output='plot', title='Kernel regression', plotflag=1) plt.figure(0) f.plot(label='p=0') kreg.p = 1 f1 = kreg(x, output='plot', title='Kernel regression', plotflag=1) f1.plot(label='p=1') # print(f1.data) plt.plot(x, y, '.', label='data') plt.plot(x, y0, 'k', label='True model') from statsmodels.nonparametric.kernel_regression import KernelReg kreg2 = KernelReg(y, x, ('c')) y2 = kreg2.fit(x) plt.plot(x, y2[0], 'm', label='statsmodel') plt.legend()
def smooth_xy(x, y): x = np.squeeze(x) y = np.squeeze(y) #v = lowess(y, x, frac=.05) kernel_reg = KernelReg(y, x, var_type='c', reg_type='lc') kernel_reg.bw = np.asarray([.01]) y = kernel_reg.fit(x)[0] return x, y
class local_stack: def __init__(self): pass def fit(self, X_train, y_train): N, p = X_train.shape self.kernel = KernelReg(y_train, X_train, var_type=p * 'c') def predict(self, X): return self.kernel.fit(X)[0]
def pred_from_loess(self, train_x, train_y, x_to_pred): """ Trains simple loess regression and returns predictions """ kr_model = KernelReg(endog=train_y, exog=train_x, var_type='c', bw=[self.bandwidth]) return kr_model.fit(x_to_pred)[0]
def __init__(self, x, y, yerr=None): reg = KernelReg([y], [x], var_type='c', reg_type='ll') vals = reg.fit(x)[0] self.spline = interp.UnivariateSpline(x, vals, w=np.isfinite(vals), ext='const') # calculate RMS and normalize to stop normalization drifting xs = np.linspace(np.min(x), np.max(x), 1000) ys = self.spline(xs) self.rms = np.sqrt(np.sum(ys**2) / 1000)
class LocalRegression: def __init__(self): pass def fit(self, X_train, y_train): # By default, this function will do a local linear regression self.regression = KernelReg(y_train, X_train, var_type='c') return self def predict(self, X_test): return self.regression.fit(X_test)[0]
def calc_smooth(prices: pd.Series, *, bw: Union[np.ndarray, str] = 'cv_ls', a: float = None, use_array: bool = True) -> Union[pd.Series, np.ndarray]: """计算Nadaraya-Watson核估计后的价格数据 Args: prices (pd.Series): 价格数据 bw (Union[np.ndarray,str]): Either a user-specified bandwidth or the method for bandwidth selection. Defaults to cv_ls. a (float, optional): 论文中所说的比例数据. Defaults to None. use_array (bool, optional): 为True返回ndarray,False返回为pd.Series. Defaults to True. Returns: Union[pd.Series,np.ndarry] """ if not isinstance(prices, pd.Series): raise ValueError('prices必须为pd.Series') idx = np.arange(len(prices)) kr = KernelReg(prices.values, idx, var_type='c', reg_type='ll', bw=bw) if a is None: f = kr.fit(idx)[0] else: kr.bw = a * kr.bw # 论文用的0.3 * h f = kr.fit(idx)[0] if use_array: return f else: return pd.Series(data=f, index=prices.index)
def find_extrema(s, bw='cv_ls'): """ Input: s: prices as pd.series bw: bandwith as str or array like Returns: prices: with 0-based index as pd.series extrema: extrema of prices as pd.series smoothed_prices: smoothed prices using kernel regression as pd.series smoothed_extrema: extrema of smoothed_prices as pd.series """ # Copy series so we can replace index and perform non-parametric # kernel regression. prices = s.copy() prices = prices.reset_index() prices.columns = ['date', 'price'] prices = prices['price'] kr = KernelReg([prices.values], [prices.index.to_numpy()], var_type='c', bw=bw) f = kr.fit([prices.index]) # Use smoothed prices to determine local minima and maxima smooth_prices = pd.Series(data=f[0], index=prices.index) smooth_local_max = argrelextrema(smooth_prices.values, np.greater)[0] smooth_local_min = argrelextrema(smooth_prices.values, np.less)[0] local_max_min = np.sort( np.concatenate([smooth_local_max, smooth_local_min])) smooth_extrema = smooth_prices.loc[local_max_min] # Iterate over extrema arrays returning datetime of passed # prices array. Uses idxmax and idxmin to window for local extrema. price_local_max_dt = [] for i in smooth_local_max: if (i > 1) and (i < len(prices) - 1): price_local_max_dt.append(prices.iloc[i - 2:i + 2].idxmax()) price_local_min_dt = [] for i in smooth_local_min: if (i > 1) and (i < len(prices) - 1): price_local_min_dt.append(prices.iloc[i - 2:i + 2].idxmin()) maxima = pd.Series(prices.loc[price_local_max_dt]) minima = pd.Series(prices.loc[price_local_min_dt]) extrema = pd.concat([maxima, minima]).sort_index() # Return series for each with bar as index return extrema, prices, smooth_extrema, smooth_prices
class KernelModelWrapper(object): def __init__(self): self.model = None self.variable_types = {} self.X_shape = None self.y_shape = None def fit(self, X, y, variable_types={}): self.X_shape = X.shape self.y_shape = y.shape if variable_types: variable_type_string = ''.join([variable_types[col] for col in X.columns]) self.model = KernelReg(y, X, variable_type_string, reg_type='ll') else: self.model = KernelReg(y, X, 'c' * X.shape[1], reg_type='ll') return self def predict(self, X): if X.shape != self.X_shape: raise Exception("Expected shape {}, received {}".format(self.X_shape, X.shape)) return self.model.fit(X)[0]
def find_max_min(prices): """ Get min and max of a series consisting of prices """ prices_ = prices.copy() prices_.index = np.linspace(1., len(prices_), len(prices_)) kr = KernelReg([prices_.values], [prices_.index.values], var_type='c', bw=[1.8]) f = kr.fit([prices_.index.values]) smooth_prices = pd.Series(data=f[0], index=prices.index) local_max = argrelextrema(smooth_prices.values, np.greater)[0] local_min = argrelextrema(smooth_prices.values, np.less)[0] price_local_max_dt = [] for i in local_max: if (i > 1) and (i < len(prices) - 1): price_local_max_dt.append(prices.iloc[i - 2:i + 2].argmax()) price_local_min_dt = [] for i in local_min: if (i > 1) and (i < len(prices) - 1): price_local_min_dt.append(prices.iloc[i - 2:i + 2].argmin()) prices.name = 'price' maxima = pd.DataFrame(prices.loc[price_local_max_dt]) minima = pd.DataFrame(prices.loc[price_local_min_dt]) max_min = pd.concat([maxima, minima]).sort_index() max_min.index.name = 'date' max_min = max_min.reset_index() max_min = max_min[~max_min.date.duplicated()] p = prices.reset_index() max_min['day_num'] = p[p['index'].isin(max_min.date)].index.values max_min = max_min.set_index('day_num').price return max_min
class KernelModelWrapper(object): def __init__(self): self.model = None self.variable_types = {} self.X_shape = None self.y_shape = None def fit(self, X, y, variable_types={}): self.X_shape = X.shape self.y_shape = y.shape if variable_types: variable_type_string = ''.join( [variable_types[col] for col in X.columns]) self.model = KernelReg(y, X, variable_type_string, reg_type='ll') else: self.model = KernelReg(y, X, 'c' * X.shape[1], reg_type='ll') return self def predict(self, X): if X.shape != self.X_shape: raise Exception("Expected shape {}, received {}".format( self.X_shape, X.shape)) return self.model.fit(X)[0]
def find_max_min(prices): prices_ = prices.copy() prices_.index = linspace(1., len(prices_), len(prices_)) #kr = KernelReg([prices_.values], [prices_.index.values], var_type='c', bw=[1.8, 1]) kr = KernelReg([prices_.values], [prices_.index.values], var_type='c', bw=[2]) # 小了捕捉局部,大了捕捉全局 ! # Either a user-specified bandwidth or the method for bandwidth selection. # If a string, valid values are ‘cv_ls’ (least-squares cross-validation) and ‘aic’ (AIC Hurvich bandwidth estimation). # Default is ‘cv_ls’. f = kr.fit([prices_.index.values]) smooth_prices = pd.Series(data=f[0], index=prices.index) local_max = argrelextrema(smooth_prices.values, np.greater)[0] local_min = argrelextrema(smooth_prices.values, np.less)[0] price_local_max_dt = [] for i in local_max: if (i > 1) and (i < len(prices) - 1): price_local_max_dt.append(prices.iloc[i - 2:i + 2].argmax()) price_local_min_dt = [] for i in local_min: if (i > 1) and (i < len(prices) - 1): price_local_min_dt.append(prices.iloc[i - 2:i + 2].argmin()) prices.name = 'price' maxima = pd.DataFrame(prices.loc[price_local_max_dt]) minima = pd.DataFrame(prices.loc[price_local_min_dt]) max_min = pd.concat([maxima, minima]).sort_index() max_min.index.name = 'date' max_min = max_min.reset_index() max_min = max_min[~max_min.date.duplicated()] p = prices.reset_index() max_min['day_num'] = p[p['index'].isin(max_min.date)].index.values max_min = max_min.set_index('day_num').price return max_min
def estimator_nw(data, est_kwargs={}, **kwargs): from statsmodels.nonparametric.kernel_regression import KernelReg #http://www.statsmodels.org/dev/generated/statsmodels.nonparametric.kernel_density.EstimatorSettings.html from statsmodels.nonparametric.kernel_regression import EstimatorSettings k = len(data['x']['Train'].T) # n = len(data['x']['Train']) if 'reg_type' in est_kwargs.keys(): reg_type = est_kwargs[ 'reg_type'] #Allows for locally linear estimation else: reg_type = 'lc' #Default is local constant (Nadaraya-Watson). #Estimate model nw = KernelReg( data['y']['Train'], data['x']['Train'], #Fits regression var_type='c' * k, #Continuous variables reg_type=reg_type, bw='aic', #Least-squares cross val. Else aic for aic hurdwidth defaults=EstimatorSettings( n_jobs=1, #No parallel efficient=True, randomize=True, #bw estimation random subsampling n_res=25, #Number of resamples n_sub=50, # Size of samples ), ) betahat = np.array([]) #NP does not have coefficients # Extract results prob, mrgeff = {}, {} for split in ('Train', 'Test'): prob[split], mrgeff[split] = nw.fit(data_predict=data['x'][split]) return betahat, prob, mrgeff
class Surface: def __init__(self, f, f2, pts3d, left_pts, right_pts, oldpts3d, safety_check=False): self.f = f self.f2 = f2 self.safety_check = safety_check self.pts3d = np.matrix(pts3d) self.minimum = np.min(self.pts3d[:,2]) self.maximum = np.max(self.pts3d[:,2]) self.oldpts3d = oldpts3d self.left_pts = left_pts self.right_pts = right_pts pts2d = [] ptsz = [] f3 = open("../calibration_data/camera_matrix.p", "rb") self.cmat = pickle.load(f3) f3.close() for pt in pts3d: pts2d.append(pt[:2]) ptsz.append(np.ceil(pt[2] * 1000000)) self.neigh = KNeighborsClassifier(n_neighbors=2) self.neigh.fit(pts2d, ptsz) self.f = scipy.interpolate.Rbf(np.matrix(pts3d)[:,0].ravel(), np.matrix(pts3d)[:,1].ravel(), np.matrix(pts3d)[:,2].ravel(), function='linear', epsilon=.1) pts3d = np.array(pts3d).T print pts3d.shape print pts3d[:2,:].shape, pts3d[2,:].shape self.f = KernelReg(pts3d[2,:], pts3d[:2,:], 'cc') def leftpixels_to_rframe(self, x, y): surf = self.f2 left_pts = self.left_pts right_pts = self.right_pts pts3d = self.oldpts3d xin = np.array([a[0] for a in left_pts]) bias = np.ones(len(xin)) yin = np.array([a[1] for a in left_pts]) xout = np.array([a[0] for a in pts3d]) yout = np.array([a[1] for a in pts3d]) A = np.vstack([xin, bias]).T m1, c1 = np.linalg.lstsq(A, xout)[0] A = np.vstack([yin, bias]).T m2, c2 = np.linalg.lstsq(A, yout)[0] xnew = m1 * x + c1 ynew = m2 * y + c2 cpoint = np.matrix([(xnew, ynew, self.f2(xnew, ynew))]) pt = np.ones(4) pt[:3] = cpoint pred = self.cmat * np.matrix(pt).T return pred def query(self, x, y): temp = self.f.fit(np.array((x, y)))[0][0] if not self.safety_check: return (x, y, temp) if temp < self.minimum - 0.02: temp = self.query_knn(x, y)[2] elif temp > self.maximum + 0.02: temp = self.query_knn(x, y)[2] print 'asdf', temp return (x, y, temp) def query_knn(self, x, y): return (x, y, (self.neigh.predict([[x, y]]) / 1000000.0)[0]) def visualize(self): fig = plt.figure() ax = fig.add_subplot(111) pts3d = np.matrix(self.pts3d) f = self.f a, b = np.ravel(np.min(pts3d, axis=0)), np.ravel(np.max(pts3d, axis=0)) extra_range = 0.0 # xnew = np.arange(a[0] - extra_range,b[0] + extra_range,0.0001) # ynew = np.arange(a[1] - extra_range,b[1] + extra_range,0.0001) X, Y = np.mgrid[a[0] + .05 :b[0] - .05 :100j, a[1]:b[1]:100j]
fairK = np.array((3, 5, 9, 15, 20, 25, 30, 35, 40, 45)) event_lengths = durs_run1_new / fairK unique_event_lengths = np.unique(event_lengths) x = event_lengths.ravel() test_x = np.linspace(min(x), max(x), num=100) smooth_wva = np.zeros((len(unique_event_lengths), len(ROI_data), nBoots)) opt_bw_holder = np.zeros((nBoots, len(ROI_data))) for ROI in range(len(ROI_data)): for b in range(nBoots): opt_bw = 0 y = ROI_data[ROI][:, :, b].ravel() KR = KernelReg(y, x, var_type='c') opt_bw += KR.bw / len(ROI_data) opt_bw_holder[b, ROI] = opt_bw y = ROI_data[ROI][:, :, b].ravel() KR = KernelReg(y, x, var_type='c', bw=opt_bw) smooth_wva[:, ROI, b] += KR.fit(unique_event_lengths)[0] np.save( datadir + 'smooth_' + suffix + '_' + save_fn + '_auto_independent_bandwidths', smooth_wva) np.save( datadir + 'smooth_' + suffix + '_' + save_fn + '_auto_independent_optimal_bandwidth', opt_bw_holder)
class CausalEffect(object): def __init__(self, X, causes, effects, admissable_set=[], variable_types=None, expectation=False, density=True): """ We want to calculate the causal effect of X and Y through back-door adjustment, P(Y|do(X)) = Sum( P(Y|X,Z)P(Z), Z) for some admissable set of control variables, Z. First we calculate the conditional density P(Y|X,Z), then the density P(Z). We find the support of Z so we can properly sum over it later. variable_types are a dictionary with the column name pointing to an element of set(['o', 'u', 'c']), for 'ordered', 'unordered discrete', or 'continuous'. """ conditional_density_vars = causes + admissable_set self.causes = causes self.effects = effects self.admissable_set = admissable_set self.conditional_density_vars = conditional_density_vars if variable_types: self.variable_types = variable_types dep_type = [variable_types[var] for var in effects] indep_type = [variable_types[var] for var in conditional_density_vars] density_types = [variable_types[var] for var in admissable_set] else: self.variable_types = self.__infer_variable_types(X) if 'c' not in variable_types.values(): bw = 'cv_ml' else: bw = 'normal_reference' if admissable_set: self.density = KDEMultivariate(X[admissable_set], var_type=''.join(density_types), bw=bw) self.conditional_density = KDEMultivariateConditional(endog=X[effects], exog=X[conditional_density_vars], dep_type=''.join(dep_type), indep_type=''.join(indep_type), bw=bw) if expectation: self.conditional_expectation = KernelReg(X[effects].values, X[conditional_density_vars].values, ''.join(indep_type), bw='cv_ls') self.support = self.__get_support(X) self.discrete_variables = [ variable for variable, var_type in self.variable_types.items() if var_type in ['o', 'u']] self.discrete_Z = list(set(self.discrete_variables).intersection(set(admissable_set))) self.continuous_variables = [ variable for variable, var_type in self.variable_types.items() if var_type == 'c' ] self.continuous_Z = list(set(self.continuous_variables).intersection(set(admissable_set))) def __infer_variable_types(self,X): """ fill this in later. """ pass def __get_support(self, X): """ find the smallest cube around which the densities are supported, allowing a little flexibility for variables with larger bandwidths. """ data_support = { variable : (X[variable].min(), X[variable].max()) for variable in X.columns} variable_bandwidths = { variable : bw for variable, bw in zip(self.effects + self.conditional_density_vars, self.conditional_density.bw)} support = {} for variable in self.effects + self.conditional_density_vars: if self.variable_types[variable] == 'c': lower_support = data_support[variable][0] - 10. * variable_bandwidths[variable] upper_support = data_support[variable][1] + 10. * variable_bandwidths[variable] support[variable] = (lower_support, upper_support) else: support[variable] = data_support[variable] return support def integration_function(self,*args): # takes continuous z, discrete z, then x data = pd.DataFrame({ k : [v] for k, v in zip(self.continuous_Z + self.discrete_Z + self.causes + self.effects, args)}) conditional = self.conditional_density.pdf(exog_predict=data[self.conditional_density_vars].values[0], endog_predict=data[self.effects].values[0]) density = self.density.pdf(data_predict=data[self.admissable_set]) return conditional * density def expectation_integration_function(self, *args): data = pd.DataFrame({ k : [v] for k, v in zip(self.continuous_Z + self.discrete_Z + self.causes, args)}) conditional = self.conditional_expectation.fit(data_predict=data[self.conditional_density_vars].values)[0] density = self.density.pdf(data_predict=data[self.admissable_set]) return conditional * density def pdf(self, x): """ Currently, this does the whole sum/integral over the cube support of Z. We may be able to improve this by taking into account how the joint and conditionals factorize, and/or finding a more efficient support. This should be reasonably fast for |Z| <= 2 or 3, and small enough discrete variable cardinalities. It runs in O(n_1 n_2 ... n_k) in the cardinality of the discrete variables, |Z_1| = n_1, etc. It likewise runs in O(V^n) for n continuous Z variables. Factorizing the joint/conditional distributions in the sum could linearize the runtime. """ causal_effect = 0. x = x[self.causes + self.effects] if self.discrete_Z: discrete_variable_ranges = [ xrange(*(int(self.support[variable][0]), int(self.support[variable][1])+1)) for variable in self.discrete_Z] for z_vals in itertools.product(*discrete_variable_ranges): z_discrete = pd.DataFrame({k : [v] for k, v in zip(self.discrete_Z, z_vals)}) if self.continuous_Z: continuous_Z_ranges = [self.support[variable] for variable in self.continuous_Z] args = z_discrete.join(x).values[0] causal_effect += nquad(self.integration_function,continuous_Z_ranges,args=args)[0] else: z_discrete = z_discrete[self.admissable_set] exog_predictors = x.join(z_discrete)[self.conditional_density_vars] conditional = self.conditional_density.pdf(exog_predict=exog_predictors, endog_predict=x[self.effects]) density = self.density.pdf(data_predict=z_discrete) dc = conditional * density causal_effect += dc return causal_effect elif self.continuous_Z: continuous_Z_ranges = [self.support[var] for var in self.continuous_Z] causal_effect, error = nquad(self.integration_function,continuous_Z_ranges,args=tuple(x.values[0])) return causal_effect else: return self.conditional_density.pdf(exog_predict=x[self.causes],endog_predict=x[self.effects]) def expected_value( self, x): """ Currently, this does the whole sum/integral over the cube support of Z. We may be able to improve this by taking into account how the joint and conditionals factorize, and/or finding a more efficient support. This should be reasonably fast for |Z| <= 2 or 3, and small enough discrete variable cardinalities. It runs in O(n_1 n_2 ... n_k) in the cardinality of the discrete variables, |Z_1| = n_1, etc. It likewise runs in O(V^n) for n continuous Z variables. Factorizing the joint/conditional distributions in the sum could linearize the runtime. """ causal_effect = 0. x = x[self.causes] if self.discrete_Z: discrete_variable_ranges = [ xrange(*(int(self.support[variable][0]), int(self.support[variable][1])+1)) for variable in self.discrete_Z] for z_vals in itertools.product(*discrete_variable_ranges): z_discrete = pd.DataFrame({k : [v] for k, v in zip(self.discrete_Z, z_vals)}) if self.continuous_Z: continuous_Z_ranges = [self.support[variable] for variable in self.continuous_Z] args = z_discrete.join(x).values[0] causal_effect += nquad(self.expectation_integration_function,continuous_Z_ranges,args=args)[0] else: z_discrete = z_discrete[self.admissable_set] exog_predictors = x.join(z_discrete)[self.conditional_density_vars] causal_effect += self.conditional_expectation.fit(data_predict=exog_predictors.values)[0] * self.density.pdf(data_predict=z_discrete.values) return causal_effect elif self.continuous_Z: continuous_Z_ranges = [self.support[var] for var in self.continuous_Z] causal_effect, error = nquad(self.expectation_integration_function,continuous_Z_ranges,args=tuple(x.values[0])) return causal_effect else: return self.conditional_expectation.fit(data_predict=x[self.causes])[0]
import numpy as np import matplotlib.pyplot as plt from statsmodels.nonparametric.kernel_regression import KernelReg x = np.sort(np.random.rand(400) * 10 - 2) y = x**4 - 8 * (x**3) + 14 * (x**2) - 32 * (x) + 14 + ( (np.random.rand(len(x)) - 0.5) * 50) y_clean = x**4 - 8 * (x**3) + 14 * (x**2) - 32 * (x) + 14 reg = KernelReg(y, x, 'c') [mean, mfx] = reg.fit() plt.figure() plt.scatter(x, y) plt.plot(x, mean, color="red") plt.plot(x, y_clean, color="green") plt.show()
def model(self): #Time the modelling start_time = time.clock() #Extract dependent and independent variables y = self.df['impl_volatility'].values x = self.df[['strike_price', 'stock', 'T', 'riskfree']].values #Activate efficient bandwidth selection if self.bandwidth == None: self.efficient = True self.bandwidth = 'cv_ls' print( 'No predetermined bandwidth selected. Looking for optimizng the bandwidth' ) #Bandwidth defined by Scott D.W. elif self.bandwidth == 'bw_scott': self.bandwidth = bw_scott(x) #self.bandwidth = self.bandwidth*() print('Selected bandwidth: ', self.bandwidth) #SBandwidth defined by Silverman B.W. elif self.bandwidth == 'bw_silverman': self.bandwidth = bw_silverman(x) print('Selected bandwidth: ', self.bandwidth) #Or else select own bandsidth for the array else: pass #Optimize the bandwidth selection if no other bandwidth selection method is defined. #See more here on their github page #https://github.com/statsmodels/statsmodels/blob/master/statsmodels/nonparametric/_kernel_base.py defaults = EstimatorSettings(efficient=self.efficient, randomize=False, n_sub=50, n_res=50, n_jobs=0, return_only_bw=True) #Preprocess the data for faster computation x = preprocessing.normalize(x) #Split the data into traning anf testing data for in and out of sample testing xtrain, xtest, ytrain, ytest = train_test_split(x, y) #Define the regressor, with conrinues variables and the bandwith selection reg = KernelReg(endog=ytrain, exog=xtrain, var_type='cccc', bw=self.bandwidth, defaults=defaults) #Fit the data onto the test data to get a out of sample prediction pred = reg.fit(xtest)[0] #Get the results from the test i form om RMSE and in and out of sample R^2 print('RMSE: ', np.sqrt(mean_squared_error(ytest, pred))) print('Out of Sample R^2 :', r2_score(ytest, pred)) #print ('In sample ' , reg.r_squared()) #Print the computing time print('Estimation time: ', time.clock() - start_time, "seconds") return reg
eigen_solver="auto", tol=1e-9, max_iter=3000, n_jobs=-1) feature_coords = kpca.fit_transform((sim_mat**2) * -0.5) landfalls = np.array([float(h.made_landfall) for h in hurricane_list]) inds = np.argsort(feature_coords[:, 0]) feature_coords_sorted = feature_coords[inds] landfalls_sorted = landfalls[inds] vartypes = ''.join('c' * target_dim) reg = KernelReg(landfalls_sorted, feature_coords_sorted, vartypes) [mean, mfx] = reg.fit() # plt.figure() # plt.scatter(feature_coords_sorted[:,0], landfalls_sorted, color="green") # plt.plot(feature_coords_sorted[:,0], mean, color="red") # plt.show() cv_feature_coords = kpca.transform((data_matrix**2) * -0.5) # print cv_feature_coords [cv_mean, cv_mfx] = reg.fit(cv_feature_coords) # print cv_mean cv_predicted = np.zeros(m) cv_high_prob = np.zeros(m) + 0.5 num_high_prob = 0 thresh = 0.05
event_lengths = durs_run1_new/fairK unique_event_lengths = np.unique(event_lengths) x = event_lengths.ravel() ROI_data = [a1_data, AG_data, prec_data, mpfc_data] #ROI_data = [a1_data,AG_data,prec_data] test_x = np.linspace(min(x), max(x), num=100) smooth_wva = np.zeros((len(unique_event_lengths), len(ROI_data), nBoots)) for b in range(nBoots): # Optimize bandwidth opt_bw = 0 for ROI in range(len(ROI_data)): y = ROI_data[ROI][:,:,b].ravel() KR = KernelReg(y,x,var_type='c') opt_bw += KR.bw/len(ROI_data) max_wva = np.zeros(len(ROI_data)) for ROI in range(len(ROI_data)): y = ROI_data[ROI][:,:,b].ravel() KR = KernelReg(y,x,var_type='c', bw=opt_bw) max_wva[ROI] = np.argmax(KR.fit(test_x)[0]) # Find peak on fine grid smooth_wva[:, ROI, b] += KR.fit(unique_event_lengths)[0] np.save(datadir + 'smooth_wva_split_merge_01_a1_prec_AG_bilmPFC',smooth_wva)
class Surface: def __init__(self, f, f2, pts3d, left_pts, right_pts, oldpts3d, safety_check=False): self.f = f self.f2 = f2 self.safety_check = safety_check self.pts3d = np.matrix(pts3d) self.minimum = np.min(self.pts3d[:, 2]) self.maximum = np.max(self.pts3d[:, 2]) self.oldpts3d = oldpts3d self.left_pts = left_pts self.right_pts = right_pts pts2d = [] ptsz = [] f3 = open("../calibration_data/camera_matrix.p", "rb") self.cmat = pickle.load(f3) f3.close() for pt in pts3d: pts2d.append(pt[:2]) ptsz.append(np.ceil(pt[2] * 1000000)) self.neigh = KNeighborsClassifier(n_neighbors=2) self.neigh.fit(pts2d, ptsz) self.f = scipy.interpolate.Rbf(np.matrix(pts3d)[:, 0].ravel(), np.matrix(pts3d)[:, 1].ravel(), np.matrix(pts3d)[:, 2].ravel(), function='linear', epsilon=.1) pts3d = np.array(pts3d).T print pts3d.shape print pts3d[:2, :].shape, pts3d[2, :].shape self.f = KernelReg(pts3d[2, :], pts3d[:2, :], 'cc') def leftpixels_to_rframe(self, x, y): surf = self.f2 left_pts = self.left_pts right_pts = self.right_pts pts3d = self.oldpts3d xin = np.array([a[0] for a in left_pts]) bias = np.ones(len(xin)) yin = np.array([a[1] for a in left_pts]) xout = np.array([a[0] for a in pts3d]) yout = np.array([a[1] for a in pts3d]) A = np.vstack([xin, bias]).T m1, c1 = np.linalg.lstsq(A, xout)[0] A = np.vstack([yin, bias]).T m2, c2 = np.linalg.lstsq(A, yout)[0] xnew = m1 * x + c1 ynew = m2 * y + c2 cpoint = np.matrix([(xnew, ynew, self.f2(xnew, ynew))]) pt = np.ones(4) pt[:3] = cpoint pred = self.cmat * np.matrix(pt).T return pred def query(self, x, y): temp = self.f.fit(np.array((x, y)))[0][0] if not self.safety_check: return (x, y, temp) if temp < self.minimum - 0.02: temp = self.query_knn(x, y)[2] elif temp > self.maximum + 0.02: temp = self.query_knn(x, y)[2] print 'asdf', temp return (x, y, temp) def query_knn(self, x, y): return (x, y, (self.neigh.predict([[x, y]]) / 1000000.0)[0]) def visualize(self): fig = plt.figure() ax = fig.add_subplot(111) pts3d = np.matrix(self.pts3d) f = self.f a, b = np.ravel(np.min(pts3d, axis=0)), np.ravel(np.max(pts3d, axis=0)) extra_range = 0.0 # xnew = np.arange(a[0] - extra_range,b[0] + extra_range,0.0001) # ynew = np.arange(a[1] - extra_range,b[1] + extra_range,0.0001) X, Y = np.mgrid[a[0] + .05:b[0] - .05:100j, a[1]:b[1]:100j]
x4=xax4 y4= tweetatsec4 pyplot.xlabel('Second') pyplot.ylabel('Total tweet') pyplot.scatter(x,y,color='cyan') pyplot.scatter(x2,y2,color='red') pyplot.scatter(x3,y3,color='blue') pyplot.scatter(x4,y4,color='green') kr = KernelReg(y,x,'o') kr2 = KernelReg(y2,x2,'o') kr3 = KernelReg(y3,x3,'o') kr4 = KernelReg(y4,x4,'o') pyplot.plot(x, y, '+') pyplot.plot(x2,y2,'+') pyplot.plot(x3,y3,'+') pyplot.plot(x4,y4,'+') y_pred, y_std = kr.fit(x) y2_pred, y2_std = kr2.fit(x2) y3_pred, y3_std = kr3.fit(x3) y4_pred, y4_std = kr4.fit(x4) pyplot.plot(x, y_pred,'cyan',label='twitter') pyplot.plot(x2,y2_pred,'red',label='facebook') pyplot.plot(x3,y3_pred,'blue',label='instagram') pyplot.plot(x4,y4_pred,'green',label='tumblr') pyplot.legend(loc='upper right') pyplot.show()
# compute average max wva across songs mean_max_wva = np.mean(max_wvas) # computing average event lengths using song durations divided by number of events durs_run1_new = durs_run1[:, np.newaxis] event_lengths = durs_run1_new / K_set unique_event_lengths = np.unique(event_lengths) x = event_lengths.ravel() test_x = np.linspace(min(x), max(x), num=100) y = ROI_WvA.ravel() KR = KernelReg(y, x, var_type='c') KR_w_bw = KernelReg(y, x, var_type='c', bw=KR.bw) smooth_wva = KR_w_bw.fit(unique_event_lengths)[0] max_wva = np.max(smooth_wva) # compute roi's preferred event length in seconds ROI_pref_sec = unique_event_lengths[np.argmax(smooth_wva)] inputs = [ROI_WvA, smooth_wva, max_wva, mean_max_wva, ROI_pref_sec] dct = {} for i, j in zip(dict_names, inputs): dct.setdefault(i, []).append(j) np.save(savedir + 'parcel' + roiNum + '_wva_data', dct)
class CausalEffect(object): def __init__(self, X, causes, effects, admissable_set=[], variable_types=None, expectation=False, density=True): """ We want to calculate the causal effect of X and Y through back-door adjustment, P(Y|do(X)) = Sum( P(Y|X,Z)P(Z), Z) for some admissable set of control variables, Z. First we calculate the conditional density P(Y|X,Z), then the density P(Z). We find the support of Z so we can properly sum over it later. variable_types are a dictionary with the column name pointing to an element of set(['o', 'u', 'c']), for 'ordered', 'unordered discrete', or 'continuous'. """ conditional_density_vars = causes + admissable_set self.causes = causes self.effects = effects self.admissable_set = admissable_set self.conditional_density_vars = conditional_density_vars if variable_types: self.variable_types = variable_types dep_type = [variable_types[var] for var in effects] indep_type = [ variable_types[var] for var in conditional_density_vars ] density_types = [variable_types[var] for var in admissable_set] else: self.variable_types = self.__infer_variable_types(X) if 'c' not in variable_types.values(): bw = 'cv_ml' else: bw = 'normal_reference' if admissable_set: self.density = KDEMultivariate(X[admissable_set], var_type=''.join(density_types), bw=bw) self.conditional_density = KDEMultivariateConditional( endog=X[effects], exog=X[conditional_density_vars], dep_type=''.join(dep_type), indep_type=''.join(indep_type), bw=bw) if expectation: self.conditional_expectation = KernelReg( X[effects].values, X[conditional_density_vars].values, ''.join(indep_type), bw='cv_ls') self.support = self.__get_support(X) self.discrete_variables = [ variable for variable, var_type in self.variable_types.items() if var_type in ['o', 'u'] ] self.discrete_Z = list( set(self.discrete_variables).intersection(set(admissable_set))) self.continuous_variables = [ variable for variable, var_type in self.variable_types.items() if var_type == 'c' ] self.continuous_Z = list( set(self.continuous_variables).intersection(set(admissable_set))) def __infer_variable_types(self, X): """ fill this in later. """ pass def __get_support(self, X): """ find the smallest cube around which the densities are supported, allowing a little flexibility for variables with larger bandwidths. """ data_support = { variable: (X[variable].min(), X[variable].max()) for variable in X.columns } variable_bandwidths = { variable: bw for variable, bw in zip( self.effects + self.conditional_density_vars, self.conditional_density.bw) } support = {} for variable in self.effects + self.conditional_density_vars: if self.variable_types[variable] == 'c': lower_support = data_support[variable][ 0] - 10. * variable_bandwidths[variable] upper_support = data_support[variable][ 1] + 10. * variable_bandwidths[variable] support[variable] = (lower_support, upper_support) else: support[variable] = data_support[variable] return support def integration_function(self, *args): # takes continuous z, discrete z, then x data = pd.DataFrame({ k: [v] for k, v in zip( self.continuous_Z + self.discrete_Z + self.causes + self.effects, args) }) conditional = self.conditional_density.pdf( exog_predict=data[self.conditional_density_vars].values[0], endog_predict=data[self.effects].values[0]) density = self.density.pdf(data_predict=data[self.admissable_set]) return conditional * density def expectation_integration_function(self, *args): data = pd.DataFrame({ k: [v] for k, v in zip(self.continuous_Z + self.discrete_Z + self.causes, args) }) conditional = self.conditional_expectation.fit( data_predict=data[self.conditional_density_vars].values)[0] density = self.density.pdf(data_predict=data[self.admissable_set]) return conditional * density def pdf(self, x): """ Currently, this does the whole sum/integral over the cube support of Z. We may be able to improve this by taking into account how the joint and conditionals factorize, and/or finding a more efficient support. This should be reasonably fast for |Z| <= 2 or 3, and small enough discrete variable cardinalities. It runs in O(n_1 n_2 ... n_k) in the cardinality of the discrete variables, |Z_1| = n_1, etc. It likewise runs in O(V^n) for n continuous Z variables. Factorizing the joint/conditional distributions in the sum could linearize the runtime. """ causal_effect = 0. x = x[self.causes + self.effects] if self.discrete_Z: discrete_variable_ranges = [ xrange(*(int(self.support[variable][0]), int(self.support[variable][1]) + 1)) for variable in self.discrete_Z ] for z_vals in itertools.product(*discrete_variable_ranges): z_discrete = pd.DataFrame( {k: [v] for k, v in zip(self.discrete_Z, z_vals)}) if self.continuous_Z: continuous_Z_ranges = [ self.support[variable] for variable in self.continuous_Z ] args = z_discrete.join(x).values[0] causal_effect += nquad(self.integration_function, continuous_Z_ranges, args=args)[0] else: z_discrete = z_discrete[self.admissable_set] exog_predictors = x.join(z_discrete)[ self.conditional_density_vars] conditional = self.conditional_density.pdf( exog_predict=exog_predictors, endog_predict=x[self.effects]) density = self.density.pdf(data_predict=z_discrete) dc = conditional * density causal_effect += dc return causal_effect elif self.continuous_Z: continuous_Z_ranges = [ self.support[var] for var in self.continuous_Z ] causal_effect, error = nquad(self.integration_function, continuous_Z_ranges, args=tuple(x.values[0])) return causal_effect else: return self.conditional_density.pdf(exog_predict=x[self.causes], endog_predict=x[self.effects]) def expected_value(self, x): """ Currently, this does the whole sum/integral over the cube support of Z. We may be able to improve this by taking into account how the joint and conditionals factorize, and/or finding a more efficient support. This should be reasonably fast for |Z| <= 2 or 3, and small enough discrete variable cardinalities. It runs in O(n_1 n_2 ... n_k) in the cardinality of the discrete variables, |Z_1| = n_1, etc. It likewise runs in O(V^n) for n continuous Z variables. Factorizing the joint/conditional distributions in the sum could linearize the runtime. """ causal_effect = 0. x = x[self.causes] if self.discrete_Z: discrete_variable_ranges = [ xrange(*(int(self.support[variable][0]), int(self.support[variable][1]) + 1)) for variable in self.discrete_Z ] for z_vals in itertools.product(*discrete_variable_ranges): z_discrete = pd.DataFrame( {k: [v] for k, v in zip(self.discrete_Z, z_vals)}) if self.continuous_Z: continuous_Z_ranges = [ self.support[variable] for variable in self.continuous_Z ] args = z_discrete.join(x).values[0] causal_effect += nquad( self.expectation_integration_function, continuous_Z_ranges, args=args)[0] else: z_discrete = z_discrete[self.admissable_set] exog_predictors = x.join(z_discrete)[ self.conditional_density_vars] causal_effect += self.conditional_expectation.fit( data_predict=exog_predictors.values )[0] * self.density.pdf(data_predict=z_discrete.values) return causal_effect elif self.continuous_Z: continuous_Z_ranges = [ self.support[var] for var in self.continuous_Z ] causal_effect, error = nquad(self.expectation_integration_function, continuous_Z_ranges, args=tuple(x.values[0])) return causal_effect else: return self.conditional_expectation.fit( data_predict=x[self.causes])[0]
def get_regularized_params( model_parameters, genes, genes_step1, genes_log10_gmean_step1, genes_log10_gmean, cell_attr, umi, batch_var=None, bw_adjust=3, gmean_eps=1, theta_regularization="od_factor", exclude_poisson=False, poisson_genes=None, method="theta_ml", ): model_parameters = model_parameters.copy() model_parameters_fit = pd.DataFrame( npy.nan, index=genes, columns=model_parameters.columns ) """ exog_predict = genes_log10_gmean#.values for column in model_parameters.columns: if column == "theta": continue endog = model_parameters.loc[genes_step1, column].values exog_fit = genes_log10_gmean_step1#.values bw = bwSJ(genes_log10_gmean_step1, bw_adjust=bw_adjust)#.values) reg = KernelReg(endog=endog, exog=exog_fit, var_type="c", reg_type="ll", bw=bw) model_parameters_fit[column] = reg.fit(exog_predict)[0] """ x_points_df = pd.DataFrame({"gene_log10_gmean": genes_log10_gmean}) x_points_df["min_gene_log10_gmean_step1"] = genes_log10_gmean_step1.min() x_points_df["x_points"] = npy.nanmax(x_points_df, axis=1) x_points_df["max_gene_log10_gmean_step1"] = npy.nanmax(genes_log10_gmean_step1) x_points_df["x_points"] = x_points_df[ ["x_points", "max_gene_log10_gmean_step1"] ].min(1) x_points = x_points_df["x_points"].values for column in model_parameters.columns: if column == "theta": continue endog = model_parameters.loc[genes_step1, column].values exog_fit = genes_log10_gmean_step1 # .values if method == "glgmp": bw = bw_SJr(genes_log10_gmean_step1, bw_adjust=bw_adjust) # .values) params = ksmooth(genes_log10_gmean, genes_log10_gmean_step1, endog, bw[0]) index = model_parameters_fit.index.values[params["order"] - 1] model_parameters_fit.loc[index, column] = params["smoothed"] else: bw = bwSJ(genes_log10_gmean_step1, bw_adjust=bw_adjust) # .values) reg = KernelReg(endog=endog, exog=exog_fit, var_type="c", reg_type="ll", bw=bw) fit = reg.fit(x_points) model_parameters_fit[column] = npy.squeeze(fit[0]) if theta_regularization == "theta": theta = npy.power(10, (model_parameters["od_factor"])) else: theta = npy.power(10, genes_log10_gmean) / ( npy.power(10, model_parameters_fit["od_factor"]) - 1 ) model_parameters_fit["theta"] = theta if exclude_poisson: # relace theta by inf if poisson_genes is not None: model_parameters_fit.loc[poisson_genes, "theta"] = npy.inf return model_parameters_fit
for row in range(xin.shape[0]): xin_list.append(float(xin.iloc[row])) yin=pd.read_csv('yin.csv', names=['y']) yin_list=[] for row in range(yin.shape[0]): yin_list.append(float(yin.iloc[row])) df=pd.concat([yin,xin], axis=1) # Using statsmodels kde = KernelReg(x, y, var_type='c', reg_type='ll', bw=[3.2]) estimator = kde.fit(y) estimator = np.reshape(estimator[0], df.shape[0]) plt.scatter(x, y) plt.scatter(x, estimator, c='r') plt.show() # Using SKFDA df_grid=skfda.FDataGrid(df) bandwidth = np.arange(0.1, 5, 0.2) llr = val.SmoothingParameterSearch( ks.LocalLinearRegressionSmoother(), bandwidth)
def selector(case): if case == 1: results_dir = create_results_directory('./results/paper/dtr_vs_xgb') x, y = load_boston(return_X_y=True) x = pd.DataFrame(x, columns=[ 'crime', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'blacks', 'lstat' ]) x = x[['rm', 'lstat']] df_all = x.copy() df_all['price'] = y # Plot 3D scatter fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(df_all['rm'], df_all['lstat'], df_all['price']) ax.view_init(30, 135) plt.savefig(f'{results_dir}/scatter.png') plt.close() dtr = DecisionTreeRegressor(max_depth=2) dtr.fit(x, y) plot_tree(dtr, impurity=False) plt.savefig(f'{results_dir}/dtr_visual.png') plt.close() x_min = x.min(axis=0) x_max = x.max(axis=0) rm_linspace = np.linspace(x_min['rm'], x_max['rm'], 100) lstat_linspace = np.linspace(x_min['lstat'], x_max['lstat'], 100) rm, lstat = np.meshgrid(rm_linspace, lstat_linspace) points = np.stack(map(np.ravel, (rm, lstat)), axis=1) z = dtr.predict(points).reshape(rm.shape) fig = plt.figure() ax = fig.gca(projection='3d') ax.plot_surface(rm, lstat, z, cmap=plt.cm.BuGn, linewidth=0.2, vmin=-50) ax.view_init(30, 135) plt.savefig(f'{results_dir}/dtr_prediction.png') plt.close() # Linear regression lr = LinearRegression().fit(x, y) z = lr.predict(points).reshape(rm.shape) fig = plt.figure() ax = fig.gca(projection='3d') ax.plot_surface(rm, lstat, z, cmap=plt.cm.BuGn, linewidth=0.2, vmin=-50) ax.view_init(30, 135) plt.savefig(f'{results_dir}/lr_prediction.png') plt.close() # Linear regression kr = KernelReg(exog=x, endog=y, var_type='cc') z = kr.fit(points)[0].reshape(rm.shape) fig = plt.figure() ax = fig.gca(projection='3d') ax.plot_surface(rm, lstat, z, cmap=plt.cm.BuGn, linewidth=0.2, vmin=-50) ax.view_init(30, 135) plt.savefig(f'{results_dir}/kr_prediction.png') plt.close() # XGB hparams = { 'seed': 42, 'booster': 'gbtree', 'learning_rate': 0.1, 'objective': 'reg:squarederror', 'verbosity': 0, 'subsample': 1, 'max_depth': 2, 'colsample_bytree': 0.5, } dtrain = xgb.DMatrix(x.values, label=y) model = xgb.train(hparams, dtrain=dtrain, num_boost_round=100, verbose_eval=False) z_xgb = model.predict(xgb.DMatrix(points)).reshape(rm.shape) fig = plt.figure() ax = fig.gca(projection='3d') ax.plot_surface(rm, lstat, z_xgb, cmap=plt.cm.BuGn, linewidth=0.2, vmin=-50) ax.view_init(30, 135) plt.savefig(f'{results_dir}/xgb_prediction.png')
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri May 24 00:18:16 2019 KernelReg practice @author: mbattley """ from statsmodels.nonparametric.kernel_regression import KernelReg import numpy as np import matplotlib.pyplot as plt x = np.linspace(0, 2 * np.pi, 100) y = np.sin(x) + np.random.random(100) * 0.2 # The third parameter specifies the type of the variable x; # 'c' stands for continuous kr = KernelReg(y, x, 'c') plt.plot(x, y, '+') y_pred, y_std = kr.fit(x) plt.plot(x, y_pred) plt.show()