def get_linear_regression_results(x, y, intercept=False, rolling_window=-1): """ :type x: pd.Series :type y: pd.Series """ if rolling_window != -1: model = pd.ols(x=x, y=y, intercept=intercept, window=rolling_window, window_type='rolling') else: model = pd.ols(x=x, y=y, intercept=intercept) # The type of 'beta' actually changes.. if rolling then DataFrame, else Series slope = model.beta['x'] if rolling_window != -1 else model.beta[0] const = model.beta['intercept'] if intercept else 0 result = pd.concat([x, y], axis=1) # remove the start that cannot be fit using rolling window mode: if rolling_window != -1: result = result.iloc[rolling_window - 1:, :] result.loc[:, 'slope'] = slope result.loc[:, 'const'] = const result.loc[:, 'f(x)'] = result.iloc[:, 0] * slope + const result.loc[:, 'error'] = result.loc[:, 'f(x)'] - result.iloc[:, 1] return result
def dd_plot(self): data = ps.merge(self.temp, self.degd, left_index=True, right_index=True, how='inner') hdd = data[ps.notnull(data['HDD'])] cdd = data[ps.notnull(data['CDD'])] hday = hdd[hdd['temp'] < 60] cday = cdd[cdd['temp'] > 55] model = ps.ols(x=hday['temp'], y=hday['HDD']) mH, cH = model.beta.x, model.beta.intercept print model model = ps.ols(x=cday['temp'], y=cday['CDD']) mC, cC = model.beta.x, model.beta.intercept print model xH = hday['temp'] xC = cday['temp'] vs_temp = ps.DataFrame({ 'temp': hdd['temp'], 'hdd scatter': hdd['HDD'] }).merge(ps.DataFrame({ 'temp': cdd['temp'], 'cdd scatter': cdd['CDD'] }), on='temp', how='outer', suffixes=('', '')).merge(ps.DataFrame({ 'temp': xH, 'hdd regression': line(xH, mH, cH) }), on='temp', how='outer', suffixes=('', '')).merge(ps.DataFrame({ 'temp': xC, 'cdd regression': line(xC, mC, cC) }), on='temp', how='outer', suffixes=('', '')) data = data.fillna(0) vs_time = ps.DataFrame({ 'time': data.index, 'HDD': data['HDD'], 'CDD': data['CDD'] }) return vs_temp.set_index('temp'), vs_time.set_index('time')
def movingLR(fr_xzdf, fr_xydf, fr_xdf, window_period): ts_xz=fr_xzdf.reset_index() ts_xy=fr_xydf.reset_index() ts_x=fr_xdf.reset_index() tdelta=ts_xz['Time']-ts_xz['Time'][0] tdelta=tdelta.astype('timedelta64[s]')/(60*60*24.) #in days for col in fr_xzdf.columns: print col #if col!=7:continue ts_xz[col].plot() model=pd.ols(y=ts_xz[col], x=tdelta,window_type='rolling', window=window_period, intercept=True) ts_xz[col]=np.round(model.beta.x,3) ts_xz[col].plot() model=pd.ols(y=ts_xy[col], x=tdelta,window_type='rolling', window=window_period, intercept=True) ts_xy[col]=np.round(model.beta.x,3) model=pd.ols(y=ts_x[col], x=tdelta,window_type='rolling', window=window_period, intercept=True) ts_x[col]=np.round(model.beta.x,3) plt.show() ts_xz.index=ts_xz['Time'] ts_xy.index=ts_xz['Time'] ts_x.index=ts_xz['Time']
def _run_regressions(self): self.model_capm = pd.ols(y=self.est_data['RETX'], x=self.est_data[['mkt']]) self.model_ff3f = pd.ols(y=self.est_data['RETX'], x=self.est_data[['mkt', 'smb', 'hml']]) self.model_ff4f = pd.ols(y=self.est_data['RETX'], x=self.est_data[['mkt', 'smb', 'hml', 'umd']]) self._has_models = True
def half_life(y, x): """calculate the half_life for a pair""" model = pd.ols(y=y, x=x, intercept=False) res = model.resid dres = res.shift(1)[1:] - res[1:] resmodel = pd.ols(y=dres, x=res, intercept=False) half = -np.log(2) / resmodel.beta[0] return half
def degree_day_regression(df, x_opt='both'): ''' Function that runs the weather normalization regression on energy use data df: dataframe that includes use per day (upd) heating degree days per day (hddpd) cooling degree days per day (cddpd) x_opt: options for the regression function 'hdd': run regression with just heating degree days 'cdd': run regression with just cooling degree days 'both' (default): ''' if x_opt == 'hdd': covar = {'HDD': df.hdd_per_day} results = pd.ols(y=df.use_per_day, x=covar) return pd.DataFrame([[ results.beta[1], results.std_err[1], results.beta[0], results.std_err[0], results.r2, results.r2_adj, results.nobs ]], columns=[ 'intercept', 'intercept_std_err', 'HDD', 'HDD_std_err', 'R2', 'R2_adj', 'N_reads' ]) elif x_opt == 'cdd': covar = {'CDD': df.cdd_per_day} results = pd.ols(y=df.use_per_day, x=covar) return pd.DataFrame([[ results.beta[1], results.std_err[1], results.beta[0], results.std_err[0], results.r2, results.r2_adj, results.nobs ]], columns=[ 'intercept', 'intercept_std_err', 'CDD', 'CDD_std_err', 'R2', 'R2_adj', 'N_reads' ]) elif x_opt == 'both': covar = {'CDD': df.cdd_per_day, 'HDD': df.hdd_per_day} results = pd.ols(y=df.use_per_day, x=covar) return pd.DataFrame([[ results.beta[2], results.std_err[2], results.beta[0], results.std_err[0], results.beta[1], results.std_err[1], results.r2, results.r2_adj, results.nobs ]], columns=[ 'intercept', 'intercept_std_err', 'CDD', 'CDD_std_err', 'HDD', 'HDD_std_err', 'R2', 'R2_adj', 'N_reads' ])
def compute_hedge_ratios(self, **kwargs): rolling_beta_com = self.settings.get('rolling_beta_com') # Predict long term volatility move as a function of short fd = self.settings['fd'] y = self.strat_data['cm_vol_fut_returns'][fd[1]] x = pd.DataFrame(index=self.strat_data['cm_vol_fut_returns'].index) x['front_return'] = self.strat_data['cm_vol_fut_returns'][fd[0]] x['front_level'] = self.strat_data['cm_vol_fut_prices'][fd[0]].shift(1) x['interaction'] = x['front_return'] * x['front_level'] exog_vars = ['front_return', 'front_level', 'interaction'] # OLS to start r1 = pd.ols(y=y, x=x) # Now use OLS residuals for WLS r2 = pd.ols(y=np.log(r1.resid**2), x=x[exog_vars]) pred_sq_err = np.exp(r2.y_fitted) weights = 1. / pred_sq_err reg_data = x reg_data['endog'] = y reg_data['weights'] = weights reg_data = reg_data[np.isfinite(reg_data).all(axis=1)] reg_data = sm.add_constant(reg_data) r3 = sm.WLS(endog=reg_data['endog'], exog=reg_data[['const'] + exog_vars], weights=reg_data['weights']).fit() # Historical front/back betas self.calc['front_back_beta'] = ( r3.params.front_return + x['front_level'] * r3.params.interaction) self.calc['rolling_front_back_beta'] = \ y.ewm(com=rolling_beta_com).cov(x['front_return'] .ewm(com=rolling_beta_com)) \ / x['front_return'].ewm(com=rolling_beta_com).var() buff = 21 self.calc['rolling_front_back_beta'].iloc[0:buff] \ = self.calc['rolling_front_back_beta'].iloc[buff] self.calc['r1'] = r1 self.calc['r2'] = r2 self.calc['r3'] = r3
def regress(): if True: pid=session["pid"] regs=[] csvf=data[pid] reg=pd.DataFrame() outs=session['rout'] inps=session['rinp'] controls=session['rcont'] count=0 for o in outs: r=[] inputData=[] controlData=[] y=csvf[o] for i in inps: reg[i]=csvf[i] for control in controls: reg[control]=csvf[control] model=pd.ols(y=y,x=reg) formula="Measuring the impact of \""+', '.join(inps)+"\" on \""+o+"\" while controlling for variables such as \""+', '.join(controls)+"\"" res=model.summary_as_matrix r.append(formula) r.append(round(model.r2_adj,2)) r.append(round(model.f_stat['f-stat'],2)) r.append(round(model.f_stat['p-value'],5)) r.append(model.df) r.append(model.nobs) for i in inps: idata=[] idata.append(i) coef=round(res.ix['beta'][i],4) idata.append(coef) pval=round(res.ix['p-value'][i],4) idata.append(pval) stderr=round(res.ix['std err'][i],2) idata.append(stderr) tstat=round(res.ix['t-stat'][i],2) idata.append(tstat) inputData.append(idata) r.append(inputData) for c in controls: cdata=[] cdata.append(c) coef=round(res.ix['beta'][c],4) cdata.append(coef) pval=round(res.ix['p-value'][c],4) cdata.append(pval) stderr=round(res.ix['std err'][c],2) cdata.append(stderr) tstat=round(res.ix['t-stat'][c],2) cdata.append(tstat) controlData.append(cdata) r.append(controlData) regs.append(r) count+=1 data["regs"]=regs return render_template("regression.html",regs=regs) else: return redirect(url_for("logout"))
def find_cointergrate_stocks(self,stockList): stocks_pair = {} price_df = history(g.adfTest_period, unit='1d', field='close', security_list=stockList, df=True, skip_paused=False, fq='pre') for i in range(len(stockList)): stock1 = stockList[i] stock1_price = price_df[stock1] for j in range(i+1,len(stockList)): stock2 = stockList[j] stock2_price = price_df[stock2] combined_df = pd.concat([stock1_price,stock2_price],axis=1) print ('combined_df is ', combined_df) combined_df = combined_df.dropna() if len(combined_df) < 500: continue stock2_price = combined_df[stock2] stock1_price = combined_df[stock1] model = pd.ols(y=stock2_price, x=stock1_price, intercept=True) # perform ols on these two stocks spread = stock2_price - stock1_price*model.beta['x'] spread = spread.dropna() spread = spread.values sta = sts.adfuller(spread, 1) if sta[1] < 0.05 and sta[0] < sta[4]['5%'] and model.beta['x'] > 0: stocks_pair[(stock1,stock2, model.beta[1], model.beta['x'],np.std(spread), np.mean(spread))] = sta[0] rank = sorted(stocks_pair.items(),key=operator.itemgetter(1)) return rank[:1]
def regress_by_year(isi, in_sample, osi, out_sample): """ Docstring if I can get it working.... """ for year in in_sample['yearID'].unique(): no_yr = in_sample.columns.drop('yearID') d_too = {} is_yr = in_sample['yearID'] == year os_yr = out_sample['yearID'] == year ols = pandas.ols(x = in_sample.loc[is_yr, no_yr], y = ys[isi][is_yr]) df = ols.summary_as_matrix is_sig = df.loc['p-value', df.loc['p-value', :] < .01].index if 'intercept' in is_sig: is_sig = is_sig.drop('intercept') clf = ensemble.RandomForestRegressor(n_estimators = 15) clf.fit(in_sample.loc[is_yr, is_sig], ys[isi][is_yr]) is_score = clf.score(in_sample.loc[is_yr, is_sig], ys[isi][is_yr]) d_too['is-r2'] = is_score os_score = clf.score(out_sample.loc[os_yr, is_sig], ys[osi][os_yr]) d_too['os-r2'] = os_score eps = ys[osi][os_yr].sub(clf.predict(out_sample.loc[os_yr, is_sig])) d_too['mae'] = eps.abs().sum()/(len(ys[osi][os_yr]) - 2.) d[year] = pandas.Series(d_too) return pandas.DataFrame(d).transpose()
def mv_regression(xs, ys, in_sample_size): """ Test a multi-variate regression creating the coefficients in sample and then using those coefficients to test the regression out of sample Args: ----- - xs: `pandas.DataFrame` of the xs - ys: `pandas.Series` of the variable we're attempting to predit - in_sample_size: integer of the size of the `in sample` we want to use to train our regression Returns: --------- float of the MSE or Mean Squared Error """ isi, in_sample, osi, out_sample = create_in_out_samples(xs, in_sample_size) #run the regression and predict the new values ols = pandas.ols(x = in_sample, y = ys[isi]) betas = ols.beta intercept = betas['intercept'] betas = betas[betas.index != 'intercept'] #make our prediction on out of sample pred = out_sample.dot(betas) + intercept eps = (pred - ys[osi]).apply(numpy.abs) mse = eps.sum()/( eps.shape[0] - 2) return mse
def test_window_ols_full(ols_data): y, x = ols_data['y'], ols_data['x'] res1 = _window_ols(y, x, window_type='full_sample') res2 = _window_ols(y, x) res3 = pd.ols(y=y, x=x, window_type='full_sample') assert_ols_equal(res1, res2) assert_ols_equal(res1, res3)
def PowerFit_CI(x, y, xspace=None, ax=plt, **kwargs): datadf = pd.DataFrame.from_dict({ 'x': x, 'y': y }).dropna().apply( np.log10 ) ## put x and y in a dataframe so you can drop ones that don't match up regression = pd.ols(y=datadf['y'], x=datadf['x']) ## Develop power function for x and y powfunc = powerfunction(x, y) ## x and y should be Series a, b = powfunc['a'].values, powfunc['b'].values #print a,b if xspace == None: xvals = np.linspace(0, x.max() * 1.2) #print 'No xspace, calculating xvals: '+str(x.max())+'*1.5= '+str(x.max()*1.5) else: xvals = xspace ypred = a * (xvals**b) ax.plot(xvals, ypred, **kwargs) ## Confidence interals ci = .5 a_cilo, a_ciup = 10**regression.sm_ols.conf_int( alpha=ci)[1][0], 10**regression.sm_ols.conf_int(alpha=ci)[1][1] b_cilo, b_ciup = regression.sm_ols.conf_int( alpha=ci)[0][0], regression.sm_ols.conf_int(alpha=ci)[0][1] ypred_cilo = a_cilo * (xvals**b_cilo) ypred_ciup = a_ciup * (xvals**b_ciup) ax.fill_between(xvals, ypred_cilo, ypred_ciup, alpha=0.5, **kwargs) plt.draw() return powfunc
def compute_exposures(factor_returns=None, target_returns=None): """ :param factor_returns: Pandas DataFrame indexed on date, with columns = factors :param target_returns: a series index on date, or a DataFrame indexed on date with columns for the various time series you want exposure for :return: """ if isinstance(target_returns, pd.Series): target_returns = pd.DataFrame(target_returns) regressions = dict() coefs = pd.DataFrame(index=target_returns.columns, columns=factor_returns.columns) t_stats = pd.DataFrame(index=target_returns.columns, columns=factor_returns.columns) for col in target_returns.columns: regressions[col] = pd.ols(y=target_returns[col], x=factor_returns) coefs.loc[col] = regressions[col].beta t_stats.loc[col] = regressions[col].t_stat return coefs, t_stats, regressions
def calculate_signals(self, event): """ generate pair trading signal :param event: :return: """ if event.type == 'MARKET' \ and self.bars.get_current_bar_total_number(self.y_symbol) > self.look_back \ and self.bars.get_current_bar_total_number(self.x_symbol) > self.look_back: y_bars = self.bars.get_latest_bars_values( self.y_symbol, "adj_close", N=self.look_back) x_bars = self.bars.get_latest_bars_values( self.x_symbol, "adj_close", N=self.look_back) # Use the pandas Ordinary Least Squares method to fit a rolling # linear regression between the two closing price time series model = pd.ols(y=y_bars, x=x_bars) # Construct the hedge ratio and eliminate the first # lookback-length empty/NaN period hedge_ratio = model.beta['x'] # Create the spread and then a z-score of the spread spread = y_bars - hedge_ratio*x_bars zscore = (spread - np.mean(spread))/np.std(spread) print(zscore)
def ar_periodogram(x, window='hanning', window_len=7): """ Compute periodogram from data x, using prewhitening, smoothing and recoloring. The data is fitted to an AR(1) model for prewhitening, and the residuals are used to compute a first-pass periodogram with smoothing. The fitted coefficients are then used for recoloring. Parameters: * x is a NumPy array containing time series data * window is a string indicating window type * window_len is an odd integer See the periodogram function documentation for more details on the window arguments. """ # === run regression === # x_current, x_lagged = x[1:], x[:-1] # x_t and x_{t-1} x_current, x_lagged = Series(x_current), Series(x_lagged) # pandas series results = ols(y=x_current, x=x_lagged, intercept=True, nw_lags=1) e_hat = results.resid.values phi = results.beta['x'] # === compute periodogram on residuals === # w, I_w = periodogram(e_hat, window=window, window_len=window_len) # === recolor and return === # I_w = I_w / np.abs(1 - phi * np.exp(1j * w))**2 return w, I_w
def model(self, dd, city_code, business_class, per_meter): ''' Regress usage data to heating and cooling degree days :param dd: heating and cooling degree days :param city_code: 'PRINCETON TWP' or 'PRINCETON BORO' :param business_class: a single business_class or a collection thereof - if a collection is given, the values are summed up :param per_meter: True for consumption per meter, False for aggregate :return: series of first differences ''' data = ps.DataFrame(self.select(city_code, business_class, per_meter)) data['year'] = data.index.map(lambda d: d.year) data = data[data['year'] > 2009]['usage'] # # index degree days frame for correspondence to usage data # dd = dd.ix[data.index].fillna(0.0) # # regress to heating and cooling degree days # model = ps.ols(x=dd, y=data) print city_code, business_class, per_meter print model print '-' * 60 return model.beta.intercept, model.beta.HDD, model.beta.CDD
def ts_regrFn(df, dep, indep, min_periods, max_periods): if not (max_periods): max_periods = len(df[dep]) indx = df.index names = indx.names cols = [col + '_beta' for col in indep] + ['intercept'] df = df.reset_index([0]) X = df[indep + [dep]].dropna(how='any') if min(X.count()) >= min_periods: model = pd.ols(y=df[dep], x=df[indep], window_type='rolling', window=max_periods, min_periods=min_periods) X = model.beta X = pd.merge(df, X, left_index=True, right_index=True, how='outer', suffixes=['', '_beta']) X = X.reset_index() X = X.set_index(names) return X[cols] else: return DataFrame(nan, index=indx, columns=cols)
def regress_by_year(isi, in_sample, osi, out_sample): """ Docstring if I can get it working.... """ for year in in_sample['yearID'].unique(): no_yr = in_sample.columns.drop('yearID') d_too = {} is_yr = in_sample['yearID'] == year os_yr = out_sample['yearID'] == year ols = pandas.ols(x=in_sample.loc[is_yr, no_yr], y=ys[isi][is_yr]) df = ols.summary_as_matrix is_sig = df.loc['p-value', df.loc['p-value', :] < .01].index if 'intercept' in is_sig: is_sig = is_sig.drop('intercept') clf = ensemble.RandomForestRegressor(n_estimators=15) clf.fit(in_sample.loc[is_yr, is_sig], ys[isi][is_yr]) is_score = clf.score(in_sample.loc[is_yr, is_sig], ys[isi][is_yr]) d_too['is-r2'] = is_score os_score = clf.score(out_sample.loc[os_yr, is_sig], ys[osi][os_yr]) d_too['os-r2'] = os_score eps = ys[osi][os_yr].sub(clf.predict(out_sample.loc[os_yr, is_sig])) d_too['mae'] = eps.abs().sum() / (len(ys[osi][os_yr]) - 2.) d[year] = pandas.Series(d_too) return pandas.DataFrame(d).transpose()
def get_alpha_beta(self, bm_rets): if isinstance(bm_rets, pd.Series): bm = CumulativeRets(bm_rets) elif isinstance(bm_rets, CumulativeRets): bm = bm_rets else: raise ValueError('bm_rets must be series or CumulativeRetPerformace not %s' % (type(bm_rets))) bm_freq = guess_freq(bm_rets) if self.pds_per_year != bm.pds_per_year: tgt = {'B': 'dly', 'W': 'weekly', 'M': 'monthly', 'Q': 'quarterly', 'A': 'annual'}.get(bm_freq, None) if tgt is None: raise ValueError('No mapping for handling benchmark with frequency: %s' % bm_freq) tmp = getattr(self, tgt) y = tmp.rets y_ann = tmp.ltd_ann else: y = self.rets y_ann = self.ltd_ann x = bm.rets.truncate(y.index[0], y.index[-1]) x_ann = bm.ltd_ann model = pd.ols(x=x, y=y) beta = model.beta[0] alpha = y_ann - beta * x_ann return pd.Series({'alpha': alpha, 'beta': beta}, name=bm_freq)
def mv_regression(xs, ys, in_sample_size): """ Test a multi-variate regression creating the coefficients in sample and then using those coefficients to test the regression out of sample Args: ----- - xs: `pandas.DataFrame` of the xs - ys: `pandas.Series` of the variable we're attempting to predit - in_sample_size: integer of the size of the `in sample` we want to use to train our regression Returns: --------- float of the MSE or Mean Squared Error """ isi, in_sample, osi, out_sample = create_in_out_samples(xs, in_sample_size) #run the regression and predict the new values ols = pandas.ols(x=in_sample, y=ys[isi]) betas = ols.beta intercept = betas['intercept'] betas = betas[betas.index != 'intercept'] #make our prediction on out of sample pred = out_sample.dot(betas) + intercept eps = (pred - ys[osi]).apply(numpy.abs) mse = eps.sum() / (eps.shape[0] - 2) return mse
def fit(x, y, funcstr, *args, **kwargs): x = pandas.Series(array(x)) y = pandas.Series(array(y)) x, y = remove_nan(x, y) if funcstr == 'linear': result = fit(x, y, 'power', 1) result.type = 'linear' elif funcstr == 'quadratic': result = fit(x, y, 'power', 2) result.type = 'quadratic' elif funcstr == 'exponential': y2 = np.log(y) result = fit(x, y2, 'linear') result.params = [np.exp(result.params[1]), result.params[0]] p = result.params labelstr = 'y= %.4e exp(%.4e x)' % (p[0], p[1]) result.label = labelstr result.type = 'exponential' elif funcstr == 'power': data = pandas.DataFrame({'x': x, 'y': y}) power = args[0] keys = ['x'] for i in range(power - 1): exponent = (i + 2) key = 'x%d' % exponent data[key] = x**exponent keys.append(key) result2 = pandas.ols(y=data['y'], x=data[keys]) keys.reverse() keys += ['intercept'] p = [result2.beta[s] for s in keys] labelstr = 'y= ' for i, pv in enumerate(p): pw = len(p) - i - 1 if pw == 1: labelstr += '%.4e x + ' % (pv) elif pw == 0: labelstr += '%.4e + ' % (pv) else: labelstr += '%.4e x^%d + ' % (pv, pw) labelstr = labelstr[:-3] # take off the last + result = Struct() result.params = p result.type = 'power' result.label = labelstr result.pandas_result = result2 else: raise ValueError('Unknown fit name %s' % funcstr) return result
def pandas_rolling_ols(single_id_dataframe, date_column="AdjDate"): """ Perform rolling ols and return the columns of date-based coefficients, t-stats, idiosyncratic vol, etc. """ df = ( single_id_dataframe .sort(date_column, ascending=True) .set_index(date_column) ) try: ols_result = pandas.ols( y=df["TotalReturnMonthly"] - df["RiskFreeRate"], x=df["ExcessMarket"], window=60, min_periods=12, intercept=True ) beta = ols_result.beta['x'] beta.name = "Beta" beta_tstat = ols_result.t_stat['x'] beta_tstat.name = "Beta_tstat" df = df.join(beta).join(beta_tstat) except: df["Beta"] = np.NaN df["Beta_tstat"] = np.NaN return df
def calculate_spread_zscore(pairs, symbols, lookback=100): """Creates a hedge ratio between the two symbols by calculating a rolling linear regression with a defined lookback period. This is then used to create a z-score of the 'spread' between the two symbols based on a linear combination of the two.""" # Use the pandas Ordinary Least Squares method to fit a rolling # linear regression between the two closing price time series #print "Fitting the rolling Linear Regression..." model = pd.ols(y=pairs['%s_close' % symbols[0].lower()], x=pairs['%s_close' % symbols[1].lower()], window=lookback) # Construct the hedge ratio and eliminate the first # lookback-length empty/NaN period pairs['hedge_ratio'] = model.beta['x'] #pairs = pairs.dropna() # Create the spread and then a z-score of the spread #print "Creating the spread/zscore columns..." pairs['spread'] = pairs['%s_close' % symbols[0].lower()] - pairs['hedge_ratio']*pairs['%s_close' % symbols[1].lower()] pairs['zscore'] = (pairs['spread'] - np.mean(pairs['spread']))/np.std(pairs['spread']) return pairs
def calculate_spread_zscore(pairs, symbols, lookback=100): """Creates a hedge ratio between the two symbols by calculating a rolling linear regression with a defined lookback period. This is then used to create a z-score of the 'spread' between the two symbols based on a linear combination of the two.""" # Use the pandas Ordinary Least Squares method to fit a rolling # linear regression between the two closing price time series print "Fitting the rolling Linear Regression..." model = pd.ols(y=pairs['%s_close' % symbols[0].lower()], x=pairs['%s_close' % symbols[1].lower()], window=100) # Construct the hedge ratio and eliminate the first # lookback-length empty/NaN period pairs['hedge_ratio'] = model.beta['x'] pairs = pairs.dropna() # Create the spread and then a z-score of the spread print "Creating the spread/zscore columns..." pairs['spread'] = pairs[ 'spy_close'] - pairs['hedge_ratio'] * pairs['iwm_close'] # ********** this is biased! ********** pairs['zscore'] = (pairs['spread'] - np.mean(pairs['spread'])) / np.std( pairs['spread']) return pairs
def plot(request, c="population density"): indicator = VARIABLES_DICT[c] filename = join(settings.STATIC_ROOT, 'myapp/merged.csv') df = pd.read_csv(filename) plt.figure() # needed, to avoid adding curves in plot lm = pd.ols(x=df[indicator], y=df['life expectancy']) plt.plot(df[indicator], df["life expectancy"], 'ro', color="blue") plt.plot(df[indicator], lm.y_fitted, 'r', linewidth=2) plt.tight_layout() plt.ylabel('life expectancy') plt.xlabel(indicator) plt.title('Regression between ' + 'life expectancy and ' + indicator, fontsize=15) # write bytes instead of file. from io import BytesIO figfile = BytesIO() # this is where the color is used. try: plt.savefig(figfile, format='png') except ValueError: raise Http404("No such color") figfile.seek(0) # rewind to beginning of file return HttpResponse(figfile.read(), content_type="image/png")
def compute_hedge_ratios(self, **kwargs): fd = self.settings['fd'] rolling_beta_com = kwargs.get('rolling_beta_com', 21) # Trailing betas y = self.strat_data['cm_vol_fut_prices'][fd].diff(1) # Predict volatility move as a function of stuff x = pd.DataFrame(index=self.strat_data['index_fut_returns'].index) x['index_return'] = self.strat_data['index_fut_returns'][ self.settings['index_fut_ticker']] x['vol_level'] = self.strat_data['cm_vol_fut_prices'][fd].shift(1) x['interaction'] = x['index_return'] * x['vol_level'] exog_vars = ['index_return', 'vol_level', 'interaction'] r1 = pd.ols(y=y, x=x) r2 = pd.ols(y=np.log(r1.resid**2), x=x[exog_vars]) pred_sq_err = np.exp(r2.y_fitted) weights = 1. / pred_sq_err reg_data = x reg_data['endog'] = y reg_data['weights'] = weights reg_data = reg_data[np.isfinite(reg_data).all(axis=1)] reg_data = sm.add_constant(reg_data) r3 = sm.WLS(endog=reg_data['endog'], exog=reg_data[['const'] + exog_vars], weights=reg_data['weights']).fit() vol_level_grid = np.arange(10, 51) beta_df = pd.DataFrame(index=vol_level_grid, columns=['beta']) for i in range(0, len(vol_level_grid)): beta_df.loc[vol_level_grid[i], 'beta'] \ = (1.0 / 100.0) * (r3.params.index_return + vol_level_grid[i] * r3.params.interaction) # Historical spot-vol betas (vol points per 1%) self.calc['spot_vol_beta'] = (r3.params.index_return + x['vol_level'] * r3.params.interaction) / 100.0 self.calc['rolling_spot_vol_beta'] = \ y.ewm(com=rolling_beta_com).cov(x['index_return'] .ewm(com=rolling_beta_com)) \ / x['index_return'].ewm(com=rolling_beta_com).var()
def sig_veh_buy(data): """ Extract the MMR columns that are valuable when compared to the VehBCost (i.e. what someone paid at the auction) ARGS: data: :class:`pandas.DataFrame` of the lemon training data RETURNS: :class:`pandas.DataFrame` of the significant MMR pairings divided by the 'VehBCost' or Vehicle Buy Cost """ cols = [ 'MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice' ] to_use = {} # go through and construct a rough cut based on .95 p-value for col in cols: xs = data['VehBCost'].div(data[col]) is_inf = numpy.isinf(xs) xs[is_inf] = numpy.nan ols = pandas.ols(x=xs, y=data['IsBadBuy']) if ols.p_value['x'] < .05: to_use[col] = xs is_sig = 1e-3 not_parsimonious = True while not_parsimonious: #now trim down to the most parsimonious model buy_df = pandas.DataFrame(to_use) ols = pandas.ols(x=buy_df, y=data['IsBadBuy']) if any(ols.p_value > is_sig): for val in ols.p_value[ols.p_value > is_sig].index: try: to_use.pop(val) except: print "Intercept not significant" else: not_parsimonious = False return buy_df
def geneticAlgo(mutation_rate, pop_size): baseball = pd.read_table( "/Users/Ahmet/Box Sync/Classes/Vanderbilt/AdvancedStatisticalComputing/Bios8366/data/textbook/baseball.dat", sep='\s+') predictors = baseball.copy() logsalary = predictors.pop('salary').apply(np.log) nrows, ncols = predictors.shape iterations = 100 aic_best = [] best_solution = [] aic_history = [] # Initialize genotype current_gen = np.random.binomial(1, 0.5, pop_size * ncols).reshape( (pop_size, ncols)) for i in range(iterations): # Get phenotype current_phe = [ predictors[predictors.columns[g.astype(bool)]] for g in current_gen ] # Calculate AIC current_aic = np.array( [aic(pd.ols(y=logsalary, x=x)) for x in current_phe]) # Get lowest AIC aic_best.append(current_aic[np.argmin(current_aic)]) best_solution.append(current_gen[np.argmin(current_aic)]) # Calculate fitness according to AIC rank fitness = calculate_fitness(current_aic) # Choose first parents according to fitness moms = np.random.choice(range(pop_size), size=int(pop_size / 2), p=fitness) # Choose second parents randomly dads = np.random.choice(range(pop_size), size=int(pop_size / 2)) next_gen = [] for x, y in zip(current_gen[moms], current_gen[dads]): # Crossover cross = np.random.randint(0, ncols) child1 = np.r_[x[:cross], y[cross:]] child2 = np.r_[y[:cross], x[cross:]] # Mutate m1 = np.random.binomial(1, mutation_rate, size=ncols).astype(bool) child1[m1] = abs(child1[m1] - 1) m2 = np.random.binomial(1, mutation_rate, size=ncols) child2[m2] = abs(child1[m2] - 1) next_gen += [child1, child2] # Increment generation current_gen = np.array(next_gen) # Store AIC values aic_history.append(current_aic) return aic_best
def annualBeta(): ''' exercise. This function was in the handouts and looked very interesting. So I implemented it to see the inner workings. Purpose of this function is to calculate beta of a stock. :return: ''' def ret_f(ticker, startDate, endDate): p = finance.quotes_historical_yahoo(ticker, startDate, endDate, asobject=True, adjusted=True) return ((p.aclose[1:] - p.aclose[0:-1]) / p.aclose[:-1]) startDate = (1990, 1, 1) endDate = (2014, 12, 31) # Pandas Series for Oracle's Data y0 = pd.Series(ret_f('ORCL', startDate, endDate)) # Pandas Series for S&P500 Data x0 = pd.Series(ret_f('^GSPC', startDate, endDate)) # Historical Date values of S&P500 dateVal = finance.quotes_historical_yahoo('^GSPC', startDate, endDate, asobject=True, adjusted=True).date[0:-1] lag_year = dateVal[0].strftime("%Y") y1, x1, beta, index0 = [], [], [], [] # Calculate Beta for each year for i in range(1, len(dateVal)): year = dateVal[i].strftime("%Y") if (year == lag_year): x1.append(x0[i]) y1.append(y0[i]) else: model = pd.ols(y=pd.Series(y1), x=pd.Series(x1)) print(lag_year, round(model.beta[0], 4)) beta.append(model.beta[0]) index0.append(lag_year) x1 = [] y1 = [] lag_year = year # Plot the main graph plt.plot(beta, c='firebrick', label='ORCL Beta w.r.t S&P500') plt.hlines(y=1, xmin=0, xmax=25, label='Perfect Correlation', lw=2, color='steelblue') plt.legend() plt.show()
def linearfunction(x,y,name='linear rating'): datadf = pd.DataFrame.from_dict({'x':x,'y':y}).dropna() ## put x and y in a dataframe so you can drop ones that don't match up datadf = datadf[datadf>=0].dropna() ##verify data is valid (not inf) regression = pd.ols(y=datadf['y'],x=datadf['x']) pearson = pearson_r(datadf['x'],datadf['y'])[0] spearman = spearman_r(datadf['x'],datadf['y'])[0] coeffdf = pd.DataFrame({'a':[regression.beta[1]],'b':[regression.beta[0]],'r2':[regression.r2],'rmse':[regression.rmse],'pearson':[pearson],'spearman':[spearman]},index=[name]) return coeffdf
def regress(df, index=pd.Index((u'slope', u'intercept'))): xcol = u'seed_cell_number_ml' ycol = u'signal' subdf = df.sort(columns=[xcol], axis=0) # ols = "ordinary least squares" ret = pd.ols(x=subdf[xcol][2:], y=subdf[ycol][2:]).beta ret.index = index return ret
def test_solve_rect(self): if not _have_statsmodels: raise nose.SkipTest("no statsmodels") b = Series(np.random.randn(N), self.frame.index) result = pmath.solve(self.frame, b) expected = ols(y=b, x=self.frame, intercept=False).beta self.assert_(np.allclose(result, expected))
def find_trend(series): import numpy as np ln = len(series) x = pd.Series(np.arange(ln)) regression = pd.ols(y=series, x=x) return regression.beta[0]
def calc_port_beta(port, mkt, window=20, min_periods=15): ''' Given a Series of portfolio returns and a Series of market returns, compute the beta of the portfolio against the market, using the windowed approach. ''' model = pd.ols(y=port, x=mkt, window_type='rolling', window=window, min_periods=min_periods, intercept=True) return model.beta.intercept, model.beta.x
def sig_MMR(data): """ Extract the MMR columns that are valuable based on the multivariate regression run by statsmodels ARGS: data: :class:`pandas.DataFrame` of the lemon training data RETURNS: :class:`pandas.DataFrame` of the significant MMR pairings """ cols = [ 'MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice' ] to_use = {} pairs = itertools.combinations(range(len(cols)), 2) # go through and construct a rough cut based on .95 p-value for x, y in pairs: xs = data[cols[x]].div(data[cols[y]]) is_inf = numpy.isinf(xs) xs[is_inf] = numpy.nan ols = pandas.ols(x=xs, y=data['IsBadBuy']) if ols.p_value['x'] < .05: to_use[str(x) + ',' + str(y)] = xs is_sig = 1e-3 not_parsimonious = True while not_parsimonious: #now trim down to the most parsimonious model mmr_df = pandas.DataFrame(to_use) ols = pandas.ols(x=mmr_df, y=data['IsBadBuy']) if any(ols.p_value > is_sig): for val in ols.p_value[ols.p_value > is_sig].index: try: to_use.pop(val) except: print "Intercept not significant" else: not_parsimonious = False return mmr_df
def reg(df, index=pd.Index(('coeff', 'intercept'))): xcol = 'seed_cell_number_ml' ycol = 'signal' sdf = df.sort(columns=[xcol], axis=0) ls = pd.ols(x=sdf[xcol][2:], y=sdf[ycol][2:]) ret = ls.beta ret.index = index return ret
def test_solve_rect(self): if not _have_statsmodels: raise nose.SkipTest b = Series(np.random.randn(N), self.frame.index) result = pmath.solve(self.frame, b) expected = ols(y=b, x=self.frame, intercept=False).beta self.assert_(np.allclose(result, expected))
def sig_veh_buy(data): """ Extract the MMR columns that are valuable when compared to the VehBCost (i.e. what someone paid at the auction) ARGS: data: :class:`pandas.DataFrame` of the lemon training data RETURNS: :class:`pandas.DataFrame` of the significant MMR pairings divided by the 'VehBCost' or Vehicle Buy Cost """ cols = ['MMRAcquisitionAuctionAveragePrice','MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice','MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailAveragePrice','MMRCurrentRetailCleanPrice'] to_use = {} # go through and construct a rough cut based on .95 p-value for col in cols: xs = data['VehBCost'].div(data[col]) is_inf = numpy.isinf(xs) xs[is_inf] = numpy.nan ols = pandas.ols(x = xs, y = data['IsBadBuy']) if ols.p_value['x'] < .05: to_use[col] = xs is_sig = 1e-3 not_parsimonious = True while not_parsimonious: #now trim down to the most parsimonious model buy_df = pandas.DataFrame(to_use) ols = pandas.ols(x = buy_df, y = data['IsBadBuy']) if any(ols.p_value > is_sig): for val in ols.p_value[ols.p_value > is_sig].index: try: to_use.pop(val) except: print "Intercept not significant" else: not_parsimonious = False return buy_df
def test_r2_adj(man_calcs, prices): log_rets = analyze.log_returns(prices).dropna() pandas_rsq = pandas.ols(x = log_rets['S&P 500'], y = log_rets['VGTSX']).r2_adj analyze_rsq = analyze.r2_adj(benchmark = log_rets['S&P 500'], series = log_rets['VGTSX']) testing.assert_almost_equal(pandas_rsq, analyze_rsq)
def test_solve_rect(self): if not _have_statsmodels: raise nose.SkipTest("no statsmodels") b = Series(np.random.randn(N), self.frame.index) result = pmath.solve(self.frame, b) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): expected = ols(y=b, x=self.frame, intercept=False).beta self.assertTrue(np.allclose(result, expected))
def get_rolling_beta(my_y, my_x, my_window=252): """ given x,y, runs ols regression on rolling window, outputs the beta coefficient series y -- your portfolio or asset x -- SPX window -- # days rolling period """ model = pd.ols(y=my_y, x=my_x, window=my_window) return model.beta.x
def regression_line(): '''Fit a line, using the powerful "ordinary least square" method of pandas''' # Get the data data = getData('altman_11_6.txt') df = pd.DataFrame(data, columns=['glucose', 'Vcf']) model = pd.ols(y=df['Vcf'], x=df['glucose']) print model.summary
def cv_ols(y, x, k=10): kfold = cross_validation.KFold(len(y), k) rmse = [] for trIdx, teIdx in kfold: result = pd.ols(y=y[trIdx], x=x.loc[trIdx]) predictions = result.predict(result.beta, x.loc[teIdx]) error = sqrt(mean_squared_error(y[teIdx], predictions)) rmse.append(error) return rmse.mean()
def degree_day_regression(df, x_opt='both'): ''' Function that runs the weather normalization regression on energy use data df: dataframe that includes use per day (upd) heating degree days per day (hddpd) cooling degree days per day (cddpd) x_opt: options for the regression function 'hdd': run regression with just heating degree days 'cdd': run regression with just cooling degree days 'both' (default): ''' if x_opt == 'hdd': covar = {'HDD': df.hdd_per_day} results = pd.ols(y=df.use_per_day, x = covar) return pd.DataFrame([[results.beta[1], results.std_err[1], results.beta[0], results.std_err[0], results.r2, results.r2_adj, results.nobs ]], columns = ['intercept', 'intercept_std_err', 'HDD', 'HDD_std_err', 'R2', 'R2_adj','N_reads']) elif x_opt == 'cdd': covar = {'CDD': df.cdd_per_day} results = pd.ols(y=df.use_per_day, x = covar) return pd.DataFrame([[results.beta[1], results.std_err[1], results.beta[0], results.std_err[0], results.r2, results.r2_adj, results.nobs]], columns = ['intercept', 'intercept_std_err', 'CDD', 'CDD_std_err', 'R2', 'R2_adj','N_reads']) elif x_opt == 'both': covar = {'CDD': df.cdd_per_day, 'HDD': df.hdd_per_day} results = pd.ols(y=df.use_per_day, x = covar) return pd.DataFrame([[results.beta[2], results.std_err[2], results.beta[0], results.std_err[0], results.beta[1], results.std_err[1], results.r2, results.r2_adj, results.nobs]], columns = ['intercept', 'intercept_std_err', 'CDD', 'CDD_std_err', 'HDD','HDD_std_err', 'R2', 'R2_adj','N_reads'])
def sig_MMR(data): """ Extract the MMR columns that are valuable based on the multivariate regression run by statsmodels ARGS: data: :class:`pandas.DataFrame` of the lemon training data RETURNS: :class:`pandas.DataFrame` of the significant MMR pairings """ cols = ['MMRAcquisitionAuctionAveragePrice','MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice','MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailAveragePrice','MMRCurrentRetailCleanPrice'] to_use = {} pairs = itertools.combinations(range(len(cols)), 2) # go through and construct a rough cut based on .95 p-value for x, y in pairs: xs = data[cols[x]].div(data[cols[y]]) is_inf = numpy.isinf(xs) xs[is_inf] = numpy.nan ols = pandas.ols(x = xs, y = data['IsBadBuy']) if ols.p_value['x'] < .05: to_use[str(x) + ',' + str(y)] = xs is_sig = 1e-3 not_parsimonious = True while not_parsimonious: #now trim down to the most parsimonious model mmr_df = pandas.DataFrame(to_use) ols = pandas.ols(x = mmr_df, y = data['IsBadBuy']) if any(ols.p_value > is_sig): for val in ols.p_value[ols.p_value > is_sig].index: try: to_use.pop(val) except: print "Intercept not significant" else: not_parsimonious = False return mmr_df
def plots(self, temp, city_code, business_class, per_meter): ''' Regress measured usage to temperatures in cold and hot months and generate x- and y-vectors for plotting this data set :param temp: temperature time series to regress to :param city_code: 'PRINCETON TWP' or 'PRINCETON BORO' :param business_class: a single business_class or a collection thereof - if a collection is given, the values are summed up :param per_meter: True for consumption per meter, False for aggregate :return: frame containing scatterplot and regression lines to temperature ''' data = self.select(city_code, business_class, per_meter) # # split monthly average temperature series into two: # cool => samples when temps are cool basetemp # warm => samples when temps are warm basetemp # use the full time period over which any usage data exists # temp = temp.ix[self.usage.index]['temp'] cool = temp[temp < self.basetemp] warm = temp[temp >= self.basetemp] # # regress separately for cool and warm months # model = ps.ols(x=cool, y=data[cool.index]) m_cool, c_cool = model.beta.x, model.beta.intercept model = ps.ols(x=warm, y=data[warm.index]) m_warm, c_warm = model.beta.x, model.beta.intercept vs_temp = ps.DataFrame( {'temp': temp, 'scatter': data[temp.index]} ).merge( ps.DataFrame( {'temp': cool, 'heating regression': line(cool, m_cool, c_cool)}), on='temp', how='outer', suffixes=('', '') ).merge( ps.DataFrame( {'temp': warm, 'cooling regression': line(warm, m_warm, c_warm)}), on='temp', how='outer', suffixes=('', '')) vs_time = data return vs_temp.set_index('temp'), vs_time
def equations(self): eqs = {} for col, ts in self.y.iteritems(): model = pn.ols(y=ts, x=self.x, window=self._window, window_type=self._window_type, min_periods=self._min_periods) eqs[col] = model return eqs
def cointegration_test(self): """协整检验,主要通过ADF检验的方式进行检验""" #协整检验,主要通过ADF检验的方式进行检验 if len(self.contract1_Close) != 0 and len(self.contract2_Close) != 0: model=pd.ols(y=self.contract1_Close,x=self.contract2_Close,intercept=True) spread=self.contract1_Close-self.contract2_Close*model.beta["x"] #得出价差序列 spread=spread.dropna() #去除缺失数据 sta=sts.adfuller(spread,1) #进行ADF检验 result=sta[0] print(sta)
def year_based_significance_regression(file_path): """ Run a year-based multivariate regression that uses only the significant variables as well as Random Forest Regression Trees to estimate the parameters Args: ------ - file_path: string of the location of `baseball.csv` Returns: --------- - pandas.DataFrame of the in-sample and out-of-sample salary estimates """ data = pandas.DataFrame.from_csv(file_path, index_col = None) data['age'] = data['yearID'] - data['birthYear'] cols = ['G_batting', 'AB', 'R', 'H', 'X2B', 'X3B', 'HR', 'RBI','SB', 'CS', 'BB', 'SO', 'IBB', 'HBP', 'SH', 'SF', 'GIDP', 'teamID', 'salary', 'yearID', 'age'] all_data = data[cols].copy() all_data.dropna(inplace = True) teams = pandas.get_dummies(all_data['teamID']) x_cols = all_data.columns[map(lambda x: x not in ['teamID', 'salary'], all_data.columns)] xs = all_data[x_cols].join(teams) ys = all_data['salary'] N = xs.shape[0] isi, in_sample, osi, out_sample = create_in_out_samples(xs, N/2) d = {} for year in all_data['yearID'].unique(): no_yr = in_sample.columns.drop('yearID') d_too = {} is_yr = in_sample['yearID'] == year os_yr = out_sample['yearID'] == year ols = pandas.ols(x = in_sample.loc[is_yr, no_yr], y = ys[isi][is_yr]) df = ols.summary_as_matrix is_sig = df.loc['p-value', df.loc['p-value', :] < .01].index if 'intercept' in is_sig: is_sig = is_sig.drop('intercept') clf = ensemble.RandomForestRegressor(n_estimators = 15) clf.fit(in_sample.loc[is_yr, is_sig], ys[isi][is_yr]) is_score = clf.score(in_sample.loc[is_yr, is_sig], ys[isi][is_yr]) d_too['is-r2'] = is_score os_score = clf.score(out_sample.loc[os_yr, is_sig], ys[osi][os_yr]) d_too['os-r2'] = os_score eps = ys[osi][os_yr].sub(clf.predict(out_sample.loc[os_yr, is_sig])) d_too['mae'] = eps.abs().sum()/(len(ys[osi][os_yr]) - 2.) d[year] = pandas.Series(d_too) return pandas.DataFrame(d).transpose()
def simAnnealing(periods,cooling,tau_start,numChangeNeigh): baseball = pd.read_table("/Users/Ahmet/Box Sync/Classes/Vanderbilt/AdvancedStatisticalComputing/Bios8366/data/textbook/baseball.dat", sep='\s+') predictors = baseball.copy() logsalary = predictors.pop('salary').apply(np.log) nrows, ncols = predictors.shape aic_values = [] solution_current = solution_best = np.random.binomial(1, 0.5, ncols).astype(bool) solution_vars = predictors[predictors.columns[solution_current]] g = pd.ols(y=logsalary, x=solution_vars) aic_best = aic(g) aic_values.append(aic_best) # Cooling schedule tau = [tau_start * 0.9**i for i in range(periods)] for j in range(periods): for i in range(cooling[j]): # Random change n-neighborhood flip = np.random.choice(ncols,numChangeNeigh,replace = False) for i in range(numChangeNeigh): solution_current[flip[i]] = not solution_current[flip[i]] solution_vars = predictors[predictors.columns[solution_current]] g = pd.ols(y=logsalary, x=solution_vars) aic_step = aic(g) alpha = min(1, np.exp((aic_values[-1] - aic_step)/tau[j])) if ((aic_step < aic_values[-1]) or (np.random.uniform() < alpha)): # Accept proposed solution aic_values.append(aic_step) if aic_step < aic_best: # Replace previous best with this one aic_best = aic_step solution_best = solution_current.copy() else: # Revert solution for i in range(numChangeNeigh): solution_current[flip[i]] = not solution_current[flip[i]] aic_values.append(aic_values[-1]) return aic_values,aic_best,solution_best
def regression_line(): """Fit a line, using the powerful "ordinary least square" method of pandas""" # Get the data data = getData("altman_11_6.txt", subDir=r"..\Data\data_altman") df = pd.DataFrame(data, columns=["glucose", "Vcf"]) model = pd.ols(y=df["Vcf"], x=df["glucose"]) print(model.summary) return model.f_stat["f-stat"] # should be 4.4140184331462571
def equations(self): eqs = {} for col, ts in iteritems(self.y): # TODO: Remove in favor of statsmodels implemetation model = pd.ols(y=ts, x=self.x, window=self._window, window_type=self._window_type, min_periods=self._min_periods) eqs[col] = model return eqs