Ejemplo n.º 1
0
def get_linear_regression_results(x, y, intercept=False, rolling_window=-1):
    """
    :type x: pd.Series
    :type y: pd.Series
    """
    if rolling_window != -1:
        model = pd.ols(x=x,
                       y=y,
                       intercept=intercept,
                       window=rolling_window,
                       window_type='rolling')
    else:
        model = pd.ols(x=x, y=y, intercept=intercept)

    # The type of 'beta' actually changes.. if rolling then DataFrame, else Series
    slope = model.beta['x'] if rolling_window != -1 else model.beta[0]
    const = model.beta['intercept'] if intercept else 0

    result = pd.concat([x, y], axis=1)

    # remove the start that cannot be fit using rolling window mode:
    if rolling_window != -1:
        result = result.iloc[rolling_window - 1:, :]

    result.loc[:, 'slope'] = slope
    result.loc[:, 'const'] = const
    result.loc[:, 'f(x)'] = result.iloc[:, 0] * slope + const
    result.loc[:, 'error'] = result.loc[:, 'f(x)'] - result.iloc[:, 1]
    return result
Ejemplo n.º 2
0
    def dd_plot(self):
        data = ps.merge(self.temp,
                        self.degd,
                        left_index=True,
                        right_index=True,
                        how='inner')

        hdd = data[ps.notnull(data['HDD'])]
        cdd = data[ps.notnull(data['CDD'])]

        hday = hdd[hdd['temp'] < 60]
        cday = cdd[cdd['temp'] > 55]

        model = ps.ols(x=hday['temp'], y=hday['HDD'])
        mH, cH = model.beta.x, model.beta.intercept
        print model
        model = ps.ols(x=cday['temp'], y=cday['CDD'])
        mC, cC = model.beta.x, model.beta.intercept
        print model

        xH = hday['temp']
        xC = cday['temp']

        vs_temp = ps.DataFrame({
            'temp': hdd['temp'],
            'hdd scatter': hdd['HDD']
        }).merge(ps.DataFrame({
            'temp': cdd['temp'],
            'cdd scatter': cdd['CDD']
        }),
                 on='temp',
                 how='outer',
                 suffixes=('', '')).merge(ps.DataFrame({
                     'temp':
                     xH,
                     'hdd regression':
                     line(xH, mH, cH)
                 }),
                                          on='temp',
                                          how='outer',
                                          suffixes=('',
                                                    '')).merge(ps.DataFrame({
                                                        'temp':
                                                        xC,
                                                        'cdd regression':
                                                        line(xC, mC, cC)
                                                    }),
                                                               on='temp',
                                                               how='outer',
                                                               suffixes=('',
                                                                         ''))

        data = data.fillna(0)
        vs_time = ps.DataFrame({
            'time': data.index,
            'HDD': data['HDD'],
            'CDD': data['CDD']
        })

        return vs_temp.set_index('temp'), vs_time.set_index('time')
Ejemplo n.º 3
0
def movingLR(fr_xzdf, fr_xydf, fr_xdf, window_period):
    ts_xz=fr_xzdf.reset_index()
    ts_xy=fr_xydf.reset_index()
    ts_x=fr_xdf.reset_index()
    
    tdelta=ts_xz['Time']-ts_xz['Time'][0]
    tdelta=tdelta.astype('timedelta64[s]')/(60*60*24.)  #in days

    for col in fr_xzdf.columns:
        print col
        #if col!=7:continue
        ts_xz[col].plot()
        model=pd.ols(y=ts_xz[col], x=tdelta,window_type='rolling', window=window_period, intercept=True)
        ts_xz[col]=np.round(model.beta.x,3)
        ts_xz[col].plot()

        model=pd.ols(y=ts_xy[col], x=tdelta,window_type='rolling', window=window_period, intercept=True)
        ts_xy[col]=np.round(model.beta.x,3)

        model=pd.ols(y=ts_x[col], x=tdelta,window_type='rolling', window=window_period, intercept=True)
        ts_x[col]=np.round(model.beta.x,3)

        plt.show()
        
    ts_xz.index=ts_xz['Time']
    ts_xy.index=ts_xz['Time']
    ts_x.index=ts_xz['Time']
Ejemplo n.º 4
0
 def _run_regressions(self):
     self.model_capm = pd.ols(y=self.est_data['RETX'],
                              x=self.est_data[['mkt']])
     self.model_ff3f = pd.ols(y=self.est_data['RETX'],
                              x=self.est_data[['mkt', 'smb', 'hml']])
     self.model_ff4f = pd.ols(y=self.est_data['RETX'],
                              x=self.est_data[['mkt', 'smb', 'hml', 'umd']])
     self._has_models = True
Ejemplo n.º 5
0
def half_life(y, x):
    """calculate the half_life for a pair"""
    model = pd.ols(y=y, x=x, intercept=False)
    res = model.resid
    dres = res.shift(1)[1:] - res[1:]
    resmodel = pd.ols(y=dres, x=res, intercept=False)
    half = -np.log(2) / resmodel.beta[0]
    return half
Ejemplo n.º 6
0
def degree_day_regression(df, x_opt='both'):
    '''
    Function that runs the weather normalization regression on energy use data

    df: dataframe that includes
        use per day (upd)
        heating degree days per day (hddpd)
        cooling degree days per day (cddpd)

    x_opt: options for the regression function
        'hdd': run regression with just heating degree days
        'cdd': run regression with just cooling degree days
        'both' (default):
    '''

    if x_opt == 'hdd':
        covar = {'HDD': df.hdd_per_day}
        results = pd.ols(y=df.use_per_day, x=covar)
        return pd.DataFrame([[
            results.beta[1], results.std_err[1], results.beta[0],
            results.std_err[0], results.r2, results.r2_adj, results.nobs
        ]],
                            columns=[
                                'intercept', 'intercept_std_err', 'HDD',
                                'HDD_std_err', 'R2', 'R2_adj', 'N_reads'
                            ])
    elif x_opt == 'cdd':
        covar = {'CDD': df.cdd_per_day}
        results = pd.ols(y=df.use_per_day, x=covar)
        return pd.DataFrame([[
            results.beta[1], results.std_err[1], results.beta[0],
            results.std_err[0], results.r2, results.r2_adj, results.nobs
        ]],
                            columns=[
                                'intercept', 'intercept_std_err', 'CDD',
                                'CDD_std_err', 'R2', 'R2_adj', 'N_reads'
                            ])
    elif x_opt == 'both':
        covar = {'CDD': df.cdd_per_day, 'HDD': df.hdd_per_day}
        results = pd.ols(y=df.use_per_day, x=covar)
        return pd.DataFrame([[
            results.beta[2], results.std_err[2], results.beta[0],
            results.std_err[0], results.beta[1], results.std_err[1],
            results.r2, results.r2_adj, results.nobs
        ]],
                            columns=[
                                'intercept', 'intercept_std_err', 'CDD',
                                'CDD_std_err', 'HDD', 'HDD_std_err', 'R2',
                                'R2_adj', 'N_reads'
                            ])
Ejemplo n.º 7
0
    def compute_hedge_ratios(self, **kwargs):

        rolling_beta_com = self.settings.get('rolling_beta_com')

        # Predict long term volatility move as a function of short
        fd = self.settings['fd']
        y = self.strat_data['cm_vol_fut_returns'][fd[1]]
        x = pd.DataFrame(index=self.strat_data['cm_vol_fut_returns'].index)
        x['front_return'] = self.strat_data['cm_vol_fut_returns'][fd[0]]
        x['front_level'] = self.strat_data['cm_vol_fut_prices'][fd[0]].shift(1)
        x['interaction'] = x['front_return'] * x['front_level']
        exog_vars = ['front_return', 'front_level', 'interaction']

        # OLS to start
        r1 = pd.ols(y=y, x=x)

        # Now use OLS residuals for WLS
        r2 = pd.ols(y=np.log(r1.resid**2), x=x[exog_vars])
        pred_sq_err = np.exp(r2.y_fitted)

        weights = 1. / pred_sq_err
        reg_data = x
        reg_data['endog'] = y
        reg_data['weights'] = weights

        reg_data = reg_data[np.isfinite(reg_data).all(axis=1)]
        reg_data = sm.add_constant(reg_data)

        r3 = sm.WLS(endog=reg_data['endog'],
                    exog=reg_data[['const'] + exog_vars],
                    weights=reg_data['weights']).fit()

        # Historical front/back betas
        self.calc['front_back_beta'] = (
            r3.params.front_return + x['front_level'] * r3.params.interaction)

        self.calc['rolling_front_back_beta'] = \
            y.ewm(com=rolling_beta_com).cov(x['front_return']
                                            .ewm(com=rolling_beta_com)) \
            / x['front_return'].ewm(com=rolling_beta_com).var()

        buff = 21
        self.calc['rolling_front_back_beta'].iloc[0:buff] \
            = self.calc['rolling_front_back_beta'].iloc[buff]

        self.calc['r1'] = r1
        self.calc['r2'] = r2
        self.calc['r3'] = r3
Ejemplo n.º 8
0
def regress():
	if True:
		pid=session["pid"]
		regs=[]
		csvf=data[pid]
		reg=pd.DataFrame()
		outs=session['rout']
		inps=session['rinp']
		controls=session['rcont']
		count=0
		for o in outs:
			r=[]
			inputData=[]
			controlData=[]
			y=csvf[o]
			for i in inps:
				reg[i]=csvf[i]
			for control in controls:
				reg[control]=csvf[control]
			model=pd.ols(y=y,x=reg)
			formula="Measuring the impact of \""+', '.join(inps)+"\" on \""+o+"\" while controlling for variables such as \""+', '.join(controls)+"\""
			res=model.summary_as_matrix
			r.append(formula)
			r.append(round(model.r2_adj,2))
			r.append(round(model.f_stat['f-stat'],2))
			r.append(round(model.f_stat['p-value'],5))
			r.append(model.df)
			r.append(model.nobs)
			for i in inps:
				idata=[]
				idata.append(i)
				coef=round(res.ix['beta'][i],4)
				idata.append(coef)
				pval=round(res.ix['p-value'][i],4)
				idata.append(pval)
				stderr=round(res.ix['std err'][i],2)
				idata.append(stderr)
				tstat=round(res.ix['t-stat'][i],2)
				idata.append(tstat)
				inputData.append(idata)
			r.append(inputData)
			for c in controls:
				cdata=[]
				cdata.append(c)
				coef=round(res.ix['beta'][c],4)
				cdata.append(coef)
				pval=round(res.ix['p-value'][c],4)
				cdata.append(pval)
				stderr=round(res.ix['std err'][c],2)
				cdata.append(stderr)
				tstat=round(res.ix['t-stat'][c],2)
				cdata.append(tstat)
				controlData.append(cdata)
			r.append(controlData)
			regs.append(r)
			count+=1
			data["regs"]=regs
		return render_template("regression.html",regs=regs)
	else:
		return redirect(url_for("logout"))
Ejemplo n.º 9
0
 def find_cointergrate_stocks(self,stockList):
     stocks_pair = {}
     price_df = history(g.adfTest_period, unit='1d', field='close', security_list=stockList, df=True, skip_paused=False, fq='pre')
     for i in range(len(stockList)):
         stock1 =  stockList[i]
         stock1_price = price_df[stock1]
         for j in range(i+1,len(stockList)):
             stock2 = stockList[j]
             stock2_price =  price_df[stock2]
             combined_df = pd.concat([stock1_price,stock2_price],axis=1)
             print ('combined_df is ', combined_df)
             combined_df = combined_df.dropna()
             if len(combined_df) < 500:
                 continue
             stock2_price = combined_df[stock2]
             stock1_price = combined_df[stock1]
             model = pd.ols(y=stock2_price, x=stock1_price, intercept=True)   # perform ols on these two stocks
             spread = stock2_price - stock1_price*model.beta['x']
             spread = spread.dropna()
             spread = spread.values
             sta = sts.adfuller(spread, 1)
             if sta[1] < 0.05 and sta[0] < sta[4]['5%'] and model.beta['x'] > 0:
                 stocks_pair[(stock1,stock2, model.beta[1], model.beta['x'],np.std(spread), np.mean(spread))] = sta[0]
     rank = sorted(stocks_pair.items(),key=operator.itemgetter(1))
     return rank[:1]
Ejemplo n.º 10
0
def regress_by_year(isi, in_sample, osi, out_sample):
    """
    Docstring if I can get it working....
    """
    for year in in_sample['yearID'].unique():
        no_yr = in_sample.columns.drop('yearID')
        d_too = {}
        is_yr = in_sample['yearID'] == year
        os_yr = out_sample['yearID'] == year
        ols = pandas.ols(x = in_sample.loc[is_yr, no_yr], y = ys[isi][is_yr])
        df = ols.summary_as_matrix
        is_sig = df.loc['p-value', df.loc['p-value', :] < .01].index

        if 'intercept' in is_sig:
            is_sig = is_sig.drop('intercept')

        clf = ensemble.RandomForestRegressor(n_estimators = 15)
        clf.fit(in_sample.loc[is_yr, is_sig], ys[isi][is_yr])
        is_score = clf.score(in_sample.loc[is_yr, is_sig], ys[isi][is_yr])
        d_too['is-r2'] = is_score
        os_score = clf.score(out_sample.loc[os_yr, is_sig], ys[osi][os_yr])
        d_too['os-r2'] = os_score
        eps = ys[osi][os_yr].sub(clf.predict(out_sample.loc[os_yr, is_sig]))
        d_too['mae'] = eps.abs().sum()/(len(ys[osi][os_yr]) - 2.)
        
        d[year] = pandas.Series(d_too)

    return pandas.DataFrame(d).transpose()
Ejemplo n.º 11
0
def mv_regression(xs, ys, in_sample_size):
    """
    Test a multi-variate regression creating the coefficients in sample
    and then using those coefficients to test the regression out of sample

    Args:
    -----
    - xs: `pandas.DataFrame` of the xs
    - ys: `pandas.Series` of the variable we're attempting to predit
    - in_sample_size: integer of the size of the `in sample` we want
      to use to train our regression

    Returns:
    ---------
    float of the MSE or Mean Squared Error

    """
    isi, in_sample, osi, out_sample = create_in_out_samples(xs, in_sample_size)

    #run the regression and predict the new values
    ols = pandas.ols(x = in_sample, y = ys[isi])
    betas = ols.beta
    intercept = betas['intercept']
    betas = betas[betas.index != 'intercept']

    #make our prediction on out of sample
    pred = out_sample.dot(betas) + intercept
    eps = (pred - ys[osi]).apply(numpy.abs)
    mse = eps.sum()/( eps.shape[0] - 2)

    return mse
Ejemplo n.º 12
0
def test_window_ols_full(ols_data):
    y, x = ols_data['y'], ols_data['x']
    res1 = _window_ols(y, x, window_type='full_sample')
    res2 = _window_ols(y, x)
    res3 = pd.ols(y=y, x=x, window_type='full_sample')
    assert_ols_equal(res1, res2)
    assert_ols_equal(res1, res3)
Ejemplo n.º 13
0
def PowerFit_CI(x, y, xspace=None, ax=plt, **kwargs):
    datadf = pd.DataFrame.from_dict({
        'x': x,
        'y': y
    }).dropna().apply(
        np.log10
    )  ## put x and y in a dataframe so you can drop ones that don't match up
    regression = pd.ols(y=datadf['y'], x=datadf['x'])
    ## Develop power function for x and y
    powfunc = powerfunction(x, y)  ## x and y should be Series
    a, b = powfunc['a'].values, powfunc['b'].values
    #print a,b
    if xspace == None:
        xvals = np.linspace(0, x.max() * 1.2)
        #print 'No xspace, calculating xvals: '+str(x.max())+'*1.5= '+str(x.max()*1.5)
    else:
        xvals = xspace
    ypred = a * (xvals**b)
    ax.plot(xvals, ypred, **kwargs)
    ## Confidence interals
    ci = .5
    a_cilo, a_ciup = 10**regression.sm_ols.conf_int(
        alpha=ci)[1][0], 10**regression.sm_ols.conf_int(alpha=ci)[1][1]
    b_cilo, b_ciup = regression.sm_ols.conf_int(
        alpha=ci)[0][0], regression.sm_ols.conf_int(alpha=ci)[0][1]
    ypred_cilo = a_cilo * (xvals**b_cilo)
    ypred_ciup = a_ciup * (xvals**b_ciup)
    ax.fill_between(xvals, ypred_cilo, ypred_ciup, alpha=0.5, **kwargs)
    plt.draw()
    return powfunc
Ejemplo n.º 14
0
    def compute_exposures(factor_returns=None, target_returns=None):
        """

        :param factor_returns: Pandas DataFrame indexed on date,
        with columns = factors
        :param target_returns: a series index on date, or a DataFrame
        indexed on date with columns for the various time series you want
        exposure for
        :return:
        """

        if isinstance(target_returns, pd.Series):
            target_returns = pd.DataFrame(target_returns)

        regressions = dict()
        coefs = pd.DataFrame(index=target_returns.columns,
                             columns=factor_returns.columns)
        t_stats = pd.DataFrame(index=target_returns.columns,
                               columns=factor_returns.columns)
        for col in target_returns.columns:
            regressions[col] = pd.ols(y=target_returns[col], x=factor_returns)
            coefs.loc[col] = regressions[col].beta
            t_stats.loc[col] = regressions[col].t_stat

        return coefs, t_stats, regressions
Ejemplo n.º 15
0
    def calculate_signals(self, event):
        """
        generate pair trading signal

        :param event:
        :return:
        """

        if event.type == 'MARKET' \
                and self.bars.get_current_bar_total_number(self.y_symbol) > self.look_back \
                and self.bars.get_current_bar_total_number(self.x_symbol) > self.look_back:
            y_bars = self.bars.get_latest_bars_values(
                self.y_symbol, "adj_close", N=self.look_back)

            x_bars = self.bars.get_latest_bars_values(
                self.x_symbol, "adj_close", N=self.look_back)

            # Use the pandas Ordinary Least Squares method to fit a rolling
            # linear regression between the two closing price time series
            model = pd.ols(y=y_bars, x=x_bars)

            # Construct the hedge ratio and eliminate the first
            # lookback-length empty/NaN period
            hedge_ratio = model.beta['x']

            # Create the spread and then a z-score of the spread
            spread = y_bars - hedge_ratio*x_bars
            zscore = (spread - np.mean(spread))/np.std(spread)

            print(zscore)
Ejemplo n.º 16
0
def ar_periodogram(x, window='hanning', window_len=7):
    """
    Compute periodogram from data x, using prewhitening, smoothing and
    recoloring.  The data is fitted to an AR(1) model for prewhitening,
    and the residuals are used to compute a first-pass periodogram with
    smoothing.  The fitted coefficients are then used for recoloring.

    Parameters:

        * x is a NumPy array containing time series data
        * window is a string indicating window type 
        * window_len is an odd integer

    See the periodogram function documentation for more details on the window
    arguments.
    """              
    # === run regression === #
    x_current, x_lagged = x[1:], x[:-1]                       # x_t and x_{t-1}
    x_current, x_lagged = Series(x_current), Series(x_lagged) # pandas series
    results = ols(y=x_current, x=x_lagged, intercept=True, nw_lags=1)
    e_hat = results.resid.values
    phi = results.beta['x']

    # === compute periodogram on residuals === #
    w, I_w = periodogram(e_hat, window=window, window_len=window_len) 

    # === recolor and return === #
    I_w = I_w  / np.abs(1 - phi * np.exp(1j * w))**2  
    return w, I_w
Ejemplo n.º 17
0
    def model(self, dd, city_code, business_class, per_meter):
        '''
        Regress usage data to heating and cooling degree days

        :param dd: heating and cooling degree days
        :param city_code: 'PRINCETON TWP' or 'PRINCETON BORO'
        :param business_class: a single business_class or a collection thereof
            - if a collection is given, the values are summed up
        :param per_meter: True for consumption per meter, False for aggregate
        :return: series of first differences
        '''
        data = ps.DataFrame(self.select(city_code, business_class, per_meter))
        data['year'] = data.index.map(lambda d: d.year)
        data = data[data['year'] > 2009]['usage']
        #
        # index degree days frame for correspondence to usage data
        #
        dd = dd.ix[data.index].fillna(0.0)
        #
        # regress to heating and cooling degree days
        #
        model = ps.ols(x=dd, y=data)
        print city_code, business_class, per_meter
        print model
        print '-' * 60
        return model.beta.intercept, model.beta.HDD, model.beta.CDD
Ejemplo n.º 18
0
def ts_regrFn(df, dep, indep, min_periods, max_periods):
    if not (max_periods): max_periods = len(df[dep])
    indx = df.index
    names = indx.names
    cols = [col + '_beta' for col in indep] + ['intercept']
    df = df.reset_index([0])
    X = df[indep + [dep]].dropna(how='any')
    if min(X.count()) >= min_periods:
        model = pd.ols(y=df[dep],
                       x=df[indep],
                       window_type='rolling',
                       window=max_periods,
                       min_periods=min_periods)
        X = model.beta
        X = pd.merge(df,
                     X,
                     left_index=True,
                     right_index=True,
                     how='outer',
                     suffixes=['', '_beta'])
        X = X.reset_index()
        X = X.set_index(names)
        return X[cols]
    else:
        return DataFrame(nan, index=indx, columns=cols)
Ejemplo n.º 19
0
def regress_by_year(isi, in_sample, osi, out_sample):
    """
    Docstring if I can get it working....
    """
    for year in in_sample['yearID'].unique():
        no_yr = in_sample.columns.drop('yearID')
        d_too = {}
        is_yr = in_sample['yearID'] == year
        os_yr = out_sample['yearID'] == year
        ols = pandas.ols(x=in_sample.loc[is_yr, no_yr], y=ys[isi][is_yr])
        df = ols.summary_as_matrix
        is_sig = df.loc['p-value', df.loc['p-value', :] < .01].index

        if 'intercept' in is_sig:
            is_sig = is_sig.drop('intercept')

        clf = ensemble.RandomForestRegressor(n_estimators=15)
        clf.fit(in_sample.loc[is_yr, is_sig], ys[isi][is_yr])
        is_score = clf.score(in_sample.loc[is_yr, is_sig], ys[isi][is_yr])
        d_too['is-r2'] = is_score
        os_score = clf.score(out_sample.loc[os_yr, is_sig], ys[osi][os_yr])
        d_too['os-r2'] = os_score
        eps = ys[osi][os_yr].sub(clf.predict(out_sample.loc[os_yr, is_sig]))
        d_too['mae'] = eps.abs().sum() / (len(ys[osi][os_yr]) - 2.)

        d[year] = pandas.Series(d_too)

    return pandas.DataFrame(d).transpose()
Ejemplo n.º 20
0
Archivo: ret.py Proyecto: xie3ge/tia
    def get_alpha_beta(self, bm_rets):
        if isinstance(bm_rets, pd.Series):
            bm = CumulativeRets(bm_rets)
        elif isinstance(bm_rets, CumulativeRets):
            bm = bm_rets
        else:
            raise ValueError('bm_rets must be series or CumulativeRetPerformace not %s' % (type(bm_rets)))

        bm_freq = guess_freq(bm_rets)
        if self.pds_per_year != bm.pds_per_year:
            tgt = {'B': 'dly', 'W': 'weekly', 'M': 'monthly', 'Q': 'quarterly', 'A': 'annual'}.get(bm_freq, None)
            if tgt is None:
                raise ValueError('No mapping for handling benchmark with frequency: %s' % bm_freq)
            tmp = getattr(self, tgt)
            y = tmp.rets
            y_ann = tmp.ltd_ann
        else:
            y = self.rets
            y_ann = self.ltd_ann

        x = bm.rets.truncate(y.index[0], y.index[-1])
        x_ann = bm.ltd_ann

        model = pd.ols(x=x, y=y)
        beta = model.beta[0]
        alpha = y_ann - beta * x_ann
        return pd.Series({'alpha': alpha, 'beta': beta}, name=bm_freq)
Ejemplo n.º 21
0
def mv_regression(xs, ys, in_sample_size):
    """
    Test a multi-variate regression creating the coefficients in sample
    and then using those coefficients to test the regression out of sample

    Args:
    -----
    - xs: `pandas.DataFrame` of the xs
    - ys: `pandas.Series` of the variable we're attempting to predit
    - in_sample_size: integer of the size of the `in sample` we want
      to use to train our regression

    Returns:
    ---------
    float of the MSE or Mean Squared Error

    """
    isi, in_sample, osi, out_sample = create_in_out_samples(xs, in_sample_size)

    #run the regression and predict the new values
    ols = pandas.ols(x=in_sample, y=ys[isi])
    betas = ols.beta
    intercept = betas['intercept']
    betas = betas[betas.index != 'intercept']

    #make our prediction on out of sample
    pred = out_sample.dot(betas) + intercept
    eps = (pred - ys[osi]).apply(numpy.abs)
    mse = eps.sum() / (eps.shape[0] - 2)

    return mse
def fit(x, y, funcstr, *args, **kwargs):

    x = pandas.Series(array(x))
    y = pandas.Series(array(y))

    x, y = remove_nan(x, y)

    if funcstr == 'linear':
        result = fit(x, y, 'power', 1)
        result.type = 'linear'
    elif funcstr == 'quadratic':
        result = fit(x, y, 'power', 2)
        result.type = 'quadratic'
    elif funcstr == 'exponential':
        y2 = np.log(y)
        result = fit(x, y2, 'linear')
        result.params = [np.exp(result.params[1]), result.params[0]]
        p = result.params
        labelstr = 'y= %.4e exp(%.4e x)' % (p[0], p[1])
        result.label = labelstr
        result.type = 'exponential'

    elif funcstr == 'power':
        data = pandas.DataFrame({'x': x, 'y': y})
        power = args[0]

        keys = ['x']
        for i in range(power - 1):
            exponent = (i + 2)
            key = 'x%d' % exponent
            data[key] = x**exponent
            keys.append(key)

        result2 = pandas.ols(y=data['y'], x=data[keys])
        keys.reverse()
        keys += ['intercept']

        p = [result2.beta[s] for s in keys]

        labelstr = 'y= '
        for i, pv in enumerate(p):
            pw = len(p) - i - 1
            if pw == 1:
                labelstr += '%.4e x + ' % (pv)
            elif pw == 0:
                labelstr += '%.4e + ' % (pv)
            else:
                labelstr += '%.4e x^%d + ' % (pv, pw)
        labelstr = labelstr[:-3]  # take off the last +

        result = Struct()
        result.params = p
        result.type = 'power'
        result.label = labelstr
        result.pandas_result = result2

    else:
        raise ValueError('Unknown fit name %s' % funcstr)

    return result
Ejemplo n.º 23
0
def test_window_ols_full(ols_data):
    y, x = ols_data['y'], ols_data['x']
    res1 = _window_ols(y, x, window_type='full_sample')
    res2 = _window_ols(y, x)
    res3 = pd.ols(y=y, x=x, window_type='full_sample')
    assert_ols_equal(res1, res2)
    assert_ols_equal(res1, res3)
Ejemplo n.º 24
0
def pandas_rolling_ols(single_id_dataframe,
                       date_column="AdjDate"):
    """
    Perform rolling ols and return the columns of date-based coefficients,
    t-stats, idiosyncratic vol, etc.
    """

    
    df = (
        single_id_dataframe
        .sort(date_column, ascending=True)
        .set_index(date_column)
    )
    
    
    try:
        ols_result = pandas.ols(
            y=df["TotalReturnMonthly"] - df["RiskFreeRate"], 
            x=df["ExcessMarket"], 
            window=60, 
            min_periods=12, 
            intercept=True
        )

        beta = ols_result.beta['x']
        beta.name = "Beta"
        beta_tstat = ols_result.t_stat['x']
        beta_tstat.name = "Beta_tstat"
        df = df.join(beta).join(beta_tstat)

    except:
        df["Beta"] = np.NaN
        df["Beta_tstat"] = np.NaN
        
    return df
Ejemplo n.º 25
0
def calculate_spread_zscore(pairs, symbols, lookback=100):
    """Creates a hedge ratio between the two symbols by calculating
    a rolling linear regression with a defined lookback period. This
    is then used to create a z-score of the 'spread' between the two
    symbols based on a linear combination of the two."""

    # Use the pandas Ordinary Least Squares method to fit a rolling
    # linear regression between the two closing price time series
    #print "Fitting the rolling Linear Regression..."
   
    model = pd.ols(y=pairs['%s_close' % symbols[0].lower()],
                   x=pairs['%s_close' % symbols[1].lower()],
                   window=lookback)

    # Construct the hedge ratio and eliminate the first
    # lookback-length empty/NaN period
    pairs['hedge_ratio'] = model.beta['x']
    #pairs = pairs.dropna()

    # Create the spread and then a z-score of the spread
    #print "Creating the spread/zscore columns..."
    pairs['spread'] = pairs['%s_close' % symbols[0].lower()] - pairs['hedge_ratio']*pairs['%s_close' % symbols[1].lower()]
    pairs['zscore'] = (pairs['spread'] - np.mean(pairs['spread']))/np.std(pairs['spread'])
    
    return pairs
Ejemplo n.º 26
0
def calculate_spread_zscore(pairs, symbols, lookback=100):
    """Creates a hedge ratio between the two symbols by calculating
    a rolling linear regression with a defined lookback period. This
    is then used to create a z-score of the 'spread' between the two
    symbols based on a linear combination of the two."""

    # Use the pandas Ordinary Least Squares method to fit a rolling
    # linear regression between the two closing price time series
    print "Fitting the rolling Linear Regression..."
    model = pd.ols(y=pairs['%s_close' % symbols[0].lower()],
                   x=pairs['%s_close' % symbols[1].lower()],
                   window=100)

    # Construct the hedge ratio and eliminate the first
    # lookback-length empty/NaN period
    pairs['hedge_ratio'] = model.beta['x']
    pairs = pairs.dropna()

    # Create the spread and then a z-score of the spread
    print "Creating the spread/zscore columns..."
    pairs['spread'] = pairs[
        'spy_close'] - pairs['hedge_ratio'] * pairs['iwm_close']
    # ********** this is biased! **********
    pairs['zscore'] = (pairs['spread'] - np.mean(pairs['spread'])) / np.std(
        pairs['spread'])
    return pairs
Ejemplo n.º 27
0
def ar_periodogram(x, window='hanning', window_len=7):
    """
    Compute periodogram from data x, using prewhitening, smoothing and
    recoloring.  The data is fitted to an AR(1) model for prewhitening,
    and the residuals are used to compute a first-pass periodogram with
    smoothing.  The fitted coefficients are then used for recoloring.

    Parameters:

        * x is a NumPy array containing time series data
        * window is a string indicating window type 
        * window_len is an odd integer

    See the periodogram function documentation for more details on the window
    arguments.
    """
    # === run regression === #
    x_current, x_lagged = x[1:], x[:-1]  # x_t and x_{t-1}
    x_current, x_lagged = Series(x_current), Series(x_lagged)  # pandas series
    results = ols(y=x_current, x=x_lagged, intercept=True, nw_lags=1)
    e_hat = results.resid.values
    phi = results.beta['x']

    # === compute periodogram on residuals === #
    w, I_w = periodogram(e_hat, window=window, window_len=window_len)

    # === recolor and return === #
    I_w = I_w / np.abs(1 - phi * np.exp(1j * w))**2
    return w, I_w
Ejemplo n.º 28
0
def plot(request, c="population density"):

    indicator = VARIABLES_DICT[c]

    filename = join(settings.STATIC_ROOT, 'myapp/merged.csv')

    df = pd.read_csv(filename)

    plt.figure()  # needed, to avoid adding curves in plot
    lm = pd.ols(x=df[indicator], y=df['life expectancy'])
    plt.plot(df[indicator], df["life expectancy"], 'ro', color="blue")
    plt.plot(df[indicator], lm.y_fitted, 'r', linewidth=2)
    plt.tight_layout()
    plt.ylabel('life expectancy')
    plt.xlabel(indicator)
    plt.title('Regression between ' + 'life expectancy and ' + indicator,
              fontsize=15)

    # write bytes instead of file.
    from io import BytesIO
    figfile = BytesIO()

    # this is where the color is used.
    try:
        plt.savefig(figfile, format='png')
    except ValueError:
        raise Http404("No such color")

    figfile.seek(0)  # rewind to beginning of file
    return HttpResponse(figfile.read(), content_type="image/png")
Ejemplo n.º 29
0
    def compute_hedge_ratios(self, **kwargs):

        fd = self.settings['fd']
        rolling_beta_com = kwargs.get('rolling_beta_com', 21)

        # Trailing betas
        y = self.strat_data['cm_vol_fut_prices'][fd].diff(1)

        # Predict volatility move as a function of stuff
        x = pd.DataFrame(index=self.strat_data['index_fut_returns'].index)
        x['index_return'] = self.strat_data['index_fut_returns'][
            self.settings['index_fut_ticker']]
        x['vol_level'] = self.strat_data['cm_vol_fut_prices'][fd].shift(1)
        x['interaction'] = x['index_return'] * x['vol_level']
        exog_vars = ['index_return', 'vol_level', 'interaction']

        r1 = pd.ols(y=y, x=x)
        r2 = pd.ols(y=np.log(r1.resid**2), x=x[exog_vars])
        pred_sq_err = np.exp(r2.y_fitted)

        weights = 1. / pred_sq_err
        reg_data = x
        reg_data['endog'] = y
        reg_data['weights'] = weights

        reg_data = reg_data[np.isfinite(reg_data).all(axis=1)]
        reg_data = sm.add_constant(reg_data)

        r3 = sm.WLS(endog=reg_data['endog'],
                    exog=reg_data[['const'] + exog_vars],
                    weights=reg_data['weights']).fit()

        vol_level_grid = np.arange(10, 51)
        beta_df = pd.DataFrame(index=vol_level_grid, columns=['beta'])
        for i in range(0, len(vol_level_grid)):
            beta_df.loc[vol_level_grid[i], 'beta'] \
                = (1.0 / 100.0) * (r3.params.index_return +
                                   vol_level_grid[i] * r3.params.interaction)

        # Historical spot-vol betas (vol points per 1%)
        self.calc['spot_vol_beta'] = (r3.params.index_return + x['vol_level'] *
                                      r3.params.interaction) / 100.0

        self.calc['rolling_spot_vol_beta'] = \
            y.ewm(com=rolling_beta_com).cov(x['index_return']
                .ewm(com=rolling_beta_com)) \
            / x['index_return'].ewm(com=rolling_beta_com).var()
Ejemplo n.º 30
0
def sig_veh_buy(data):
    """
    Extract the MMR columns that are valuable when compared to the
    VehBCost (i.e. what someone paid at the auction)

    ARGS:

        data: :class:`pandas.DataFrame` of the lemon training data

    RETURNS:

        :class:`pandas.DataFrame` of the significant MMR pairings
        divided by the 'VehBCost' or Vehicle Buy Cost
    """

    cols = [
        'MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice',
        'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice',
        'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice',
        'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice'
    ]

    to_use = {}
    # go through and construct a rough cut based on .95 p-value
    for col in cols:
        xs = data['VehBCost'].div(data[col])
        is_inf = numpy.isinf(xs)
        xs[is_inf] = numpy.nan
        ols = pandas.ols(x=xs, y=data['IsBadBuy'])
        if ols.p_value['x'] < .05:
            to_use[col] = xs

    is_sig = 1e-3
    not_parsimonious = True
    while not_parsimonious:
        #now trim down to the most parsimonious model
        buy_df = pandas.DataFrame(to_use)
        ols = pandas.ols(x=buy_df, y=data['IsBadBuy'])
        if any(ols.p_value > is_sig):
            for val in ols.p_value[ols.p_value > is_sig].index:
                try:
                    to_use.pop(val)
                except:
                    print "Intercept not significant"
        else:
            not_parsimonious = False
    return buy_df
Ejemplo n.º 31
0
def geneticAlgo(mutation_rate, pop_size):

    baseball = pd.read_table(
        "/Users/Ahmet/Box Sync/Classes/Vanderbilt/AdvancedStatisticalComputing/Bios8366/data/textbook/baseball.dat",
        sep='\s+')
    predictors = baseball.copy()
    logsalary = predictors.pop('salary').apply(np.log)
    nrows, ncols = predictors.shape
    iterations = 100

    aic_best = []
    best_solution = []
    aic_history = []

    # Initialize genotype
    current_gen = np.random.binomial(1, 0.5, pop_size * ncols).reshape(
        (pop_size, ncols))

    for i in range(iterations):

        # Get phenotype
        current_phe = [
            predictors[predictors.columns[g.astype(bool)]] for g in current_gen
        ]
        # Calculate AIC
        current_aic = np.array(
            [aic(pd.ols(y=logsalary, x=x)) for x in current_phe])
        # Get lowest AIC
        aic_best.append(current_aic[np.argmin(current_aic)])
        best_solution.append(current_gen[np.argmin(current_aic)])

        # Calculate fitness according to AIC rank
        fitness = calculate_fitness(current_aic)

        # Choose first parents according to fitness
        moms = np.random.choice(range(pop_size),
                                size=int(pop_size / 2),
                                p=fitness)
        # Choose second parents randomly
        dads = np.random.choice(range(pop_size), size=int(pop_size / 2))

        next_gen = []
        for x, y in zip(current_gen[moms], current_gen[dads]):
            # Crossover
            cross = np.random.randint(0, ncols)
            child1 = np.r_[x[:cross], y[cross:]]
            child2 = np.r_[y[:cross], x[cross:]]
            # Mutate
            m1 = np.random.binomial(1, mutation_rate, size=ncols).astype(bool)
            child1[m1] = abs(child1[m1] - 1)
            m2 = np.random.binomial(1, mutation_rate, size=ncols)
            child2[m2] = abs(child1[m2] - 1)
            next_gen += [child1, child2]

        # Increment generation
        current_gen = np.array(next_gen)
        # Store AIC values
        aic_history.append(current_aic)
    return aic_best
Ejemplo n.º 32
0
def annualBeta():
    '''
    exercise.
    This function was in the handouts and looked very interesting.
    So I implemented it to see the inner workings.
    Purpose of this function is to calculate beta of a stock.
    :return:
    '''
    def ret_f(ticker, startDate, endDate):
        p = finance.quotes_historical_yahoo(ticker,
                                            startDate,
                                            endDate,
                                            asobject=True,
                                            adjusted=True)
        return ((p.aclose[1:] - p.aclose[0:-1]) / p.aclose[:-1])

    startDate = (1990, 1, 1)
    endDate = (2014, 12, 31)

    # Pandas Series for Oracle's Data
    y0 = pd.Series(ret_f('ORCL', startDate, endDate))

    # Pandas Series for S&P500 Data
    x0 = pd.Series(ret_f('^GSPC', startDate, endDate))

    # Historical Date values of S&P500
    dateVal = finance.quotes_historical_yahoo('^GSPC',
                                              startDate,
                                              endDate,
                                              asobject=True,
                                              adjusted=True).date[0:-1]
    lag_year = dateVal[0].strftime("%Y")
    y1, x1, beta, index0 = [], [], [], []

    # Calculate Beta for each year
    for i in range(1, len(dateVal)):
        year = dateVal[i].strftime("%Y")
        if (year == lag_year):
            x1.append(x0[i])
            y1.append(y0[i])
        else:
            model = pd.ols(y=pd.Series(y1), x=pd.Series(x1))
            print(lag_year, round(model.beta[0], 4))
            beta.append(model.beta[0])
            index0.append(lag_year)
            x1 = []
            y1 = []
            lag_year = year

    # Plot the main graph
    plt.plot(beta, c='firebrick', label='ORCL Beta w.r.t S&P500')
    plt.hlines(y=1,
               xmin=0,
               xmax=25,
               label='Perfect Correlation',
               lw=2,
               color='steelblue')
    plt.legend()
    plt.show()
def linearfunction(x,y,name='linear rating'):
    datadf = pd.DataFrame.from_dict({'x':x,'y':y}).dropna() ## put x and y in a dataframe so you can drop ones that don't match up    
    datadf = datadf[datadf>=0].dropna() ##verify data is valid (not inf)
    regression = pd.ols(y=datadf['y'],x=datadf['x'])
    pearson = pearson_r(datadf['x'],datadf['y'])[0]
    spearman = spearman_r(datadf['x'],datadf['y'])[0]
    coeffdf = pd.DataFrame({'a':[regression.beta[1]],'b':[regression.beta[0]],'r2':[regression.r2],'rmse':[regression.rmse],'pearson':[pearson],'spearman':[spearman]},index=[name])
    return coeffdf
Ejemplo n.º 34
0
 def regress(df, index=pd.Index((u'slope', u'intercept'))):
     xcol = u'seed_cell_number_ml'
     ycol = u'signal'
     subdf = df.sort(columns=[xcol], axis=0)
     # ols = "ordinary least squares"
     ret = pd.ols(x=subdf[xcol][2:], y=subdf[ycol][2:]).beta
     ret.index = index
     return ret
Ejemplo n.º 35
0
    def test_solve_rect(self):
        if not _have_statsmodels:
            raise nose.SkipTest("no statsmodels")

        b = Series(np.random.randn(N), self.frame.index)
        result = pmath.solve(self.frame, b)
        expected = ols(y=b, x=self.frame, intercept=False).beta
        self.assert_(np.allclose(result, expected))
Ejemplo n.º 36
0
def find_trend(series):
    import numpy as np

    ln = len(series)
    x = pd.Series(np.arange(ln))
    regression = pd.ols(y=series, x=x)

    return regression.beta[0]
Ejemplo n.º 37
0
Archivo: hedge.py Proyecto: huabxu/tssa
def calc_port_beta(port, mkt, window=20, min_periods=15):
    '''
    Given a Series of portfolio returns and a Series of market returns,
    compute the beta of the portfolio against the market,
    using the windowed approach.
    '''
    model = pd.ols(y=port, x=mkt, window_type='rolling', window=window, min_periods=min_periods, intercept=True)
    return model.beta.intercept, model.beta.x
Ejemplo n.º 38
0
def sig_MMR(data):
    """
    Extract the MMR columns that are valuable based on the multivariate
    regression run by statsmodels

    ARGS:

        data: :class:`pandas.DataFrame` of the lemon training data

    RETURNS:

        :class:`pandas.DataFrame` of the significant MMR pairings
    """
    cols = [
        'MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice',
        'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice',
        'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice',
        'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice'
    ]

    to_use = {}
    pairs = itertools.combinations(range(len(cols)), 2)
    # go through and construct a rough cut based on .95 p-value
    for x, y in pairs:
        xs = data[cols[x]].div(data[cols[y]])
        is_inf = numpy.isinf(xs)
        xs[is_inf] = numpy.nan
        ols = pandas.ols(x=xs, y=data['IsBadBuy'])
        if ols.p_value['x'] < .05:
            to_use[str(x) + ',' + str(y)] = xs

    is_sig = 1e-3
    not_parsimonious = True
    while not_parsimonious:
        #now trim down to the most parsimonious model
        mmr_df = pandas.DataFrame(to_use)
        ols = pandas.ols(x=mmr_df, y=data['IsBadBuy'])
        if any(ols.p_value > is_sig):
            for val in ols.p_value[ols.p_value > is_sig].index:
                try:
                    to_use.pop(val)
                except:
                    print "Intercept not significant"
        else:
            not_parsimonious = False
    return mmr_df
Ejemplo n.º 39
0
 def reg(df, index=pd.Index(('coeff', 'intercept'))):
     xcol = 'seed_cell_number_ml'
     ycol = 'signal'
     sdf = df.sort(columns=[xcol], axis=0)
     ls = pd.ols(x=sdf[xcol][2:], y=sdf[ycol][2:])
     ret = ls.beta
     ret.index = index
     return ret
Ejemplo n.º 40
0
def find_trend(series):
    import numpy as np

    ln = len(series)
    x = pd.Series(np.arange(ln))
    regression = pd.ols(y=series, x=x)

    return regression.beta[0]
Ejemplo n.º 41
0
    def test_solve_rect(self):
        if not _have_statsmodels:
            raise nose.SkipTest

        b = Series(np.random.randn(N), self.frame.index)
        result = pmath.solve(self.frame, b)
        expected = ols(y=b, x=self.frame, intercept=False).beta
        self.assert_(np.allclose(result, expected))
Ejemplo n.º 42
0
def sig_veh_buy(data):
    """
    Extract the MMR columns that are valuable when compared to the
    VehBCost (i.e. what someone paid at the auction)

    ARGS:

        data: :class:`pandas.DataFrame` of the lemon training data

    RETURNS:

        :class:`pandas.DataFrame` of the significant MMR pairings
        divided by the 'VehBCost' or Vehicle Buy Cost
    """
    
    cols = ['MMRAcquisitionAuctionAveragePrice','MMRAcquisitionAuctionCleanPrice',
            'MMRAcquisitionRetailAveragePrice','MMRAcquisitonRetailCleanPrice',
            'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 
            'MMRCurrentRetailAveragePrice','MMRCurrentRetailCleanPrice']

    to_use = {}
    # go through and construct a rough cut based on .95 p-value
    for col in cols:
        xs = data['VehBCost'].div(data[col])
        is_inf = numpy.isinf(xs)
        xs[is_inf] = numpy.nan
        ols = pandas.ols(x = xs, y = data['IsBadBuy'])
        if ols.p_value['x'] < .05:
            to_use[col] = xs
    
    is_sig = 1e-3
    not_parsimonious = True
    while not_parsimonious:
    #now trim down to the most parsimonious model
        buy_df = pandas.DataFrame(to_use)
        ols = pandas.ols(x = buy_df, y = data['IsBadBuy'])
        if any(ols.p_value > is_sig):
            for val in ols.p_value[ols.p_value > is_sig].index:
                try:
                    to_use.pop(val)
                except:
                    print "Intercept not significant"
        else:
            not_parsimonious = False
    return buy_df
Ejemplo n.º 43
0
def test_r2_adj(man_calcs, prices):
    log_rets = analyze.log_returns(prices).dropna()
    pandas_rsq = pandas.ols(x = log_rets['S&P 500'], 
                            y = log_rets['VGTSX']).r2_adj

    analyze_rsq = analyze.r2_adj(benchmark = log_rets['S&P 500'], 
                             series = log_rets['VGTSX'])

    testing.assert_almost_equal(pandas_rsq, analyze_rsq)
Ejemplo n.º 44
0
    def test_solve_rect(self):
        if not _have_statsmodels:
            raise nose.SkipTest("no statsmodels")

        b = Series(np.random.randn(N), self.frame.index)
        result = pmath.solve(self.frame, b)
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            expected = ols(y=b, x=self.frame, intercept=False).beta
        self.assertTrue(np.allclose(result, expected))
Ejemplo n.º 45
0
def get_rolling_beta(my_y, my_x, my_window=252):
    """
    given x,y, runs ols regression on rolling window, outputs the beta coefficient series
    y -- your portfolio or asset
    x -- SPX
    window -- # days rolling period
    """
    model = pd.ols(y=my_y, x=my_x, window=my_window)
    return model.beta.x
Ejemplo n.º 46
0
def regression_line():
    '''Fit a line, using the powerful "ordinary least square" method of pandas'''
    
    # Get the data
    data = getData('altman_11_6.txt')
    
    df = pd.DataFrame(data, columns=['glucose', 'Vcf'])
    model = pd.ols(y=df['Vcf'], x=df['glucose'])
    print model.summary    
Ejemplo n.º 47
0
def cv_ols(y, x, k=10):
    kfold = cross_validation.KFold(len(y), k)
    rmse = []
    for trIdx, teIdx in kfold:
        result = pd.ols(y=y[trIdx], x=x.loc[trIdx])
        predictions = result.predict(result.beta, x.loc[teIdx])
        error = sqrt(mean_squared_error(y[teIdx], predictions))
        rmse.append(error)
    return rmse.mean()
Ejemplo n.º 48
0
    def test_solve_rect(self):
        if not _have_statsmodels:
            raise nose.SkipTest("no statsmodels")

        b = Series(np.random.randn(N), self.frame.index)
        result = pmath.solve(self.frame, b)
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            expected = ols(y=b, x=self.frame, intercept=False).beta
        self.assertTrue(np.allclose(result, expected))
Ejemplo n.º 49
0
def degree_day_regression(df, x_opt='both'):
    '''
    Function that runs the weather normalization regression on energy use data

    df: dataframe that includes
        use per day (upd)
        heating degree days per day (hddpd)
        cooling degree days per day (cddpd)

    x_opt: options for the regression function
        'hdd': run regression with just heating degree days
        'cdd': run regression with just cooling degree days
        'both' (default):
    '''

    if x_opt == 'hdd':
        covar = {'HDD': df.hdd_per_day}
        results = pd.ols(y=df.use_per_day, x = covar)
        return pd.DataFrame([[results.beta[1], results.std_err[1],
                              results.beta[0], results.std_err[0],
                              results.r2, results.r2_adj, results.nobs ]],
                            columns = ['intercept', 'intercept_std_err',
                                       'HDD', 'HDD_std_err',
                                       'R2', 'R2_adj','N_reads'])
    elif x_opt == 'cdd':
        covar = {'CDD': df.cdd_per_day}
        results = pd.ols(y=df.use_per_day, x = covar)
        return pd.DataFrame([[results.beta[1], results.std_err[1],
                              results.beta[0], results.std_err[0],
                              results.r2, results.r2_adj, results.nobs]],
                              columns = ['intercept', 'intercept_std_err',
                                         'CDD', 'CDD_std_err',
                                         'R2', 'R2_adj','N_reads'])
    elif x_opt == 'both':
        covar = {'CDD': df.cdd_per_day, 'HDD': df.hdd_per_day}
        results = pd.ols(y=df.use_per_day, x = covar)
        return pd.DataFrame([[results.beta[2], results.std_err[2],
                              results.beta[0], results.std_err[0],
                              results.beta[1], results.std_err[1],
                              results.r2, results.r2_adj, results.nobs]],
                            columns = ['intercept', 'intercept_std_err',
                                       'CDD', 'CDD_std_err',
                                       'HDD','HDD_std_err',
                                       'R2', 'R2_adj','N_reads'])
Ejemplo n.º 50
0
def sig_MMR(data):
    """
    Extract the MMR columns that are valuable based on the multivariate
    regression run by statsmodels

    ARGS:

        data: :class:`pandas.DataFrame` of the lemon training data

    RETURNS:

        :class:`pandas.DataFrame` of the significant MMR pairings
    """
    cols = ['MMRAcquisitionAuctionAveragePrice','MMRAcquisitionAuctionCleanPrice',
            'MMRAcquisitionRetailAveragePrice','MMRAcquisitonRetailCleanPrice',
            'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 
            'MMRCurrentRetailAveragePrice','MMRCurrentRetailCleanPrice']

    to_use = {}
    pairs = itertools.combinations(range(len(cols)), 2)
    # go through and construct a rough cut based on .95 p-value
    for x, y in pairs:
        xs = data[cols[x]].div(data[cols[y]])
        is_inf = numpy.isinf(xs)
        xs[is_inf] = numpy.nan
        ols = pandas.ols(x = xs, y = data['IsBadBuy'])
        if ols.p_value['x'] < .05:
            to_use[str(x) + ',' + str(y)] = xs
    
    is_sig = 1e-3
    not_parsimonious = True
    while not_parsimonious:
    #now trim down to the most parsimonious model
        mmr_df = pandas.DataFrame(to_use)
        ols = pandas.ols(x = mmr_df, y = data['IsBadBuy'])
        if any(ols.p_value > is_sig):
            for val in ols.p_value[ols.p_value > is_sig].index:
                try:
                    to_use.pop(val)
                except:
                    print "Intercept not significant"
        else:
            not_parsimonious = False
    return mmr_df
Ejemplo n.º 51
0
    def plots(self, temp, city_code, business_class, per_meter):
        '''
        Regress measured usage to temperatures in cold and hot months and
        generate x- and y-vectors for plotting this data set

        :param temp: temperature time series to regress to
        :param city_code: 'PRINCETON TWP' or 'PRINCETON BORO'
        :param business_class: a single business_class or a collection thereof
            - if a collection is given, the values are summed up
        :param per_meter: True for consumption per meter, False for aggregate
        :return: frame containing scatterplot and regression lines to temperature
        '''
        data = self.select(city_code, business_class, per_meter)
        #
        # split monthly average temperature series into two:
        # cool => samples when temps are cool basetemp
        # warm => samples when temps are warm basetemp
        # use the full time period over which any usage data exists
        #
        temp = temp.ix[self.usage.index]['temp']
        cool = temp[temp <  self.basetemp]
        warm = temp[temp >= self.basetemp]
        #
        # regress separately for cool and warm months
        #
        model = ps.ols(x=cool, y=data[cool.index])
        m_cool, c_cool = model.beta.x, model.beta.intercept
        model = ps.ols(x=warm, y=data[warm.index])
        m_warm, c_warm = model.beta.x, model.beta.intercept

        vs_temp = ps.DataFrame(
            {'temp': temp, 'scatter': data[temp.index]}
        ).merge(
            ps.DataFrame(
                {'temp': cool, 'heating regression': line(cool, m_cool, c_cool)}),
            on='temp', how='outer', suffixes=('', '')
        ).merge(
            ps.DataFrame(
                {'temp': warm, 'cooling regression': line(warm, m_warm, c_warm)}),
            on='temp', how='outer', suffixes=('', ''))

        vs_time = data
        return vs_temp.set_index('temp'), vs_time
Ejemplo n.º 52
0
    def equations(self):
        eqs = {}
        for col, ts in self.y.iteritems():
            model = pn.ols(y=ts, x=self.x, window=self._window,
                           window_type=self._window_type,
                           min_periods=self._min_periods)

            eqs[col] = model

        return eqs
 def  cointegration_test(self):
     """协整检验,主要通过ADF检验的方式进行检验"""
     #协整检验,主要通过ADF检验的方式进行检验
     if len(self.contract1_Close) != 0 and len(self.contract2_Close) != 0:
         model=pd.ols(y=self.contract1_Close,x=self.contract2_Close,intercept=True)
         spread=self.contract1_Close-self.contract2_Close*model.beta["x"] #得出价差序列
         spread=spread.dropna() #去除缺失数据
         sta=sts.adfuller(spread,1) #进行ADF检验
         result=sta[0]
     print(sta)
Ejemplo n.º 54
0
def year_based_significance_regression(file_path):
    """
    Run a year-based multivariate regression that uses only the
    significant variables as well as Random Forest Regression Trees to
    estimate the parameters

    Args:
    ------
    - file_path: string of the location of `baseball.csv`

    Returns:
    ---------
    - pandas.DataFrame of the in-sample and out-of-sample salary estimates
    """

    data = pandas.DataFrame.from_csv(file_path, index_col = None)
    data['age'] = data['yearID'] - data['birthYear']
    cols = ['G_batting', 'AB', 'R', 'H', 'X2B', 'X3B', 'HR', 'RBI','SB', 
            'CS', 'BB', 'SO', 'IBB', 'HBP', 'SH', 'SF', 'GIDP', 'teamID',
            'salary', 'yearID', 'age']

    all_data = data[cols].copy()
    all_data.dropna(inplace = True)
    teams = pandas.get_dummies(all_data['teamID'])
    x_cols = all_data.columns[map(lambda x: x not in ['teamID', 'salary'],
                                  all_data.columns)]
    xs = all_data[x_cols].join(teams)
    ys = all_data['salary']
    N = xs.shape[0]
    isi, in_sample, osi, out_sample = create_in_out_samples(xs, N/2)
    d = {}
    for year in all_data['yearID'].unique():
        no_yr = in_sample.columns.drop('yearID')
        d_too = {}
        is_yr = in_sample['yearID'] == year
        os_yr = out_sample['yearID'] == year
        ols = pandas.ols(x = in_sample.loc[is_yr, no_yr], y = ys[isi][is_yr])
        df = ols.summary_as_matrix
        is_sig = df.loc['p-value', df.loc['p-value', :] < .01].index

        if 'intercept' in is_sig:
            is_sig = is_sig.drop('intercept')

        clf = ensemble.RandomForestRegressor(n_estimators = 15)
        clf.fit(in_sample.loc[is_yr, is_sig], ys[isi][is_yr])
        is_score = clf.score(in_sample.loc[is_yr, is_sig], ys[isi][is_yr])
        d_too['is-r2'] = is_score
        os_score = clf.score(out_sample.loc[os_yr, is_sig], ys[osi][os_yr])
        d_too['os-r2'] = os_score
        eps = ys[osi][os_yr].sub(clf.predict(out_sample.loc[os_yr, is_sig]))
        d_too['mae'] = eps.abs().sum()/(len(ys[osi][os_yr]) - 2.)

        d[year] = pandas.Series(d_too)

    return pandas.DataFrame(d).transpose()
Ejemplo n.º 55
0
def simAnnealing(periods,cooling,tau_start,numChangeNeigh):
    baseball = pd.read_table("/Users/Ahmet/Box Sync/Classes/Vanderbilt/AdvancedStatisticalComputing/Bios8366/data/textbook/baseball.dat", sep='\s+')
    predictors = baseball.copy()
    logsalary = predictors.pop('salary').apply(np.log)

    nrows, ncols = predictors.shape

    aic_values = []
    solution_current = solution_best = np.random.binomial(1, 0.5, ncols).astype(bool)
    solution_vars = predictors[predictors.columns[solution_current]]
    g = pd.ols(y=logsalary, x=solution_vars)
    aic_best = aic(g)
    aic_values.append(aic_best)

    # Cooling schedule
    tau = [tau_start * 0.9**i for i in range(periods)]
    for j in range(periods):

        for i in range(cooling[j]):

            # Random change n-neighborhood
            flip = np.random.choice(ncols,numChangeNeigh,replace = False)
            for i in range(numChangeNeigh):
                solution_current[flip[i]] = not solution_current[flip[i]]
            solution_vars = predictors[predictors.columns[solution_current]]
            g = pd.ols(y=logsalary, x=solution_vars)
            aic_step = aic(g)
            alpha = min(1, np.exp((aic_values[-1] - aic_step)/tau[j]))

            if ((aic_step < aic_values[-1]) or (np.random.uniform() < alpha)):
                # Accept proposed solution
                aic_values.append(aic_step)
                if aic_step < aic_best:
                    # Replace previous best with this one
                    aic_best = aic_step
                    solution_best = solution_current.copy()
            else:
                # Revert solution
                for i in range(numChangeNeigh):
                    solution_current[flip[i]] = not solution_current[flip[i]]
                aic_values.append(aic_values[-1])
    return aic_values,aic_best,solution_best
Ejemplo n.º 56
0
def regression_line():
    """Fit a line, using the powerful "ordinary least square" method of pandas"""

    # Get the data
    data = getData("altman_11_6.txt", subDir=r"..\Data\data_altman")

    df = pd.DataFrame(data, columns=["glucose", "Vcf"])
    model = pd.ols(y=df["Vcf"], x=df["glucose"])
    print(model.summary)

    return model.f_stat["f-stat"]  # should be 4.4140184331462571
Ejemplo n.º 57
0
    def equations(self):
        eqs = {}
        for col, ts in iteritems(self.y):
            # TODO: Remove in favor of statsmodels implemetation
            model = pd.ols(y=ts, x=self.x, window=self._window,
                           window_type=self._window_type,
                           min_periods=self._min_periods)

            eqs[col] = model

        return eqs