Beispiel #1
0
    def run(self, results_x, results_z, attach=True):
        '''

        see class docstring (for now)
        '''
        if not np.allclose(results_x.model.endog, results_z.model.endog):
            raise ValueError('endogenous variables in models are not the same')
        nobs = results_x.model.endog.shape[0]
        x = results_x.model.exog
        z = results_z.model.exog
        sigma2_x = results_x.ssr / nobs
        sigma2_z = results_z.ssr / nobs
        yhat_x = results_x.fittedvalues
        yhat_z = results_z.fittedvalues
        res_dx = sm.OLS(yhat_x, z).fit()
        err_zx = res_dx.resid
        res_xzx = sm.OLS(err_zx, x).fit()
        err_xzx = res_xzx.resid

        sigma2_zx = sigma2_x + np.dot(err_zx.T, err_zx) / nobs
        c01 = nobs / 2. * (np.log(sigma2_z) - np.log(sigma2_zx))
        v01 = sigma2_x * np.dot(err_xzx.T, err_xzx) / sigma2_zx**2
        q = c01 / np.sqrt(v01)
        pval = 2 * stats.norm.sf(np.abs(q))

        if attach:
            self.res_dx = res_dx
            self.res_xzx = res_xzx
            self.c01 = c01
            self.v01 = v01
            self.q = q
            self.pvalue = pval
            self.dist = stats.norm

        return q, pval
Beispiel #2
0
def het_white(y, x, retres=False):
    '''Lagrange Multiplier Heteroscedasticity Test by White

    Notes
    -----
    assumes x contains constant (for counting dof)

    question: does f-statistic make sense? constant ?

    References
    ----------

    Greene section 11.4.1 5th edition p. 222
    '''
    x = np.asarray(x)
    y = np.asarray(y)**2
    if x.ndim == 1:
        raise ValueError(
            'x should have constant and at least one more variable')
    nobs, nvars0 = x.shape
    i0, i1 = np.triu_indices(nvars0)
    exog = x[:, i0] * x[:, i1]
    nobs, nvars = exog.shape
    assert nvars == nvars0 * (nvars0 - 1) / 2. + nvars0
    resols = sm.OLS(y**2, exog).fit()
    fval = resols.fvalue
    fpval = resols.f_pvalue
    lm = nobs * resols.rsquared
    # Note: degrees of freedom for LM test is nvars minus constant
    lmpval = stats.chi2.sf(lm, nvars - 1)
    return lm, lmpval, fval, fpval
Beispiel #3
0
    def __init__(self,
                 y,
                 x,
                 intercept=True,
                 weights=None,
                 nw_lags=None,
                 nw_overlap=False):
        import scikits.statsmodels.api as sm
        self._x_orig = x
        self._y_orig = y
        self._weights_orig = weights
        self._intercept = intercept
        self._nw_lags = nw_lags
        self._nw_overlap = nw_overlap

        (self._y, self._x, self._weights, self._x_filtered, self._index,
         self._time_has_obs) = self._prepare_data()

        if self._weights is not None:
            self._x_trans = self._x.mul(np.sqrt(self._weights), axis=0)
            self._y_trans = self._y * np.sqrt(self._weights)
            self.sm_ols = sm.WLS(self._y.values,
                                 self._x.values,
                                 weights=self._weights.values).fit()
        else:
            self._x_trans = self._x
            self._y_trans = self._y
            self.sm_ols = sm.OLS(self._y.values, self._x.values).fit()
Beispiel #4
0
    def run(self, results_x, results_z, attach=True):
        '''

        see class docstring (for now)
        '''
        if not np.allclose(results_x.model.endog, results_z.model.endog):
            raise ValueError('endogenous variables in models are not the same')
        nobs = results_x.model.endog.shape[0]
        y = results_x.model.exog
        x = results_x.model.exog
        z = results_z.model.exog
        #sigma2_x = results_x.ssr/nobs
        #sigma2_z = results_z.ssr/nobs
        yhat_x = results_x.fittedvalues
        #yhat_z = results_z.fittedvalues
        res_zx = sm.OLS(y, np.column_stack((yhat_x, z))).fit()
        self.res_zx = res_zx  #for testing
        tstat = res_zx.tvalues[0]
        pval = res_zx.pvalues[0]
        if attach:
            self.res_zx = res_zx
            self.dist = stats.t(res_zx.model.df_resid)
            self.teststat = tstat
            self.pvalue = pval

        return tsta, pval
Beispiel #5
0
    def checkOLS(self, exog, endog, x, y):

        try:
            import scikits.statsmodels.api as sm
        except ImportError:
            import scikits.statsmodels as sm

        reference = sm.OLS(endog, sm.add_constant(exog)).fit()

        result = ols(y=y, x=x)

        assert_almost_equal(reference.params, result._beta_raw)
        assert_almost_equal(reference.df_model, result._df_model_raw)
        assert_almost_equal(reference.df_resid, result._df_resid_raw)
        assert_almost_equal(reference.fvalue, result._f_stat_raw[0])
        assert_almost_equal(reference.pvalues, result._p_value_raw)
        assert_almost_equal(reference.rsquared, result._r2_raw)
        assert_almost_equal(reference.rsquared_adj, result._r2_adj_raw)
        assert_almost_equal(reference.resid, result._resid_raw)
        assert_almost_equal(reference.bse, result._std_err_raw)
        assert_almost_equal(reference.t(), result._t_stat_raw)
        assert_almost_equal(reference.cov_params(), result._var_beta_raw)
        assert_almost_equal(reference.fittedvalues, result._y_fitted_raw)

        _check_non_raw_results(result)
 def test_qqplot(self):
     #just test that it runs
     data = sm.datasets.longley.load()
     data.exog = sm.add_constant(data.exog)
     mod_fit = sm.OLS(data.endog, data.exog).fit()
     res = mod_fit.resid
     fig = sm.qqplot(res)
     plt.close(fig)
    def setup(self):
        nsample = 100
        sig = 0.5
        x1 = np.linspace(0, 20, nsample)
        x2 = 5 + 3 * np.random.randn(nsample)
        X = np.c_[x1, x2, np.sin(0.5 * x1), (x2 - 5)**2, np.ones(nsample)]
        beta = [0.5, 0.5, 1, -0.04, 5.]
        y_true = np.dot(X, beta)
        y = y_true + sig * np.random.normal(size=nsample)
        exog0 = sm.add_constant(np.c_[x1, x2], prepend=False)
        res = sm.OLS(y, exog0).fit()

        self.res = res
Beispiel #8
0
def regression_analysis(play_arr, dataFunction1, dataFunction2):

    totalBefore = []
    totalAfter = []
    for weekNum in range(10, 15):
        Before, After = regression_weekly(play_arr, weekNum, dataFunction1,
                                          dataFunction2)

        totalBefore = np.concatenate([totalBefore, Before])
        totalAfter = np.concatenate([totalAfter, After])

    slope, intercept, r_value, p_value, err = stats.linregress(
        totalBefore, totalAfter)
    results = sm.OLS(totalAfter, sm.add_constant(totalBefore)).fit()

    print results.summary()

    plt.plot(totalBefore, totalAfter, '.')
    X_plot = np.linspace(0, 1, 100)
    plt.plot(X_plot, X_plot * results.params[0] + results.params[1])
    plt.show()
Beispiel #9
0
    def checkOLS(self, exog, endog, x, y):
        reference = sm.OLS(endog, sm.add_constant(exog, prepend=False)).fit()
        result = ols(y=y, x=x)

        # check that sparse version is the same
        sparse_result = ols(y=y.to_sparse(), x=x.to_sparse())
        _compare_ols_results(result, sparse_result)

        assert_almost_equal(reference.params, result._beta_raw)
        assert_almost_equal(reference.df_model, result._df_model_raw)
        assert_almost_equal(reference.df_resid, result._df_resid_raw)
        assert_almost_equal(reference.fvalue, result._f_stat_raw[0])
        assert_almost_equal(reference.pvalues, result._p_value_raw)
        assert_almost_equal(reference.rsquared, result._r2_raw)
        assert_almost_equal(reference.rsquared_adj, result._r2_adj_raw)
        assert_almost_equal(reference.resid, result._resid_raw)
        assert_almost_equal(reference.bse, result._std_err_raw)
        assert_almost_equal(reference.tvalues, result._t_stat_raw)
        assert_almost_equal(reference.cov_params(), result._var_beta_raw)
        assert_almost_equal(reference.fittedvalues, result._y_fitted_raw)

        _check_non_raw_results(result)
Beispiel #10
0
 def fit_fixed_nfact(self, nfact):
     if not hasattr(self, 'factors_wconst'):
         self.calc_factors()
     return sm.OLS(self.endog, self.factors[:, :nfact + 1]).fit()
Beispiel #11
0
# OLS non-linear curve but linear in parameters
# ---------------------------------------------

nsample = 50
sig = 0.5
x1 = np.linspace(0, 20, nsample)
X = np.c_[x1, np.sin(x1), (x1 - 5)**2, np.ones(nsample)]
beta = [0.5, 0.5, -0.02, 5.]
y_true = np.dot(X, beta)
y = y_true + sig * np.random.normal(size=nsample)

plt.figure()
plt.plot(x1, y, 'o', x1, y_true, 'b-')

res = sm.OLS(y, X).fit()
print res.params
print res.bse
#current bug predict requires call to model.results
#print res.model.predict
prstd, iv_l, iv_u = wls_prediction_std(res)
plt.plot(x1, res.fittedvalues, 'r--.')
plt.plot(x1, iv_u, 'r--')
plt.plot(x1, iv_l, 'r--')
plt.title('blue: true,   red: OLS')

print res.summary()

#OLS with dummy variables
#------------------------
Beispiel #12
0
sexdummy = data2dummy(dta_used[:, 1])
factors = ['sex']
for k in factors:
    varsused[k][0] = data2dummy(varsused[k][0])

products = [('sex', 'age')]
for k in products:
    varsused[''.join(k)] = data2proddummy(np.c_[varsused[k[0]][0],
                                                varsused[k[1]][0]])

# make dictionary of variables with dummies as one variable
#vars_to_use = {name: data or dummy variables}

X_b0 = np.c_[sexdummy, dta_used[:, 2], np.ones((dta_used.shape[0], 1))]
y_b0 = dta_used[:, 0]
res_b0 = sm.OLS(y_b0, X_b0).results
print res_b0.params
print res_b0.ssr

anova_str0 = '''
ANOVA statistics (model sum of squares excludes constant)
Source    DF  Sum Squares   Mean Square    F Value    Pr > F
Model     %(df_model)i        %(ess)f       %(mse_model)f   %(fvalue)f %(f_pvalue)f
Error     %(df_resid)i     %(ssr)f       %(mse_resid)f
CTotal    %(nobs)i    %(uncentered_tss)f     %(mse_total)f

R squared  %(rsquared)f
'''

anova_str = '''
ANOVA statistics (model sum of squares includes constant)
Beispiel #13
0
if __name__ == '__main__':
    #A: josef-pktd

    import scikits.statsmodels.api as sm
    from scikits.statsmodels.api import OLS
    #from scikits.statsmodels.datasets.longley import load
    from scikits.statsmodels.datasets.stackloss import load
    from scikits.statsmodels.iolib.table import (SimpleTable, default_txt_fmt,
                                                 default_latex_fmt,
                                                 default_html_fmt)
    import numpy as np

    data = load()
    data.exog = sm.tools.add_constant(data.exog)

    resols = sm.OLS(data.endog, data.exog).fit()

    print '\n OLS leave 1 out'
    for inidx, outidx in cross_val.LeaveOneOut(len(data.endog)):
        res = sm.OLS(data.endog[inidx], data.exog[inidx, :]).fit()
        print data.endog[outidx], res.model.predict(res.params,
                                                    data.exog[outidx, :]),
        print data.endog[outidx] - res.model.predict(res.params,
                                                     data.exog[outidx, :])

    print '\n OLS leave 2 out'
    resparams = []
    for inidx, outidx in cross_val.LeavePOut(len(data.endog), 2):
        res = sm.OLS(data.endog[inidx], data.exog[inidx, :]).fit()
        #print data.endog[outidx], res.model.predict(data.exog[outidx,:]),
        #print ((data.endog[outidx] - res.model.predict(data.exog[outidx,:]))**2).sum()
Beispiel #14
0
#0: no rescaling, 1:demean, 2:standardize, 3:standardize and transform back
rescale_ratio = data.endog.std() / data.exog.std(0)
if rescale > 0:
    # rescaling
    data.endog -= data.endog.mean()
    data.exog -= data.exog.mean(0)
if rescale > 1:
    data.endog *= 1. / data.endog.std()
    #data.exog *= 1000./data.exog.var(0)
    data.exog /= data.exog.std(0)
    #rescale_ratio = data.exog.var(0)/data.endog.var()

#skip because mean has been removed, but dimension is hardcoded in table
data.exog = sm.tools.add_constant(data.exog)

ols_model = sm.OLS(data.endog, data.exog)
ols_results = ols_model.fit()

# the Longley dataset is well known to have high multicollinearity
# one way to find the condition number is as follows

#Find OLS parameters for model with one explanatory variable dropped

resparams = np.nan * np.ones((7, 7))
res = sm.OLS(data.endog, data.exog).fit()
resparams[:, 0] = res.params

indall = range(7)
for i in range(6):
    ind = indall[:]
    del ind[i]
def inverse_model(sensor, num_sensors, inputcloudinessbins, num_bins, angle):

    #Get zone orientation and time for which the zone faces direct sun
    directsun = facadeorientation.orientation()
    zone_orientation = directsun[1]
    starttime = math.floor(float(directsun[2]) + float(directsun[3]) / 60)
    endtime = math.ceil(float(directsun[5]) + float(directsun[6]) / 60)
    #Get data from database
    connection = sqlite3.connect('data.db')
    cursor = connection.cursor()
    #define dicts to save data
    win_daylight = dict()
    win_sunangle = dict()
    win_hours = dict()
    newwin_daylight = dict()
    sensors = num_sensors
    cloudinessbins = inputcloudinessbins
    bins = num_bins
    cloudy = [
        'Clear', 'Partly Cloudy', 'Scattered Clouds', 'Mostly Cloudy',
        'Light Rain', 'Rain', 'Overcast', 'Heavy Rain', 'Fog', 'Haze'
    ]  #number of cloudiness could be user defined'''
    n = 0
    getcloudiness = []
    getclouds = []
    while n + len(cloudy) / int(cloudinessbins) <= len(cloudy):
        cloudset = cloudy[n:n + len(cloudy) / int(cloudinessbins)]
        getclouds.append(cloudset)
        n = n + len(cloudy) / int(cloudinessbins)
    #for sensor in range(2,int(sensors)+2,1): #get values for each sensor in the room
    table = 'light' + str(sensor)
    daylight = dict()
    worklight = dict()
    sunangle = dict()
    hours = dict()
    newdaylight = dict()
    newsunangle = dict()
    coeffam = dict()
    constantam = dict()
    rvalueam = dict()
    sunangles = dict()
    sunanglerange = dict()
    for clouds in range(len(getclouds)):  #for each level of cloudiness
        win_daylight[clouds] = []
        win_sunangle[clouds] = []
        win_hours[clouds] = []
        hours[clouds] = []
        newwin_daylight[clouds] = []
        newdaylight[clouds] = []
        newsunangle[clouds] = []
        daylight[clouds] = []
        sunangle[clouds] = []
        sunanglerange[clouds] = []
        sunangles[clouds] = []
        clouded = getclouds[clouds]
        getcloudiness.append(clouded)
        y1 = []
        y2 = []
        #print clouded
        for count in range(len(clouded)):
            cursor.execute(
                'SELECT altitude, hour FROM light1 WHERE unixtime>=1.362175776e+12 AND unixtime<=1.362729428e+12 AND hour>=%s AND hour<=%s AND cloudiness="%s"'
                % (starttime, endtime, clouded[count]))
            y1.append(cursor.fetchall())
        n = 0
        while n <= len(cloudy) / int(cloudinessbins) - 1:
            for count in y1[n]:
                altitude = float(count[0])
                hour = int(count[1])
                if altitude >= 0:
                    win_sunangle[clouds].append(altitude)
                    win_hours[clouds].append(hour)
            n += 1
        for count in range(len(clouded)):
            cursor.execute(
                'SELECT altitude, hour FROM %s WHERE unixtime>=1.362175776e+12 AND unixtime<=1.362729428e+12 AND hour>=%s AND hour<=%s AND cloudiness="%s"'
                % (table, starttime, endtime, clouded[count]))
            y2.append(cursor.fetchall())
        n = 0
        while n <= len(cloudy) / int(cloudinessbins) - 1:
            for count in y2[n]:
                altitude = float(count[0])
                hour = int(count[1])
                if altitude >= 0:
                    sunangle[clouds].append(altitude)
                    hours[clouds].append(hour)
            n += 1
        if len(sunangle[clouds]) > len(win_sunangle[clouds]):
            datalength = len(win_sunangle[clouds])
        else:
            datalength = len(sunangle[clouds])
        win_daylight[clouds] = dict()
        daylight[clouds] = dict()
        coeffam[clouds] = dict()
        constantam[clouds] = dict()
        rvalueam[clouds] = dict()
        if len(sunangle[clouds]) > 1 and len(win_sunangle[clouds]) > 1:
            if max(sunangle[clouds]) >= max(win_sunangle[clouds]):
                if min(sunangle[clouds]) >= min(win_sunangle[clouds]):
                    sunanglerange[clouds] = np.arange(
                        math.floor(min(sunangle[clouds])),
                        math.ceil(max(win_sunangle[clouds])), int(bins))
                else:
                    sunanglerange[clouds] = np.arange(
                        math.floor(min(win_sunangle[clouds])),
                        math.ceil(max(win_sunangle[clouds])), int(bins))
            else:
                if min(sunangle[clouds]) >= min(win_sunangle[clouds]):
                    sunanglerange[clouds] = np.arange(
                        math.floor(min(sunangle[clouds])),
                        math.ceil(max(sunangle[clouds])), int(bins))
                else:
                    sunanglerange[clouds] = np.arange(
                        math.floor(min(win_sunangle[clouds])),
                        math.ceil(max(sunangle[clouds])), int(bins))
        #return sunanglerange[clouds]
        y3 = dict()
        y4 = dict()
        #for angle in range(len(sunanglerange[clouds])-1):
        angle = angle_range
        y3[angle] = []
        for count in range(len(clouded)):
            cursor.execute(
                'SELECT light FROM light1 WHERE unixtime>=1.362175776e+12 AND unixtime<=1.362729428e+12 AND hour>=%s AND hour<=%s AND cloudiness="%s" AND altitude>=%s AND altitude<=%s'
                % (starttime, endtime, clouded[count],
                   sunanglerange[clouds][angle],
                   sunanglerange[clouds][angle + 1]))
            y3[angle].append(cursor.fetchall())
        #print sunanglerange[clouds][angle],y3[angle],clouded
        win_daylight[clouds][angle] = []
        n = 0
        while n <= len(cloudy) / int(cloudinessbins) - 1:
            for count in y3[angle][n]:
                if float(count[0]) > 1:
                    actualdaylight = round(float(count[0]), 2)
                    win_daylight[clouds][angle].append(actualdaylight)
            n += 1
        #print len(win_daylight[clouds][angle]),' ',clouded,' ',sunanglerange[clouds][angle]
    #for angle in range(len(sunanglerange[clouds])-1):
        y4[angle] = []
        for count in range(len(clouded)):
            cursor.execute(
                'SELECT light FROM %s WHERE unixtime>=1.362175776e+12 AND unixtime<=1.362729428e+12 AND hour>=%s AND hour<=%s AND cloudiness="%s" AND altitude>=%s AND altitude<=%s'
                % (table, starttime, endtime, clouded[count],
                   sunanglerange[clouds][angle],
                   sunanglerange[clouds][angle + 1]))
            y4[angle].append(cursor.fetchall())
        daylight[clouds][angle] = []
        coeffam[clouds][angle] = []
        constantam[clouds][angle] = []
        rvalueam[clouds][angle] = []
        n = 0
        while n <= len(cloudy) / int(cloudinessbins) - 1:
            for count in y4[angle][n]:
                if float(count[0]) > 1:
                    actualworklight = round(float(count[0]), 2)
                    daylight[clouds][angle].append(actualworklight)
            n += 1
        #print len(daylight[clouds][angle]),' ',clouded,' ',sunanglerange[clouds][angle]
        if len(win_daylight[clouds][angle]) > len(daylight[clouds][angle]):
            finaldatalength = len(daylight[clouds][angle])
        else:
            finaldatalength = len(win_daylight[clouds][angle])

        if finaldatalength > 2:
            adata = vstack((win_daylight[clouds][angle][0:finaldatalength],
                            daylight[clouds][angle][0:finaldatalength]))
            realadata = adata.transpose()
            x = realadata[:, 0]
            y = realadata[:, 1]
            X = sm.add_constant(x)
            model = sm.OLS(y, X).fit()
            if len(model.params) > 1:
                coeffam[clouds][angle] = round(model.params[0], 3)
                constantam[clouds][angle] = round(model.params[1], 3)
                rvalueam[clouds][angle] = round(model.rsquared, 3)
        return [
            coeffam[clouds][angle], constantam[clouds][angle],
            rvalueam[clouds][angle], clouded, sunanglerange[clouds][angle]
        ]
            while n<=len(cloudy)/int(cloudinessbins)-1:
                for count in y4[angle][n]:
                    if float(count[0])>1:
                        actualworklight=round(float(count[0]),2)
                        daylight[clouds][angle].append(actualworklight)
                n+=1
            #print len(daylight[clouds][angle]),' ',clouded,' ',sunanglerange[clouds][angle]
            if len(win_daylight[clouds][angle])>len(daylight[clouds][angle]):
                finaldatalength=len(daylight[clouds][angle])
            else:
                finaldatalength=len(win_daylight[clouds][angle])
            #filename="coefficients_"+str(sunanglerange[clouds][angle])+'_'+str(clouds)+".txt"
            filename="coefficients.txt"
            #savedata=open('C:\Users\chandrayee\Documents\GitHub\sensor-placement\\coefficients1_10_2sun\\'+filename,'a')
##PERFORM OLS
            if finaldatalength>2: 
                adata=vstack((win_daylight[clouds][angle][0:finaldatalength],daylight[clouds][angle][0:finaldatalength]))
                realadata=adata.transpose()
                x=realadata[:,0]
                y=realadata[:,1]
                X=sm.add_constant(x)
                model = sm.OLS(y, X).fit()
                if len(model.params)>1:
                    coeffam[clouds][angle]=round(model.params[0],3)
                    constantam[clouds][angle]=round(model.params[1],3)
                    rvalueam[clouds][angle]=round(model.rsquared,3)
            print "The coeff, constant and rvalue are", coeffam[clouds][angle], constantam[clouds][angle], rvalueam[clouds][angle], "for", clouded, "and angle", sunanglerange[clouds][angle]
            data=str(coeffam[clouds][angle])+'\t'+str(constantam[clouds][angle])+'\t'+str(rvalueam[clouds][angle])+'\t'+str(clouds)+'\t'+str(sunanglerange[clouds][angle])+'\n'      
            #savedata.write(data)
#savedata.close()      
Beispiel #17
0
pl.savefig('vzv_forest.pdf')

### @export 'OLS'
pl.figure()
import scikits.statsmodels.api as sm

Y = df['Parameter Value'].__array__()
X = .5 * (df['Age Start'] + df['Age End']).__array__()
pl.plot(X, Y, 'ks', label='Observed', mec='w', mew=1)

XX = sm.add_constant(X)
X_pred = pl.arange(65)
XX_pred = sm.add_constant(X_pred)

model = sm.OLS(Y, XX)
results = model.fit()
Y_pred = model.predict(XX_pred)

pl.plot(X_pred, Y_pred, 'k-', linewidth=2, label='Predicted by OLS')

Y = mc.logit(df['Parameter Value'].__array__())
model = sm.OLS(Y, XX)
results = model.fit()
Y_pred = model.predict(XX_pred)

pl.plot(X_pred,
        mc.invlogit(Y_pred),
        'k--',
        linewidth=2,
        label='Predicted by logit-transformed OLS')
Beispiel #18
0
def acorr_lm(x, maxlag=None, autolag='AIC', store=False):
    '''Lagrange Multiplier tests for autocorrelation

    not checked yet, copied from unitrood_adf with adjustments
    check array shapes because of the addition of the constant.
    written/copied without reference
    This is not Breush-Godfrey. BG adds lags of residual to exog in the
    design matrix for the auxiliary regression with residuals as endog,
    see Greene 12.7.1.

    Notes
    -----
    If x is calculated as y^2 for a time series y, then this test corresponds
    to the Engel test for autoregressive conditional heteroscedasticity (ARCH).
    TODO: get details and verify

    '''

    x = np.asarray(x)
    nobs = x.shape[0]
    if maxlag is None:
        #for adf from Greene referencing Schwert 1989
        maxlag = 12. * np.power(
            nobs / 100., 1 / 4.)  #nobs//4  #TODO: check default, or do AIC/BIC

    xdiff = np.diff(x)
    #
    xdall = lagmat(x[:-1, None], maxlag, trim='both')
    nobs = xdall.shape[0]
    xdall = np.c_[np.ones((nobs, 1)), xdall]
    xshort = x[-nobs:]

    if store: resstore = ResultsStore()

    if autolag:
        #search for lag length with highest information criteria
        #Note: I use the same number of observations to have comparable IC
        results = {}
        for mlag in range(1, maxlag):
            results[mlag] = sm.OLS(xshort, xdall[:, :mlag + 1]).fit()

        if autolag.lower() == 'aic':
            bestic, icbestlag = max((v.aic, k) for k, v in results.iteritems())
        elif autolag.lower() == 'bic':
            icbest, icbestlag = max((v.bic, k) for k, v in results.iteritems())
        else:
            raise ValueError("autolag can only be None, 'AIC' or 'BIC'")

        #rerun ols with best ic
        xdall = lagmat(x[:, None], icbestlag, trim='forward')
        nobs = xdall.shape[0]
        xdall = np.c_[np.ones((nobs, 1)), xdall]
        xshort = x[-nobs:]
        usedlag = icbestlag
    else:
        usedlag = maxlag

    resols = sm.OLS(xshort, xdall[:, :usedlag + 1]).fit()
    fval = resols.fvalue
    fpval = resols.f_pvalue
    lm = nobs * resols.rsquared
    lmpval = stats.chi2.sf(lm, usedlag)
    # Note: degrees of freedom for LM test is nvars minus constant = usedlags
    return fval, fpval, lm, lmpval

    if store:
        resstore.resols = resols
        resstore.usedlag = usedlag
        return fval, fpval, lm, lmpval, resstore
    else:
        return fval, fpval, lm, lmpval
Beispiel #19
0
    def loglikeobs(self, params):
        beta = params[:-1]
        sigma = params[-1]
        xb = np.dot(self.exog, beta)
        return stats.norm.logpdf(self.endog, loc=xb, scale=sigma)


mod_norm2 = MygMLE(datal.endog, datal.exog)
#res_norm = mod_norm.fit(start_params=np.ones(datal.exog.shape[1]+1), method="nm", maxiter = 500)
res_norm2 = mod_norm2.fit(start_params=[1.] * datal.exog.shape[1] + [1],
                          method="nm",
                          maxiter=500)
print res_norm2.params

res2 = sm.OLS(datal.endog, datal.exog).fit()
start_params = np.hstack((res2.params, np.sqrt(res2.mse_resid)))
res_norm3 = mod_norm2.fit(start_params=start_params,
                          method="nm",
                          maxiter=500,
                          retall=0)
print start_params
print res_norm3.params
print res2.bse
#print res_norm3.bse   # not available
print 'llf', res2.llf, res_norm3.llf

bse = np.sqrt(np.diag(np.linalg.inv(res_norm3.model.hessian(
    res_norm3.params))))
res_norm3.model.score(res_norm3.params)
Beispiel #20
0
import scikits.statsmodels.api as sm
from scikits.statsmodels.miscmodels import TLinearModel

#Example:
#np.random.seed(98765678)
nobs = 50
nvars = 6
df = 3
rvs = np.random.randn(nobs, nvars - 1)
data_exog = sm.add_constant(rvs)
xbeta = 0.9 + 0.1 * rvs.sum(1)
data_endog = xbeta + 0.1 * np.random.standard_t(df, size=nobs)
print 'variance of endog:', data_endog.var()
print 'true parameters:', [0.1] * nvars + [0.9]

res_ols = sm.OLS(data_endog, data_exog).fit()
print '\nResults with ols'
print '----------------'
print res_ols.scale
print np.sqrt(res_ols.scale)
print res_ols.params
print res_ols.bse
kurt = stats.kurtosis(res_ols.resid)
df_fromkurt = 6. / kurt + 4
print 'df_fromkurt from ols residuals', df_fromkurt
print stats.t.stats(df_fromkurt, moments='mvsk')
print stats.t.stats(df, moments='mvsk')

modp = TLinearModel(data_endog, data_exog)
start_value = 0.1 * np.ones(data_exog.shape[1] + 2)
#start_value = np.zeros(data_exog.shape[1]+2)
Beispiel #21
0
    x = np.ones((nobs, 2))
    x[:, 1] = np.arange(nobs) / 20.
    y = x.sum(1) + 1.01 * (1 + 1.5 * (x[:, 1] > 10)) * np.random.rand(nobs)
    print het_goldfeldquandt(y, x, 1)

    y = x.sum(1) + 1.01 * (1 + 0.5 * (x[:, 1] > 10)) * np.random.rand(nobs)
    print het_goldfeldquandt(y, x, 1)

    y = x.sum(1) + 1.01 * (1 - 0.5 * (x[:, 1] > 10)) * np.random.rand(nobs)
    print het_goldfeldquandt(y, x, 1)

    print het_breushpagan(y, x)
    print het_white(y, x)

    f, p = het_goldfeldquandt(y, x, 1)
    print f, p
    resgq = het_goldfeldquandt(y, x, 1, retres=True)
    print resgq

    #this is just a syntax check:
    print neweywestcov(y, x)

    resols1 = sm.OLS(y, x).fit()
    print neweywestcov(resols1.resid, x)
    print resols1.cov_params()
    print resols1.HC0_se
    print resols1.cov_HC0

    y = x.sum(1) + 10. * (1 - 0.5 * (x[:, 1] > 10)) * np.random.rand(nobs)
    print HetGoldfeldQuandt().run(y, x, 1, alternative='dec')
Beispiel #22
0
import numpy as np
import scikits.statsmodels.api as sm

# create some data set

nsample = 50
sig = 0.25
x1 = np.linspace(0, 20, nsample)
X = np.c_[x1, np.sin(x1), (x1-5)**2, np.ones(nsample)]
beta = [0.5, 0.5, -0.02, 5.]
y_true = np.dot(X, beta)
y = y_true + sig * np.random.normal(size=nsample)

#setup and estimate the model

olsmod = sm.OLS(y, X)
olsres = olsmod.fit()
print olsres.params
print olsres.bse

# use predict method of model class, not in the results class
# (we had a discussion but it is still in the model)

ypred = olsmod.predict(X) # predict insample

# create a new sample of explanatory variables Xnew, predict and plot

x1n = np.linspace(20.5,25, 10)
Xnew = np.c_[x1n, np.sin(x1n), (x1n-5)**2, np.ones(10)]
ynewpred =  olsmod.predict(Xnew) # predict out of sample
print ypred
Beispiel #23
0
else:
    y2 = 19 + 17 * x2 + 2 * np.random.randn(nobs)

x2 = sm.add_constant(x2)

# stack
x = np.concatenate((x1, x2), 0)
y = np.concatenate((y1, y2))
if example_groups == '2':
    groupind = (np.arange(2 * nobs) > nobs - 1).astype(int)
else:
    groupind = np.mod(np.arange(2 * nobs), 4)
    groupind.sort()
#x = np.column_stack((x,x*groupind[:,None]))

res1 = sm.OLS(y, x).fit()
skip = 8

rresid, rparams, rypred, rresid_standardized, rresid_scaled, rcusum, rcusumci = \
            recursive_olsresiduals(res1, skip)
print rcusum
print rresid_scaled[skip - 1:]

assert_almost_equal(rparams[-1], res1.params)

import matplotlib.pyplot as plt
plt.plot(rcusum)
plt.plot(rcusumci[0])
plt.plot(rcusumci[1])
plt.figure()
plt.plot(rresid)
Beispiel #24
0
def het_goldfeldquandt2(y, x, idx, split=None, retres=False):
    '''test whether variance is the same in 2 subsamples

    Parameters
    ----------
    y : array_like
        endogenous variable
    x : array_like
        exogenous variable, regressors
    idx : integer
        column index of variable according to which observations are
        sorted for the split
    split : None or integer or float in intervall (0,1)
        index at which sample is split.
        If 0<split<1 then split is interpreted as fraction of the observations
        in the first sample
    retres : boolean
        if true, then an instance of a result class is returned,
        otherwise 2 numbers, fvalue and p-value, are returned

    Returns
    -------
    (fval, pval) or res
    fval : float
        value of the F-statistic
    pval : float
        p-value of the hypothesis that the variance in one subsample is larger
        than in the other subsample
    res : instance of result class
        The class instance is just a storage for the intermediate and final
        results that are calculated

    Notes
    -----

    TODO:
    add resultinstance - DONE
    maybe add drop-middle as option
    maybe allow for several breaks

    recommendation for users: use this function as pattern for more flexible
        split in tests, e.g. drop middle.

    can do Chow test for structural break in same way

    ran sanity check
    '''
    x = np.asarray(x)
    y = np.asarray(y)
    nobs, nvars = x.shape
    if split is None:
        split = nobs // 2
    elif (0 < split) and (split < 1):
        split = int(nobs * split)

    xsortind = np.argsort(x[:, idx])
    y = y[xsortind]
    x = x[xsortind, :]
    resols1 = sm.OLS(y[:split], x[:split]).fit()
    resols2 = sm.OLS(y[split:], x[split:]).fit()
    fval = resols1.mse_resid / resols2.mse_resid
    if fval > 1:
        fpval = stats.f.sf(fval, resols1.df_resid, resols2.df_resid)
        ordering = 'larger'
    else:
        fval = 1. / fval
        fpval = stats.f.sf(fval, resols2.df_resid, resols1.df_resid)
        ordering = 'smaller'

    if retres:
        res = ResultsStore()
        res.__doc__ = 'Test Results for Goldfeld-Quandt test of heterogeneity'
        res.fval = fval
        res.fpval = fpval
        res.df_fval = (resols2.df_resid, resols1.df_resid)
        res.resols1 = resols1
        res.resols2 = resols2
        res.ordering = ordering
        res.split = split
        #res.__str__
        res._str = '''The Goldfeld-Quandt test for null hypothesis that the
variance in the second subsample is %s than in the first subsample:
    F-statistic =%8.4f and p-value =%8.4f''' % (ordering, fval, fpval)

        return res
    else:
        return fval, fpval
Beispiel #25
0
    #Note: full parameterization of dummies is orthogonal
    #np.eye(6)*10 in "large" example
    print(np.dot(dd_full.T, dd_full) == np.diag(dd_full.sum(0))).all()

    #check that transforms work
    #generate 3 data sets with the 3 different parameterizations

    effect_size = [1., 0.01][1]
    noise_scale = [0.001, 0.1][0]
    noise = noise_scale * np.random.randn(nobs)
    beta = effect_size * np.arange(1, 7)
    ydata_full = (dd_full * beta).sum(1) + noise
    ydata_dropl = (dd_dropl * beta).sum(1) + noise
    ydata_dropf = (dd_dropf * beta).sum(1) + noise

    resols_full_full = sm.OLS(ydata_full, dd_full).fit()
    resols_full_dropf = sm.OLS(ydata_full, dd_dropf).fit()
    params_f_f = resols_full_full.params
    params_f_df = resols_full_dropf.params

    resols_dropf_full = sm.OLS(ydata_dropf, dd_full).fit()
    resols_dropf_dropf = sm.OLS(ydata_dropf, dd_dropf).fit()
    params_df_f = resols_dropf_full.params
    params_df_df = resols_dropf_dropf.params

    tr_of = np.linalg.lstsq(dd_dropf, dd_full)[0]
    tr_fo = np.linalg.lstsq(dd_full, dd_dropf)[0]
    print np.dot(tr_fo, params_df_df) - params_df_f
    print np.dot(tr_of, params_f_f) - params_f_df

    transf_f_df = DummyTransform(dd_full, dd_dropf)
Beispiel #26
0
    def run(self,
            y,
            x,
            idx=None,
            split=None,
            drop=None,
            alternative='increasing',
            attach=True):
        '''see class docstring'''
        x = np.asarray(x)
        y = np.asarray(y)  #**2
        nobs, nvars = x.shape
        if split is None:
            split = nobs // 2
        elif (0 < split) and (split < 1):
            split = int(nobs * split)
        if drop is None:
            start2 = split
        elif (0 < drop) and (drop < 1):
            start2 = split + int(nobs * drop)
        else:
            start2 = split + drop

        if not idx is None:
            xsortind = np.argsort(x[:, idx])
            y = y[xsortind]
            x = x[xsortind, :]
        resols1 = sm.OLS(y[:split], x[:split]).fit()
        resols2 = sm.OLS(y[start2:], x[start2:]).fit()
        fval = resols2.mse_resid / resols1.mse_resid
        #if fval>1:
        if alternative.lower() in ['i', 'inc', 'increasing']:
            fpval = stats.f.sf(fval, resols1.df_resid, resols2.df_resid)
            ordering = 'increasing'
        elif alternative.lower() in ['d', 'dec', 'decreasing']:
            fval = 1. / fval
            fpval = stats.f.sf(fval, resols2.df_resid, resols1.df_resid)
            ordering = 'decreasing'
        elif alternative.lower() in ['2', '2-sided', 'two-sided']:
            fpval_sm = stats.f.cdf(fval, resols2.df_resid, resols1.df_resid)
            fpval_la = stats.f.sf(fval, resols2.df_resid, resols1.df_resid)
            fpval = 2 * min(fpval_sm, fpval_la)
            ordering = 'two-sided'
        else:
            raise ValueError('invalid alternative')

        if attach:
            res = self
            res.__doc__ = 'Test Results for Goldfeld-Quandt test of heterogeneity'
            res.fval = fval
            res.fpval = fpval
            res.df_fval = (resols2.df_resid, resols1.df_resid)
            res.resols1 = resols1
            res.resols2 = resols2
            res.ordering = ordering
            res.split = split
            #res.__str__
            #TODO: check if string works
            res._str = '''The Goldfeld-Quandt test for null hypothesis that the
    variance in the second subsample is %s than in the first subsample:
        F-statistic =%8.4f and p-value =%8.4f''' % (ordering, fval, fpval)

        return fval, fpval, ordering
dta = data.load()
gdp = np.log(dta.data['realgdp'])

from numpy import polynomial
from scipy import special

maxorder = 20
polybase = special.chebyt
polybase = special.legendre

t = np.linspace(-1, 1, len(gdp))

exog = np.column_stack([polybase(i)(t) for i in range(maxorder)])

fitted = [
    sm.OLS(gdp, exog[:, :maxr]).fit().fittedvalues
    for maxr in range(2, maxorder)
]

print(np.corrcoef(exog[:, 1:6], rowvar=0) * 10000).astype(int)

import matplotlib.pyplot as plt

plt.figure()
plt.plot(gdp, 'o')
for i in range(maxorder - 2):
    plt.plot(fitted[i])

plt.figure()
#plt.plot(gdp, 'o')
for i in range(maxorder - 4, maxorder - 2):
Beispiel #28
0
    def fit_find_nfact(self, maxfact=None, skip_crossval=True, cv_iter=None):
        '''estimate the model and selection criteria for up to maxfact factors

        The selection criteria that are calculated are AIC, BIC, and R2_adj. and
        additionally cross-validation prediction error sum of squares if `skip_crossval`
        is false. Cross-validation is not used by default because it can be
        time consuming to calculate.

        By default the cross-validation method is Leave-one-out on the full dataset.
        A different cross-validation sample can be specified as an argument to
        cv_iter.

        Results are attached in `results_find_nfact`



        '''
        #print 'OLS on Factors'
        if not hasattr(self, 'factors'):
            self.calc_factors()

        hasconst = self.hasconst
        if maxfact is None:
            maxfact = self.factors.shape[1] - hasconst

        if (maxfact + hasconst) < 1:
            raise ValueError(
                'nothing to do, number of factors (incl. constant) should ' +
                'be at least 1')

        #temporary safety
        maxfact = min(maxfact, 10)

        y0 = self.endog
        results = []
        #xred, fact, eva, eve  = pca(x0, keepdim=0, normalize=1)
        for k in range(1, maxfact + hasconst):  #k includes now the constnat
            #xred, fact, eva, eve  = pca(x0, keepdim=k, normalize=1)
            # this is faster and same result
            fact = self.factors[:, :k]
            res = sm.OLS(y0, fact).fit()
            ##    print 'k =', k
            ##    print res.params
            ##    print 'aic:  ', res.aic
            ##    print 'bic:  ', res.bic
            ##    print 'llf:  ', res.llf
            ##    print 'R2    ', res.rsquared
            ##    print 'R2 adj', res.rsquared_adj

            if not skip_crossval:
                if cv_iter is None:
                    cv_iter = LeaveOneOut(len(y0))
                prederr2 = 0.
                for inidx, outidx in cv_iter:
                    res_l1o = sm.OLS(y0[inidx], fact[inidx, :]).fit()
                    #print data.endog[outidx], res.model.predict(data.exog[outidx,:]),
                    prederr2 += (y0[outidx] -
                                 res_l1o.model.predict(fact[outidx, :]))**2.
            else:
                prederr2 = np.nan

            results.append([k, res.aic, res.bic, res.rsquared_adj, prederr2])

        self.results_find_nfact = results = np.array(results)
        self.best_nfact = np.r_[(np.argmin(results[:, 1:3],
                                           0), np.argmax(results[:, 3], 0),
                                 np.argmin(results[:, -1], 0))]
Beispiel #29
0
def het_breushpagan(resid, x, exog=None):
    '''Lagrange Multiplier Heteroscedasticity Test by Breush-Pagan

    The tests the hypothesis that the residual variance does not depend on
    the variables in x in the form

    :math: \sigma_i = \\sigma * f(\\alpha_0 + \\alpha z_i)

    Homoscedasticity implies that $\\alpha=0$


    Parameters
    ----------
    resid : arraylike, (nobs,)
        For the Breush-Pagan test, this should be the residual of a regression.
        If an array is given in exog, then the residuals are calculated by
        the an OLS regression or resid on exog. In this case resid should
        contain the dependent variable. Exog can be the same as x.
    x : array_like, (nobs, nvars)
        This contains variables that might create data dependent
        heteroscedasticity.

    Returns
    -------
    lm : float
        lagrange multiplier statistic
    lm_pvalue :float
        p-value of lagrange multiplier test
    fvalue : float
        f-statistic of the hypothesis that the error variance does not depend
        on x
    f_pvalue : float
        p-value for the f-statistic

    Notes
    -----
    Assumes x contains constant (for counting dof and calculation of R^2).
    In the general description of LM test, Greene mentions that this test
    exaggerates the significance of results in small or moderately large
    samples. In this case the F-statistic is preferrable.

    *Verification*

    Chisquare test statistic is exactly (<1e-13) the same result as bptest
    in R-stats with defaults (studentize=True).

    Implementation
    This is calculated using the generic formula for LM test using $R^2$
    (Greene, section 17.6) and not with the explicit formula
    (Greene, section 11.4.3).

    References
    ----------
    http://en.wikipedia.org/wiki/Breusch%E2%80%93Pagan_test
    Greene 5th edition
    Breush, Pagan article

    '''
    if not exog is None:
        resid = sm.OLS(y, exog).fit()

    x = np.asarray(x)
    y = np.asarray(resid)**2
    nobs, nvars = x.shape
    resols = sm.OLS(y, x).fit()
    fval = resols.fvalue
    fpval = resols.f_pvalue
    lm = nobs * resols.rsquared
    # Note: degrees of freedom for LM test is nvars minus constant
    return lm, stats.chi2.sf(lm, nvars - 1), fval, fpval
Beispiel #30
0
    print 'mutualinfo_kde normed', mutualinfo_kde(y, x)
    print 'mutualinfo_kde       ', mutualinfo_kde(y, x, normed=False)
    mi_normed, (pyx2, py2, px2, binsy2, binsx2), mi_obs = \
               mutualinfo_binned(y, x, 5, normed=True)
    print 'mutualinfo_binned normed', mi_normed
    print 'mutualinfo_binned       ', mi_obs.sum()

    mi_normed, (pyx2, py2, px2, binsy2, binsx2), mi_obs = \
               mutualinfo_binned(y, x, 'auto', normed=True)
    print 'auto'
    print 'mutualinfo_binned normed', mi_normed
    print 'mutualinfo_binned       ', mi_obs.sum()

    ys = np.sort(y)
    xs = np.sort(x)
    by = ys[((nobs - 1) * np.array([0, 0.25, 0.4, 0.6, 0.75, 1])).astype(int)]
    bx = xs[((nobs - 1) * np.array([0, 0.25, 0.4, 0.6, 0.75, 1])).astype(int)]
    mi_normed, (pyx2, py2, px2, binsy2, binsx2), mi_obs = \
               mutualinfo_binned(y, x, (by,bx), normed=True)
    print 'quantiles'
    print 'mutualinfo_binned normed', mi_normed
    print 'mutualinfo_binned       ', mi_obs.sum()

    doplot = 1  #False
    if doplot:
        import matplotlib.pyplot as plt
        plt.plot(x, y, 'o')
        olsres = sm.OLS(y, exog).fit()
        plt.plot(x, olsres.fittedvalues)