Beispiel #1
0
def prob_mv_grid(bins, cdf, axis=-1):
    '''helper function for probability of a rectangle grid in a multivariate distribution

    how does this generalize to more than 2 variates ?

    bins : tuple
        tuple of bin edges, currently it is assumed that they broadcast
        correctly

    '''
    if not isinstance(bins, np.ndarray):
        bins = lmap(np.asarray, bins)
        n_dim = len(bins)
        bins_ = []
        #broadcast if binedges are 1d
        if all(lmap(np.ndim, bins) == np.ones(n_dim)):
            for d in range(n_dim):
                sl = [None]*n_dim
                sl[d] = slice(None)
                bins_.append(bins[d][sl])
    else: #assume it is already correctly broadcasted
        n_dim = bins.shape[0]
        bins_ = bins

    print(len(bins))
    cdf_values = cdf(bins_)
    probs = cdf_values.copy()
    for d in range(n_dim):
        probs = np.diff(probs, axis=d)

    return probs
Beispiel #2
0
def date_range_str(start, end=None, length=None):
    """
    Returns a list of abbreviated date strings.

    Parameters
    ----------
    start : str
        The first abbreviated date, for instance, '1965q1' or '1965m1'
    end : str, optional
        The last abbreviated date if length is None.
    length : int, optional
        The length of the returned array of end is None.

    Returns
    -------
    date_range : list
        List of strings
    """
    flags = re.IGNORECASE | re.VERBOSE
    #_check_range_inputs(end, length, freq)
    start = start.lower()
    if re.search(_m_pattern, start, flags):
        annual_freq = 12
        split = 'm'
    elif re.search(_q_pattern, start, flags):
        annual_freq = 4
        split = 'q'
    elif re.search(_y_pattern, start, flags):
        annual_freq = 1
        start += 'a1'  # hack
        if end:
            end += 'a1'
        split = 'a'
    else:
        raise ValueError("Date %s not understood" % start)
    yr1, offset1 = lmap(int, start.replace(":", "").split(split))
    if end is not None:
        end = end.lower()
        yr2, offset2 = lmap(int, end.replace(":", "").split(split))
        length = (yr2 - yr1) * annual_freq + offset2
    elif length:
        yr2 = yr1 + length // annual_freq
        offset2 = length % annual_freq + (offset1 - 1)
    years = [str(yr) for yr in np.repeat(lrange(yr1 + 1, yr2), annual_freq)]
    # tack on first year
    years = [(str(yr1))] * (annual_freq + 1 - offset1) + years
    # tack on last year
    years = years + [(str(yr2))] * offset2
    if split != 'a':
        offset = np.tile(np.arange(1, annual_freq + 1),
                         yr2 - yr1 - 1).astype("a2")
        offset = np.r_[np.arange(offset1, annual_freq + 1).astype('a2'),
                       offset]
        offset = np.r_[offset, np.arange(1, offset2 + 1).astype('a2')]
        date_arr_range = [
            ''.join([i, split, asstr(j)]) for i, j in zip(years, offset)
        ]
    else:
        date_arr_range = years.tolist()
    return date_arr_range
Beispiel #3
0
def prob_mv_grid(bins, cdf, axis=-1):
    '''helper function for probability of a rectangle grid in a multivariate distribution

    how does this generalize to more than 2 variates ?

    bins : tuple
        tuple of bin edges, currently it is assumed that they broadcast
        correctly

    '''
    if not isinstance(bins, np.ndarray):
        bins = lmap(np.asarray, bins)
        n_dim = len(bins)
        bins_ = []
        #broadcast if binedges are 1d
        if all(lmap(np.ndim, bins) == np.ones(n_dim)):
            for d in range(n_dim):
                sl = [None] * n_dim
                sl[d] = slice(None)
                bins_.append(bins[d][sl])
    else:  #assume it is already correctly broadcasted
        n_dim = bins.shape[0]
        bins_ = bins

    print(len(bins))
    cdf_values = cdf(bins_)
    probs = cdf_values.copy()
    for d in range(n_dim):
        probs = np.diff(probs, axis=d)

    return probs
Beispiel #4
0
    def handle_missing(cls, endog, exog, missing, **kwargs):
        """
        This returns a dictionary with keys endog, exog and the keys of
        kwargs. It preserves Nones.
        """
        none_array_names = []

        if exog is not None:
            combined = (endog, exog)
            combined_names = ['endog', 'exog']
        else:
            combined = (endog,)
            combined_names = ['endog']
            none_array_names += ['exog']

        # deal with other arrays
        combined_2d = ()
        combined_2d_names = []
        if len(kwargs):
            for key, value_array in iteritems(kwargs):
                if value_array is None or value_array.ndim == 0:
                    none_array_names += [key]
                    continue
                # grab 1d arrays
                if value_array.ndim == 1:
                    combined += (value_array,)
                    combined_names += [key]
                elif value_array.squeeze().ndim == 1:
                    combined += (value_array,)
                    combined_names += [key]

                # grab 2d arrays that are _assumed_ to be symmetric
                elif value_array.ndim == 2:
                    combined_2d += (value_array,)
                    combined_2d_names += [key]
                else:
                    raise ValueError("Arrays with more than 2 dimensions "
                            "aren't yet handled")

        nan_mask = _nan_rows(*combined)
        if combined_2d:
            nan_mask = _nan_rows(*(nan_mask[:, None],) + combined_2d)

        if missing == 'raise' and np.any(nan_mask):
            raise MissingDataError("NaNs were encountered in the data")

        elif missing == 'drop':
            nan_mask = ~nan_mask
            drop_nans = lambda x: cls._drop_nans(x, nan_mask)
            drop_nans_2d = lambda x: cls._drop_nans_2d(x, nan_mask)
            combined = dict(zip(combined_names, lmap(drop_nans, combined)))
            if combined_2d:
                combined.update(dict(zip(combined_2d_names,
                                          lmap(drop_nans_2d, combined_2d))))
            if none_array_names:
                combined.update(dict(zip(none_array_names,
                                          [None] * len(none_array_names))))
            return combined, np.where(~nan_mask)[0].tolist()
        else:
            raise ValueError("missing option %s not understood" % missing)
Beispiel #5
0
def date_range_str(start, end=None, length=None):
    """
    Returns a list of abbreviated date strings.

    Parameters
    ----------
    start : str
        The first abbreviated date, for instance, '1965q1' or '1965m1'
    end : str, optional
        The last abbreviated date if length is None.
    length : int, optional
        The length of the returned array of end is None.

    Returns
    -------
    date_range : list
        List of strings
    """
    flags = re.IGNORECASE | re.VERBOSE
    #_check_range_inputs(end, length, freq)
    start = start.lower()
    if re.search(_m_pattern, start, flags):
        annual_freq = 12
        split = 'm'
    elif re.search(_q_pattern, start, flags):
        annual_freq = 4
        split = 'q'
    elif re.search(_y_pattern, start, flags):
        annual_freq = 1
        start += 'a1' # hack
        if end:
            end += 'a1'
        split = 'a'
    else:
        raise ValueError("Date %s not understood" % start)
    yr1, offset1 = lmap(int, start.replace(":","").split(split))
    if end is not None:
        end = end.lower()
        yr2, offset2 = lmap(int, end.replace(":","").split(split))
        length = (yr2 - yr1) * annual_freq + offset2
    elif length:
        yr2 = yr1 + length // annual_freq
        offset2 = length % annual_freq + (offset1 - 1)
    years = np.repeat(lrange(yr1+1, yr2), annual_freq).tolist()
    years = np.r_[[str(yr1)]*(annual_freq+1-offset1), years] # tack on first year
    years = np.r_[years, [str(yr2)]*offset2] # tack on last year
    if split != 'a':
        offset = np.tile(np.arange(1, annual_freq+1), yr2-yr1-1)
        offset = np.r_[np.arange(offset1, annual_freq+1).astype('a2'), offset]
        offset = np.r_[offset, np.arange(1,offset2+1).astype('a2')]
        date_arr_range = [''.join([i, split, asstr(j)]) for i,j in
                                                        zip(years, offset)]
    else:
        date_arr_range = years.tolist()
    return date_arr_range
Beispiel #6
0
def test_plot_month():
    dta = sm.datasets.elnino.load_pandas().data
    dta['YEAR'] = dta.YEAR.astype(int).apply(str)
    dta = dta.set_index('YEAR').T.unstack()
    dates = lmap(lambda x: pd.tseries.tools.parse_time_string('1 '+' '.join(x))[0],
                 dta.index.values)

    # test dates argument
    fig = month_plot(dta.values, dates=dates, ylabel='el nino')
    plt.close(fig)

    # test with a TimeSeries DatetimeIndex with no freq
    dta.index = pd.DatetimeIndex(dates)
    fig = month_plot(dta)
    plt.close(fig)

    # w freq
    dta.index = pd.DatetimeIndex(dates, freq='MS')
    fig = month_plot(dta)
    plt.close(fig)

    # test with a TimeSeries PeriodIndex
    dta.index = pd.PeriodIndex(dates, freq='M')
    fig = month_plot(dta)
    plt.close(fig)
Beispiel #7
0
def test_plot_month():
    dta = sm.datasets.elnino.load_pandas().data
    dta['YEAR'] = dta.YEAR.astype(int).apply(str)
    dta = dta.set_index('YEAR').T.unstack()
    dates = lmap(
        lambda x: pd.datetools.parse_time_string('1 ' + ' '.join(x))[0],
        dta.index.values)

    # test dates argument
    fig = month_plot(dta.values, dates=dates, ylabel='el nino')
    plt.close(fig)

    # test with a TimeSeries DatetimeIndex with no freq
    dta.index = pd.DatetimeIndex(dates)
    fig = month_plot(dta)
    plt.close(fig)

    # w freq
    dta.index = pd.DatetimeIndex(dates, freq='M')
    fig = month_plot(dta)
    plt.close(fig)

    # test with a TimeSeries PeriodIndex
    dta.index = pd.PeriodIndex(dates, freq='M')
    fig = month_plot(dta)
    plt.close(fig)
    def setup_class(cls):
        XLISTEXOG2 = 'aget aget2 educyr actlim totchr'.split()

        endog_name = 'docvis'
        exog_names = 'private medicaid'.split() + XLISTEXOG2 + ['const']
        instrument_names = 'income ssiratio'.split() + XLISTEXOG2 + ['const']

        endog = DATA[endog_name]
        exog = DATA[exog_names]
        instrument = DATA[instrument_names]

        asarray = lambda x: np.asarray(x, float)
        endog, exog, instrument = lmap(asarray, [endog, exog, instrument])


        cls.bse_tol = [5e-6, 5e-7]
        q_tol = [0.04, 0]
        # compare to Stata default options, iterative GMM
        # with const at end
        start = OLS(np.log(endog+1), exog).fit().params
        nobs, k_instr = instrument.shape
        w0inv = np.dot(instrument.T, instrument) / nobs

        mod = gmm.NonlinearIVGMM(endog, exog, instrument, moment_exponential_add)
        res0 = mod.fit(start, maxiter=0, inv_weights=w0inv,
                        optim_method='bfgs', optim_args={'gtol':1e-8, 'disp': 0},
                        wargs={'centered':False})
        cls.res1 = res0

        from .results_gmm_poisson import results_addonestep as results
        cls.res2 = results
Beispiel #9
0
    def dataset(self, as_dict=False):
        """
        Returns a Python generator object for iterating over the dataset.


        Parameters
        ----------
        as_dict : bool, optional
            If as_dict is True, yield each row of observations as a dict.
            If False, yields each row of observations as a list.

        Returns
        -------
        Generator object for iterating over the dataset.  Yields each row of
        observations as a list by default.

        Notes
        -----
        If missing_values is True during instantiation of StataReader then
        observations with _StataMissingValue(s) are not filtered and should
        be handled by your applcation.
        """

        try:
            self._file.seek(self._data_location)
        except Exception:
            pass

        if as_dict:
            vars = lmap(str, self.variables())
            for i in range(len(self)):
                yield dict(zip(vars, self._next()))
        else:
            for i in range(self._header['nobs']):
                yield self._next()
    def dataset(self, as_dict=False):
        """
        Returns a Python generator object for iterating over the dataset.


        Parameters
        ----------
        as_dict : bool, optional
            If as_dict is True, yield each row of observations as a dict.
            If False, yields each row of observations as a list.

        Returns
        -------
        Generator object for iterating over the dataset.  Yields each row of
        observations as a list by default.

        Notes
        -----
        If missing_values is True during instantiation of StataReader then
        observations with StataMissingValue(s) are not filtered and should
        be handled by your applcation.
        """

        try:
            self._file.seek(self._data_location)
        except Exception:
            pass

        if as_dict:
            vars = lmap(str, self.variables())
            for i in range(len(self)):
                yield dict(zip(vars, self._next()))
        else:
            for i in range(self._header['nobs']):
                yield self._next()
    def setup_class(cls):
        # compare to Stata default options, twostep GMM
        XLISTEXOG2 = 'aget aget2 educyr actlim totchr'.split()

        endog_name = 'docvis'
        exog_names = 'private medicaid'.split() + XLISTEXOG2 + ['const']
        instrument_names = 'income medicaid ssiratio'.split() + XLISTEXOG2 + ['const']

        endog = DATA[endog_name]
        exog = DATA[exog_names]
        instrument = DATA[instrument_names]

        asarray = lambda x: np.asarray(x, float)
        endog, exog, instrument = lmap(asarray, [endog, exog, instrument])

        # Need to add all data into exog
        endog_ = np.zeros(len(endog))
        exog_ = np.column_stack((endog, exog))


        cls.bse_tol = [5e-6, 5e-7]
        # compare to Stata default options, iterative GMM
        # with const at end
        start = OLS(endog, exog).fit().params
        nobs, k_instr = instrument.shape
        w0inv = np.dot(instrument.T, instrument) / nobs

        mod = gmm.NonlinearIVGMM(endog_, exog_, instrument, moment_exponential_mult)
        res0 = mod.fit(start, maxiter=2, inv_weights=w0inv,
                        optim_method='bfgs', optim_args={'gtol':1e-8, 'disp': 0},
                        wargs={'centered':False}, has_optimal_weights=False)
        cls.res1 = res0

        from .results_gmm_poisson import results_multtwostep as results
        cls.res2 = results
    def setup_class(cls):
        XLISTEXOG2 = 'aget aget2 educyr actlim totchr'.split()

        endog_name = 'docvis'
        exog_names = 'private medicaid'.split() + XLISTEXOG2 + ['const']
        instrument_names = 'income ssiratio'.split() + XLISTEXOG2 + ['const']

        endog = DATA[endog_name]
        exog = DATA[exog_names]
        instrument = DATA[instrument_names]

        asarray = lambda x: np.asarray(x, float)
        endog, exog, instrument = lmap(asarray, [endog, exog, instrument])


        cls.bse_tol = [5e-6, 5e-7]
        q_tol = [0.04, 0]
        # compare to Stata default options, iterative GMM
        # with const at end
        start = OLS(np.log(endog+1), exog).fit().params
        nobs, k_instr = instrument.shape
        w0inv = np.dot(instrument.T, instrument) / nobs

        mod = gmm.NonlinearIVGMM(endog, exog, instrument, moment_exponential_add)
        res0 = mod.fit(start, maxiter=0, inv_weights=w0inv,
                        optim_method='bfgs', optim_args={'gtol':1e-8, 'disp': 0},
                        wargs={'centered':False})
        cls.res1 = res0

        from .results_gmm_poisson import results_addonestep as results
        cls.res2 = results
    def setup_class(cls):
        # compare to Stata default options, twostep GMM
        XLISTEXOG2 = 'aget aget2 educyr actlim totchr'.split()

        endog_name = 'docvis'
        exog_names = 'private medicaid'.split() + XLISTEXOG2 + ['const']
        instrument_names = 'income medicaid ssiratio'.split() + XLISTEXOG2 + ['const']

        endog = DATA[endog_name]
        exog = DATA[exog_names]
        instrument = DATA[instrument_names]

        asarray = lambda x: np.asarray(x, float)
        endog, exog, instrument = lmap(asarray, [endog, exog, instrument])

        # Need to add all data into exog
        endog_ = np.zeros(len(endog))
        exog_ = np.column_stack((endog, exog))


        cls.bse_tol = [5e-6, 5e-7]
        # compare to Stata default options, iterative GMM
        # with const at end
        start = OLS(endog, exog).fit().params
        nobs, k_instr = instrument.shape
        w0inv = np.dot(instrument.T, instrument) / nobs

        mod = gmm.NonlinearIVGMM(endog_, exog_, instrument, moment_exponential_mult)
        res0 = mod.fit(start, maxiter=2, inv_weights=w0inv,
                        optim_method='bfgs', optim_args={'gtol':1e-8, 'disp': 0},
                        wargs={'centered':False}, has_optimal_weights=False)
        cls.res1 = res0

        from .results_gmm_poisson import results_multtwostep as results
        cls.res2 = results
Beispiel #14
0
def test_plot_quarter():
    dta = sm.datasets.macrodata.load_pandas().data
    dates = lmap(
        'Q'.join,
        zip(
            dta.year.astype(int).apply(str),
            dta.quarter.astype(int).apply(str)))
    # test dates argument
    quarter_plot(dta.unemp.values, dates)
    plt.close('all')

    # test with a DatetimeIndex with no freq
    parser = pd.datetools.parse_time_string
    dta.set_index(pd.DatetimeIndex((x[0] for x in map(parser, dates))),
                  inplace=True)
    quarter_plot(dta.unemp)
    plt.close('all')

    # w freq
    # see pandas #6631
    dta.index = pd.DatetimeIndex((x[0] for x in map(parser, dates)),
                                 freq='QS-Oct')
    quarter_plot(dta.unemp)
    plt.close('all')

    # w PeriodIndex
    dta.index = pd.PeriodIndex((x[0] for x in map(parser, dates)), freq='Q')
    quarter_plot(dta.unemp)
    plt.close('all')
def anova_oneway(y, x, seq=0):
    # new version to match NIST
    # no generalization or checking of arguments, tested only for 1d
    yrvs = y[:,np.newaxis] #- min(y)
    #subracting mean increases numerical accuracy for NIST test data sets
    xrvs = x[:,np.newaxis] - x.mean() #for 1d#- 1e12  trick for 'SmLs09.dat'

    meang, varg, xdevmeangr, countg = groupsstats_dummy(yrvs[:,:1], xrvs[:,:1])#, seq=0)
    #the following does not work as replacement
    #gcount, gmean , meanarr, withinvar, withinvararr = groupstatsbin(y, x)#, seq=0)
    sswn = np.dot(xdevmeangr.T,xdevmeangr)
    ssbn = np.dot((meang-xrvs.mean())**2, countg.T)
    nobs = yrvs.shape[0]
    ncat = meang.shape[1]
    dfbn = ncat - 1
    dfwn = nobs - ncat
    msb = ssbn/float(dfbn)
    msw = sswn/float(dfwn)
    f = msb/msw
    prob = stats.f.sf(f,dfbn,dfwn)
    R2 = (ssbn/(sswn+ssbn))  #R-squared
    resstd = np.sqrt(msw) #residual standard deviation
    #print(f, prob
    def _fix2scalar(z): # return number
        if np.shape(z) == (1, 1): return z[0,0]
        else: return z
    f, prob, R2, resstd = lmap(_fix2scalar, (f, prob, R2, resstd))
    return f, prob, R2, resstd
Beispiel #16
0
def test_plot_quarter():
    dta = sm.datasets.macrodata.load_pandas().data
    dates = lmap('Q'.join, zip(dta.year.astype(int).apply(str),
                              dta.quarter.astype(int).apply(str)))
    # test dates argument
    quarter_plot(dta.unemp.values, dates)
    plt.close('all')

    # test with a DatetimeIndex with no freq
    parser = pd.tseries.tools.parse_time_string
    dta.set_index(pd.DatetimeIndex((x[0] for x in map(parser, dates))),
                  inplace=True)
    quarter_plot(dta.unemp)
    plt.close('all')

    # w freq
    # see pandas #6631
    dta.index = pd.DatetimeIndex((x[0] for x in map(parser, dates)),
                                   freq='QS-Oct')
    quarter_plot(dta.unemp)
    plt.close('all')

    # w PeriodIndex
    dta.index = pd.PeriodIndex((x[0] for x in map(parser, dates)),
                                   freq='Q')
    quarter_plot(dta.unemp)
    plt.close('all')
Beispiel #17
0
def anova_oneway(y, x, seq=0):
    # new version to match NIST
    # no generalization or checking of arguments, tested only for 1d
    yrvs = y[:, np.newaxis]  #- min(y)
    #subracting mean increases numerical accuracy for NIST test data sets
    xrvs = x[:, np.newaxis] - x.mean()  #for 1d#- 1e12  trick for 'SmLs09.dat'

    meang, varg, xdevmeangr, countg = groupsstats_dummy(
        yrvs[:, :1], xrvs[:, :1])  #, seq=0)  # noqa:F821  See GH#5756
    #the following does not work as replacement
    #from .try_catdata import groupsstats_dummy, groupstatsbin
    #gcount, gmean , meanarr, withinvar, withinvararr = groupstatsbin(y, x)#, seq=0)
    sswn = np.dot(xdevmeangr.T, xdevmeangr)
    ssbn = np.dot((meang - xrvs.mean())**2, countg.T)
    nobs = yrvs.shape[0]
    ncat = meang.shape[1]
    dfbn = ncat - 1
    dfwn = nobs - ncat
    msb = ssbn / float(dfbn)
    msw = sswn / float(dfwn)
    f = msb / msw
    prob = stats.f.sf(f, dfbn, dfwn)
    R2 = (ssbn / (sswn + ssbn))  #R-squared
    resstd = np.sqrt(msw)  #residual standard deviation

    #print(f, prob

    def _fix2scalar(z):  # return number
        if np.shape(z) == (1, 1): return z[0, 0]
        else: return z

    f, prob, R2, resstd = lmap(_fix2scalar, (f, prob, R2, resstd))
    return f, prob, R2, resstd
def bootstrap(distr, args=(), nobs=200, nrep=100, value=None, batch_size=None):
    '''Monte Carlo (or parametric bootstrap) p-values for gof

    currently hardcoded for A^2 only

    assumes vectorized fit_vec method,
    builds and analyses (nobs, nrep) sample in one step

    rename function to less generic

    this works also with nrep=1

    '''
    #signature similar to kstest ?
    #delegate to fn ?

    #rvs_kwds = {'size':(nobs, nrep)}
    #rvs_kwds.update(kwds)


    #it will be better to build a separate batch function that calls bootstrap
    #keep batch if value is true, but batch iterate from outside if stat is returned
    if batch_size is not None:
        if value is None:
            raise ValueError('using batching requires a value')
        n_batch = int(np.ceil(nrep/float(batch_size)))
        count = 0
        for irep in range(n_batch):
            rvs = distr.rvs(args, **{'size':(batch_size, nobs)})
            params = distr.fit_vec(rvs, axis=1)
            params = lmap(lambda x: np.expand_dims(x, 1), params)
            cdfvals = np.sort(distr.cdf(rvs, params), axis=1)
            stat = asquare(cdfvals, axis=1)
            count += (stat >= value).sum()
        return count / float(n_batch * batch_size)
    else:
        #rvs = distr.rvs(args, **kwds)  #extension to distribution kwds ?
        rvs = distr.rvs(args, **{'size':(nrep, nobs)})
        params = distr.fit_vec(rvs, axis=1)
        params = lmap(lambda x: np.expand_dims(x, 1), params)
        cdfvals = np.sort(distr.cdf(rvs, params), axis=1)
        stat = asquare(cdfvals, axis=1)
        if value is None:           #return all bootstrap results
            stat_sorted = np.sort(stat)
            return stat_sorted
        else:                       #calculate and return specific p-value
            return (stat >= value).mean()
Beispiel #19
0
 def _col_size(self, k=None):
     """Calculate size of a data record."""
     if len(self._col_sizes) == 0:
         self._col_sizes = lmap(lambda x: self._calcsize(x), self._header["typlist"])
     if k == None:
         return self._col_sizes
     else:
         return self._col_sizes[k]
 def _col_size(self, k=None):
     """Calculate size of a data record."""
     if len(self._col_sizes) == 0:
         self._col_sizes = lmap(lambda x: self._calcsize(x),
                                self._header['typlist'])
     if k is None:
         return self._col_sizes
     else:
         return self._col_sizes[k]
Beispiel #21
0
 def variables(self):
     """
     Returns a list of the dataset's StataVariables objects.
     """
     return lmap(_StataVariable, zip(lrange(self._header['nvar']),
         self._header['typlist'], self._header['varlist'],
         self._header['srtlist'],
         self._header['fmtlist'], self._header['lbllist'],
         self._header['vlblist']))
Beispiel #22
0
 def variables(self):
     """
     Returns a list of the dataset's StataVariables objects.
     """
     return lmap(_StataVariable, zip(lrange(self._header['nvar']),
         self._header['typlist'], self._header['varlist'],
         self._header['srtlist'],
         self._header['fmtlist'], self._header['lbllist'],
         self._header['vlblist']))
def test_panel_robust_cov():
    import pandas as pa
    import statsmodels.datasets.grunfeld as gr
    from .results.results_panelrobust import results as res_stata

    dtapa = gr.data.load_pandas()
    #Stata example/data seems to miss last firm
    dtapa_endog = dtapa.endog[:200]
    dtapa_exog = dtapa.exog[:200]
    res = OLS(dtapa_endog,
              add_constant(dtapa_exog[['value', 'capital']],
                           prepend=False)).fit()

    #time indicator in range(max Ti)
    time = np.asarray(dtapa_exog[['year']])
    time -= time.min()
    time = np.squeeze(time).astype(int)

    #sw.cov_nw_panel requires bounds instead of index
    tidx = [(i * 20, 20 * (i + 1)) for i in range(10)]

    #firm index in range(n_firms)
    firm_names, firm_id = np.unique(np.asarray(dtapa_exog[['firm']], 'S20'),
                                    return_inverse=True)

    #panel newey west standard errors
    cov = sw.cov_nw_panel(res, 0, tidx, use_correction='hac')
    #dropping numpy 1.4 soon
    #np.testing.assert_allclose(cov, res_stata.cov_pnw0_stata, rtol=1e-6)
    assert_almost_equal(cov, res_stata.cov_pnw0_stata, decimal=4)

    cov = sw.cov_nw_panel(res, 1, tidx, use_correction='hac')
    #np.testing.assert_allclose(cov, res_stata.cov_pnw1_stata, rtol=1e-6)
    assert_almost_equal(cov, res_stata.cov_pnw1_stata, decimal=4)

    cov = sw.cov_nw_panel(res, 4, tidx)  #check default
    #np.testing.assert_allclose(cov, res_stata.cov_pnw4_stata, rtol=1e-6)
    assert_almost_equal(cov, res_stata.cov_pnw4_stata, decimal=4)

    #cluster robust standard errors
    cov_clu = sw.cov_cluster(res, firm_id)
    assert_almost_equal(cov_clu, res_stata.cov_clu_stata, decimal=4)

    #cluster robust standard errors, non-int groups
    cov_clu = sw.cov_cluster(res, lmap(str, firm_id))
    assert_almost_equal(cov_clu, res_stata.cov_clu_stata, decimal=4)

    #Driscoll and Kraay panel robust standard errors
    rcov = sw.cov_nw_groupsum(res, 0, time, use_correction=0)
    assert_almost_equal(rcov, res_stata.cov_dk0_stata, decimal=4)

    rcov = sw.cov_nw_groupsum(res, 1, time, use_correction=0)
    assert_almost_equal(rcov, res_stata.cov_dk1_stata, decimal=4)

    rcov = sw.cov_nw_groupsum(res, 4, time)  #check default
    assert_almost_equal(rcov, res_stata.cov_dk4_stata, decimal=4)
def test_panel_robust_cov():
    import pandas as pa
    import statsmodels.datasets.grunfeld as gr
    from .results.results_panelrobust import results as res_stata

    dtapa = gr.data.load_pandas()
    # Stata example/data seems to miss last firm
    dtapa_endog = dtapa.endog[:200]
    dtapa_exog = dtapa.exog[:200]
    res = OLS(dtapa_endog, add_constant(dtapa_exog[["value", "capital"]], prepend=False)).fit()

    # time indicator in range(max Ti)
    time = np.asarray(dtapa_exog[["year"]])
    time -= time.min()
    time = np.squeeze(time).astype(int)

    # sw.cov_nw_panel requires bounds instead of index
    tidx = [(i * 20, 20 * (i + 1)) for i in range(10)]

    # firm index in range(n_firms)
    firm_names, firm_id = np.unique(np.asarray(dtapa_exog[["firm"]], "S20"), return_inverse=True)

    # panel newey west standard errors
    cov = sw.cov_nw_panel(res, 0, tidx, use_correction="hac")
    # dropping numpy 1.4 soon
    # np.testing.assert_allclose(cov, res_stata.cov_pnw0_stata, rtol=1e-6)
    assert_almost_equal(cov, res_stata.cov_pnw0_stata, decimal=4)

    cov = sw.cov_nw_panel(res, 1, tidx, use_correction="hac")
    # np.testing.assert_allclose(cov, res_stata.cov_pnw1_stata, rtol=1e-6)
    assert_almost_equal(cov, res_stata.cov_pnw1_stata, decimal=4)

    cov = sw.cov_nw_panel(res, 4, tidx)  # check default
    # np.testing.assert_allclose(cov, res_stata.cov_pnw4_stata, rtol=1e-6)
    assert_almost_equal(cov, res_stata.cov_pnw4_stata, decimal=4)

    # cluster robust standard errors
    cov_clu = sw.cov_cluster(res, firm_id)
    assert_almost_equal(cov_clu, res_stata.cov_clu_stata, decimal=4)

    # cluster robust standard errors, non-int groups
    cov_clu = sw.cov_cluster(res, lmap(str, firm_id))
    assert_almost_equal(cov_clu, res_stata.cov_clu_stata, decimal=4)

    # Driscoll and Kraay panel robust standard errors
    rcov = sw.cov_nw_groupsum(res, 0, time, use_correction=0)
    assert_almost_equal(rcov, res_stata.cov_dk0_stata, decimal=4)

    rcov = sw.cov_nw_groupsum(res, 1, time, use_correction=0)
    assert_almost_equal(rcov, res_stata.cov_dk1_stata, decimal=4)

    rcov = sw.cov_nw_groupsum(res, 4, time)  # check default
    assert_almost_equal(rcov, res_stata.cov_dk4_stata, decimal=4)
def data2proddummy(x):
    '''creates product dummy variables from 2 columns of 2d array

    drops last dummy variable, but not from each category
    singular with simple dummy variable but not with constant

    quickly written, no safeguards

    '''
    #brute force, assumes x is 2d
    #replace with encoding if possible
    groups = np.unique(lmap(tuple, x.tolist()))
    #includes singularity with additive factors
    return (x == groups[:, None, :]).all(-1).T.astype(int)[:, :-1]
def data2proddummy(x):
    '''creates product dummy variables from 2 columns of 2d array

    drops last dummy variable, but not from each category
    singular with simple dummy variable but not with constant

    quickly written, no safeguards

    '''
    #brute force, assumes x is 2d
    #replace with encoding if possible
    groups = np.unique(lmap(tuple, x.tolist()))
    #includes singularity with additive factors
    return (x==groups[:,None,:]).all(-1).T.astype(int)[:,:-1]
Beispiel #27
0
 def _next(self):
     typlist = self._header["typlist"]
     if self._has_string_data:
         data = [None] * self._header["nvar"]
         for i in range(len(data)):
             if isinstance(typlist[i], int):
                 data[i] = self._null_terminate(self._file.read(typlist[i]), self._encoding)
             else:
                 data[i] = self._unpack(typlist[i], self._file.read(self._col_size(i)))
         return data
     else:
         return lmap(
             lambda i: self._unpack(typlist[i], self._file.read(self._col_size(i))), lrange(self._header["nvar"])
         )
Beispiel #28
0
 def _next(self):
     typlist = self._header['typlist']
     if self._has_string_data:
         data = [None]*self._header['nvar']
         for i in range(len(data)):
             if isinstance(typlist[i], int):
                 data[i] = self._null_terminate(self._file.read(typlist[i]),
                             self._encoding)
             else:
                 data[i] = self._unpack(typlist[i],
                         self._file.read(self._col_size(i)))
         return data
     else:
         return lmap(lambda i: self._unpack(typlist[i],
             self._file.read(self._col_size(i))),
             lrange(self._header['nvar']))
Beispiel #29
0
 def variables(self):
     """
     Returns a list of the dataset's StataVariables objects.
     """
     return lmap(
         _StataVariable,
         zip(
             lrange(self._header["nvar"]),
             self._header["typlist"],
             self._header["varlist"],
             self._header["srtlist"],
             self._header["fmtlist"],
             self._header["lbllist"],
             self._header["vlblist"],
         ),
     )
Beispiel #30
0
def dates_from_str(dates):
    """
    Turns a sequence of date strings and returns a list of datetime.

    Parameters
    ----------
    dates : array-like
        A sequence of abbreviated dates as string. For instance,
        '1996m1' or '1996Q1'. The datetime dates are at the end of the
        period.

    Returns
    -------
    date_list : array
        A list of datetime types.
    """
    return lmap(date_parser, dates)
Beispiel #31
0
def dates_from_str(dates):
    """
    Turns a sequence of date strings and returns a list of datetime.

    Parameters
    ----------
    dates : array_like
        A sequence of abbreviated dates as string. For instance,
        '1996m1' or '1996Q1'. The datetime dates are at the end of the
        period.

    Returns
    -------
    date_list : array
        A list of datetime types.
    """
    return lmap(date_parser, dates)
    def print_summary(self, stats, orientation='auto'):
        #TODO: need to specify a table formating for the numbers, using defualt
        title = 'Summary Statistics'
        header = stats
        stubs = self.univariate['obs'][1]
        data = [[self.univariate[astat][2][col] for astat in stats]
                for col in range(len(self.univariate['obs'][2]))]

        if (orientation == 'varcols') or \
           (orientation == 'auto' and len(stubs) < len(header)):
            #swap rows and columns
            data = lmap(lambda *row: list(row), *data)
            header, stubs = stubs, header

        part_fmt = dict(data_fmts=["%#8.4g"] * (len(header) - 1))
        table = SimpleTable(data, header, stubs, title=title, txt_fmt=part_fmt)

        return table
Beispiel #33
0
def test_plot_quarter(close_figures):
    dta = sm.datasets.macrodata.load_pandas().data
    dates = lmap('Q'.join, zip(dta.year.astype(int).apply(str),
                               dta.quarter.astype(int).apply(str)))
    # test dates argument
    quarter_plot(dta.unemp.values, dates)

    # test with a DatetimeIndex with no freq
    dta.set_index(pd.to_datetime(dates), inplace=True)
    quarter_plot(dta.unemp)

    # w freq
    # see pandas #6631
    dta.index = pd.DatetimeIndex(pd.to_datetime(dates), freq='QS-Oct')
    quarter_plot(dta.unemp)

    # w PeriodIndex
    dta.index = pd.PeriodIndex(pd.to_datetime(dates), freq='Q')
    quarter_plot(dta.unemp)
Beispiel #34
0
def test_plot_quarter(close_figures):
    dta = sm.datasets.macrodata.load_pandas().data
    dates = lmap('Q'.join, zip(dta.year.astype(int).apply(str),
                               dta.quarter.astype(int).apply(str)))
    # test dates argument
    quarter_plot(dta.unemp.values, dates)

    # test with a DatetimeIndex with no freq
    dta.set_index(pd.to_datetime(dates), inplace=True)
    quarter_plot(dta.unemp)

    # w freq
    # see pandas #6631
    dta.index = pd.DatetimeIndex(pd.to_datetime(dates), freq='QS-Oct')
    quarter_plot(dta.unemp)

    # w PeriodIndex
    dta.index = pd.PeriodIndex(pd.to_datetime(dates), freq='Q')
    quarter_plot(dta.unemp)
Beispiel #35
0
def summary_return(tables, return_fmt='text'):
    # join table parts then print
    if return_fmt == 'text':
        strdrop = lambda x: str(x).rsplit('\n', 1)[0]
        # convert to string drop last line
        return '\n'.join(lmap(strdrop, tables[:-1]) + [str(tables[-1])])
    elif return_fmt == 'tables':
        return tables
    elif return_fmt == 'csv':
        return '\n'.join(x.as_csv() for x in tables)
    elif return_fmt == 'latex':
        # TODO: insert \hline after updating SimpleTable
        table = copy.deepcopy(tables[0])
        for part in tables[1:]:
            table.extend(part)
        return table.as_latex_tabular()
    elif return_fmt == 'html':
        return "\n".join(table.as_html() for table in tables)
    else:
        raise ValueError('available output formats are text, csv, latex, html')
Beispiel #36
0
 def _get_colwidths(self, output_format, **fmt_dict):
     """Return list, the calculated widths of each column."""
     output_format = get_output_format(output_format)
     fmt = self.output_formats[output_format].copy()
     fmt.update(fmt_dict)
     ncols = max(len(row) for row in self)
     request = fmt.get('colwidths')
     if request is 0:  # assume no extra space desired (e.g, CSV)
         return [0] * ncols
     elif request is None:  # assume no extra space desired (e.g, CSV)
         request = [0] * ncols
     elif isinstance(request, (int, long)):
         request = [request] * ncols
     elif len(request) < ncols:
         request = [request[i % len(request)] for i in range(ncols)]
     min_widths = []
     for col in zip(*self):
         maxwidth = max(len(c.format(0, output_format, **fmt)) for c in col)
         min_widths.append(maxwidth)
     result = lmap(max, min_widths, request)
     return result
Beispiel #37
0
 def _get_colwidths(self, output_format, **fmt_dict):
     """Return list, the calculated widths of each column."""
     output_format = get_output_format(output_format)
     fmt = self.output_formats[output_format].copy()
     fmt.update(fmt_dict)
     ncols = max(len(row) for row in self)
     request = fmt.get('colwidths')
     if request == 0:  # assume no extra space desired (e.g, CSV)
         return [0] * ncols
     elif request is None:  # assume no extra space desired (e.g, CSV)
         request = [0] * ncols
     elif isinstance(request, (int, long)):
         request = [request] * ncols
     elif len(request) < ncols:
         request = [request[i % len(request)] for i in range(ncols)]
     min_widths = []
     for col in zip(*self):
         maxwidth = max(len(c.format(0, output_format, **fmt)) for c in col)
         min_widths.append(maxwidth)
     result = lmap(max, min_widths, request)
     return result
Beispiel #38
0
def summary_return(tables, return_fmt='text'):
    # join table parts then print
    if return_fmt == 'text':
        strdrop = lambda x: str(x).rsplit('\n',1)[0]
        # convert to string drop last line
        return '\n'.join(lmap(strdrop, tables[:-1]) + [str(tables[-1])])
    elif return_fmt == 'tables':
        return tables
    elif return_fmt == 'csv':
        return '\n'.join(x.as_csv() for x in tables)
    elif return_fmt == 'latex':
        # TODO: insert \hline after updating SimpleTable
        table = copy.deepcopy(tables[0])
        del table[-1]
        for part in tables[1:]:
            table.extend(part)
        return table.as_latex_tabular()
    elif return_fmt == 'html':
        return "\n".join(table.as_html() for table in tables)
    else:
        raise ValueError('available output formats are text, csv, latex, html')
    def print_summary(self, stats, orientation='auto'):
        #TODO: need to specify a table formating for the numbers, using defualt
        title = 'Summary Statistics'
        header = stats
        stubs = self.univariate['obs'][1]
        data = [[self.univariate[astat][2][col] for astat in stats] for col in
                                range(len(self.univariate['obs'][2]))]

        if (orientation == 'varcols') or \
           (orientation == 'auto' and len(stubs) < len(header)):
            #swap rows and columns
            data = lmap(lambda *row: list(row), *data)
            header, stubs = stubs, header

        part_fmt = dict(data_fmts = ["%#8.4g"]*(len(header)-1))
        table = SimpleTable(data,
                            header,
                            stubs,
                            title=title,
                            txt_fmt = part_fmt)

        return table
Beispiel #40
0
    import scikits.timeseries as ts
    d1 = ts.Date(year=1700, freq='A')
    #NOTE: have to have yearBegin offset for annual data until parser rewrite
    #should this be up to the user, or should it be done in TSM init?
    #NOTE: not anymore, it's end of year now
    ts_dr = ts.date_array(start_date=d1, length=len(sunspots.endog))
    pandas_dr = pandas.DateRange(start=d1.datetime,
                                 periods=len(sunspots.endog),
                                 timeRule='A@DEC')
    #pandas_dr = pandas_dr.shift(-1, pandas.datetools.yearBegin)

    dates = np.arange(1700, 1700 + len(sunspots.endog))
    dates = ts.date_array(dates, freq='A')
    #sunspots = pandas.TimeSeries(sunspots.endog, index=dates)

    #NOTE: pandas only does business days for dates it looks like
    import datetime
    dt_dates = np.asarray(
        lmap(datetime.datetime.fromordinal,
             ts_dr.toordinal().astype(int)))
    sunspots = pandas.TimeSeries(sunspots.endog, index=dt_dates)

    #NOTE: pandas can't handle pre-1900 dates
    mod = AR(sunspots, freq='A')
    res = mod.fit(method='mle', maxlag=9)

    # some data for an example in Box Jenkins
    IBM = np.asarray([460, 457, 452, 459, 462, 459, 463, 479, 493, 490.])
    w = np.diff(IBM)
    theta = .5
Beispiel #41
0
def categorical(data, col=None, dictnames=False, drop=False, ):
    '''
    Returns a dummy matrix given an array of categorical variables.

    Parameters
    ----------
    data : array
        A structured array, recarray, or array.  This can be either
        a 1d vector of the categorical variable or a 2d array with
        the column specifying the categorical variable specified by the col
        argument.
    col : 'string', int, or None
        If data is a structured array or a recarray, `col` can be a string
        that is the name of the column that contains the variable.  For all
        arrays `col` can be an int that is the (zero-based) column index
        number.  `col` can only be None for a 1d array.  The default is None.
    dictnames : bool, optional
        If True, a dictionary mapping the column number to the categorical
        name is returned.  Used to have information about plain arrays.
    drop : bool
        Whether or not keep the categorical variable in the returned matrix.

    Returns
    --------
    dummy_matrix, [dictnames, optional]
        A matrix of dummy (indicator/binary) float variables for the
        categorical data.  If dictnames is True, then the dictionary
        is returned as well.

    Notes
    -----
    This returns a dummy variable for EVERY distinct variable.  If a
    a structured or recarray is provided, the names for the new variable is the
    old variable name - underscore - category name.  So if the a variable
    'vote' had answers as 'yes' or 'no' then the returned array would have to
    new variables-- 'vote_yes' and 'vote_no'.  There is currently
    no name checking.

    Examples
    --------
    >>> import numpy as np
    >>> import statsmodels.api as sm

    Univariate examples

    >>> import string
    >>> string_var = [string.lowercase[0:5], string.lowercase[5:10],   \
                string.lowercase[10:15], string.lowercase[15:20],   \
                string.lowercase[20:25]]
    >>> string_var *= 5
    >>> string_var = np.asarray(sorted(string_var))
    >>> design = sm.tools.categorical(string_var, drop=True)

    Or for a numerical categorical variable

    >>> instr = np.floor(np.arange(10,60, step=2)/10)
    >>> design = sm.tools.categorical(instr, drop=True)

    With a structured array

    >>> num = np.random.randn(25,2)
    >>> struct_ar = np.zeros((25,1), dtype=[('var1', 'f4'),('var2', 'f4'),  \
                    ('instrument','f4'),('str_instr','a5')])
    >>> struct_ar['var1'] = num[:,0][:,None]
    >>> struct_ar['var2'] = num[:,1][:,None]
    >>> struct_ar['instrument'] = instr[:,None]
    >>> struct_ar['str_instr'] = string_var[:,None]
    >>> design = sm.tools.categorical(struct_ar, col='instrument', drop=True)

    Or

    >>> design2 = sm.tools.categorical(struct_ar, col='str_instr', drop=True)
    '''
    if isinstance(col, (list, tuple)):
        try:
            assert len(col) == 1
            col = col[0]
        except:
            raise ValueError("Can only convert one column at a time")

    # TODO: add a NameValidator function
    # catch recarrays and structured arrays
    if data.dtype.names or data.__class__ is np.recarray:
        if not col and np.squeeze(data).ndim > 1:
            raise IndexError("col is None and the input array is not 1d")
        if isinstance(col, int):
            col = data.dtype.names[col]
        if col is None and data.dtype.names and len(data.dtype.names) == 1:
            col = data.dtype.names[0]

        tmp_arr = np.unique(data[col])

        # if the cols are shape (#,) vs (#,1) need to add an axis and flip
        _swap = True
        if data[col].ndim == 1:
            tmp_arr = tmp_arr[:, None]
            _swap = False
        tmp_dummy = (tmp_arr == data[col]).astype(float)
        if _swap:
            tmp_dummy = np.squeeze(tmp_dummy).swapaxes(1, 0)

        if not tmp_arr.dtype.names:  # how do we get to this code path?
            tmp_arr = [asstr2(item) for item in np.squeeze(tmp_arr)]
        elif tmp_arr.dtype.names:
            tmp_arr = [asstr2(item) for item in np.squeeze(tmp_arr.tolist())]

        # prepend the varname and underscore, if col is numeric attribute
        # lookup is lost for recarrays...
        if col is None:
            try:
                col = data.dtype.names[0]
            except:
                col = 'var'
        # TODO: the above needs to be made robust because there could be many
        # var_yes, var_no varaibles for instance.
        tmp_arr = [col + '_' + item for item in tmp_arr]
        # TODO: test this for rec and structured arrays!!!

        if drop is True:
            if len(data.dtype) <= 1:
                if tmp_dummy.shape[0] < tmp_dummy.shape[1]:
                    tmp_dummy = np.squeeze(tmp_dummy).swapaxes(1, 0)
                dt = lzip(tmp_arr, [tmp_dummy.dtype.str]*len(tmp_arr))
                # preserve array type
                return np.array(lmap(tuple, tmp_dummy.tolist()),
                                dtype=dt).view(type(data))

            data = nprf.drop_fields(data, col, usemask=False,
                                    asrecarray=type(data) is np.recarray)
        data = nprf.append_fields(data, tmp_arr, data=tmp_dummy,
                                  usemask=False,
                                  asrecarray=type(data) is np.recarray)
        return data

    # handle ndarrays and catch array-like for an error
    elif data.__class__ is np.ndarray or not isinstance(data, np.ndarray):
        if not isinstance(data, np.ndarray):
            raise NotImplementedError("Array-like objects are not supported")

        if isinstance(col, int):
            offset = data.shape[1]          # need error catching here?
            tmp_arr = np.unique(data[:, col])
            tmp_dummy = (tmp_arr[:, np.newaxis] == data[:, col]).astype(float)
            tmp_dummy = tmp_dummy.swapaxes(1, 0)
            if drop is True:
                offset -= 1
                data = np.delete(data, col, axis=1).astype(float)
            data = np.column_stack((data, tmp_dummy))
            if dictnames is True:
                col_map = _make_dictnames(tmp_arr, offset)
                return data, col_map
            return data
        elif col is None and np.squeeze(data).ndim == 1:
            tmp_arr = np.unique(data)
            tmp_dummy = (tmp_arr[:, None] == data).astype(float)
            tmp_dummy = tmp_dummy.swapaxes(1, 0)
            if drop is True:
                if dictnames is True:
                    col_map = _make_dictnames(tmp_arr)
                    return tmp_dummy, col_map
                return tmp_dummy
            else:
                data = np.column_stack((data, tmp_dummy))
                if dictnames is True:
                    col_map = _make_dictnames(tmp_arr, offset=1)
                    return data, col_map
                return data
        else:
            raise IndexError("The index %s is not understood" % col)
Beispiel #42
0
def lstsq(a, b, cond=None, overwrite_a=0, overwrite_b=0):
    """Compute least-squares solution to equation :m:`a x = b`

    Compute a vector x such that the 2-norm :m:`|b - a x|` is minimised.

    Parameters
    ----------
    a : array, shape (M, N)
    b : array, shape (M,) or (M, K)
    cond : float
        Cutoff for 'small' singular values; used to determine effective
        rank of a. Singular values smaller than rcond*largest_singular_value
        are considered zero.
    overwrite_a : boolean
        Discard data in a (may enhance performance)
    overwrite_b : boolean
        Discard data in b (may enhance performance)

    Returns
    -------
    x : array, shape (N,) or (N, K) depending on shape of b
        Least-squares solution
    residues : array, shape () or (1,) or (K,)
        Sums of residues, squared 2-norm for each column in :m:`b - a x`
        If rank of matrix a is < N or > M this is an empty array.
        If b was 1-d, this is an (1,) shape array, otherwise the shape is (K,)
    rank : integer
        Effective rank of matrix a
    s : array, shape (min(M,N),)
        Singular values of a. The condition number of a is abs(s[0]/s[-1]).

    Raises LinAlgError if computation does not converge

    """
    a1, b1 = lmap(asarray_chkfinite, (a, b))
    if a1.ndim != 2:
        raise ValueError('expected matrix')
    m, n = a1.shape
    if b1.ndim == 2:
        nrhs = b1.shape[1]
    else:
        nrhs = 1
    if m != b1.shape[0]:
        raise ValueError('incompatible dimensions')
    gelss, = get_lapack_funcs(('gelss',), (a1, b1))
    if n > m:
        # need to extend b matrix as it will be filled with
        # a larger solution matrix
        b2 = zeros((n, nrhs), dtype=gelss.dtype)
        if b1.ndim == 2:
            b2[:m, :] = b1
        else:
            b2[:m, 0] = b1
        b1 = b2
    overwrite_a = overwrite_a or (a1 is not a and not hasattr(a, '__array__'))
    overwrite_b = overwrite_b or (b1 is not b and not hasattr(b, '__array__'))
    if gelss.module_name[:7] == 'flapack':
        lwork = calc_lwork.gelss(gelss.prefix, m, n, nrhs)[1]
        v, x, s, rank, info = gelss(a1, b1, cond=cond, lwork=lwork,
                                    overwrite_a=overwrite_a,
                                    overwrite_b=overwrite_b)
    else:
        raise NotImplementedError('calling gelss from %s' %
                                  gelss.module_name)
    if info > 0:
        raise LinAlgError("SVD did not converge in Linear Least Squares")
    if info < 0:
        raise ValueError('illegal value in %-th argument of '
                         'internal gelss' % -info)
    resids = asarray([], dtype=x.dtype)
    if n < m:
        x1 = x[:n]
        if rank == n:
            resids = sum(x[n:]**2, axis=0)
        x = x1
    return x, resids, rank, s
Beispiel #43
0
                     prepend=True)  # for R comparison
    #prepend=False)  # for Stata comparison

    Z = add_constant(griliches76_data[['expr', 'tenure', 'rns', 'smsa', \
                                       'D_67', 'D_68', 'D_69', 'D_70', 'D_71',
                                       'D_73', 'med', 'kww', 'age', 'mrt']])
    Y = griliches76_data['lw']

    return Y, X, Z


# use module global to load only once
yg_df, xg_df, zg_df = get_griliches76_data()

endog = np.asarray(yg_df, dtype=float)  # TODO: why is yg_df float32
exog, instrument = lmap(np.asarray, [xg_df, zg_df])

assert exog.dtype == np.float64
assert instrument.dtype == np.float64

# from R
#-----------------
varnames = np.array([
    "(Intercept)", "s", "iq", "expr", "tenure", "rns", "smsa", "D_67", "D_68",
    "D_69", "D_70", "D_71", "D_73"
])
params = np.array([
    4.03350989, 0.17242531, -0.00909883, 0.04928949, 0.04221709, -0.10179345,
    0.12611095, -0.05961711, 0.04867956, 0.15281763, 0.17443605, 0.09166597,
    0.09323977
])
def genfromdta(fname,
               missing_flt=-999.,
               encoding=None,
               pandas=False,
               convert_dates=True):
    """
    Returns an ndarray or DataFrame from a Stata .dta file.

    Parameters
    ----------
    fname : str or filehandle
        Stata .dta file.
    missing_flt : numeric
        The numeric value to replace missing values with. Will be used for
        any numeric value.
    encoding : string, optional
        Used for Python 3 only. Encoding to use when reading the .dta file.
        Defaults to `locale.getpreferredencoding`
    pandas : bool
        Optionally return a DataFrame instead of an ndarray
    convert_dates : bool
        If convert_dates is True, then Stata formatted dates will be converted
        to datetime types according to the variable's format.
    """
    warnings.warn(
        "genfromdta is deprecated as of 0.10.0 and will be removed in a "
        "future version.  Use pandas.read_stata instead.", FutureWarning)

    if isinstance(fname, string_types):
        fhd = StataReader(open(fname, 'rb'),
                          missing_values=False,
                          encoding=encoding)
    elif not hasattr(fname, 'read'):
        raise TypeError("The input should be a string or a filehandle. "\
                        "(got %s instead)" % type(fname))
    else:
        fhd = StataReader(fname, missing_values=False, encoding=encoding)


#    validate_names = np.lib._iotools.NameValidator(excludelist=excludelist,
#                                    deletechars=deletechars,
#                                    case_sensitive=case_sensitive)

#TODO: This needs to handle the byteorder?
    header = fhd.file_headers()
    types = header['dtyplist']
    nobs = header['nobs']
    numvars = header['nvar']
    varnames = header['varlist']
    fmtlist = header['fmtlist']
    dataname = header['data_label']
    labels = header['vlblist']  # labels are thrown away unless DataArray
    # type is used
    data = np.zeros((nobs, numvars))
    stata_dta = fhd.dataset()

    dt = np.dtype(lzip(varnames, types))
    data = np.zeros((nobs), dtype=dt)  # init final array

    for rownum, line in enumerate(stata_dta):
        # doesn't handle missing value objects, just casts
        # None will only work without missing value object.
        if None in line:
            for i, val in enumerate(line):
                #NOTE: This will only be scalar types because missing strings
                # are empty not None in Stata
                if val is None:
                    line[i] = missing_flt
        data[rownum] = tuple(line)

    if pandas:
        from pandas import DataFrame
        data = DataFrame.from_records(data)
        if convert_dates:
            cols = np.where(lmap(lambda x: x in _date_formats, fmtlist))[0]
            for col in cols:
                i = col
                col = data.columns[col]
                data[col] = data[col].apply(_stata_elapsed_date_to_datetime,
                                            args=(fmtlist[i], ))
    elif convert_dates:
        #date_cols = np.where(map(lambda x : x in _date_formats,
        #                                                    fmtlist))[0]
        # make the dtype for the datetime types
        cols = np.where(lmap(lambda x: x in _date_formats, fmtlist))[0]
        dtype = data.dtype.descr
        dtype = [(sub_dtype[0], object) if i in cols else sub_dtype
                 for i, sub_dtype in enumerate(dtype)]
        data = data.astype(dtype)  # have to copy
        for col in cols:

            def convert(x):
                return _stata_elapsed_date_to_datetime(x, fmtlist[col])

            data[data.dtype.names[col]] = lmap(convert,
                                               data[data.dtype.names[col]])
    return data
Beispiel #45
0
    print(shannonentropy(Y))

    p = [1e-5,1e-4,.001,.01,.1,.15,.2,.25,.3,.35,.4,.45,.5]

    plt.subplot(111)
    plt.ylabel("Information")
    plt.xlabel("Probability")
    x = np.linspace(0,1,100001)
    plt.plot(x, shannoninfo(x))
#    plt.show()

    plt.subplot(111)
    plt.ylabel("Entropy")
    plt.xlabel("Probability")
    x = np.linspace(0,1,101)
    plt.plot(x, lmap(shannonentropy, lzip(x,1-x)))
#    plt.show()

    # define a joint probability distribution
    # from Golan (2008) table 3.3
    w = np.array([[0,0,1./3],[1/9.,1/9.,1/9.],[1/18.,1/9.,1/6.]])
    # table 3.4
    px = w.sum(0)
    py = w.sum(1)
    H_X = shannonentropy(px)
    H_Y = shannonentropy(py)
    H_XY = shannonentropy(w)
    H_XgivenY = condentropy(px,py,w)
    H_YgivenX = condentropy(py,px,w)
# note that cross-entropy is not a distance measure as the following shows
    D_YX = logbasechange(2,np.e)*stats.entropy(px, py)
Beispiel #46
0
def seasonal_decompose(x, model="additive", filt=None, freq=None):
    """
    Parameters
    ----------
    x : array-like
        Time series
    model : str {"additive", "multiplicative"}
        Type of seasonal component. Abbreviations are accepted.
    filt : array-like
        The filter coefficients for filtering out the seasonal component.
        The default is a symmetric moving average.
    freq : int, optional
        Frequency of the series. Must be used if x is not a pandas
        object with a timeseries index.

    Returns
    -------
    results : obj
        A object with seasonal, trend, and resid attributes.

    Notes
    -----
    This is a naive decomposition. More sophisticated methods should
    be preferred.

    The additive model is Y[t] = T[t] + S[t] + e[t]

    The multiplicative model is Y[t] = T[t] * S[t] * e[t]

    The seasonal component is first removed by applying a convolution
    filter to the data. The average of this smoothed series for each
    period is the returned seasonal component.

    See Also
    --------
    statsmodels.tsa.filters.convolution_filter
    """
    _pandas_wrapper, pfreq = _maybe_get_pandas_wrapper_freq(x)
    x = np.asanyarray(x).squeeze()
    nobs = len(x)

    if not np.all(np.isfinite(x)):
        raise ValueError("This function does not handle missing values")
    if model.startswith('m'):
        if np.any(x <= 0):
            raise ValueError("Multiplicative seasonality is not appropriate "
                             "for zero and negative values")

    if pfreq is not None:
        pfreq = freq_to_period(pfreq)
        if freq and pfreq != freq:
            raise ValueError("Inferred frequency of index and frequency "
                             "don't match. This function does not re-sample")
        else:
            freq = pfreq

    elif freq is None:
        raise ValueError("You must specify a freq or x must be a "
                         "pandas object with a timeseries index")


    if filt is None:
        if freq % 2 == 0:  # split weights at ends
            filt = np.array([.5] + [1] * (freq - 1) + [.5]) / freq
        else:
            filt = np.repeat(1./freq, freq)

    trend = convolution_filter(x, filt)

    # nan pad for conformability - convolve doesn't do it
    if model.startswith('m'):
        detrended = x / trend
    else:
        detrended = x - trend

    period_averages = seasonal_mean(detrended, freq)

    if model.startswith('m'):
        period_averages /= np.mean(period_averages)
    else:
        period_averages -= np.mean(period_averages)

    seasonal = np.tile(period_averages, nobs // freq + 1)[:nobs]

    if model.startswith('m'):
        resid = x / seasonal / trend
    else:
        resid = detrended - seasonal

    results = lmap(_pandas_wrapper, [seasonal, trend, resid, x])
    return DecomposeResult(seasonal=results[0], trend=results[1],
                           resid=results[2], observed=results[3])
Beispiel #47
0
def seasonal_decompose(x,
                       model="additive",
                       filt=None,
                       freq=None,
                       two_sided=True,
                       extrapolate_trend=0):
    """
    Seasonal decomposition using moving averages

    Parameters
    ----------
    x : array-like
        Time series. If 2d, individual series are in columns.
    model : str {"additive", "multiplicative"}
        Type of seasonal component. Abbreviations are accepted.
    filt : array-like
        The filter coefficients for filtering out the seasonal component.
        The concrete moving average method used in filtering is determined by two_sided.
    freq : int, optional
        Frequency of the series. Must be used if x is not a pandas object.
        Overrides default periodicity of x if x is a pandas
        object with a timeseries index.
    two_sided : bool
        The moving average method used in filtering.
        If True (default), a centered moving average is computed using the filt.
        If False, the filter coefficients are for past values only.
    extrapolate_trend : int or 'freq', optional
        If set to > 0, the trend resulting from the convolution is
        linear least-squares extrapolated on both ends (or the single one
        if two_sided is False) considering this many (+1) closest points.
        If set to 'freq', use `freq` closest points. Setting this parameter
        results in no NaN values in trend or resid components.

    Returns
    -------
    results : obj
        A object with seasonal, trend, and resid attributes.

    Notes
    -----
    This is a naive decomposition. More sophisticated methods should
    be preferred.

    The additive model is Y[t] = T[t] + S[t] + e[t]

    The multiplicative model is Y[t] = T[t] * S[t] * e[t]

    The seasonal component is first removed by applying a convolution
    filter to the data. The average of this smoothed series for each
    period is the returned seasonal component.

    See Also
    --------
    statsmodels.tsa.filters.bk_filter.bkfilter
    statsmodels.tsa.filters.cf_filter.xffilter
    statsmodels.tsa.filters.hp_filter.hpfilter
    statsmodels.tsa.filters.convolution_filter
    """
    if freq is None:
        _pandas_wrapper, pfreq = _maybe_get_pandas_wrapper_freq(x)
    else:
        _pandas_wrapper = _maybe_get_pandas_wrapper(x)
        pfreq = None
    x = np.asanyarray(x).squeeze()
    nobs = len(x)

    if not np.all(np.isfinite(x)):
        raise ValueError("This function does not handle missing values")
    if model.startswith('m'):
        if np.any(x <= 0):
            raise ValueError("Multiplicative seasonality is not appropriate "
                             "for zero and negative values")

    if freq is None:
        if pfreq is not None:
            pfreq = freq_to_period(pfreq)
            freq = pfreq
        else:
            raise ValueError("You must specify a freq or x must be a "
                             "pandas object with a timeseries index with "
                             "a freq not set to None")

    if filt is None:
        if freq % 2 == 0:  # split weights at ends
            filt = np.array([.5] + [1] * (freq - 1) + [.5]) / freq
        else:
            filt = np.repeat(1. / freq, freq)

    nsides = int(two_sided) + 1
    trend = convolution_filter(x, filt, nsides)

    if extrapolate_trend == 'freq':
        extrapolate_trend = freq - 1

    if extrapolate_trend > 0:
        trend = _extrapolate_trend(trend, extrapolate_trend + 1)

    if model.startswith('m'):
        detrended = x / trend
    else:
        detrended = x - trend

    period_averages = seasonal_mean(detrended, freq)

    if model.startswith('m'):
        period_averages /= np.mean(period_averages, axis=0)
    else:
        period_averages -= np.mean(period_averages, axis=0)

    seasonal = np.tile(period_averages.T, nobs // freq + 1).T[:nobs]

    if model.startswith('m'):
        resid = x / seasonal / trend
    else:
        resid = detrended - seasonal

    results = lmap(_pandas_wrapper, [seasonal, trend, resid, x])
    return DecomposeResult(seasonal=results[0],
                           trend=results[1],
                           resid=results[2],
                           observed=results[3])
Beispiel #48
0
    ## Try with a pandas series
    import pandas
    import scikits.timeseries as ts
    d1 = ts.Date(year=1700, freq='A')
    #NOTE: have to have yearBegin offset for annual data until parser rewrite
    #should this be up to the user, or should it be done in TSM init?
    #NOTE: not anymore, it's end of year now
    ts_dr = ts.date_array(start_date=d1, length=len(sunspots.endog))
    pandas_dr = pandas.DateRange(start=d1.datetime,
                                 periods=len(sunspots.endog), timeRule='A@DEC')
    #pandas_dr = pandas_dr.shift(-1, pandas.datetools.yearBegin)

    dates = np.arange(1700, 1700 + len(sunspots.endog))
    dates = ts.date_array(dates, freq='A')
    #sunspots = pandas.Series(sunspots.endog, index=dates)

    #NOTE: pandas only does business days for dates it looks like
    import datetime
    dt_dates = np.asarray(lmap(datetime.datetime.fromordinal,
                              ts_dr.toordinal().astype(int)))
    sunspots = pandas.Series(sunspots.endog, index=dt_dates)

    #NOTE: pandas can't handle pre-1900 dates
    mod = AR(sunspots, freq='A')
    res = mod.fit(method='mle', maxlag=9)

# some data for an example in Box Jenkins
    IBM = np.asarray([460, 457, 452, 459, 462, 459, 463, 479, 493, 490.])
    w = np.diff(IBM)
    theta = .5
Beispiel #49
0
#for Python 3 compatibility

# Seemingly Unrelated Regressions (SUR) Model

# This example uses the subset of the Grunfeld data in Greene's Econometric
# Analysis Chapter 14 (5th Edition)

grun_data = sm.datasets.grunfeld.load()

firms = [
    'General Motors', 'Chrysler', 'General Electric', 'Westinghouse',
    'US Steel'
]
#for Python 3 compatibility
firms = lmap(asbytes, firms)

grun_exog = grun_data.exog
grun_endog = grun_data.endog

# Right now takes SUR takes a list of arrays
# The array alternates between the LHS of an equation and RHS side of an
# equation
# This is very likely to change
grun_sys = []
for i in firms:
    index = grun_exog['firm'] == i
    grun_sys.append(grun_endog[index])
    exog = grun_exog[index][['value', 'capital']].view(float).reshape(-1, 2)
    exog = sm.add_constant(exog, prepend=True)
    grun_sys.append(exog)
Beispiel #50
0
def anova_lm(*args, **kwargs):
    """
    ANOVA table for one or more fitted linear models.

    Parameters
    ----------
    args : fitted linear model results instance
        One or more fitted linear models
    scale : float
        Estimate of variance, If None, will be estimated from the largest
        model. Default is None.
    test : str {"F", "Chisq", "Cp"} or None
        Test statistics to provide. Default is "F".
    typ : str or int {"I","II","III"} or {1,2,3}
        The type of ANOVA test to perform. See notes.
    robust : {None, "hc0", "hc1", "hc2", "hc3"}
        Use heteroscedasticity-corrected coefficient covariance matrix.
        If robust covariance is desired, it is recommended to use `hc3`.

    Returns
    -------
    anova : DataFrame
    A DataFrame containing.

    Notes
    -----
    Model statistics are given in the order of args. Models must have
    been fit using the formula api.

    See Also
    --------
    model_results.compare_f_test, model_results.compare_lm_test

    Examples
    --------
    >>> import statsmodels.api as sm
    >>> from statsmodels.formula.api import ols

    >>> moore = sm.datasets.get_rdataset("Moore", "car",
    ...                                  cache=True) # load data
    >>> data = moore.data
    >>> data = data.rename(columns={"partner.status" :
    ...                             "partner_status"}) # make name pythonic
    >>> moore_lm = ols('conformity ~ C(fcategory, Sum)*C(partner_status, Sum)',
    ...                 data=data).fit()

    >>> table = sm.stats.anova_lm(moore_lm, typ=2) # Type 2 ANOVA DataFrame
    >>> print table
    """
    typ = kwargs.get('typ', 1)

    ### Farm Out Single model ANOVA Type I, II, III, and IV ###

    if len(args) == 1:
        model = args[0]
        return anova_single(model, **kwargs)

    try:
        assert typ in [1,"I"]
    except:
        raise ValueError("Multiple models only supported for type I. "
                         "Got type %s" % str(typ))

    ### COMPUTE ANOVA TYPE I ###

    # if given a single model
    if len(args) == 1:
        return anova_single(*args, **kwargs)

    # received multiple fitted models

    test = kwargs.get("test", "F")
    scale = kwargs.get("scale", None)
    n_models = len(args)

    model_formula = []
    pr_test = "Pr(>%s)" % test
    names = ['df_resid', 'ssr', 'df_diff', 'ss_diff', test, pr_test]
    table = DataFrame(np.zeros((n_models, 6)), columns = names)

    if not scale: # assume biggest model is last
        scale = args[-1].scale

    table["ssr"] = lmap(getattr, args, ["ssr"]*n_models)
    table["df_resid"] = lmap(getattr, args, ["df_resid"]*n_models)
    table.ix[1:, "df_diff"] = -np.diff(table["df_resid"].values)
    table["ss_diff"] = -table["ssr"].diff()
    if test == "F":
        table["F"] = table["ss_diff"] / table["df_diff"] / scale
        table[pr_test] = stats.f.sf(table["F"], table["df_diff"],
                             table["df_resid"])
        # for earlier scipy - stats.f.sf(np.nan, 10, 2) -> 0 not nan
        table[pr_test][table['F'].isnull()] = np.nan

    return table
Beispiel #51
0
def anova_lm(*args, **kwargs):
    """
    Anova table for one or more fitted linear models.

    Parameters
    ----------
    args : fitted linear model results instance
        One or more fitted linear models
    scale : float
        Estimate of variance, If None, will be estimated from the largest
        model. Default is None.
    test : str {"F", "Chisq", "Cp"} or None
        Test statistics to provide. Default is "F".
    typ : str or int {"I","II","III"} or {1,2,3}
        The type of Anova test to perform. See notes.
    robust : {None, "hc0", "hc1", "hc2", "hc3"}
        Use heteroscedasticity-corrected coefficient covariance matrix.
        If robust covariance is desired, it is recommended to use `hc3`.

    Returns
    -------
    anova : DataFrame
    A DataFrame containing.

    Notes
    -----
    Model statistics are given in the order of args. Models must have
    been fit using the formula api.

    See Also
    --------
    model_results.compare_f_test, model_results.compare_lm_test

    Examples
    --------
    >>> import statsmodels.api as sm
    >>> from statsmodels.formula.api import ols
    >>> moore = sm.datasets.get_rdataset("Moore", "carData", cache=True) # load
    >>> data = moore.data
    >>> data = data.rename(columns={"partner.status" :
    ...                             "partner_status"}) # make name pythonic
    >>> moore_lm = ols('conformity ~ C(fcategory, Sum)*C(partner_status, Sum)',
    ...                 data=data).fit()
    >>> table = sm.stats.anova_lm(moore_lm, typ=2) # Type 2 Anova DataFrame
    >>> print(table)
    """
    typ = kwargs.get('typ', 1)

    ### Farm Out Single model Anova Type I, II, III, and IV ###

    if len(args) == 1:
        model = args[0]
        return anova_single(model, **kwargs)

    try:
        assert typ in [1, "I"]
    except:
        raise ValueError("Multiple models only supported for type I. "
                         "Got type %s" % str(typ))

    test = kwargs.get("test", "F")
    scale = kwargs.get("scale", None)
    n_models = len(args)
    pr_test = "Pr(>%s)" % test
    names = ['df_resid', 'ssr', 'df_diff', 'ss_diff', test, pr_test]
    table = DataFrame(np.zeros((n_models, 6)), columns=names)

    if not scale:  # assume biggest model is last
        scale = args[-1].scale

    table["ssr"] = lmap(getattr, args, ["ssr"] * n_models)
    table["df_resid"] = lmap(getattr, args, ["df_resid"] * n_models)
    table.loc[table.index[1:], "df_diff"] = -np.diff(table["df_resid"].values)
    table["ss_diff"] = -table["ssr"].diff()
    if test == "F":
        table["F"] = table["ss_diff"] / table["df_diff"] / scale
        table[pr_test] = stats.f.sf(table["F"], table["df_diff"],
                                    table["df_resid"])
        # for earlier scipy - stats.f.sf(np.nan, 10, 2) -> 0 not nan
        table[pr_test][table['F'].isnull()] = np.nan

    return table
Beispiel #52
0
    def handle_missing(cls, endog, exog, missing, **kwargs):
        """
        This returns a dictionary with keys endog, exog and the keys of
        kwargs. It preserves Nones.
        """
        none_array_names = []

        # patsy's already dropped NaNs in y/X
        missing_idx = kwargs.pop('missing_idx', None)

        if missing_idx is not None:
            # y, X already handled by patsy. add back in later.
            combined = ()
            combined_names = []
            if exog is None:
                none_array_names += ['exog']
        elif exog is not None:
            combined = (endog, exog)
            combined_names = ['endog', 'exog']
        else:
            combined = (endog,)
            combined_names = ['endog']
            none_array_names += ['exog']

        # deal with other arrays
        combined_2d = ()
        combined_2d_names = []
        if len(kwargs):
            for key, value_array in iteritems(kwargs):
                if value_array is None or value_array.ndim == 0:
                    none_array_names += [key]
                    continue
                # grab 1d arrays
                if value_array.ndim == 1:
                    combined += (np.asarray(value_array),)
                    combined_names += [key]
                elif value_array.squeeze().ndim == 1:
                    combined += (np.asarray(value_array),)
                    combined_names += [key]

                # grab 2d arrays that are _assumed_ to be symmetric
                elif value_array.ndim == 2:
                    combined_2d += (np.asarray(value_array),)
                    combined_2d_names += [key]
                else:
                    raise ValueError("Arrays with more than 2 dimensions "
                                     "aren't yet handled")

        if missing_idx is not None:
            nan_mask = missing_idx
            updated_row_mask = None
            if combined:  # there were extra arrays not handled by patsy
                combined_nans = _nan_rows(*combined)
                if combined_nans.shape[0] != nan_mask.shape[0]:
                    raise ValueError("Shape mismatch between endog/exog "
                                     "and extra arrays given to model.")
                # for going back and updated endog/exog
                updated_row_mask = combined_nans[~nan_mask]
                nan_mask |= combined_nans  # for updating extra arrays only
            if combined_2d:
                combined_2d_nans = _nan_rows(combined_2d)
                if combined_2d_nans.shape[0] != nan_mask.shape[0]:
                    raise ValueError("Shape mismatch between endog/exog "
                                     "and extra 2d arrays given to model.")
                if updated_row_mask is not None:
                    updated_row_mask |= combined_2d_nans[~nan_mask]
                else:
                    updated_row_mask = combined_2d_nans[~nan_mask]
                nan_mask |= combined_2d_nans

        else:
            nan_mask = _nan_rows(*combined)
            if combined_2d:
                nan_mask = _nan_rows(*(nan_mask[:, None],) + combined_2d)

        if not np.any(nan_mask):  # no missing don't do anything
            combined = dict(zip(combined_names, combined))
            if combined_2d:
                combined.update(dict(zip(combined_2d_names, combined_2d)))
            if none_array_names:
                combined.update(dict(zip(none_array_names,
                                         [None] * len(none_array_names))))

            if missing_idx is not None:
                combined.update({'endog': endog})
                if exog is not None:
                    combined.update({'exog': exog})

            return combined, []

        elif missing == 'raise':
            raise MissingDataError("NaNs were encountered in the data")

        elif missing == 'drop':
            nan_mask = ~nan_mask
            drop_nans = lambda x: cls._drop_nans(x, nan_mask)
            drop_nans_2d = lambda x: cls._drop_nans_2d(x, nan_mask)
            combined = dict(zip(combined_names, lmap(drop_nans, combined)))

            if missing_idx is not None:
                if updated_row_mask is not None:
                    updated_row_mask = ~updated_row_mask
                    # update endog/exog with this new information
                    endog = cls._drop_nans(endog, updated_row_mask)
                    if exog is not None:
                        exog = cls._drop_nans(exog, updated_row_mask)

                combined.update({'endog': endog})
                if exog is not None:
                    combined.update({'exog': exog})

            if combined_2d:
                combined.update(dict(zip(combined_2d_names,
                                         lmap(drop_nans_2d, combined_2d))))
            if none_array_names:
                combined.update(dict(zip(none_array_names,
                                         [None] * len(none_array_names))))

            return combined, np.where(~nan_mask)[0].tolist()
        else:
            raise ValueError("missing option %s not understood" % missing)
Beispiel #53
0
    def handle_missing(cls, endog, exog, missing, **kwargs):
        """
        This returns a dictionary with keys endog, exog and the keys of
        kwargs. It preserves Nones.
        """
        none_array_names = []

        # patsy's already dropped NaNs in y/X
        missing_idx = kwargs.pop('missing_idx', None)

        if missing_idx is not None:
            # y, X already handled by patsy. add back in later.
            combined = ()
            combined_names = []
            if exog is None:
                none_array_names += ['exog']
        elif exog is not None:
            combined = (endog, exog)
            combined_names = ['endog', 'exog']
        else:
            combined = (endog, )
            combined_names = ['endog']
            none_array_names += ['exog']

        # deal with other arrays
        combined_2d = ()
        combined_2d_names = []
        if len(kwargs):
            for key, value_array in iteritems(kwargs):
                if value_array is None or value_array.ndim == 0:
                    none_array_names += [key]
                    continue
                # grab 1d arrays
                if value_array.ndim == 1:
                    combined += (np.asarray(value_array), )
                    combined_names += [key]
                elif value_array.squeeze().ndim == 1:
                    combined += (np.asarray(value_array), )
                    combined_names += [key]

                # grab 2d arrays that are _assumed_ to be symmetric
                elif value_array.ndim == 2:
                    combined_2d += (np.asarray(value_array), )
                    combined_2d_names += [key]
                else:
                    raise ValueError("Arrays with more than 2 dimensions "
                                     "aren't yet handled")

        if missing_idx is not None:
            nan_mask = missing_idx
            updated_row_mask = None
            if combined:  # there were extra arrays not handled by patsy
                combined_nans = _nan_rows(*combined)
                if combined_nans.shape[0] != nan_mask.shape[0]:
                    raise ValueError("Shape mismatch between endog/exog "
                                     "and extra arrays given to model.")
                # for going back and updated endog/exog
                updated_row_mask = combined_nans[~nan_mask]
                nan_mask |= combined_nans  # for updating extra arrays only
            if combined_2d:
                combined_2d_nans = _nan_rows(combined_2d)
                if combined_2d_nans.shape[0] != nan_mask.shape[0]:
                    raise ValueError("Shape mismatch between endog/exog "
                                     "and extra 2d arrays given to model.")
                if updated_row_mask is not None:
                    updated_row_mask |= combined_2d_nans[~nan_mask]
                else:
                    updated_row_mask = combined_2d_nans[~nan_mask]
                nan_mask |= combined_2d_nans

        else:
            nan_mask = _nan_rows(*combined)
            if combined_2d:
                nan_mask = _nan_rows(*(nan_mask[:, None], ) + combined_2d)

        if not np.any(nan_mask):  # no missing don't do anything
            combined = dict(zip(combined_names, combined))
            if combined_2d:
                combined.update(dict(zip(combined_2d_names, combined_2d)))
            if none_array_names:
                combined.update(
                    dict(zip(none_array_names,
                             [None] * len(none_array_names))))

            if missing_idx is not None:
                combined.update({'endog': endog})
                if exog is not None:
                    combined.update({'exog': exog})

            return combined, []

        elif missing == 'raise':
            raise MissingDataError("NaNs were encountered in the data")

        elif missing == 'drop':
            nan_mask = ~nan_mask
            drop_nans = lambda x: cls._drop_nans(x, nan_mask)
            drop_nans_2d = lambda x: cls._drop_nans_2d(x, nan_mask)
            combined = dict(zip(combined_names, lmap(drop_nans, combined)))

            if missing_idx is not None:
                if updated_row_mask is not None:
                    updated_row_mask = ~updated_row_mask
                    # update endog/exog with this new information
                    endog = cls._drop_nans(endog, updated_row_mask)
                    if exog is not None:
                        exog = cls._drop_nans(exog, updated_row_mask)

                combined.update({'endog': endog})
                if exog is not None:
                    combined.update({'exog': exog})

            if combined_2d:
                combined.update(
                    dict(
                        zip(combined_2d_names, lmap(drop_nans_2d,
                                                    combined_2d))))
            if none_array_names:
                combined.update(
                    dict(zip(none_array_names,
                             [None] * len(none_array_names))))

            return combined, np.where(~nan_mask)[0].tolist()
        else:
            raise ValueError("missing option %s not understood" % missing)
Beispiel #54
0
def categorical(data, col=None, dictnames=False, drop=False):
    '''
    Returns a dummy matrix given an array of categorical variables.

    Parameters
    ----------
    data : array
        A structured array, recarray, array, Series or DataFrame.  This can be
        either a 1d vector of the categorical variable or a 2d array with
        the column specifying the categorical variable specified by the col
        argument.
    col : {str, int, None}
        If data is a DataFrame col must in a column of data. If data is a
        Series, col must be either the name of the Series or None. If data is a
        structured array or a recarray, `col` can be a string that is the name
        of the column that contains the variable.  For all other
        arrays `col` can be an int that is the (zero-based) column index
        number.  `col` can only be None for a 1d array.  The default is None.
    dictnames : bool, optional
        If True, a dictionary mapping the column number to the categorical
        name is returned.  Used to have information about plain arrays.
    drop : bool
        Whether or not keep the categorical variable in the returned matrix.

    Returns
    -------
    dummy_matrix, [dictnames, optional]
        A matrix of dummy (indicator/binary) float variables for the
        categorical data.  If dictnames is True, then the dictionary
        is returned as well.

    Notes
    -----
    This returns a dummy variable for EVERY distinct variable.  If a
    a structured or recarray is provided, the names for the new variable is the
    old variable name - underscore - category name.  So if the a variable
    'vote' had answers as 'yes' or 'no' then the returned array would have to
    new variables-- 'vote_yes' and 'vote_no'.  There is currently
    no name checking.

    Examples
    --------
    >>> import numpy as np
    >>> import statsmodels.api as sm

    Univariate examples

    >>> import string
    >>> string_var = [string.ascii_lowercase[0:5], \
                      string.ascii_lowercase[5:10], \
                      string.ascii_lowercase[10:15], \
                      string.ascii_lowercase[15:20],   \
                      string.ascii_lowercase[20:25]]
    >>> string_var *= 5
    >>> string_var = np.asarray(sorted(string_var))
    >>> design = sm.tools.categorical(string_var, drop=True)

    Or for a numerical categorical variable

    >>> instr = np.floor(np.arange(10,60, step=2)/10)
    >>> design = sm.tools.categorical(instr, drop=True)

    With a structured array

    >>> num = np.random.randn(25,2)
    >>> struct_ar = np.zeros((25,1), dtype=[('var1', 'f4'),('var2', 'f4'),  \
                    ('instrument','f4'),('str_instr','a5')])
    >>> struct_ar['var1'] = num[:,0][:,None]
    >>> struct_ar['var2'] = num[:,1][:,None]
    >>> struct_ar['instrument'] = instr[:,None]
    >>> struct_ar['str_instr'] = string_var[:,None]
    >>> design = sm.tools.categorical(struct_ar, col='instrument', drop=True)

    Or

    >>> design2 = sm.tools.categorical(struct_ar, col='str_instr', drop=True)
    '''
    # TODO: add a NameValidator function
    if isinstance(col, (list, tuple)):
        if len(col) == 1:
            col = col[0]
        else:
            raise ValueError("Can only convert one column at a time")
    if (not isinstance(data, (pd.DataFrame, pd.Series)) and
            not isinstance(col, (string_types, int)) and
            col is not None):
        raise TypeError('col must be a str, int or None')

    # Pull out a Series from a DataFrame if provided
    if isinstance(data, pd.DataFrame):
        if col is None:
            raise TypeError('col must be a str or int when using a DataFrame')
        elif col not in data:
            raise ValueError('Column \'{0}\' not found in data'.format(col))
        data = data[col]
        # Set col to None since we not have a Series
        col = None

    if isinstance(data, pd.Series):
        if col is not None and data.name != col:
            raise ValueError('data.name does not match col '
                             '\'{0}\''.format(col))
        data_cat = data.astype('category')
        dummies = pd.get_dummies(data_cat)
        col_map = {i: cat for i, cat in enumerate(data_cat.cat.categories) if
                   cat in dummies}
        if not drop:
            dummies.columns = list(dummies.columns)
            dummies = pd.concat([dummies, data], 1)
        if dictnames:
            return dummies, col_map
        return dummies
    # catch recarrays and structured arrays
    elif data.dtype.names or data.__class__ is np.recarray:
        if not col and np.squeeze(data).ndim > 1:
            raise IndexError("col is None and the input array is not 1d")
        if isinstance(col, (int, long)):
            col = data.dtype.names[col]
        if col is None and data.dtype.names and len(data.dtype.names) == 1:
            col = data.dtype.names[0]

        tmp_arr = np.unique(data[col])

        # if the cols are shape (#,) vs (#,1) need to add an axis and flip
        _swap = True
        if data[col].ndim == 1:
            tmp_arr = tmp_arr[:, None]
            _swap = False
        tmp_dummy = (tmp_arr == data[col]).astype(float)
        if _swap:
            tmp_dummy = np.squeeze(tmp_dummy).swapaxes(1, 0)

        if not tmp_arr.dtype.names:  # how do we get to this code path?
            tmp_arr = [asstr2(item) for item in np.squeeze(tmp_arr)]
        elif tmp_arr.dtype.names:
            tmp_arr = [asstr2(item) for item in np.squeeze(tmp_arr.tolist())]

        # prepend the varname and underscore, if col is numeric attribute
        # lookup is lost for recarrays...
        if col is None:
            try:
                col = data.dtype.names[0]
            except:
                col = 'var'
        # TODO: the above needs to be made robust because there could be many
        # var_yes, var_no varaibles for instance.
        tmp_arr = [col + '_' + item for item in tmp_arr]
        # TODO: test this for rec and structured arrays!!!

        if drop is True:
            if len(data.dtype) <= 1:
                if tmp_dummy.shape[0] < tmp_dummy.shape[1]:
                    tmp_dummy = np.squeeze(tmp_dummy).swapaxes(1, 0)
                dt = lzip(tmp_arr, [tmp_dummy.dtype.str]*len(tmp_arr))
                # preserve array type
                return np.array(lmap(tuple, tmp_dummy.tolist()),
                                dtype=dt).view(type(data))

            data = nprf.drop_fields(data, col, usemask=False,
                                    asrecarray=type(data) is np.recarray)
        data = nprf.append_fields(data, tmp_arr, data=tmp_dummy,
                                  usemask=False,
                                  asrecarray=type(data) is np.recarray)
        return data

    # Catch array-like for an error
    elif not isinstance(data, np.ndarray):
        raise NotImplementedError("Array-like objects are not supported")
    else:
        if isinstance(col, (int, long)):
            offset = data.shape[1]          # need error catching here?
            tmp_arr = np.unique(data[:, col])
            tmp_dummy = (tmp_arr[:, np.newaxis] == data[:, col]).astype(float)
            tmp_dummy = tmp_dummy.swapaxes(1, 0)
            if drop is True:
                offset -= 1
                data = np.delete(data, col, axis=1).astype(float)
            data = np.column_stack((data, tmp_dummy))
            if dictnames is True:
                col_map = _make_dictnames(tmp_arr, offset)
                return data, col_map
            return data
        elif col is None and np.squeeze(data).ndim == 1:
            tmp_arr = np.unique(data)
            tmp_dummy = (tmp_arr[:, None] == data).astype(float)
            tmp_dummy = tmp_dummy.swapaxes(1, 0)
            if drop is True:
                if dictnames is True:
                    col_map = _make_dictnames(tmp_arr)
                    return tmp_dummy, col_map
                return tmp_dummy
            else:
                data = np.column_stack((data, tmp_dummy))
                if dictnames is True:
                    col_map = _make_dictnames(tmp_arr, offset=1)
                    return data, col_map
                return data
        else:
            raise IndexError("The index %s is not understood" % col)
Beispiel #55
0
def lstsq(a, b, cond=None, overwrite_a=0, overwrite_b=0):
    """Compute least-squares solution to equation :m:`a x = b`

    Compute a vector x such that the 2-norm :m:`|b - a x|` is minimised.

    Parameters
    ----------
    a : array, shape (M, N)
    b : array, shape (M,) or (M, K)
    cond : float
        Cutoff for 'small' singular values; used to determine effective
        rank of a. Singular values smaller than rcond*largest_singular_value
        are considered zero.
    overwrite_a : boolean
        Discard data in a (may enhance performance)
    overwrite_b : boolean
        Discard data in b (may enhance performance)

    Returns
    -------
    x : array, shape (N,) or (N, K) depending on shape of b
        Least-squares solution
    residues : array, shape () or (1,) or (K,)
        Sums of residues, squared 2-norm for each column in :m:`b - a x`
        If rank of matrix a is < N or > M this is an empty array.
        If b was 1-d, this is an (1,) shape array, otherwise the shape is (K,)
    rank : integer
        Effective rank of matrix a
    s : array, shape (min(M,N),)
        Singular values of a. The condition number of a is abs(s[0]/s[-1]).

    Raises LinAlgError if computation does not converge

    """
    a1, b1 = lmap(asarray_chkfinite, (a, b))
    if a1.ndim != 2:
        raise ValueError('expected matrix')
    m, n = a1.shape
    if b1.ndim == 2:
        nrhs = b1.shape[1]
    else:
        nrhs = 1
    if m != b1.shape[0]:
        raise ValueError('incompatible dimensions')
    gelss, = get_lapack_funcs(('gelss', ), (a1, b1))
    if n > m:
        # need to extend b matrix as it will be filled with
        # a larger solution matrix
        b2 = zeros((n, nrhs), dtype=gelss.dtype)
        if b1.ndim == 2:
            b2[:m, :] = b1
        else:
            b2[:m, 0] = b1
        b1 = b2
    overwrite_a = overwrite_a or (a1 is not a and not hasattr(a, '__array__'))
    overwrite_b = overwrite_b or (b1 is not b and not hasattr(b, '__array__'))

    if gelss.module_name[:7] == 'flapack':

        # get optimal work array
        work = gelss(a1, b1, lwork=-1)[4]
        lwork = work[0].real.astype(np.int)
        v, x, s, rank, work, info = gelss(a1,
                                          b1,
                                          cond=cond,
                                          lwork=lwork,
                                          overwrite_a=overwrite_a,
                                          overwrite_b=overwrite_b)

    else:
        raise NotImplementedError('calling gelss from %s' % gelss.module_name)
    if info > 0:
        raise LinAlgError("SVD did not converge in Linear Least Squares")
    if info < 0:
        raise ValueError('illegal value in %-th argument of '
                         'internal gelss' % -info)
    resids = asarray([], dtype=x.dtype)
    if n < m:
        x1 = x[:n]
        if rank == n:
            resids = sum(x[n:]**2, axis=0)
        x = x1
    return x, resids, rank, s
Beispiel #56
0
    print(shannonentropy(Y))

    p = [1e-5,1e-4,.001,.01,.1,.15,.2,.25,.3,.35,.4,.45,.5]

    plt.subplot(111)
    plt.ylabel("Information")
    plt.xlabel("Probability")
    x = np.linspace(0,1,100001)
    plt.plot(x, shannoninfo(x))
#    plt.show()

    plt.subplot(111)
    plt.ylabel("Entropy")
    plt.xlabel("Probability")
    x = np.linspace(0,1,101)
    plt.plot(x, lmap(shannonentropy, lzip(x,1-x)))
#    plt.show()

    # define a joint probability distribution
    # from Golan (2008) table 3.3
    w = np.array([[0,0,1./3],[1/9.,1/9.,1/9.],[1/18.,1/9.,1/6.]])
    # table 3.4
    px = w.sum(0)
    py = w.sum(1)
    H_X = shannonentropy(px)
    H_Y = shannonentropy(py)
    H_XY = shannonentropy(w)
    H_XgivenY = condentropy(px,py,w)
    H_YgivenX = condentropy(py,px,w)
# note that cross-entropy is not a distance measure as the following shows
    D_YX = logbasechange(2,np.e)*stats.entropy(px, py)
import statsmodels.api as sm
from statsmodels.sandbox.sysreg import *

#for Python 3 compatibility

# Seemingly Unrelated Regressions (SUR) Model

# This example uses the subset of the Grunfeld data in Greene's Econometric
# Analysis Chapter 14 (5th Edition)

grun_data = sm.datasets.grunfeld.load()

firms = ['General Motors', 'Chrysler', 'General Electric', 'Westinghouse',
        'US Steel']
#for Python 3 compatibility
firms = lmap(asbytes, firms)

grun_exog = grun_data.exog
grun_endog = grun_data.endog

# Right now takes SUR takes a list of arrays
# The array alternates between the LHS of an equation and RHS side of an
# equation
# This is very likely to change
grun_sys = []
for i in firms:
    index = grun_exog['firm'] == i
    grun_sys.append(grun_endog[index])
    exog = grun_exog[index][['value','capital']].view(float).reshape(-1,2)
    exog = sm.add_constant(exog, prepend=True)
    grun_sys.append(exog)
Beispiel #58
0
    ## Try with a pandas series
    import pandas
    import scikits.timeseries as ts
    d1 = ts.Date(year=1700, freq='A')
    #NOTE: have to have yearBegin offset for annual data until parser rewrite
    #should this be up to the user, or should it be done in TSM init?
    #NOTE: not anymore, it's end of year now
    ts_dr = ts.date_array(start_date=d1, length=len(sunspots.endog))
    pandas_dr = pandas.DateRange(start=d1.datetime,
                                 periods=len(sunspots.endog), timeRule='A@DEC')
    #pandas_dr = pandas_dr.shift(-1, pandas.datetools.yearBegin)

    dates = np.arange(1700, 1700 + len(sunspots.endog))
    dates = ts.date_array(dates, freq='A')
    #sunspots = pandas.Series(sunspots.endog, index=dates)

    #NOTE: pandas only does business days for dates it looks like
    import datetime
    dt_dates = np.asarray(lmap(datetime.datetime.fromordinal,
                              ts_dr.toordinal().astype(int)))
    sunspots = pandas.Series(sunspots.endog, index=dt_dates)

    #NOTE: pandas can't handle pre-1900 dates
    mod = AR(sunspots, freq='A')
    res = mod.fit(method='mle', maxlag=9)

# some data for an example in Box Jenkins
    IBM = np.asarray([460, 457, 452, 459, 462, 459, 463, 479, 493, 490.])
    w = np.diff(IBM)
    theta = .5
Beispiel #59
0
def fit_fr(self, data, *args, **kwds):
    '''estimate distribution parameters by MLE taking some parameters as fixed

    Parameters
    ----------
    data : array, 1d
        data for which the distribution parameters are estimated,
    args : list ? check
        starting values for optimization
    kwds :

      - 'frozen' : array_like
           values for frozen distribution parameters and, for elements with
           np.nan, the corresponding parameter will be estimated

    Returns
    -------
    argest : array
        estimated parameters


    Examples
    --------
    generate random sample
    >>> np.random.seed(12345)
    >>> x = stats.gamma.rvs(2.5, loc=0, scale=1.2, size=200)

    estimate all parameters
    >>> stats.gamma.fit(x)
    array([ 2.0243194 ,  0.20395655,  1.44411371])
    >>> stats.gamma.fit_fr(x, frozen=[np.nan, np.nan, np.nan])
    array([ 2.0243194 ,  0.20395655,  1.44411371])

    keep loc fixed, estimate shape and scale parameters
    >>> stats.gamma.fit_fr(x, frozen=[np.nan, 0.0, np.nan])
    array([ 2.45603985,  1.27333105])

    keep loc and scale fixed, estimate shape parameter
    >>> stats.gamma.fit_fr(x, frozen=[np.nan, 0.0, 1.0])
    array([ 3.00048828])
    >>> stats.gamma.fit_fr(x, frozen=[np.nan, 0.0, 1.2])
    array([ 2.57792969])

    estimate only scale parameter for fixed shape and loc
    >>> stats.gamma.fit_fr(x, frozen=[2.5, 0.0, np.nan])
    array([ 1.25087891])

    Notes
    -----
    self is an instance of a distribution class. This can be attached to
    scipy.stats.distributions.rv_continuous

    *Todo*

    * check if docstring is correct
    * more input checking, args is list ? might also apply to current fit method

    '''
    loc0, scale0 = lmap(kwds.get, ['loc', 'scale'],[0.0, 1.0])
    Narg = len(args)

    if Narg == 0 and hasattr(self, '_fitstart'):
        x0 = self._fitstart(data)
    elif Narg > self.numargs:
        raise ValueError("Too many input arguments.")
    else:
        args += (1.0,)*(self.numargs-Narg)
        # location and scale are at the end
        x0 = args + (loc0, scale0)

    if 'frozen' in kwds:
        frmask = np.array(kwds['frozen'])
        if len(frmask) != self.numargs+2:
            raise ValueError("Incorrect number of frozen arguments.")
        else:
            # keep starting values for not frozen parameters
            x0  = np.array(x0)[np.isnan(frmask)]
    else:
        frmask = None

    #print(x0
    #print(frmask
    return optimize.fmin(self.nnlf_fr, x0,
                args=(np.ravel(data), frmask), disp=0)