def select_order(self, maxlags=None, verbose=True): """ Compute lag order selections based on each of the available information criteria Parameters ---------- maxlags : int if None, defaults to 12 * (nobs/100.)**(1./4) verbose : bool, default True If True, print table of info criteria and selected orders Returns ------- selections : dict {info_crit -> selected_order} """ if maxlags is None: maxlags = int(round(12*(len(self.endog)/100.)**(1/4.))) ics = defaultdict(list) for p in range(maxlags + 1): # exclude some periods to same amount of data used for each lag # order result = self._estimate_var(p, offset=maxlags-p) for k, v in iteritems(result.info_criteria): ics[k].append(v) selected_orders = dict((k, mat(v).argmin()) for k, v in iteritems(ics)) if verbose: output.print_ic_table(ics, selected_orders) return selected_orders
def get_colwidths(self, output_format, **fmt_dict): """Return list, the widths of each column.""" call_args = [output_format] for k, v in sorted(iteritems(fmt_dict)): if isinstance(v, list): call_args.append((k, tuple(v))) elif isinstance(v, dict): call_args.append((k, tuple(sorted(iteritems(v))))) else: call_args.append((k, v)) key = tuple(call_args) try: return self._colwidths[key] except KeyError: self._colwidths[key] = self._get_colwidths(output_format, **fmt_dict) return self._colwidths[key]
def handle_missing(cls, endog, exog, missing, **kwargs): """ This returns a dictionary with keys endog, exog and the keys of kwargs. It preserves Nones. """ none_array_names = [] if exog is not None: combined = (endog, exog) combined_names = ['endog', 'exog'] else: combined = (endog,) combined_names = ['endog'] none_array_names += ['exog'] # deal with other arrays combined_2d = () combined_2d_names = [] if len(kwargs): for key, value_array in iteritems(kwargs): if value_array is None or value_array.ndim == 0: none_array_names += [key] continue # grab 1d arrays if value_array.ndim == 1: combined += (value_array,) combined_names += [key] elif value_array.squeeze().ndim == 1: combined += (value_array,) combined_names += [key] # grab 2d arrays that are _assumed_ to be symmetric elif value_array.ndim == 2: combined_2d += (value_array,) combined_2d_names += [key] else: raise ValueError("Arrays with more than 2 dimensions " "aren't yet handled") nan_mask = _nan_rows(*combined) if combined_2d: nan_mask = _nan_rows(*(nan_mask[:, None],) + combined_2d) if missing == 'raise' and np.any(nan_mask): raise MissingDataError("NaNs were encountered in the data") elif missing == 'drop': nan_mask = ~nan_mask drop_nans = lambda x: cls._drop_nans(x, nan_mask) drop_nans_2d = lambda x: cls._drop_nans_2d(x, nan_mask) combined = dict(zip(combined_names, lmap(drop_nans, combined))) if combined_2d: combined.update(dict(zip(combined_2d_names, lmap(drop_nans_2d, combined_2d)))) if none_array_names: combined.update(dict(zip(none_array_names, [None] * len(none_array_names)))) return combined, np.where(~nan_mask)[0].tolist() else: raise ValueError("missing option %s not understood" % missing)
def populate_wrapper(klass, wrapping): for meth, how in iteritems(klass._wrap_methods): if not hasattr(wrapping, meth): continue func = getattr(wrapping, meth) wrapper = make_wrapper(func, how) setattr(klass, meth, wrapper)
def _reduce_dict(count_dict, partial_key): """ Make partial sum on a counter dict. Given a match for the beginning of the category, it will sum each value. """ L = len(partial_key) count = sum(v for k, v in iteritems(count_dict) if k[:L] == partial_key) return count
def _normalize_data(data, index): """normalize the data to a dict with tuples of strings as keys right now it works with: 0 - dictionary (or equivalent mappable) 1 - pandas.Series with simple or hierarchical indexes 2 - numpy.ndarrays 3 - everything that can be converted to a numpy array 4 - pandas.DataFrame (via the _normalize_dataframe function) """ # if data is a dataframe we need to take a completely new road # before coming back here. Use the hasattr to avoid importing # pandas explicitly if hasattr(data, 'pivot') and hasattr(data, 'groupby'): data = _normalize_dataframe(data, index) index = None # can it be used as a dictionary? try: items = list(iteritems(data)) except AttributeError: # ok, I cannot use the data as a dictionary # Try to convert it to a numpy array, or die trying data = np.asarray(data) temp = OrderedDict() for idx in np.ndindex(data.shape): name = tuple(i for i in idx) temp[name] = data[idx] data = temp items = list(iteritems(data)) # make all the keys a tuple, even if simple numbers data = OrderedDict([_tuplify(k), v] for k, v in items) categories_levels = _categories_level(list(iterkeys(data))) # fill the void in the counting dictionary indexes = product(*categories_levels) contingency = OrderedDict([(k, data.get(k, 0)) for k in indexes]) data = contingency # reorder the keys order according to the one specified by the user # or if the index is None convert it into a simple list # right now it doesn't do any check, but can be modified in the future index = lrange(len(categories_levels)) if index is None else index contingency = OrderedDict() for key, value in iteritems(data): new_key = tuple(key[i] for i in index) contingency[new_key] = value data = contingency return data
def equations(self): eqs = {} for col, ts in iteritems(self.y): model = _window_ols(y=ts, x=self.x, window=self._window, window_type=self._window_type, min_periods=self._min_periods) eqs[col] = model return eqs
def get_lag(mod, endog, exog, start_lag, max_lag, method, model_args=()): results = {} # dict method = method.lower() for lag in range(start_lag, start_lag + max_lag + 1): results[lag] = mod(endog, exog[:, :lag], *model_args).fit() if method == "aic": best_inf_crit, best_lag = min( (v.aic, k) for k, v in iteritems(results)) # перебор по значениям из results return best_inf_crit, best_lag
def test_get_trendorder(): results = { 'c' : 1, 'nc' : 0, 'ct' : 2, 'ctt' : 3 } for t, trendorder in iteritems(results): assert(util.get_trendorder(t) == trendorder)
def select_order(self, maxlag, ic, trend='c', method='mle'): """ Select the lag order according to the information criterion. Parameters ---------- maxlag : int The highest lag length tried. See `AR.fit`. ic : str {'aic','bic','hqic','t-stat'} Criterion used for selecting the optimal lag length. See `AR.fit`. trend : str {'c','nc'} Whether to include a constant or not. 'c' - include constant. 'nc' - no constant. Returns ------- bestlag : int Best lag according to IC. """ endog = self.endog # make Y and X with same nobs to compare ICs Y = endog[maxlag:] self.Y = Y # attach to get correct fit stats X = self._stackX(maxlag, trend) # sets k_trend self.X = X k = self.k_trend # k_trend set in _stackX k = max(1, k) # handle if startlag is 0 results = {} if ic != 't-stat': for lag in range(k, maxlag+1): # have to reinstantiate the model to keep comparable models endog_tmp = endog[maxlag-lag:] fit = AR(endog_tmp).fit(maxlag=lag, method=method, full_output=0, trend=trend, maxiter=100, disp=0) results[lag] = eval('fit.'+ic) bestic, bestlag = min((res, k) for k, res in iteritems(results)) else: # choose by last t-stat. stop = 1.6448536269514722 # for t-stat, norm.ppf(.95) for lag in range(maxlag, k - 1, -1): # have to reinstantiate the model to keep comparable models endog_tmp = endog[maxlag - lag:] fit = AR(endog_tmp).fit(maxlag=lag, method=method, full_output=0, trend=trend, maxiter=35, disp=-1) bestlag = 0 if np.abs(fit.tvalues[-1]) >= stop: bestlag = lag break return bestlag
def equations(self): eqs = {} for col, ts in iteritems(self.y): # TODO: Remove in favor of statsmodels implemetation model = pd.ols(y=ts, x=self.x, window=self._window, window_type=self._window_type, min_periods=self._min_periods) eqs[col] = model return eqs
def _statistical_coloring(data): """evaluate colors from the indipendence properties of the matrix It will encounter problem if one category has all zeros """ data = _normalize_data(data, None) categories_levels = _categories_level(list(iterkeys(data))) Nlevels = len(categories_levels) total = 1.0 * sum(v for v in itervalues(data)) # count the proportion of observation # for each level that has the given name # at each level levels_count = [] for level_idx in range(Nlevels): proportion = {} for level in categories_levels[level_idx]: proportion[level] = 0.0 for key, value in iteritems(data): if level == key[level_idx]: proportion[level] += value proportion[level] /= total levels_count.append(proportion) # for each key I obtain the expected value # and it's standard deviation from a binomial distribution # under the hipothesys of independence expected = {} for key, value in iteritems(data): base = 1.0 for i, k in enumerate(key): base *= levels_count[i][k] expected[key] = base * total, np.sqrt(total * base * (1.0 - base)) # now we have the standard deviation of distance from the # expected value for each tile. We create the colors from this sigmas = dict((k, (data[k] - m) / s) for k, (m, s) in iteritems(expected)) props = {} for key, dev in iteritems(sigmas): red = 0.0 if dev < 0 else (dev / (1 + dev)) blue = 0.0 if dev > 0 else (dev / (-1 + dev)) green = (1.0 - red - blue) / 2.0 hatch = 'x' if dev > 2 else 'o' if dev < -2 else '' props[key] = {'color': [red, green, blue], 'hatch': hatch} return props
def coefs(self): """ Return dynamic regression coefficients as Panel """ data = {} for eq, result in iteritems(self.equations): data[eq] = result.beta panel = pd.Panel.fromDict(data) # Coefficient names become items return panel.swapaxes('items', 'minor')
def maxlag_selection(self): ''' 最大滞后阶数选择 :return: 不同准则下最优滞后阶数 ''' self.data_prepro() Y, Y_future, Y_exog, Y_exog_future = self.prepare_Y(0, -10) model = VAR(endog=Y, exog=Y_exog) ics = defaultdict(list) for p in range(self.maxlags + 1): result = model._estimate_var(p, offset=self.maxlags - p) for k, v in iteritems(result.info_criteria): ics[k].append(v) selected_orders = dict( (k, np.array(v).argmin()) for k, v in iteritems(ics)) t_str = get_ic_table(ics, selected_orders) t_str += '\n%s' % str(selected_orders) self.model = model self.InfoText.delete(0.0, tkinter.END) self.InfoText.insert(0.0, t_str)
def test_mosaic_very_complex(): # make a scattermatrix of mosaic plots to show the correlations between # each pair of variable in a dataset. Could be easily converted into a # new function that does this automatically based on the type of data key_name = ['gender', 'age', 'health', 'work'] key_base = (['male', 'female'], ['old', 'young'], ['healty', 'ill'], ['work', 'unemployed']) keys = list(product(*key_base)) data = OrderedDict(zip(keys, range(1, 1 + len(keys)))) props = {} props[('male', 'old')] = {'color': 'r'} props[('female', )] = {'color': 'pink'} L = len(key_base) fig, axes = pylab.subplots(L, L) for i in range(L): for j in range(L): m = set(range(L)).difference(set((i, j))) if i == j: axes[i, i].text(0.5, 0.5, key_name[i], ha='center', va='center') axes[i, i].set_xticks([]) axes[i, i].set_xticklabels([]) axes[i, i].set_yticks([]) axes[i, i].set_yticklabels([]) else: ji = max(i, j) ij = min(i, j) temp_data = OrderedDict([((k[ij], k[ji]) + tuple(k[r] for r in m), v) for k, v in iteritems(data)]) keys = list(iterkeys(temp_data)) for k in keys: value = _reduce_dict(temp_data, k[:2]) temp_data[k[:2]] = value del temp_data[k] mosaic(temp_data, ax=axes[i, j], axes_label=False, properties=props, gap=0.05, horizontal=i > j) pylab.suptitle('old males should look bright red, (plot 4 of 4)') #pylab.show() pylab.close('all')
def summary_model(results): """ Create a dict with information about the model """ def time_now(*args, **kwds): now = datetime.datetime.now() return now.strftime('%Y-%m-%d %H:%M') info = {} info['Model:'] = lambda x: x.model.__class__.__name__ info['Model Family:'] = lambda x: x.family.__class.__name__ info['Link Function:'] = lambda x: x.family.link.__class__.__name__ info['Dependent Variable:'] = lambda x: x.model.endog_names info['Date:'] = time_now info['No. Observations:'] = lambda x: "%#6d" % x.nobs info['Df Model:'] = lambda x: "%#6d" % x.df_model info['Df Residuals:'] = lambda x: "%#6d" % x.df_resid info['Converged:'] = lambda x: x.mle_retvals['converged'] info['No. Iterations:'] = lambda x: x.mle_retvals['iterations'] info['Method:'] = lambda x: x.method info['Norm:'] = lambda x: x.fit_options['norm'] info['Scale Est.:'] = lambda x: x.fit_options['scale_est'] info['Cov. Type:'] = lambda x: x.fit_options['cov'] rsquared_type = '' if results.k_constant else ' (uncentered)' info['R-squared' + rsquared_type + ':'] = lambda x: "%#8.3f" % x.rsquared info['Adj. R-squared' + rsquared_type + ':'] = lambda x: "%#8.3f" % x.rsquared_adj # noqa:E501 info['Pseudo R-squared:'] = lambda x: "%#8.3f" % x.prsquared info['AIC:'] = lambda x: "%8.4f" % x.aic info['BIC:'] = lambda x: "%8.4f" % x.bic info['Log-Likelihood:'] = lambda x: "%#8.5g" % x.llf info['LL-Null:'] = lambda x: "%#8.5g" % x.llnull info['LLR p-value:'] = lambda x: "%#8.5g" % x.llr_pvalue info['Deviance:'] = lambda x: "%#8.5g" % x.deviance info['Pearson chi2:'] = lambda x: "%#6.3g" % x.pearson_chi2 info['F-statistic:'] = lambda x: "%#8.4g" % x.fvalue info['Prob (F-statistic):'] = lambda x: "%#6.3g" % x.f_pvalue info['Scale:'] = lambda x: "%#8.5g" % x.scale out = {} for key, func in iteritems(info): try: out[key] = func(results) except (AttributeError, KeyError, NotImplementedError): # NOTE: some models do not have loglike defined (RLM), # so raise NotImplementedError pass return out
def test_adf_autolag(): #see issue #246 #this is mostly a unit test d2 = macrodata.load().data for k_trend, tr in enumerate(['nc', 'c', 'ct', 'ctt']): #[None:'nc', 0:'c', 1:'ct', 2:'ctt'] x = np.log(d2['realgdp']) xd = np.diff(x) #check exog adf3 = tsast.adfuller(x, maxlag=None, autolag='aic', regression=tr, store=True, regresults=True) st2 = adf3[-1] assert_equal(len(st2.autolag_results), 15 + 1) #+1 for lagged level for l, res in sorted(iteritems(st2.autolag_results))[:5]: lag = l - k_trend #assert correct design matrices in _autolag assert_equal(res.model.exog[-10:, k_trend], x[-11:-1]) assert_equal(res.model.exog[-1, k_trend + 1:], xd[-lag:-1][::-1]) #min-ic lag of dfgls in Stata is also 2, or 9 for maic with notrend assert_equal(st2.usedlag, 2) #same result with lag fixed at usedlag of autolag adf2 = tsast.adfuller(x, maxlag=2, autolag=None, regression=tr) assert_almost_equal(adf3[:2], adf2[:2], decimal=12) tr = 'c' #check maxlag with autolag adf3 = tsast.adfuller(x, maxlag=5, autolag='aic', regression=tr, store=True, regresults=True) assert_equal(len(adf3[-1].autolag_results), 5 + 1) adf3 = tsast.adfuller(x, maxlag=0, autolag='aic', regression=tr, store=True, regresults=True) assert_equal(len(adf3[-1].autolag_results), 0 + 1)
def summary_model(results): '''Create a dict with information about the model ''' def time_now(*args, **kwds): now = datetime.datetime.now() return now.strftime('%Y-%m-%d %H:%M') info = OrderedDict() info['Model:'] = lambda x: x.model.__class__.__name__ info['Model Family:'] = lambda x: x.family.__class.__name__ info['Link Function:'] = lambda x: x.family.link.__class__.__name__ info['Dependent Variable:'] = lambda x: x.model.endog_names info['Date:'] = time_now info['No. Observations:'] = lambda x: "%#6d" % x.nobs info['Df Model:'] = lambda x: "%#6d" % x.df_model info['Df Residuals:'] = lambda x: "%#6d" % x.df_resid info['Converged:'] = lambda x: x.mle_retvals['converged'] info['No. Iterations:'] = lambda x: x.mle_retvals['iterations'] info['Method:'] = lambda x: x.method info['Norm:'] = lambda x: x.fit_options['norm'] info['Scale Est.:'] = lambda x: x.fit_options['scale_est'] info['Cov. Type:'] = lambda x: x.fit_options['cov'] rsquared_type = '' if results.k_constant else ' (uncentered)' info['R-squared' + rsquared_type + ':'] = lambda x: "%#8.3f" % x.rsquared info['Adj. R-squared' + rsquared_type + ':'] = lambda x: "%#8.3f" % x.rsquared_adj info['Pseudo R-squared:'] = lambda x: "%#8.3f" % x.prsquared info['AIC:'] = lambda x: "%8.4f" % x.aic info['BIC:'] = lambda x: "%8.4f" % x.bic info['Log-Likelihood:'] = lambda x: "%#8.5g" % x.llf info['LL-Null:'] = lambda x: "%#8.5g" % x.llnull info['LLR p-value:'] = lambda x: "%#8.5g" % x.llr_pvalue info['Deviance:'] = lambda x: "%#8.5g" % x.deviance info['Pearson chi2:'] = lambda x: "%#6.3g" % x.pearson_chi2 info['F-statistic:'] = lambda x: "%#8.4g" % x.fvalue info['Prob (F-statistic):'] = lambda x: "%#6.3g" % x.f_pvalue info['Scale:'] = lambda x: "%#8.5g" % x.scale out = OrderedDict() for key, func in iteritems(info): try: out[key] = func(results) except (AttributeError, KeyError, NotImplementedError): # NOTE: some models don't have loglike defined (RLM), # so raise NotImplementedError pass return out
def _recode(x, levels): """ Recode categorial data to int factor. Parameters ---------- x : array-like array like object supporting with numpy array methods of categorially coded data. levels : dict mapping of labels to integer-codings Returns ------- out : instance numpy.ndarray """ from pandas import Series name = None index = None if isinstance(x, Series): name = x.name index = x.index x = x.values if x.dtype.type not in [np.str_, np.object_]: raise ValueError('This is not a categorial factor.' ' Array of str type required.') elif not isinstance(levels, dict): raise ValueError('This is not a valid value for levels.' ' Dict required.') elif not (np.unique(x) == np.unique(list(iterkeys(levels)))).all(): raise ValueError('The levels do not match the array values.') else: out = np.empty(x.shape[0], dtype=np.int) for level, coding in iteritems(levels): out[x == level] = coding if name: out = Series(out, name=name, index=index) return out
def _key_splitting(rect_dict, keys, values, key_subset, horizontal, gap): """ Given a dictionary where each entry is a rectangle, a list of key and value (count of elements in each category) it split each rect accordingly, as long as the key start with the tuple key_subset. The other keys are returned without modification. """ result = OrderedDict() L = len(key_subset) for name, (x, y, w, h) in iteritems(rect_dict): if key_subset == name[:L]: # split base on the values given divisions = _split_rect(x, y, w, h, values, horizontal, gap) for key, rect in zip(keys, divisions): result[name + (key, )] = rect else: result[name] = (x, y, w, h) return result
def test_multi_pvalcorrection(self): #test against R package multtest mt.rawp2adjp res_multtest = self.res2 pval0 = res_multtest[:,0] for k,v in iteritems(rmethods): if v[1] in self.methods: reject, pvalscorr = multipletests(pval0, alpha=self.alpha, method=v[1])[:2] assert_almost_equal(pvalscorr, res_multtest[:,v[0]], 15) assert_equal(reject, pvalscorr <= self.alpha) pvalscorr = np.sort(fdrcorrection(pval0, method='n')[1]) assert_almost_equal(pvalscorr, res_multtest[:,7], 15) pvalscorr = np.sort(fdrcorrection(pval0, method='i')[1]) assert_almost_equal(pvalscorr, res_multtest[:,6], 15)
def test_multi_pvalcorrection(self): #test against R package multtest mt.rawp2adjp res_multtest = self.res2 pval0 = res_multtest[:, 0] for k, v in iteritems(rmethods): if v[1] in self.methods: reject, pvalscorr = multipletests(pval0, alpha=self.alpha, method=v[1])[:2] assert_almost_equal(pvalscorr, res_multtest[:, v[0]], 15) assert_equal(reject, pvalscorr <= self.alpha) pvalscorr = np.sort(fdrcorrection(pval0, method='n')[1]) assert_almost_equal(pvalscorr, res_multtest[:, 7], 15) pvalscorr = np.sort(fdrcorrection(pval0, method='i')[1]) assert_almost_equal(pvalscorr, res_multtest[:, 6], 15)
def _key_splitting(rect_dict, keys, values, key_subset, horizontal, gap): """ Given a dictionary where each entry is a rectangle, a list of key and value (count of elements in each category) it split each rect accordingly, as long as the key start with the tuple key_subset. The other keys are returned without modification. """ result = OrderedDict() L = len(key_subset) for name, (x, y, w, h) in iteritems(rect_dict): if key_subset == name[:L]: # split base on the values given divisions = _split_rect(x, y, w, h, values, horizontal, gap) for key, rect in zip(keys, divisions): result[name + (key,)] = rect else: result[name] = (x, y, w, h) return result
def summary_model(results): '''Create a dict with information about the model ''' def time_now(**kwrds): now = datetime.datetime.now() return now.strftime('%Y-%m-%d %H:%M') info = OrderedDict() info['Model:'] = lambda x: x.model.__class__.__name__ info['Model Family:'] = lambda x: x.family.__class.__name__ info['Link Function:'] = lambda x: x.family.link.__class__.__name__ info['Dependent Variable:'] = lambda x: x.model.endog_names info['Date:'] = time_now() info['No. Observations:'] = lambda x: "%#6d" % x.nobs info['Df Model:'] = lambda x: "%#6d" % x.df_model info['Df Residuals:'] = lambda x: "%#6d" % x.df_resid info['Converged:'] = lambda x: x.mle_retvals['converged'] info['No. Iterations:'] = lambda x: x.mle_retvals['iterations'] info['Method:'] = lambda x: x.method info['Norm:'] = lambda x: x.fit_options['norm'] info['Scale Est.:'] = lambda x: x.fit_options['scale_est'] info['Cov. Type:'] = lambda x: x.fit_options['cov'] info['R-squared:'] = lambda x: "%#8.3f" % x.rsquared info['Adj. R-squared:'] = lambda x: "%#8.3f" % x.rsquared_adj info['Pseudo R-squared:'] = lambda x: "%#8.3f" % x.prsquared info['AIC:'] = lambda x: "%8.4f" % x.aic info['BIC:'] = lambda x: "%8.4f" % x.bic info['Log-Likelihood:'] = lambda x: "%#8.5g" % x.llf info['LL-Null:'] = lambda x: "%#8.5g" % x.llnull info['LLR p-value:'] = lambda x: "%#8.5g" % x.llr_pvalue info['Deviance:'] = lambda x: "%#8.5g" % x.deviance info['Pearson chi2:'] = lambda x: "%#6.3g" % x.pearson_chi2 info['F-statistic:'] = lambda x: "%#8.4g" % x.fvalue info['Prob (F-statistic):'] = lambda x: "%#6.3g" % x.f_pvalue info['Scale:'] = lambda x: "%#8.5g" % x.scale out = OrderedDict() for key in iteritems(info): try: out[key] = info[key](results) except: pass return out
def test_mosaic_very_complex(): # make a scattermatrix of mosaic plots to show the correlations between # each pair of variable in a dataset. Could be easily converted into a # new function that does this automatically based on the type of data key_name = ['gender', 'age', 'health', 'work'] key_base = (['male', 'female'], ['old', 'young'], ['healty', 'ill'], ['work', 'unemployed']) keys = list(product(*key_base)) data = OrderedDict(zip(keys, range(1, 1 + len(keys)))) props = {} props[('male', 'old')] = {'color': 'r'} props[('female',)] = {'color': 'pink'} L = len(key_base) fig, axes = pylab.subplots(L, L) for i in range(L): for j in range(L): m = set(range(L)).difference(set((i, j))) if i == j: axes[i, i].text(0.5, 0.5, key_name[i], ha='center', va='center') axes[i, i].set_xticks([]) axes[i, i].set_xticklabels([]) axes[i, i].set_yticks([]) axes[i, i].set_yticklabels([]) else: ji = max(i, j) ij = min(i, j) temp_data = OrderedDict([((k[ij], k[ji]) + tuple(k[r] for r in m), v) for k, v in iteritems(data)]) keys = list(iterkeys(temp_data)) for k in keys: value = _reduce_dict(temp_data, k[:2]) temp_data[k[:2]] = value del temp_data[k] mosaic(temp_data, ax=axes[i, j], axes_label=False, properties=props, gap=0.05, horizontal=i > j) pylab.suptitle('old males should look bright red, (plot 4 of 4)') #pylab.show() pylab.close('all')
def __init__(self, endog, exog, tree, paramsind): self.endog = endog self.datadict = exog self.tree = tree self.paramsind = paramsind self.branchsum = '' self.probs = {} self.probstxt = {} self.branchleaves = {} self.branchvalues = {} #just to keep track of returns by branches self.branchsums = {} self.bprobs = {} self.branches, self.leaves, self.branches_degenerate = getnodes(tree) self.nbranches = len(self.branches) #copied over but not quite sure yet #unique, parameter array names, #sorted alphabetically, order is/should be only internal self.paramsnames = (sorted(set([i for j in itervalues(paramsind) for i in j])) + ['tau_%s' % bname for bname in self.branches]) self.nparams = len(self.paramsnames) #mapping coefficient names to indices to unique/parameter array self.paramsidx = dict((name, idx) for (idx,name) in enumerate(self.paramsnames)) #mapping branch and leaf names to index in parameter array self.parinddict = dict((k, [self.paramsidx[j] for j in v]) for k,v in iteritems(self.paramsind)) self.recursionparams = 1. + np.arange(len(self.paramsnames)) #for testing that individual parameters are used in the right place self.recursionparams = np.zeros(len(self.paramsnames)) #self.recursionparams[2] = 1 self.recursionparams[-self.nbranches:] = 1 #values for tau's
def __init__(self, endog, exog, tree, paramsind): self.endog = endog self.datadict = exog self.tree = tree self.paramsind = paramsind self.branchsum = '' self.probs = {} self.probstxt = {} self.branchleaves = {} self.branchvalues = {} #just to keep track of returns by branches self.branchsums = {} self.bprobs = {} self.branches, self.leaves, self.branches_degenerate = getnodes(tree) self.nbranches = len(self.branches) #copied over but not quite sure yet #unique, parameter array names, #sorted alphabetically, order is/should be only internal self.paramsnames = ( sorted(set([i for j in itervalues(paramsind) for i in j])) + ['tau_%s' % bname for bname in self.branches]) self.nparams = len(self.paramsnames) #mapping coefficient names to indices to unique/parameter array self.paramsidx = dict( (name, idx) for (idx, name) in enumerate(self.paramsnames)) #mapping branch and leaf names to index in parameter array self.parinddict = dict((k, [self.paramsidx[j] for j in v]) for k, v in iteritems(self.paramsind)) self.recursionparams = 1. + np.arange(len(self.paramsnames)) #for testing that individual parameters are used in the right place self.recursionparams = np.zeros(len(self.paramsnames)) #self.recursionparams[2] = 1 self.recursionparams[-self.nbranches:] = 1 #values for tau's
def test_adf_autolag(): #see issue #246 #this is mostly a unit test d2 = macrodata.load().data for k_trend, tr in enumerate(['nc', 'c', 'ct', 'ctt']): #[None:'nc', 0:'c', 1:'ct', 2:'ctt'] x = np.log(d2['realgdp']) xd = np.diff(x) #check exog adf3 = tsast.adfuller(x, maxlag=None, autolag='aic', regression=tr, store=True, regresults=True) st2 = adf3[-1] assert_equal(len(st2.autolag_results), 15 + 1) #+1 for lagged level for l, res in sorted(iteritems(st2.autolag_results))[:5]: lag = l-k_trend #assert correct design matrices in _autolag assert_equal(res.model.exog[-10:,k_trend], x[-11:-1]) assert_equal(res.model.exog[-1,k_trend+1:], xd[-lag:-1][::-1]) #min-ic lag of dfgls in Stata is also 2, or 9 for maic with notrend assert_equal(st2.usedlag, 2) #same result with lag fixed at usedlag of autolag adf2 = tsast.adfuller(x, maxlag=2, autolag=None, regression=tr) assert_almost_equal(adf3[:2], adf2[:2], decimal=12) tr = 'c' #check maxlag with autolag adf3 = tsast.adfuller(x, maxlag=5, autolag='aic', regression=tr, store=True, regresults=True) assert_equal(len(adf3[-1].autolag_results), 5 + 1) adf3 = tsast.adfuller(x, maxlag=0, autolag='aic', regression=tr, store=True, regresults=True) assert_equal(len(adf3[-1].autolag_results), 0 + 1)
def __init__(self, **kwargs): for key, value in iteritems(kwargs): setattr(self, key, value)
def resid(self): data = {} for eq, result in iteritems(self.equations): data[eq] = result.resid return pd.DataFrame(data)
def r2(self): """Returns the r-squared values.""" data = dict((eq, r.r2) for eq, r in iteritems(self.equations)) return pd.DataFrame(data)
def nobs(self): # Stub, do I need this? data = dict((eq, r.nobs) for eq, r in iteritems(self.equations)) return pd.DataFrame(data)
def _autolag(mod, endog, exog, startlag, maxlag, method, modargs=(), fitargs=(), regresults=False): """ Returns the results for the lag length that maximizes the info criterion. Parameters ---------- mod : Model class Model estimator class endog : array-like nobs array containing endogenous variable exog : array-like nobs by (startlag + maxlag) array containing lags and possibly other variables startlag : int The first zero-indexed column to hold a lag. See Notes. maxlag : int The highest lag order for lag length selection. method : {'aic', 'bic', 't-stat'} aic - Akaike Information Criterion bic - Bayes Information Criterion t-stat - Based on last lag modargs : tuple, optional args to pass to model. See notes. fitargs : tuple, optional args to pass to fit. See notes. regresults : bool, optional Flag indicating to return optional return results Returns ------- icbest : float Best information criteria. bestlag : int The lag length that maximizes the information criterion. results : dict, optional Dictionary containing all estimation results Notes ----- Does estimation like mod(endog, exog[:,:i], *modargs).fit(*fitargs) where i goes from lagstart to lagstart+maxlag+1. Therefore, lags are assumed to be in contiguous columns from low to high lag length with the highest lag in the last column. """ #TODO: can tcol be replaced by maxlag + 2? #TODO: This could be changed to laggedRHS and exog keyword arguments if # this will be more general. results = {} method = method.lower() for lag in range(startlag, startlag + maxlag + 1): mod_instance = mod(endog, exog[:, :lag], *modargs) results[lag] = mod_instance.fit() if method == "aic": icbest, bestlag = min((v.aic, k) for k, v in iteritems(results)) elif method == "bic": icbest, bestlag = min((v.bic, k) for k, v in iteritems(results)) elif method == "t-stat": #stop = stats.norm.ppf(.95) stop = 1.6448536269514722 for lag in range(startlag + maxlag, startlag - 1, -1): icbest = np.abs(results[lag].tvalues[-1]) if np.abs(icbest) >= stop: bestlag = lag icbest = icbest break else: raise ValueError("Information Criterion %s not understood.") % method if not regresults: return icbest, bestlag else: return icbest, bestlag, results
def solve_power(self, **kwds): '''solve for any one of the parameters of a t-test for t-test the keywords are: effect_size, nobs, alpha, power exactly one needs to be ``None``, all others need numeric values *attaches* cache_fit_res : list Cache of the result of the root finding procedure for the latest call to ``solve_power``, mainly for debugging purposes. The first element is the success indicator, one if successful. The remaining elements contain the return information of the up to three solvers that have been tried. ''' #TODO: maybe use explicit kwds, # nicer but requires inspect? and not generic across tests # I'm duplicating this in the subclass to get informative docstring key = [k for k,v in iteritems(kwds) if v is None] #print kwds, key; if len(key) != 1: raise ValueError('need exactly one keyword that is None') key = key[0] if key == 'power': del kwds['power'] return self.power(**kwds) self._counter = 0 def func(x): kwds[key] = x fval = self._power_identity(**kwds) self._counter += 1 #print self._counter, if self._counter > 500: raise RuntimeError('possible endless loop (500 NaNs)') if np.isnan(fval): return np.inf else: return fval #TODO: I'm using the following so I get a warning when start_ttp is not defined try: start_value = self.start_ttp[key] except KeyError: start_value = 0.9 print('Warning: using default start_value for {0}'.format(key)) fit_kwds = self.start_bqexp[key] fit_res = [] #print vars() try: val, res = brentq_expanding(func, full_output=True, **fit_kwds) failed = False fit_res.append(res) except ValueError: failed = True fit_res.append(None) success = None if (not failed) and res.converged: success = 1 else: # try backup #TODO: check more cases to make this robust val, infodict, ier, msg = optimize.fsolve(func, start_value, full_output=True) #scalar #val = optimize.newton(func, start_value) #scalar fval = infodict['fvec'] fit_res.append(infodict) if ier == 1 and np.abs(fval) < 1e-4 : success = 1 else: #print infodict if key in ['alpha', 'power', 'effect_size']: val, r = optimize.brentq(func, 1e-8, 1-1e-8, full_output=True) #scalar success = 1 if r.converged else 0 fit_res.append(r) else: success = 0 if not success == 1: import warnings from statsmodels.tools.sm_exceptions import (ConvergenceWarning, convergence_doc) warnings.warn(convergence_doc, ConvergenceWarning) #attach fit_res, for reading only, should be needed only for debugging fit_res.insert(0, success) self.cache_fit_res = fit_res return val
from statsmodels.datasets import macrodata import statsmodels.tsa.stattools as tsa_stats # some example data mdata = macrodata.load_pandas().data mdata = mdata[['realgdp','realcons']].values data = mdata data = np.diff(np.log(data), axis=0) #R: lmtest:grangertest r_result = [0.243097, 0.7844328, 195, 2] #f_test gr = tsa_stats.grangercausalitytests(data[:,1::-1], 2, verbose=False) assert_almost_equal(r_result, gr[2][0]['ssr_ftest'], decimal=7) assert_almost_equal(gr[2][0]['params_ftest'], gr[2][0]['ssr_ftest'], decimal=7) lag = 2 print('\nTest Results for %d lags' % lag) print() print('\n'.join(['%-20s statistic: %f6.4 p-value: %f6.4' % (k, res[0], res[1]) for k, res in iteritems(gr[lag][0]) ])) print('\n Results for auxiliary restricted regression with two lags') print() print(gr[lag][1][0].summary()) print('\n Results for auxiliary unrestricted regression with two lags') print() print(gr[lag][1][1].summary())
def arma_order_select_ic(y, max_ar=4, max_ma=2, ic='bic', trend='c', model_kw={}, fit_kw={}): """ Returns information criteria for many ARMA models Parameters ---------- y : array-like Time-series data max_ar : int Maximum number of AR lags to use. Default 4. max_ma : int Maximum number of MA lags to use. Default 2. ic : str, list Information criteria to report. Either a single string or a list of different criteria is possible. trend : str The trend to use when fitting the ARMA models. model_kw : dict Keyword arguments to be passed to the ``ARMA`` model fit_kw : dict Keyword arguments to be passed to ``ARMA.fit``. Returns ------- obj : Results object Each ic is an attribute with a DataFrame for the results. The AR order used is the row index. The ma order used is the column index. The minimum orders are available as ``ic_min_order``. Examples -------- >>> from statsmodels.tsa.arima_process import arma_generate_sample >>> import statsmodels.api as sm >>> import numpy as np >>> arparams = np.array([.75, -.25]) >>> maparams = np.array([.65, .35]) >>> arparams = np.r_[1, -arparams] >>> maparam = np.r_[1, maparams] >>> nobs = 250 >>> np.random.seed(2014) >>> y = arma_generate_sample(arparams, maparams, nobs) >>> res = sm.tsa.arma_order_select_ic(y, ic=['aic', 'bic'], trend='nc') >>> res.aic_min_order >>> res.bic_min_order Notes ----- This method can be used to tentatively identify the order of an ARMA from process, provided that the time series is stationary and invertible. This function computes the full exact MLE estimate of each model and can be, therefore a little slow. An implementation using approximate estimates will be provided in the future. In the meantime, consider passing {method : 'css'} to fit_kw. """ from pandas import DataFrame ar_range = lrange(0, max_ar + 1) ma_range = lrange(0, max_ma + 1) if isinstance(ic, string_types): ic = [ic] elif not isinstance(ic, (list, tuple)): raise ValueError("Need a list or a tuple for ic if not a string.") results = np.zeros((len(ic), max_ar + 1, max_ma + 1)) for ar in ar_range: for ma in ma_range: if ar == 0 and ma == 0 and trend == 'nc': results[:, ar, ma] = np.nan continue mod = _safe_arma_fit(y, (ar, ma), model_kw, trend, fit_kw) if mod is None: results[:, ar, ma] = np.nan continue for i, criteria in enumerate(ic): results[i, ar, ma] = getattr(mod, criteria) dfs = [DataFrame(res, columns=ma_range, index=ar_range) for res in results] res = dict(zip(ic, dfs)) # add the minimums to the results dict min_res = {} for i, result in iteritems(res): mins = np.where(result.min().min() == result) min_res.update({i + '_min_order' : (mins[0][0], mins[1][0])}) res.update(min_res) return Bunch(**res)
def __init__(self, **kwargs): for key, value in iteritems(kwargs): setattr(self, key, value) self.nobs = len(self.observed)
def _autolag(mod, endog, exog, startlag, maxlag, method, modargs=(), fitargs=(), regresults=False): """ Returns the results for the lag length that maximimizes the info criterion. Parameters ---------- mod : Model class Model estimator class. modargs : tuple args to pass to model. See notes. fitargs : tuple args to pass to fit. See notes. lagstart : int The first zero-indexed column to hold a lag. See Notes. maxlag : int The highest lag order for lag length selection. method : str {"aic","bic","t-stat"} aic - Akaike Information Criterion bic - Bayes Information Criterion t-stat - Based on last lag Returns ------- icbest : float Best information criteria. bestlag : int The lag length that maximizes the information criterion. Notes ----- Does estimation like mod(endog, exog[:,:i], *modargs).fit(*fitargs) where i goes from lagstart to lagstart+maxlag+1. Therefore, lags are assumed to be in contiguous columns from low to high lag length with the highest lag in the last column. """ #TODO: can tcol be replaced by maxlag + 2? #TODO: This could be changed to laggedRHS and exog keyword arguments if # this will be more general. results = {} method = method.lower() for lag in range(startlag, startlag + maxlag + 1): mod_instance = mod(endog, exog[:, :lag], *modargs) results[lag] = mod_instance.fit() if method == "aic": icbest, bestlag = min((v.aic, k) for k, v in iteritems(results)) elif method == "bic": icbest, bestlag = min((v.bic, k) for k, v in iteritems(results)) elif method == "t-stat": #stop = stats.norm.ppf(.95) stop = 1.6448536269514722 for lag in range(startlag + maxlag, startlag - 1, -1): icbest = np.abs(results[lag].tvalues[-1]) if np.abs(icbest) >= stop: bestlag = lag icbest = icbest break else: raise ValueError("Information Criterion %s not understood.") % method if not regresults: return icbest, bestlag else: return icbest, bestlag, results
def handle_missing(cls, endog, exog, missing, **kwargs): """ This returns a dictionary with keys endog, exog and the keys of kwargs. It preserves Nones. """ none_array_names = [] # patsy's already dropped NaNs in y/X missing_idx = kwargs.pop('missing_idx', None) if missing_idx is not None: # y, X already handled by patsy. add back in later. combined = () combined_names = [] if exog is None: none_array_names += ['exog'] elif exog is not None: combined = (endog, exog) combined_names = ['endog', 'exog'] else: combined = (endog, ) combined_names = ['endog'] none_array_names += ['exog'] # deal with other arrays combined_2d = () combined_2d_names = [] if len(kwargs): for key, value_array in iteritems(kwargs): if value_array is None or value_array.ndim == 0: none_array_names += [key] continue # grab 1d arrays if value_array.ndim == 1: combined += (np.asarray(value_array), ) combined_names += [key] elif value_array.squeeze().ndim == 1: combined += (np.asarray(value_array), ) combined_names += [key] # grab 2d arrays that are _assumed_ to be symmetric elif value_array.ndim == 2: combined_2d += (np.asarray(value_array), ) combined_2d_names += [key] else: raise ValueError("Arrays with more than 2 dimensions " "aren't yet handled") if missing_idx is not None: nan_mask = missing_idx updated_row_mask = None if combined: # there were extra arrays not handled by patsy combined_nans = _nan_rows(*combined) if combined_nans.shape[0] != nan_mask.shape[0]: raise ValueError("Shape mismatch between endog/exog " "and extra arrays given to model.") # for going back and updated endog/exog updated_row_mask = combined_nans[~nan_mask] nan_mask |= combined_nans # for updating extra arrays only if combined_2d: combined_2d_nans = _nan_rows(combined_2d) if combined_2d_nans.shape[0] != nan_mask.shape[0]: raise ValueError("Shape mismatch between endog/exog " "and extra 2d arrays given to model.") if updated_row_mask is not None: updated_row_mask |= combined_2d_nans[~nan_mask] else: updated_row_mask = combined_2d_nans[~nan_mask] nan_mask |= combined_2d_nans else: nan_mask = _nan_rows(*combined) if combined_2d: nan_mask = _nan_rows(*(nan_mask[:, None], ) + combined_2d) if not np.any(nan_mask): # no missing don't do anything combined = dict(zip(combined_names, combined)) if combined_2d: combined.update(dict(zip(combined_2d_names, combined_2d))) if none_array_names: combined.update( dict(zip(none_array_names, [None] * len(none_array_names)))) if missing_idx is not None: combined.update({'endog': endog}) if exog is not None: combined.update({'exog': exog}) return combined, [] elif missing == 'raise': raise MissingDataError("NaNs were encountered in the data") elif missing == 'drop': nan_mask = ~nan_mask drop_nans = lambda x: cls._drop_nans(x, nan_mask) drop_nans_2d = lambda x: cls._drop_nans_2d(x, nan_mask) combined = dict(zip(combined_names, lmap(drop_nans, combined))) if missing_idx is not None: if updated_row_mask is not None: updated_row_mask = ~updated_row_mask # update endog/exog with this new information endog = cls._drop_nans(endog, updated_row_mask) if exog is not None: exog = cls._drop_nans(exog, updated_row_mask) combined.update({'endog': endog}) if exog is not None: combined.update({'exog': exog}) if combined_2d: combined.update( dict( zip(combined_2d_names, lmap(drop_nans_2d, combined_2d)))) if none_array_names: combined.update( dict(zip(none_array_names, [None] * len(none_array_names)))) return combined, np.where(~nan_mask)[0].tolist() else: raise ValueError("missing option %s not understood" % missing)
def solve_power(self, **kwds): '''solve for any one of the parameters of a t-test for t-test the keywords are: effect_size, nobs, alpha, power exactly one needs to be ``None``, all others need numeric values *attaches* cache_fit_res : list Cache of the result of the root finding procedure for the latest call to ``solve_power``, mainly for debugging purposes. The first element is the success indicator, one if successful. The remaining elements contain the return information of the up to three solvers that have been tried. ''' #TODO: maybe use explicit kwds, # nicer but requires inspect? and not generic across tests # I'm duplicating this in the subclass to get informative docstring key = [k for k, v in iteritems(kwds) if v is None] #print kwds, key if len(key) != 1: raise ValueError('need exactly one keyword that is None') key = key[0] if key == 'power': del kwds['power'] return self.power(**kwds) if kwds['effect_size'] == 0: import warnings from statsmodels.tools.sm_exceptions import HypothesisTestWarning warnings.warn('Warning: Effect size of 0 detected', HypothesisTestWarning) if key == 'power': return kwds['alpha'] if key == 'alpha': return kwds['power'] else: raise ValueError( 'Cannot detect an effect-size of 0. Try changing your effect-size.' ) self._counter = 0 def func(x): kwds[key] = x fval = self._power_identity(**kwds) self._counter += 1 #print self._counter, if self._counter > 500: raise RuntimeError('possible endless loop (500 NaNs)') if np.isnan(fval): return np.inf else: return fval #TODO: I'm using the following so I get a warning when start_ttp is not defined try: start_value = self.start_ttp[key] except KeyError: start_value = 0.9 import warnings from statsmodels.tools.sm_exceptions import ValueWarning warnings.warn( 'Warning: using default start_value for {0}'.format(key), ValueWarning) fit_kwds = self.start_bqexp[key] fit_res = [] #print vars() try: val, res = brentq_expanding(func, full_output=True, **fit_kwds) failed = False fit_res.append(res) except ValueError: failed = True fit_res.append(None) success = None if (not failed) and res.converged: success = 1 else: # try backup # TODO: check more cases to make this robust if not np.isnan(start_value): val, infodict, ier, msg = optimize.fsolve( func, start_value, full_output=True) #scalar #val = optimize.newton(func, start_value) #scalar fval = infodict['fvec'] fit_res.append(infodict) else: ier = -1 fval = 1 fit_res.append([None]) if ier == 1 and np.abs(fval) < 1e-4: success = 1 else: #print infodict if key in ['alpha', 'power', 'effect_size']: val, r = optimize.brentq(func, 1e-8, 1 - 1e-8, full_output=True) #scalar success = 1 if r.converged else 0 fit_res.append(r) else: success = 0 if not success == 1: import warnings from statsmodels.tools.sm_exceptions import (ConvergenceWarning, convergence_doc) warnings.warn(convergence_doc, ConvergenceWarning) #attach fit_res, for reading only, should be needed only for debugging fit_res.insert(0, success) self.cache_fit_res = fit_res return val
def handle_missing(cls, endog, exog, missing, **kwargs): """ This returns a dictionary with keys endog, exog and the keys of kwargs. It preserves Nones. """ none_array_names = [] # patsy's already dropped NaNs in y/X missing_idx = kwargs.pop('missing_idx', None) if missing_idx is not None: # y, X already handled by patsy. add back in later. combined = () combined_names = [] if exog is None: none_array_names += ['exog'] elif exog is not None: combined = (endog, exog) combined_names = ['endog', 'exog'] else: combined = (endog,) combined_names = ['endog'] none_array_names += ['exog'] # deal with other arrays combined_2d = () combined_2d_names = [] if len(kwargs): for key, value_array in iteritems(kwargs): if value_array is None or value_array.ndim == 0: none_array_names += [key] continue # grab 1d arrays if value_array.ndim == 1: combined += (np.asarray(value_array),) combined_names += [key] elif value_array.squeeze().ndim == 1: combined += (np.asarray(value_array),) combined_names += [key] # grab 2d arrays that are _assumed_ to be symmetric elif value_array.ndim == 2: combined_2d += (np.asarray(value_array),) combined_2d_names += [key] else: raise ValueError("Arrays with more than 2 dimensions " "aren't yet handled") if missing_idx is not None: nan_mask = missing_idx updated_row_mask = None if combined: # there were extra arrays not handled by patsy combined_nans = _nan_rows(*combined) if combined_nans.shape[0] != nan_mask.shape[0]: raise ValueError("Shape mismatch between endog/exog " "and extra arrays given to model.") # for going back and updated endog/exog updated_row_mask = combined_nans[~nan_mask] nan_mask |= combined_nans # for updating extra arrays only if combined_2d: combined_2d_nans = _nan_rows(combined_2d) if combined_2d_nans.shape[0] != nan_mask.shape[0]: raise ValueError("Shape mismatch between endog/exog " "and extra 2d arrays given to model.") if updated_row_mask is not None: updated_row_mask |= combined_2d_nans[~nan_mask] else: updated_row_mask = combined_2d_nans[~nan_mask] nan_mask |= combined_2d_nans else: nan_mask = _nan_rows(*combined) if combined_2d: nan_mask = _nan_rows(*(nan_mask[:, None],) + combined_2d) if not np.any(nan_mask): # no missing don't do anything combined = dict(zip(combined_names, combined)) if combined_2d: combined.update(dict(zip(combined_2d_names, combined_2d))) if none_array_names: combined.update(dict(zip(none_array_names, [None] * len(none_array_names)))) if missing_idx is not None: combined.update({'endog': endog}) if exog is not None: combined.update({'exog': exog}) return combined, [] elif missing == 'raise': raise MissingDataError("NaNs were encountered in the data") elif missing == 'drop': nan_mask = ~nan_mask drop_nans = lambda x: cls._drop_nans(x, nan_mask) drop_nans_2d = lambda x: cls._drop_nans_2d(x, nan_mask) combined = dict(zip(combined_names, lmap(drop_nans, combined))) if missing_idx is not None: if updated_row_mask is not None: updated_row_mask = ~updated_row_mask # update endog/exog with this new information endog = cls._drop_nans(endog, updated_row_mask) if exog is not None: exog = cls._drop_nans(exog, updated_row_mask) combined.update({'endog': endog}) if exog is not None: combined.update({'exog': exog}) if combined_2d: combined.update(dict(zip(combined_2d_names, lmap(drop_nans_2d, combined_2d)))) if none_array_names: combined.update(dict(zip(none_array_names, [None] * len(none_array_names)))) return combined, np.where(~nan_mask)[0].tolist() else: raise ValueError("missing option %s not understood" % missing)
def arma_order_select_ic(y, max_ar=4, max_ma=2, ic='bic', trend='c', model_kw={}, fit_kw={}): """ Returns information criteria for many ARMA models Parameters ---------- y : array-like Time-series data max_ar : int Maximum number of AR lags to use. Default 4. max_ma : int Maximum number of MA lags to use. Default 2. ic : str, list Information criteria to report. Either a single string or a list of different criteria is possible. trend : str The trend to use when fitting the ARMA models. model_kw : dict Keyword arguments to be passed to the ``ARMA`` model fit_kw : dict Keyword arguments to be passed to ``ARMA.fit``. Returns ------- obj : Results object Each ic is an attribute with a DataFrame for the results. The AR order used is the row index. The ma order used is the column index. The minimum orders are available as ``ic_min_order``. Examples -------- >>> from statsmodels.tsa.arima_process import arma_generate_sample >>> import statsmodels.api as sm >>> import numpy as np >>> arparams = np.array([.75, -.25]) >>> maparams = np.array([.65, .35]) >>> arparams = np.r_[1, -arparams] >>> maparam = np.r_[1, maparams] >>> nobs = 250 >>> np.random.seed(2014) >>> y = arma_generate_sample(arparams, maparams, nobs) >>> res = sm.tsa.arma_order_select_ic(y, ic=['aic', 'bic'], trend='nc') >>> res.aic_min_order >>> res.bic_min_order Notes ----- This method can be used to tentatively identify the order of an ARMA process, provided that the time series is stationary and invertible. This function computes the full exact MLE estimate of each model and can be, therefore a little slow. An implementation using approximate estimates will be provided in the future. In the meantime, consider passing {method : 'css'} to fit_kw. """ from pandas import DataFrame ar_range = lrange(0, max_ar + 1) ma_range = lrange(0, max_ma + 1) if isinstance(ic, string_types): ic = [ic] elif not isinstance(ic, (list, tuple)): raise ValueError("Need a list or a tuple for ic if not a string.") results = np.zeros((len(ic), max_ar + 1, max_ma + 1)) for ar in ar_range: for ma in ma_range: if ar == 0 and ma == 0 and trend == 'nc': results[:, ar, ma] = np.nan continue mod = _safe_arma_fit(y, (ar, ma), model_kw, trend, fit_kw) if mod is None: results[:, ar, ma] = np.nan continue for i, criteria in enumerate(ic): results[i, ar, ma] = getattr(mod, criteria) dfs = [DataFrame(res, columns=ma_range, index=ar_range) for res in results] res = dict(zip(ic, dfs)) # add the minimums to the results dict min_res = {} for i, result in iteritems(res): mins = np.where(result.min().min() == result) min_res.update({i + '_min_order' : (mins[0][0], mins[1][0])}) res.update(min_res) return Bunch(**res)
def set_options(self, **kwargs): options = "" for key, value in iteritems(kwargs): options += "{0}={1}\n".format(key, value) self.__dict__.update({key: value}) self.options = options
"g": ["p", "hincg"], "B3": [], "h": ["consth", "p", "h"], "top": [], } # unique, parameter array names, # sorted alphabetically, order is/should be only internal paramsnames = sorted(set([i for j in itervalues(paramsind) for i in j])) # mapping coefficient names to indices to unique/parameter array paramsidx = dict((name, idx) for (idx, name) in enumerate(paramsnames)) # mapping branch and leaf names to index in parameter array inddict = dict((k, [paramsidx[j] for j in v]) for k, v in iteritems(paramsind)) """ >>> paramsnames ['const2', 'consta', 'constb', 'conste', 'consth', 'constt', 'h', 'hince', 'hincf', 'hincg', 'p', 'time', 'x2', 'x22'] >>> parmasidx {'conste': 3, 'consta': 1, 'constb': 2, 'h': 6, 'time': 11, 'consth': 4, 'p': 10, 'constt': 5, 'const2': 0, 'x2': 12, 'x22': 13, 'hince': 7, 'hincg': 9, 'hincf': 8} >>> inddict {'a': [1, 10], 'c': [1, 10, 11], 'b': [2, 10], 'e': [3, 10, 7], 'd': [1, 10, 11], 'g': [10, 9], 'f': [5, 10, 8], 'h': [4, 10, 6], 'top': [], 'B22': [13], 'B21': [], 'B1': [], 'B2': [0, 12], 'B3': []} >>> paramsind {'a': ['consta', 'p'], 'c': ['consta', 'p', 'time'], 'b': ['constb', 'p'],