def pdf(self, endog_predict=None, exog_predict=None): r""" Evaluate the probability density function. Parameters ---------- endog_predict: array_like, optional Evaluation data for the dependent variables. If unspecified, the training data is used. exog_predict: array_like, optional Evaluation data for the independent variables. Returns ------- pdf: array_like The value of the probability density at `endog_predict` and `exog_predict`. Notes ----- The formula for the conditional probability density is: .. math:: f(X|Y)=\frac{f(X,Y)}{f(Y)} with .. math:: f(X)=\prod_{s=1}^{q}h_{s}^{-1}k \left(\frac{X_{is}-X_{js}}{h_{s}}\right) where :math:`k` is the appropriate kernel for each variable. """ if endog_predict is None: endog_predict = self.endog else: endog_predict = _adjust_shape(endog_predict, self.k_dep) if exog_predict is None: exog_predict = self.exog else: exog_predict = _adjust_shape(exog_predict, self.k_indep) pdf_est = [] data_predict = np.column_stack((endog_predict, exog_predict)) for i in xrange(np.shape(data_predict)[0]): f_yx = gpke(self.bw, data=self.data, data_predict=data_predict[i, :], var_type=(self.dep_type + self.indep_type)) f_x = gpke(self.bw[self.k_dep:], data=self.exog, data_predict=exog_predict[i, :], var_type=self.indep_type) pdf_est.append(f_yx / f_x) return np.squeeze(pdf_est)
def _compute_lambda(self, Y, X): """Computes only lambda -- the main part of the test statistic""" n = np.shape(X)[0] Y = _adjust_shape(Y, 1) X = _adjust_shape(X, self.k_vars) b = KernelReg(Y, X, self.var_type, self.model.reg_type, self.bw, defaults = EstimatorSettings(efficient=False)).fit()[1] b = b[:, self.test_vars] b = np.reshape(b, (n, len(self.test_vars))) #fct = np.std(b) # Pivot the statistic by dividing by SE fct = 1. # Don't Pivot -- Bootstrapping works better if Pivot lam = ((b / fct) ** 2).sum() / float(n) return lam
def pdf(self, data_predict=None): r""" Evaluate the probability density function. Parameters ---------- data_predict: array_like, optional Points to evaluate at. If unspecified, the training data is used. Returns ------- pdf_est: array_like Probability density function evaluated at `data_predict`. Notes ----- The probability density is given by the generalized product kernel estimator: .. math:: K_{h}(X_{i},X_{j}) = \prod_{s=1}^{q}h_{s}^{-1}k\left(\frac{X_{is}-X_{js}}{h_{s}}\right) """ if data_predict is None: data_predict = self.data else: data_predict = _adjust_shape(data_predict, self.k_vars) pdf_est = [] for i in xrange(np.shape(data_predict)[0]): pdf_est.append(gpke(self.bw, data=self.data, data_predict=data_predict[i, :], var_type=self.var_type) / self.nobs) pdf_est = np.squeeze(pdf_est) return pdf_est
def fit(self, data_predict=None): """ Returns the mean and marginal effects at the `data_predict` points. Parameters ---------- data_predict : array_like, optional Points at which to return the mean and marginal effects. If not given, ``data_predict == exog``. Returns ------- mean : ndarray The regression result for the mean (i.e. the actual curve). mfx : ndarray The marginal effects, i.e. the partial derivatives of the mean. """ func = self.est[self.reg_type] if data_predict is None: data_predict = self.exog else: data_predict = _adjust_shape(data_predict, self.k_vars) N_data_predict = np.shape(data_predict)[0] mean = np.empty((N_data_predict,)) mfx = np.empty((N_data_predict, self.k_vars)) for i in xrange(N_data_predict): mean_mfx = func(self.bw, self.endog, self.exog, data_predict=data_predict[i, :]) mean[i] = mean_mfx[0] mfx_c = np.squeeze(mean_mfx[1]) mfx[i, :] = mfx_c return mean, mfx
def __init__(self, endog, exog, var_type, reg_type='ll', bw='cv_ls', defaults=EstimatorSettings()): self.var_type = var_type self.data_type = var_type self.reg_type = reg_type self.k_vars = len(self.var_type) self.endog = _adjust_shape(endog, 1) self.exog = _adjust_shape(exog, self.k_vars) self.data = np.column_stack((self.endog, self.exog)) self.nobs = np.shape(self.exog)[0] self.bw_func = dict(cv_ls=self.cv_loo, aic=self.aic_hurvich) self.est = dict(lc=self._est_loc_constant, ll=self._est_loc_linear) self._set_defaults(defaults) if not self.efficient: self.bw = self._compute_reg_bw(bw) else: self.bw = self._compute_efficient(bw)
def __init__(self, endog, exog, dep_type, indep_type, bw, defaults=EstimatorSettings()): self.dep_type = dep_type self.indep_type = indep_type self.data_type = dep_type + indep_type self.k_dep = len(self.dep_type) self.k_indep = len(self.indep_type) self.endog = _adjust_shape(endog, self.k_dep) self.exog = _adjust_shape(exog, self.k_indep) self.nobs, self.k_dep = np.shape(self.endog) self.data = np.column_stack((self.endog, self.exog)) self.k_vars = np.shape(self.data)[1] self._set_defaults(defaults) if not self.efficient: self.bw = self._compute_bw(bw) else: self.bw = self._compute_efficient(bw)
def cdf(self, data_predict=None): r""" Evaluate the cumulative distribution function. Parameters ---------- data_predict: array_like, optional Points to evaluate at. If unspecified, the training data is used. Returns ------- cdf_est: array_like The estimate of the cdf. Notes ----- See http://en.wikipedia.org/wiki/Cumulative_distribution_function For more details on the estimation see Ref. [5] in module docstring. The multivariate CDF for mixed data (continuous and ordered/unordered discrete) is estimated by: ..math:: F(x^{c},x^{d})=n^{-1}\sum_{i=1}^{n}\left[G( \frac{x^{c}-X_{i}}{h})\sum_{u\leq x^{d}}L(X_{i}^{d},x_{i}^{d}, \lambda)\right] where G() is the product kernel CDF estimator for the continuous and L() for the discrete variables. Used bandwidth is ``self.bw``. """ if data_predict is None: data_predict = self.data else: data_predict = _adjust_shape(data_predict, self.k_vars) cdf_est = [] for i in xrange(np.shape(data_predict)[0]): cdf_est.append( gpke(self.bw, data=self.data, data_predict=data_predict[i, :], var_type=self.var_type, ckertype="gaussian_cdf", ukertype="aitchisonaitken_cdf", okertype='wangryzin_cdf') / self.nobs) cdf_est = np.squeeze(cdf_est) return cdf_est
def __init__(self, data, var_type, bw=None, defaults=EstimatorSettings()): self.var_type = var_type self.k_vars = len(self.var_type) self.data = _adjust_shape(data, self.k_vars) self.data_type = var_type self.nobs, self.k_vars = np.shape(self.data) if self.nobs <= self.k_vars: raise ValueError("The number of observations must be larger " \ "than the number of variables.") self._set_defaults(defaults) if not self.efficient: self.bw = self._compute_bw(bw) else: self.bw = self._compute_efficient(bw)
def cdf(self, data_predict=None): r""" Evaluate the cumulative distribution function. Parameters ---------- data_predict: array_like, optional Points to evaluate at. If unspecified, the training data is used. Returns ------- cdf_est: array_like The estimate of the cdf. Notes ----- See http://en.wikipedia.org/wiki/Cumulative_distribution_function For more details on the estimation see Ref. [5] in module docstring. The multivariate CDF for mixed data (continuous and ordered/unordered discrete) is estimated by: ..math:: F(x^{c},x^{d})=n^{-1}\sum_{i=1}^{n}\left[G( \frac{x^{c}-X_{i}}{h})\sum_{u\leq x^{d}}L(X_{i}^{d},x_{i}^{d}, \lambda)\right] where G() is the product kernel CDF estimator for the continuous and L() for the discrete variables. Used bandwidth is ``self.bw``. """ if data_predict is None: data_predict = self.data else: data_predict = _adjust_shape(data_predict, self.k_vars) cdf_est = [] for i in xrange(np.shape(data_predict)[0]): cdf_est.append(gpke(self.bw, data=self.data, data_predict=data_predict[i, :], var_type=self.var_type, ckertype="gaussian_cdf", ukertype="aitchisonaitken_cdf", okertype='wangryzin_cdf') / self.nobs) cdf_est = np.squeeze(cdf_est) return cdf_est
def censored(self, censor_val): # see pp. 341-344 in [1] self.d = (self.endog != censor_val) * 1. ix = np.argsort(np.squeeze(self.endog)) self.sortix = ix self.sortix_rev = np.zeros(ix.shape, int) self.sortix_rev[ix] = np.arange(len(ix)) self.endog = np.squeeze(self.endog[ix]) self.endog = _adjust_shape(self.endog, 1) self.exog = np.squeeze(self.exog[ix]) self.d = np.squeeze(self.d[ix]) self.W_in = np.empty((self.nobs, 1)) for i in xrange(1, self.nobs + 1): P=1 for j in xrange(1, i): P *= ((self.nobs - j)/(float(self.nobs)-j+1))**self.d[j-1] self.W_in[i-1,0] = P * self.d[i-1] / (float(self.nobs) - i + 1 )
def fit(self, data_predict=None): """ Returns the marginal effects at the data_predict points. """ func = self.est[self.reg_type] if data_predict is None: data_predict = self.exog else: data_predict = _adjust_shape(data_predict, self.k_vars) N_data_predict = np.shape(data_predict)[0] mean = np.empty((N_data_predict,)) mfx = np.empty((N_data_predict, self.k_vars)) for i in xrange(N_data_predict): mean_mfx = func(self.bw, self.endog, self.exog, data_predict=data_predict[i, :], W=self.W_in) mean[i] = mean_mfx[0] mfx_c = np.squeeze(mean_mfx[1]) mfx[i, :] = mfx_c return mean, mfx
def cdf(self, endog_predict=None, exog_predict=None): r""" Cumulative distribution function for the conditional density. Parameters ---------- endog_predict: array_like, optional The evaluation dependent variables at which the cdf is estimated. If not specified the training dependent variables are used. exog_predict: array_like, optional The evaluation independent variables at which the cdf is estimated. If not specified the training independent variables are used. Returns ------- cdf_est: array_like The estimate of the cdf. Notes ----- For more details on the estimation see [5], and p.181 in [1]. The multivariate conditional CDF for mixed data (continuous and ordered/unordered discrete) is estimated by: ..math:: F(y|x)=\frac{n^{-1}\sum_{i=1}^{n}G(\frac{y-Y_{i}}{h_{0}}) W_{h}(X_{i},x)}{\widehat{\mu}(x)} where G() is the product kernel CDF estimator for the dependent (y) variable(s) and W() is the product kernel CDF estimator for the independent variable(s). """ if endog_predict is None: endog_predict = self.endog else: endog_predict = _adjust_shape(endog_predict, self.k_dep) if exog_predict is None: exog_predict = self.exog else: exog_predict = _adjust_shape(exog_predict, self.k_indep) N_data_predict = np.shape(exog_predict)[0] cdf_est = np.empty(N_data_predict) for i in xrange(N_data_predict): mu_x = gpke(self.bw[self.k_dep:], data=self.exog, data_predict=exog_predict[i, :], var_type=self.indep_type) / self.nobs mu_x = np.squeeze(mu_x) cdf_endog = gpke(self.bw[0:self.k_dep], data=self.endog, data_predict=endog_predict[i, :], var_type=self.dep_type, ckertype="gaussian_cdf", ukertype="aitchisonaitken_cdf", okertype='wangryzin_cdf', tosum=False) cdf_exog = gpke(self.bw[self.k_dep:], data=self.exog, data_predict=exog_predict[i, :], var_type=self.indep_type, tosum=False) S = (cdf_endog * cdf_exog).sum(axis=0) cdf_est[i] = S / (self.nobs * mu_x) return cdf_est