def penalized_wls(endog, exog, penalty_matrix, weights): """weighted least squares with quadratic penalty Parameters ---------- endog : ndarray response or endogenous variable exog : ndarray design matrix, matrix of exogenous or explanatory variables penalty_matrix : ndarray, 2-Dim square penality matrix for quadratic penalization. Note, the penalty_matrix is multiplied by two to match non-pirls fitting methods. weights : ndarray weights for WLS Returns ------- results : Results instance of WLS """ y, x, s = endog, exog, penalty_matrix # TODO: I don't understand why I need 2 * s aug_y, aug_x, aug_weights = make_augmented_matrix(y, x, 2 * s, weights) wls_results = lm.WLS(aug_y, aug_x, aug_weights).fit() # TODO: use MinimalWLS during iterations, less overhead # However, MinimalWLS does not return normalized_cov_params # which we need at the end of the iterations # call would be # wls_results = reg_tools._MinimalWLS(aug_y, aug_x, aug_weights).fit() wls_results.params = wls_results.params.ravel() return wls_results
def null(self): endog = self._endog model = self.model exog = np.ones((len(endog), 1)) if hasattr(model, 'offset'): return GLM(endog, exog, offset=model.offset, family=self.family).fit().mu elif hasattr(model, 'exposure'): return GLM(endog, exog, exposure=model.exposure, family=self.family).fit().mu else: wls_model = lm.WLS(endog, exog, weights=self._data_weights) return wls_model.fit().fittedvalues
def null(self): endog = self._endog model = self.model exog = np.ones((len(endog), 1)) kwargs = {} if hasattr(model, 'offset'): kwargs['offset'] = model.offset if hasattr(model, 'exposure'): kwargs['exposure'] = model.exposure if len(kwargs) > 0: return GLM(endog, exog, family=self.family, **kwargs).fit().mu else: wls_model = lm.WLS(endog, exog, weights=self._data_weights) return wls_model.fit().fittedvalues
def compute_QTL_gti_peaki(datapoint): [peak_sample, gt_sample, weight_sample] = datapoint valid_samples = np.where(gt_sample!= -1)[0] y = np.array(peak_sample[valid_samples]) y = y.astype(float) x = np.array(gt_sample[valid_samples]) x_weights = np.array(weight_sample[valid_samples]) x = sm.add_constant(x) wls_model = sm.WLS(y, x, weights = x_weights) results = wls_model.fit() return results.pvalues[1]
def added_variable_resids(results, focus_exog, resid_type=None, use_glm_weights=True, fit_kwargs=None): """ Residualize the endog variable and a 'focus' exog variable in a regression model with respect to the other exog variables. Parameters ---------- results : regression results instance A fitted model including the focus exog and all other predictors of interest. focus_exog : integer or string The column of results.model.exog or a variable name that is to be residualized against the other predictors. resid_type : string The type of residuals to use for the dependent variable. If None, uses `resid_deviance` for GLM/GEE and `resid` otherwise. use_glm_weights : bool Only used if the model is a GLM or GEE. If True, the residuals for the focus predictor are computed using WLS, with the weights obtained from the IRLS calculations for fitting the GLM. If False, unweighted regression is used. fit_kwargs : dict, optional Keyword arguments to be passed to fit when refitting the model. Returns ------- endog_resid : array-like The residuals for the original exog focus_exog_resid : array-like The residuals for the focus predictor Notes ----- The 'focus variable' residuals are always obtained using linear regression. Currently only GLM, GEE, and OLS models are supported. """ model = results.model if not isinstance(model, (GEE, GLM, OLS)): raise ValueError( "model type %s not supported for added variable residuals" % model.__class__.__name__) exog = model.exog endog = model.endog focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model) focus_exog_vals = exog[:, focus_col] # Default residuals if resid_type is None: if isinstance(model, (GEE, GLM)): resid_type = "resid_deviance" else: resid_type = "resid" ii = range(exog.shape[1]) ii = list(ii) ii.pop(focus_col) reduced_exog = exog[:, ii] start_params = results.params[ii] klass = model.__class__ kwargs = model._get_init_kwds() new_model = klass(endog, reduced_exog, **kwargs) args = {"start_params": start_params} if fit_kwargs is not None: args.update(fit_kwargs) new_result = new_model.fit(**args) if not new_result.converged: raise ValueError( "fit did not converge when calculating added variable residuals") try: endog_resid = getattr(new_result, resid_type) except AttributeError: raise ValueError("'%s' residual type not available" % resid_type) import statsmodels.regression.linear_model as lm if isinstance(model, (GLM, GEE)) and use_glm_weights: weights = model.family.weights(results.fittedvalues) if hasattr(model, "data_weights"): weights = weights * model.data_weights lm_results = lm.WLS(focus_exog_vals, reduced_exog, weights).fit() else: lm_results = lm.OLS(focus_exog_vals, reduced_exog).fit() focus_exog_resid = lm_results.resid return endog_resid, focus_exog_resid
def fit(self, maxiter=50, tol=1e-8, scale_est='mad', init=None, cov='H1', update_scale=True, conv='dev'): """ Fits the model using iteratively reweighted least squares. The IRLS routine runs until the specified objective converges to `tol` or `maxiter` has been reached. Parameters ---------- conv : string Indicates the convergence criteria. Available options are "coefs" (the coefficients), "weights" (the weights in the iteration), "sresid" (the standardized residuals), and "dev" (the un-normalized log-likelihood for the M estimator). The default is "dev". cov : string, optional 'H1', 'H2', or 'H3' Indicates how the covariance matrix is estimated. Default is 'H1'. See rlm.RLMResults for more information. init : string Specifies method for the initial estimates of the parameters. Default is None, which means that the least squares estimate is used. Currently it is the only available choice. maxiter : int The maximum number of iterations to try. Default is 50. scale_est : string or HuberScale() 'mad', 'stand_mad', or HuberScale() Indicates the estimate to use for scaling the weights in the IRLS. The default is 'mad' (median absolute deviation. Other options are use 'stand_mad' for the median absolute deviation standardized around the median and 'HuberScale' for Huber's proposal 2. Huber's proposal 2 has optional keyword arguments d, tol, and maxiter for specifying the tuning constant, the convergence tolerance, and the maximum number of iterations. See models.robust.scale for more information. tol : float The convergence tolerance of the estimate. Default is 1e-8. update_scale : Bool If `update_scale` is False then the scale estimate for the weights is held constant over the iteration. Otherwise, it is updated for each fit in the iteration. Default is True. Returns ------- results : object statsmodels.rlm.RLMresults """ if not cov.upper() in ["H1", "H2", "H3"]: raise ValueError("Covariance matrix %s not understood" % cov) else: self.cov = cov.upper() conv = conv.lower() if not conv in ["weights", "coefs", "dev", "sresid"]: raise ValueError("Convergence argument %s not understood" \ % conv) self.scale_est = scale_est wls_results = lm.WLS(self.endog, self.exog).fit() if not init: self.scale = self._estimate_scale(wls_results.resid) history = dict(params=[np.inf], scale=[]) if conv == 'coefs': criterion = history['params'] elif conv == 'dev': history.update(dict(deviance=[np.inf])) criterion = history['deviance'] elif conv == 'sresid': history.update(dict(sresid=[np.inf])) criterion = history['sresid'] elif conv == 'weights': history.update(dict(weights=[np.inf])) criterion = history['weights'] # done one iteration so update history = self._update_history(wls_results, history, conv) iteration = 1 converged = 0 while not converged: self.weights = self.M.weights(wls_results.resid / self.scale) wls_results = lm.WLS(self.endog, self.exog, weights=self.weights).fit() if update_scale is True: self.scale = self._estimate_scale(wls_results.resid) history = self._update_history(wls_results, history, conv) iteration += 1 converged = _check_convergence(criterion, iteration, tol, maxiter) results = RLMResults(self, wls_results.params, self.normalized_cov_params, self.scale) history['iteration'] = iteration results.fit_history = history results.fit_options = dict(cov=cov.upper(), scale_est=scale_est, norm=self.M.__class__.__name__, conv=conv) #norm is not changed in fit, no old state #doing the next causes exception #self.cov = self.scale_est = None #reset for additional fits #iteration and history could contain wrong state with repeated fit return RLMResultsWrapper(results)
# Use noise values copied from book (based on sds above). eta = [ -0.0023, -0.0728, 0.1104, 0.6076, -0.3034, -0.2237, 0.7407, -1.0, -3.0, -2.4653, -3.0 ] # Find observed values of x (with noise added). x = m * s + c + eta # Weighted Least Squares regression. # Find weightings w (discount) for each data point. vars0 = sds**2 w = 1 / vars0 # Un-comment next line for solution based on un-weighted regression. #w=ones(size(w)) ss = sm.add_constant(s) # Add column of 1s for regression. model = sm.WLS(x, ss, weights=w) results = model.fit() cest2, mest2 = results.params print('Estimated slope = %.3f,' % mest2) print(' estimated intercept = %.3f.' % cest2) # Make line xest2 based on fitted slope and intercept. s2 = arange(0, 13) xest2 = mest2 * s2 + cest2 # Plot fitted line xest, data points, and error bars. fig1 = plt.figure() plt.errorbar(s, x, yerr=sds, fmt='o', color='k') plt.plot(s, x, 'k*', s2, xest2, 'k--') plt.xlabel('Salary, $s$ (groats)') plt.ylabel('Height, $x$ (feet)')
def fit_model(data, model, sigma, fit_method='chisq', masking=None, mask_only=False, **kwargs): def chisq(x): return np.sum((data[mask] - x * model[mask])**2 / sigma[mask]**2) / (sum(mask) - 1) def difference(x): return np.sum(np.abs(data[mask] - x * model[mask])) mask = np.array([True for _ in data]) sigmalimit = None if masking is not None: for masktype in masking.split(';'): masktype = masktype.strip().lower() if masktype.startswith('middle'): perinterval = float(masktype[6:]) # Estimate model strength (source rate) by fitting middle % interval = PercentileInterval(perinterval) lim = interval.get_limits(data) mask = (mask & (data >= lim[0]) & (data <= lim[1])) elif (masktype.startswith('minalt')) and ('altitude' in kwargs): minalt = float(masktype[6:]) mask = mask & (kwargs['altitude'] >= minalt) elif masktype.startswith('minalt'): raise InputError('mathMB.fit_model', 'Altitude not supplied.') elif masktype.startswith('minsnr'): minSNR = float(masktype[6:]) snr = data / sigma mask = mask & (snr > minSNR) elif masktype.startswith('siglimit'): sigmalimit = masktype else: raise InputError('MESSENGERdata.fit_model', f'masking = {masktype} not defined.') else: pass if mask_only: return None, None, mask else: available_fitfunctions = ['chisq', 'difference', 'wls'] if np.any(mask) == False: # No data points are included - just do a simple fit for show mask_ = mask.copy() mask[:] = True model_strength = minimize_scalar(difference) mask = mask_ return model_strength.x, model_strength.fun, mask elif fit_method.lower() in available_fitfunctions: if fit_method == 'wls': # Weighted least squares fit wls_model = lm.WLS(model[mask], data[mask], 1. / sigma[mask]**2) result = wls_model.fit() if sigmalimit is not None: siglimit = float(sigmalimit[8:]) diff = (data - model / result.params[0]) / sigma mask = mask & (diff < siglimit * sigma) wls_model = lm.WLS(model[mask], data[mask], 1. / sigma[mask]**2) result = wls_model.fit() else: pass return 1. / result.params[0], result.rsquared, mask else: model_strength = minimize_scalar(eval(fit_method.lower())) if sigmalimit is not None: siglimit = float(sigmalimit[8:]) diff = (data - model_strength.x * model) / sigma mask = mask & (diff < siglimit * sigma) model_strength = minimize_scalar(eval(fit_method.lower())) else: pass return model_strength.x, model_strength.fun, mask else: raise InputError('mathMB.fit_model', f'fit_method = {fit_method} not defined.')
def fit(self, maxiter=50, tol=1e-8, scale_est='mad', init=None, cov='H1', update_scale=True, conv='dev', start_params=None): """ Fits the model using iteratively reweighted least squares. The IRLS routine runs until the specified objective converges to `tol` or `maxiter` has been reached. Parameters ---------- conv : str Indicates the convergence criteria. Available options are "coefs" (the coefficients), "weights" (the weights in the iteration), "sresid" (the standardized residuals), and "dev" (the un-normalized log-likelihood for the M estimator). The default is "dev". cov : str, optional 'H1', 'H2', or 'H3' Indicates how the covariance matrix is estimated. Default is 'H1'. See rlm.RLMResults for more information. init : str Specifies method for the initial estimates of the parameters. Default is None, which means that the least squares estimate is used. Currently it is the only available choice. maxiter : int The maximum number of iterations to try. Default is 50. scale_est : str or HuberScale() 'mad' or HuberScale() Indicates the estimate to use for scaling the weights in the IRLS. The default is 'mad' (median absolute deviation. Other options are 'HuberScale' for Huber's proposal 2. Huber's proposal 2 has optional keyword arguments d, tol, and maxiter for specifying the tuning constant, the convergence tolerance, and the maximum number of iterations. See statsmodels.robust.scale for more information. tol : float The convergence tolerance of the estimate. Default is 1e-8. update_scale : Bool If `update_scale` is False then the scale estimate for the weights is held constant over the iteration. Otherwise, it is updated for each fit in the iteration. Default is True. start_params : array-like, optional Initial guess of the solution of the optimizer. If not provided, the initial parameters are computed using OLS. Returns ------- results : statsmodels.rlm.RLMresults Results instance """ if cov.upper() not in ["H1", "H2", "H3"]: raise ValueError("Covariance matrix %s not understood" % cov) else: self.cov = cov.upper() conv = conv.lower() if conv not in ["weights", "coefs", "dev", "sresid"]: raise ValueError("Convergence argument %s not understood" % conv) self.scale_est = scale_est if start_params is None: wls_results = lm.WLS(self.endog, self.exog).fit() else: start_params = np.asarray(start_params, dtype=np.double).squeeze() if (start_params.shape[0] != self.exog.shape[1] or start_params.ndim != 1): raise ValueError('start_params must by a 1-d array with {0} ' 'values'.format(self.exog.shape[1])) fake_wls = reg_tools._MinimalWLS(self.endog, self.exog, weights=np.ones_like(self.endog), check_weights=False) wls_results = fake_wls.results(start_params) if not init: self.scale = self._estimate_scale(wls_results.resid) history = dict(params=[np.inf], scale=[]) if conv == 'coefs': criterion = history['params'] elif conv == 'dev': history.update(dict(deviance=[np.inf])) criterion = history['deviance'] elif conv == 'sresid': history.update(dict(sresid=[np.inf])) criterion = history['sresid'] elif conv == 'weights': history.update(dict(weights=[np.inf])) criterion = history['weights'] # done one iteration so update history = self._update_history(wls_results, history, conv) iteration = 1 converged = 0 while not converged: if self.scale == 0.0: import warnings warnings.warn( 'Estimated scale is 0.0 indicating that the most' ' last iteration produced a perfect fit of the ' 'weighted data.', ConvergenceWarning) break self.weights = self.M.weights(wls_results.resid / self.scale) wls_results = reg_tools._MinimalWLS(self.endog, self.exog, weights=self.weights, check_weights=True).fit() if update_scale is True: self.scale = self._estimate_scale(wls_results.resid) history = self._update_history(wls_results, history, conv) iteration += 1 converged = _check_convergence(criterion, iteration, tol, maxiter) results = RLMResults(self, wls_results.params, self.normalized_cov_params, self.scale) history['iteration'] = iteration results.fit_history = history results.fit_options = dict(cov=cov.upper(), scale_est=scale_est, norm=self.M.__class__.__name__, conv=conv) # norm is not changed in fit, no old state # doing the next causes exception # self.cov = self.scale_est = None #reset for additional fits # iteration and history could contain wrong state with repeated fit return RLMResultsWrapper(results)
def fit(self, start_params=None, maxiter=100, method='IRLS', tol=1e-8, scale=None): """ Fits a generalized linear model for a given family. parameters ---------- maxiter : int, optional Default is 100. method : string Default is 'IRLS' for iteratively reweighted least squares. This is currently the only method available for GLM fit. scale : string or float, optional `scale` can be 'X2', 'dev', or a float The default value is None, which uses `X2` for Gamma, Gaussian, and Inverse Gaussian. `X2` is Pearson's chi-square divided by `df_resid`. The default is 1 for the Binomial and Poisson families. `dev` is the deviance divided by df_resid tol : float Convergence tolerance. Default is 1e-8. start_params : array-like, optional Initial guess of the solution for the loglikelihood maximization. The default is family-specific and is given by the ``family.starting_mu(endog)``. If start_params is given then the initial mean will be calculated as ``np.dot(exog, start_params)``. """ endog = self.endog if endog.ndim > 1 and endog.shape[1] == 2: data_weights = endog.sum(1) # weights are total trials else: data_weights = np.ones((endog.shape[0])) self.data_weights = data_weights if np.shape(self.data_weights) == () and self.data_weights > 1: self.data_weights = self.data_weights * np.ones((endog.shape[0])) self.scaletype = scale if isinstance(self.family, families.Binomial): # this checks what kind of data is given for Binomial. # family will need a reference to endog if this is to be removed from # preprocessing self.endog = self.family.initialize(self.endog) if hasattr(self, 'offset'): offset = self.offset elif hasattr(self, 'exposure'): offset = self.exposure else: offset = 0 #TODO: would there ever be both and exposure and an offset? wlsexog = self.exog if start_params is None: mu = self.family.starting_mu(self.endog) else: mu = self.family.fitted(np.dot(wlsexog, start_params)) eta = self.family.predict(mu) dev = self.family.deviance(self.endog, mu) if np.isnan(dev): raise ValueError("The first guess on the deviance function " "returned a nan. This could be a boundary " " problem and should be reported.") # first guess on the deviance is assumed to be scaled by 1. # params are none to start, so they line up with the deviance history = dict(params=[None, start_params], deviance=[np.inf, dev]) iteration = 0 converged = 0 criterion = history['deviance'] while not converged: self.weights = data_weights*self.family.weights(mu) wlsendog = (eta + self.family.link.deriv(mu) * (self.endog-mu) - offset) wls_results = lm.WLS(wlsendog, wlsexog, self.weights).fit() eta = np.dot(self.exog, wls_results.params) + offset mu = self.family.fitted(eta) history = self._update_history(wls_results, mu, history) self.scale = self.estimate_scale(mu) iteration += 1 if endog.squeeze().ndim == 1 and np.allclose(mu - endog, 0): msg = "Perfect separation detected, results not available" raise PerfectSeparationError(msg) converged = _check_convergence(criterion, iteration, tol, maxiter) self.mu = mu glm_results = GLMResults(self, wls_results.params, wls_results.normalized_cov_params, self.scale) history['iteration'] = iteration glm_results.fit_history = history return GLMResultsWrapper(glm_results)
def fit(self, start_params=None, maxiter=100, method='IRLS', tol=1e-8, scale=None, cov_type='nonrobust', cov_kwds=None, use_t=None, **kwargs): """ Fits a generalized linear model for a given family. parameters ---------- maxiter : int, optional Default is 100. method : string Default is 'IRLS' for iteratively reweighted least squares. This is currently the only method available for GLM fit. scale : string or float, optional `scale` can be 'X2', 'dev', or a float The default value is None, which uses `X2` for Gamma, Gaussian, and Inverse Gaussian. `X2` is Pearson's chi-square divided by `df_resid`. The default is 1 for the Binomial and Poisson families. `dev` is the deviance divided by df_resid tol : float Convergence tolerance. Default is 1e-8. start_params : array-like, optional Initial guess of the solution for the loglikelihood maximization. The default is family-specific and is given by the ``family.starting_mu(endog)``. If start_params is given then the initial mean will be calculated as ``np.dot(exog, start_params)``. Notes ----- This method does not take any extra undocumented ``kwargs``. """ endog = self.endog if endog.ndim > 1 and endog.shape[1] == 2: data_weights = endog.sum(1) # weights are total trials else: data_weights = np.ones((endog.shape[0])) self.data_weights = data_weights if np.shape(self.data_weights) == () and self.data_weights > 1: self.data_weights = self.data_weights * np.ones((endog.shape[0])) self.scaletype = scale if isinstance(self.family, families.Binomial): # this checks what kind of data is given for Binomial. # family will need a reference to endog if this is to be removed from # preprocessing self.endog = self.family.initialize(self.endog) # Construct a combined offset/exposure term. Note that # exposure has already been logged if present. offset_exposure = 0. if hasattr(self, 'offset'): offset_exposure = self.offset if hasattr(self, 'exposure'): offset_exposure = offset_exposure + self.exposure self._offset_exposure = offset_exposure wlsexog = self.exog if start_params is None: mu = self.family.starting_mu(self.endog) lin_pred = self.family.predict(mu) else: lin_pred = np.dot(wlsexog, start_params) + offset_exposure mu = self.family.fitted(lin_pred) dev = self.family.deviance(self.endog, mu) if np.isnan(dev): raise ValueError("The first guess on the deviance function " "returned a nan. This could be a boundary " " problem and should be reported.") # first guess on the deviance is assumed to be scaled by 1. # params are none to start, so they line up with the deviance history = dict(params=[None, start_params], deviance=[np.inf, dev]) converged = False criterion = history['deviance'] # This special case is used to get the likelihood for a specific # params vector. if maxiter == 0: mu = self.family.fitted(lin_pred) self.scale = self.estimate_scale(mu) wls_results = lm.RegressionResults(self, start_params, None) iteration = 0 for iteration in range(maxiter): self.weights = data_weights * self.family.weights(mu) wlsendog = (lin_pred + self.family.link.deriv(mu) * (self.endog - mu) - offset_exposure) wls_results = lm.WLS(wlsendog, wlsexog, self.weights).fit() lin_pred = np.dot(self.exog, wls_results.params) + offset_exposure mu = self.family.fitted(lin_pred) history = self._update_history(wls_results, mu, history) self.scale = self.estimate_scale(mu) if endog.squeeze().ndim == 1 and np.allclose(mu - endog, 0): msg = "Perfect separation detected, results not available" raise PerfectSeparationError(msg) converged = _check_convergence(criterion, iteration, tol) if converged: break self.mu = mu glm_results = GLMResults(self, wls_results.params, wls_results.normalized_cov_params, self.scale, cov_type=cov_type, cov_kwds=cov_kwds, use_t=use_t) history['iteration'] = iteration + 1 glm_results.fit_history = history return GLMResultsWrapper(glm_results)
def fit(self, X, Y, sample_weight = None): """ fit the weighted model Parameters ---------- X : design matrix Y : response matrix sample_weight: sample weight vector """ # family is the glm family with link, the family is the same as in the statsmodel if sample_weight is None: sample_weight = np.ones((X.shape[0],)) assert X.shape[0] == sample_weight.shape[0] assert X.shape[0] == Y.shape[0] assert Y.ndim == 1 or Y.shape[1] == 1 Y = Y.reshape(-1,) sum_w = np.sum(sample_weight) assert sum_w > 0 if X.ndim == 1: X = X.reshape(-1,1) if self.fit_intercept: X = addIntercept(X) self.n_samples = X.shape[0] self.n_features = X.shape[1] self.n_targets = 1 # start fitting using irls mu = self.fam.starting_mu(Y) lin_pred = self.fam.predict(mu) dev = self.fam.deviance_weighted(Y, mu, sample_weight) if np.isnan(dev): raise ValueError("The first guess on the deviance function " "returned a nan. This could be a boundary " " problem and should be reported.") # This special case is used to get the likelihood for a specific # params vector. for iteration in range(self.max_iter): weights = sample_weight * self.fam.weights(mu) wlsendog = lin_pred + self.fam.link.deriv(mu) * (Y-mu) if self.penalty is None: wls_results = lim.WLS(wlsendog, X, weights).fit(method = self.solver) if self.penalty == 'elasticnet': wls_results = lim.WLS(wlsendog, X, weights).fit_regularized(alpha = self.reg, L1_wt = self.l1_ratio) lin_pred = np.dot(X, wls_results.params) mu = self.fam.fitted(lin_pred) if Y.squeeze().ndim == 1 and np.allclose(mu - Y, 0): msg = "Perfect separation detected, results not available" raise Error(msg) dev_new = self.fam.deviance_weighted(Y, mu, sample_weight) converged = np.fabs(dev - dev_new) <= self.tol dev = dev_new if converged: break self.converged = converged self.coef = wls_results.params self.dispersion = self.estimate_dispersion(X, Y, mu, sample_weight) if self.est_sd: self.sd = self.estimate_sd(X, Y, mu, sample_weight, weights) self.ll = self.estimate_loglikelihood(Y, mu, sample_weight)