Ejemplo n.º 1
0
def penalized_wls(endog, exog, penalty_matrix, weights):
    """weighted least squares with quadratic penalty

    Parameters
    ----------
    endog : ndarray
        response or endogenous variable
    exog : ndarray
        design matrix, matrix of exogenous or explanatory variables
    penalty_matrix : ndarray, 2-Dim square
        penality matrix for quadratic penalization. Note, the penalty_matrix
        is multiplied by two to match non-pirls fitting methods.
    weights : ndarray
        weights for WLS

    Returns
    -------
    results : Results instance of WLS
    """
    y, x, s = endog, exog, penalty_matrix
    # TODO: I don't understand why I need 2 * s
    aug_y, aug_x, aug_weights = make_augmented_matrix(y, x, 2 * s, weights)
    wls_results = lm.WLS(aug_y, aug_x, aug_weights).fit()
    # TODO: use MinimalWLS during iterations, less overhead
    # However, MinimalWLS does not return normalized_cov_params
    #   which we need at the end of the iterations
    # call would be
    # wls_results = reg_tools._MinimalWLS(aug_y, aug_x, aug_weights).fit()
    wls_results.params = wls_results.params.ravel()

    return wls_results
 def null(self):
     endog = self._endog
     model = self.model
     exog = np.ones((len(endog), 1))
     if hasattr(model, 'offset'):
         return GLM(endog, exog, offset=model.offset,
                    family=self.family).fit().mu
     elif hasattr(model, 'exposure'):
         return GLM(endog, exog, exposure=model.exposure,
                    family=self.family).fit().mu
     else:
         wls_model = lm.WLS(endog, exog, weights=self._data_weights)
         return wls_model.fit().fittedvalues
Ejemplo n.º 3
0
 def null(self):
     endog = self._endog
     model = self.model
     exog = np.ones((len(endog), 1))
     kwargs = {}
     if hasattr(model, 'offset'):
         kwargs['offset'] = model.offset
     if hasattr(model, 'exposure'):
         kwargs['exposure'] = model.exposure
     if len(kwargs) > 0:
         return GLM(endog, exog, family=self.family, **kwargs).fit().mu
     else:
         wls_model = lm.WLS(endog, exog, weights=self._data_weights)
         return wls_model.fit().fittedvalues
Ejemplo n.º 4
0
def compute_QTL_gti_peaki(datapoint):

    [peak_sample, gt_sample, weight_sample] = datapoint
    valid_samples = np.where(gt_sample!= -1)[0]
    
    y = np.array(peak_sample[valid_samples])
    y = y.astype(float)
    x = np.array(gt_sample[valid_samples])
    x_weights = np.array(weight_sample[valid_samples])
   
    x = sm.add_constant(x) 
    wls_model = sm.WLS(y, x, weights = x_weights)
    results = wls_model.fit()

    return results.pvalues[1]
Ejemplo n.º 5
0
def added_variable_resids(results,
                          focus_exog,
                          resid_type=None,
                          use_glm_weights=True,
                          fit_kwargs=None):
    """
    Residualize the endog variable and a 'focus' exog variable in a
    regression model with respect to the other exog variables.

    Parameters
    ----------
    results : regression results instance
        A fitted model including the focus exog and all other
        predictors of interest.
    focus_exog : integer or string
        The column of results.model.exog or a variable name that is
        to be residualized against the other predictors.
    resid_type : string
        The type of residuals to use for the dependent variable.  If
        None, uses `resid_deviance` for GLM/GEE and `resid` otherwise.
    use_glm_weights : bool
        Only used if the model is a GLM or GEE.  If True, the
        residuals for the focus predictor are computed using WLS, with
        the weights obtained from the IRLS calculations for fitting
        the GLM.  If False, unweighted regression is used.
    fit_kwargs : dict, optional
        Keyword arguments to be passed to fit when refitting the
        model.

    Returns
    -------
    endog_resid : array-like
        The residuals for the original exog
    focus_exog_resid : array-like
        The residuals for the focus predictor

    Notes
    -----
    The 'focus variable' residuals are always obtained using linear
    regression.

    Currently only GLM, GEE, and OLS models are supported.
    """

    model = results.model
    if not isinstance(model, (GEE, GLM, OLS)):
        raise ValueError(
            "model type %s not supported for added variable residuals" %
            model.__class__.__name__)

    exog = model.exog
    endog = model.endog

    focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model)

    focus_exog_vals = exog[:, focus_col]

    # Default residuals
    if resid_type is None:
        if isinstance(model, (GEE, GLM)):
            resid_type = "resid_deviance"
        else:
            resid_type = "resid"

    ii = range(exog.shape[1])
    ii = list(ii)
    ii.pop(focus_col)
    reduced_exog = exog[:, ii]
    start_params = results.params[ii]

    klass = model.__class__

    kwargs = model._get_init_kwds()
    new_model = klass(endog, reduced_exog, **kwargs)
    args = {"start_params": start_params}
    if fit_kwargs is not None:
        args.update(fit_kwargs)
    new_result = new_model.fit(**args)
    if not new_result.converged:
        raise ValueError(
            "fit did not converge when calculating added variable residuals")

    try:
        endog_resid = getattr(new_result, resid_type)
    except AttributeError:
        raise ValueError("'%s' residual type not available" % resid_type)

    import statsmodels.regression.linear_model as lm

    if isinstance(model, (GLM, GEE)) and use_glm_weights:
        weights = model.family.weights(results.fittedvalues)
        if hasattr(model, "data_weights"):
            weights = weights * model.data_weights
        lm_results = lm.WLS(focus_exog_vals, reduced_exog, weights).fit()
    else:
        lm_results = lm.OLS(focus_exog_vals, reduced_exog).fit()
    focus_exog_resid = lm_results.resid

    return endog_resid, focus_exog_resid
Ejemplo n.º 6
0
    def fit(self,
            maxiter=50,
            tol=1e-8,
            scale_est='mad',
            init=None,
            cov='H1',
            update_scale=True,
            conv='dev'):
        """
        Fits the model using iteratively reweighted least squares.

        The IRLS routine runs until the specified objective converges to `tol`
        or `maxiter` has been reached.

        Parameters
        ----------
        conv : string
            Indicates the convergence criteria.
            Available options are "coefs" (the coefficients), "weights" (the
            weights in the iteration), "sresid" (the standardized residuals),
            and "dev" (the un-normalized log-likelihood for the M
            estimator).  The default is "dev".
        cov : string, optional
            'H1', 'H2', or 'H3'
            Indicates how the covariance matrix is estimated.  Default is 'H1'.
            See rlm.RLMResults for more information.
        init : string
            Specifies method for the initial estimates of the parameters.
            Default is None, which means that the least squares estimate
            is used.  Currently it is the only available choice.
        maxiter : int
            The maximum number of iterations to try. Default is 50.
        scale_est : string or HuberScale()
            'mad', 'stand_mad', or HuberScale()
            Indicates the estimate to use for scaling the weights in the IRLS.
            The default is 'mad' (median absolute deviation.  Other options are
            use 'stand_mad' for the median absolute deviation standardized
            around the median and 'HuberScale' for Huber's proposal 2.
            Huber's proposal 2 has optional keyword arguments d, tol, and
            maxiter for specifying the tuning constant, the convergence
            tolerance, and the maximum number of iterations.
            See models.robust.scale for more information.
        tol : float
            The convergence tolerance of the estimate.  Default is 1e-8.
        update_scale : Bool
            If `update_scale` is False then the scale estimate for the
            weights is held constant over the iteration.  Otherwise, it
            is updated for each fit in the iteration.  Default is True.

        Returns
        -------
        results : object
            statsmodels.rlm.RLMresults
        """
        if not cov.upper() in ["H1", "H2", "H3"]:
            raise ValueError("Covariance matrix %s not understood" % cov)
        else:
            self.cov = cov.upper()
        conv = conv.lower()
        if not conv in ["weights", "coefs", "dev", "sresid"]:
            raise ValueError("Convergence argument %s not understood" \
                % conv)
        self.scale_est = scale_est
        wls_results = lm.WLS(self.endog, self.exog).fit()
        if not init:
            self.scale = self._estimate_scale(wls_results.resid)

        history = dict(params=[np.inf], scale=[])
        if conv == 'coefs':
            criterion = history['params']
        elif conv == 'dev':
            history.update(dict(deviance=[np.inf]))
            criterion = history['deviance']
        elif conv == 'sresid':
            history.update(dict(sresid=[np.inf]))
            criterion = history['sresid']
        elif conv == 'weights':
            history.update(dict(weights=[np.inf]))
            criterion = history['weights']

        # done one iteration so update
        history = self._update_history(wls_results, history, conv)
        iteration = 1
        converged = 0
        while not converged:
            self.weights = self.M.weights(wls_results.resid / self.scale)
            wls_results = lm.WLS(self.endog, self.exog,
                                 weights=self.weights).fit()
            if update_scale is True:
                self.scale = self._estimate_scale(wls_results.resid)
            history = self._update_history(wls_results, history, conv)
            iteration += 1
            converged = _check_convergence(criterion, iteration, tol, maxiter)
        results = RLMResults(self, wls_results.params,
                             self.normalized_cov_params, self.scale)

        history['iteration'] = iteration
        results.fit_history = history
        results.fit_options = dict(cov=cov.upper(),
                                   scale_est=scale_est,
                                   norm=self.M.__class__.__name__,
                                   conv=conv)
        #norm is not changed in fit, no old state

        #doing the next causes exception
        #self.cov = self.scale_est = None #reset for additional fits
        #iteration and history could contain wrong state with repeated fit
        return RLMResultsWrapper(results)
Ejemplo n.º 7
0
# Use noise values copied from book (based on sds above).
eta = [
    -0.0023, -0.0728, 0.1104, 0.6076, -0.3034, -0.2237, 0.7407, -1.0, -3.0,
    -2.4653, -3.0
]
# Find observed values of x (with noise added).
x = m * s + c + eta

# Weighted Least Squares regression.
# Find weightings w (discount) for each data point.
vars0 = sds**2
w = 1 / vars0
# Un-comment next line for solution based on un-weighted regression.
#w=ones(size(w))
ss = sm.add_constant(s)  # Add column of 1s for regression.
model = sm.WLS(x, ss, weights=w)
results = model.fit()
cest2, mest2 = results.params
print('Estimated slope = %.3f,' % mest2)
print(' estimated intercept = %.3f.' % cest2)

# Make line xest2 based on fitted slope and intercept.
s2 = arange(0, 13)
xest2 = mest2 * s2 + cest2

# Plot fitted line xest, data points, and error bars.
fig1 = plt.figure()
plt.errorbar(s, x, yerr=sds, fmt='o', color='k')
plt.plot(s, x, 'k*', s2, xest2, 'k--')
plt.xlabel('Salary, $s$ (groats)')
plt.ylabel('Height, $x$ (feet)')
Ejemplo n.º 8
0
def fit_model(data,
              model,
              sigma,
              fit_method='chisq',
              masking=None,
              mask_only=False,
              **kwargs):
    def chisq(x):
        return np.sum((data[mask] - x * model[mask])**2 /
                      sigma[mask]**2) / (sum(mask) - 1)

    def difference(x):
        return np.sum(np.abs(data[mask] - x * model[mask]))

    mask = np.array([True for _ in data])
    sigmalimit = None
    if masking is not None:
        for masktype in masking.split(';'):
            masktype = masktype.strip().lower()
            if masktype.startswith('middle'):
                perinterval = float(masktype[6:])
                # Estimate model strength (source rate) by fitting middle %
                interval = PercentileInterval(perinterval)
                lim = interval.get_limits(data)
                mask = (mask & (data >= lim[0]) & (data <= lim[1]))
            elif (masktype.startswith('minalt')) and ('altitude' in kwargs):
                minalt = float(masktype[6:])
                mask = mask & (kwargs['altitude'] >= minalt)
            elif masktype.startswith('minalt'):
                raise InputError('mathMB.fit_model', 'Altitude not supplied.')
            elif masktype.startswith('minsnr'):
                minSNR = float(masktype[6:])
                snr = data / sigma
                mask = mask & (snr > minSNR)
            elif masktype.startswith('siglimit'):
                sigmalimit = masktype
            else:
                raise InputError('MESSENGERdata.fit_model',
                                 f'masking = {masktype} not defined.')
    else:
        pass

    if mask_only:
        return None, None, mask
    else:
        available_fitfunctions = ['chisq', 'difference', 'wls']
        if np.any(mask) == False:
            # No data points are included - just do a simple fit for show
            mask_ = mask.copy()
            mask[:] = True
            model_strength = minimize_scalar(difference)
            mask = mask_
            return model_strength.x, model_strength.fun, mask
        elif fit_method.lower() in available_fitfunctions:
            if fit_method == 'wls':
                # Weighted least squares fit
                wls_model = lm.WLS(model[mask], data[mask],
                                   1. / sigma[mask]**2)
                result = wls_model.fit()

                if sigmalimit is not None:
                    siglimit = float(sigmalimit[8:])
                    diff = (data - model / result.params[0]) / sigma
                    mask = mask & (diff < siglimit * sigma)
                    wls_model = lm.WLS(model[mask], data[mask],
                                       1. / sigma[mask]**2)
                    result = wls_model.fit()
                else:
                    pass
                return 1. / result.params[0], result.rsquared, mask
            else:
                model_strength = minimize_scalar(eval(fit_method.lower()))

                if sigmalimit is not None:
                    siglimit = float(sigmalimit[8:])
                    diff = (data - model_strength.x * model) / sigma
                    mask = mask & (diff < siglimit * sigma)
                    model_strength = minimize_scalar(eval(fit_method.lower()))
                else:
                    pass
                return model_strength.x, model_strength.fun, mask
        else:
            raise InputError('mathMB.fit_model',
                             f'fit_method = {fit_method} not defined.')
Ejemplo n.º 9
0
    def fit(self,
            maxiter=50,
            tol=1e-8,
            scale_est='mad',
            init=None,
            cov='H1',
            update_scale=True,
            conv='dev',
            start_params=None):
        """
        Fits the model using iteratively reweighted least squares.

        The IRLS routine runs until the specified objective converges to `tol`
        or `maxiter` has been reached.

        Parameters
        ----------
        conv : str
            Indicates the convergence criteria.
            Available options are "coefs" (the coefficients), "weights" (the
            weights in the iteration), "sresid" (the standardized residuals),
            and "dev" (the un-normalized log-likelihood for the M
            estimator).  The default is "dev".
        cov : str, optional
            'H1', 'H2', or 'H3'
            Indicates how the covariance matrix is estimated.  Default is 'H1'.
            See rlm.RLMResults for more information.
        init : str
            Specifies method for the initial estimates of the parameters.
            Default is None, which means that the least squares estimate
            is used.  Currently it is the only available choice.
        maxiter : int
            The maximum number of iterations to try. Default is 50.
        scale_est : str or HuberScale()
            'mad' or HuberScale()
            Indicates the estimate to use for scaling the weights in the IRLS.
            The default is 'mad' (median absolute deviation.  Other options are
            'HuberScale' for Huber's proposal 2. Huber's proposal 2 has
            optional keyword arguments d, tol, and maxiter for specifying the
            tuning constant, the convergence tolerance, and the maximum number
            of iterations. See statsmodels.robust.scale for more information.
        tol : float
            The convergence tolerance of the estimate.  Default is 1e-8.
        update_scale : Bool
            If `update_scale` is False then the scale estimate for the
            weights is held constant over the iteration.  Otherwise, it
            is updated for each fit in the iteration.  Default is True.
        start_params : array-like, optional
            Initial guess of the solution of the optimizer. If not provided,
            the initial parameters are computed using OLS.

        Returns
        -------
        results : statsmodels.rlm.RLMresults
            Results instance
        """
        if cov.upper() not in ["H1", "H2", "H3"]:
            raise ValueError("Covariance matrix %s not understood" % cov)
        else:
            self.cov = cov.upper()
        conv = conv.lower()
        if conv not in ["weights", "coefs", "dev", "sresid"]:
            raise ValueError("Convergence argument %s not understood" % conv)
        self.scale_est = scale_est

        if start_params is None:
            wls_results = lm.WLS(self.endog, self.exog).fit()
        else:
            start_params = np.asarray(start_params, dtype=np.double).squeeze()
            if (start_params.shape[0] != self.exog.shape[1]
                    or start_params.ndim != 1):
                raise ValueError('start_params must by a 1-d array with {0} '
                                 'values'.format(self.exog.shape[1]))
            fake_wls = reg_tools._MinimalWLS(self.endog,
                                             self.exog,
                                             weights=np.ones_like(self.endog),
                                             check_weights=False)
            wls_results = fake_wls.results(start_params)

        if not init:
            self.scale = self._estimate_scale(wls_results.resid)

        history = dict(params=[np.inf], scale=[])
        if conv == 'coefs':
            criterion = history['params']
        elif conv == 'dev':
            history.update(dict(deviance=[np.inf]))
            criterion = history['deviance']
        elif conv == 'sresid':
            history.update(dict(sresid=[np.inf]))
            criterion = history['sresid']
        elif conv == 'weights':
            history.update(dict(weights=[np.inf]))
            criterion = history['weights']

        # done one iteration so update
        history = self._update_history(wls_results, history, conv)
        iteration = 1
        converged = 0
        while not converged:
            if self.scale == 0.0:
                import warnings
                warnings.warn(
                    'Estimated scale is 0.0 indicating that the most'
                    ' last iteration produced a perfect fit of the '
                    'weighted data.', ConvergenceWarning)
                break
            self.weights = self.M.weights(wls_results.resid / self.scale)
            wls_results = reg_tools._MinimalWLS(self.endog,
                                                self.exog,
                                                weights=self.weights,
                                                check_weights=True).fit()
            if update_scale is True:
                self.scale = self._estimate_scale(wls_results.resid)
            history = self._update_history(wls_results, history, conv)
            iteration += 1
            converged = _check_convergence(criterion, iteration, tol, maxiter)
        results = RLMResults(self, wls_results.params,
                             self.normalized_cov_params, self.scale)

        history['iteration'] = iteration
        results.fit_history = history
        results.fit_options = dict(cov=cov.upper(),
                                   scale_est=scale_est,
                                   norm=self.M.__class__.__name__,
                                   conv=conv)
        # norm is not changed in fit, no old state

        # doing the next causes exception
        # self.cov = self.scale_est = None #reset for additional fits
        # iteration and history could contain wrong state with repeated fit
        return RLMResultsWrapper(results)
    def fit(self, start_params=None, maxiter=100, method='IRLS', tol=1e-8,
            scale=None):
        """
        Fits a generalized linear model for a given family.

        parameters
        ----------
        maxiter : int, optional
            Default is 100.
        method : string
            Default is 'IRLS' for iteratively reweighted least squares.  This
            is currently the only method available for GLM fit.
        scale : string or float, optional
            `scale` can be 'X2', 'dev', or a float
            The default value is None, which uses `X2` for Gamma, Gaussian,
            and Inverse Gaussian.
            `X2` is Pearson's chi-square divided by `df_resid`.
            The default is 1 for the Binomial and Poisson families.
            `dev` is the deviance divided by df_resid
        tol : float
            Convergence tolerance.  Default is 1e-8.
        start_params : array-like, optional
            Initial guess of the solution for the loglikelihood maximization.
            The default is family-specific and is given by the
            ``family.starting_mu(endog)``. If start_params is given then the
            initial mean will be calculated as ``np.dot(exog, start_params)``.
        """
        endog = self.endog
        if endog.ndim > 1 and endog.shape[1] == 2:
            data_weights = endog.sum(1)  # weights are total trials
        else:
            data_weights = np.ones((endog.shape[0]))
        self.data_weights = data_weights
        if np.shape(self.data_weights) == () and self.data_weights > 1:
            self.data_weights = self.data_weights * np.ones((endog.shape[0]))
        self.scaletype = scale
        if isinstance(self.family, families.Binomial):
        # this checks what kind of data is given for Binomial.
        # family will need a reference to endog if this is to be removed from
        # preprocessing
            self.endog = self.family.initialize(self.endog)

        if hasattr(self, 'offset'):
            offset = self.offset
        elif hasattr(self, 'exposure'):
            offset = self.exposure
        else:
            offset = 0
        #TODO: would there ever be both and exposure and an offset?

        wlsexog = self.exog
        if start_params is None:
            mu = self.family.starting_mu(self.endog)
        else:
            mu = self.family.fitted(np.dot(wlsexog, start_params))
        eta = self.family.predict(mu)
        dev = self.family.deviance(self.endog, mu)
        if np.isnan(dev):
            raise ValueError("The first guess on the deviance function "
                             "returned a nan.  This could be a boundary "
                             " problem and should be reported.")

        # first guess on the deviance is assumed to be scaled by 1.
        # params are none to start, so they line up with the deviance
        history = dict(params=[None, start_params], deviance=[np.inf, dev])
        iteration = 0
        converged = 0
        criterion = history['deviance']
        while not converged:
            self.weights = data_weights*self.family.weights(mu)
            wlsendog = (eta + self.family.link.deriv(mu) * (self.endog-mu)
                        - offset)
            wls_results = lm.WLS(wlsendog, wlsexog, self.weights).fit()
            eta = np.dot(self.exog, wls_results.params) + offset
            mu = self.family.fitted(eta)
            history = self._update_history(wls_results, mu, history)
            self.scale = self.estimate_scale(mu)
            iteration += 1
            if endog.squeeze().ndim == 1 and np.allclose(mu - endog, 0):
                msg = "Perfect separation detected, results not available"
                raise PerfectSeparationError(msg)
            converged = _check_convergence(criterion, iteration, tol, maxiter)
        self.mu = mu
        glm_results = GLMResults(self, wls_results.params,
                                 wls_results.normalized_cov_params,
                                 self.scale)
        history['iteration'] = iteration
        glm_results.fit_history = history
        return GLMResultsWrapper(glm_results)
Ejemplo n.º 11
0
    def fit(self,
            start_params=None,
            maxiter=100,
            method='IRLS',
            tol=1e-8,
            scale=None,
            cov_type='nonrobust',
            cov_kwds=None,
            use_t=None,
            **kwargs):
        """
        Fits a generalized linear model for a given family.

        parameters
        ----------
        maxiter : int, optional
            Default is 100.
        method : string
            Default is 'IRLS' for iteratively reweighted least squares.  This
            is currently the only method available for GLM fit.
        scale : string or float, optional
            `scale` can be 'X2', 'dev', or a float
            The default value is None, which uses `X2` for Gamma, Gaussian,
            and Inverse Gaussian.
            `X2` is Pearson's chi-square divided by `df_resid`.
            The default is 1 for the Binomial and Poisson families.
            `dev` is the deviance divided by df_resid
        tol : float
            Convergence tolerance.  Default is 1e-8.
        start_params : array-like, optional
            Initial guess of the solution for the loglikelihood maximization.
            The default is family-specific and is given by the
            ``family.starting_mu(endog)``. If start_params is given then the
            initial mean will be calculated as ``np.dot(exog, start_params)``.

        Notes
        -----
        This method does not take any extra undocumented ``kwargs``.
        """
        endog = self.endog
        if endog.ndim > 1 and endog.shape[1] == 2:
            data_weights = endog.sum(1)  # weights are total trials
        else:
            data_weights = np.ones((endog.shape[0]))
        self.data_weights = data_weights
        if np.shape(self.data_weights) == () and self.data_weights > 1:
            self.data_weights = self.data_weights * np.ones((endog.shape[0]))
        self.scaletype = scale
        if isinstance(self.family, families.Binomial):
            # this checks what kind of data is given for Binomial.
            # family will need a reference to endog if this is to be removed from
            # preprocessing
            self.endog = self.family.initialize(self.endog)

        # Construct a combined offset/exposure term.  Note that
        # exposure has already been logged if present.
        offset_exposure = 0.
        if hasattr(self, 'offset'):
            offset_exposure = self.offset
        if hasattr(self, 'exposure'):
            offset_exposure = offset_exposure + self.exposure
        self._offset_exposure = offset_exposure

        wlsexog = self.exog
        if start_params is None:
            mu = self.family.starting_mu(self.endog)
            lin_pred = self.family.predict(mu)
        else:
            lin_pred = np.dot(wlsexog, start_params) + offset_exposure
            mu = self.family.fitted(lin_pred)
        dev = self.family.deviance(self.endog, mu)
        if np.isnan(dev):
            raise ValueError("The first guess on the deviance function "
                             "returned a nan.  This could be a boundary "
                             " problem and should be reported.")

        # first guess on the deviance is assumed to be scaled by 1.
        # params are none to start, so they line up with the deviance
        history = dict(params=[None, start_params], deviance=[np.inf, dev])
        converged = False
        criterion = history['deviance']
        # This special case is used to get the likelihood for a specific
        # params vector.
        if maxiter == 0:
            mu = self.family.fitted(lin_pred)
            self.scale = self.estimate_scale(mu)
            wls_results = lm.RegressionResults(self, start_params, None)
            iteration = 0
        for iteration in range(maxiter):
            self.weights = data_weights * self.family.weights(mu)
            wlsendog = (lin_pred + self.family.link.deriv(mu) *
                        (self.endog - mu) - offset_exposure)
            wls_results = lm.WLS(wlsendog, wlsexog, self.weights).fit()
            lin_pred = np.dot(self.exog, wls_results.params) + offset_exposure
            mu = self.family.fitted(lin_pred)
            history = self._update_history(wls_results, mu, history)
            self.scale = self.estimate_scale(mu)
            if endog.squeeze().ndim == 1 and np.allclose(mu - endog, 0):
                msg = "Perfect separation detected, results not available"
                raise PerfectSeparationError(msg)
            converged = _check_convergence(criterion, iteration, tol)
            if converged:
                break
        self.mu = mu

        glm_results = GLMResults(self,
                                 wls_results.params,
                                 wls_results.normalized_cov_params,
                                 self.scale,
                                 cov_type=cov_type,
                                 cov_kwds=cov_kwds,
                                 use_t=use_t)

        history['iteration'] = iteration + 1
        glm_results.fit_history = history
        return GLMResultsWrapper(glm_results)
Ejemplo n.º 12
0
    def fit(self, X, Y, sample_weight = None):
        """
        fit the weighted model
        Parameters
        ----------
        X : design matrix
        Y : response matrix
        sample_weight: sample weight vector

        """
         # family is the glm family with link, the family is the same as in the statsmodel
        

        if sample_weight is None:
            sample_weight = np.ones((X.shape[0],))
        assert X.shape[0] == sample_weight.shape[0]
        assert X.shape[0] == Y.shape[0]
        assert Y.ndim == 1 or Y.shape[1] == 1
        Y = Y.reshape(-1,)

        sum_w = np.sum(sample_weight)
        assert sum_w > 0
        
        
        if X.ndim == 1:
            X = X.reshape(-1,1)
        if self.fit_intercept:
            X = addIntercept(X)   
        self.n_samples = X.shape[0]
        self.n_features = X.shape[1]
        self.n_targets = 1


        # start fitting using irls
        mu = self.fam.starting_mu(Y)
        lin_pred = self.fam.predict(mu)
        dev = self.fam.deviance_weighted(Y, mu, sample_weight)

        if np.isnan(dev):
            raise ValueError("The first guess on the deviance function "
                             "returned a nan.  This could be a boundary "
                             " problem and should be reported.")

        # This special case is used to get the likelihood for a specific
        # params vector.

        for iteration in range(self.max_iter):
            weights = sample_weight * self.fam.weights(mu)
            wlsendog = lin_pred + self.fam.link.deriv(mu) * (Y-mu)
            if self.penalty is None:
                wls_results = lim.WLS(wlsendog, X, weights).fit(method = self.solver)

            if self.penalty == 'elasticnet':
                wls_results = lim.WLS(wlsendog, X, weights).fit_regularized(alpha = self.reg, L1_wt = self.l1_ratio)


            lin_pred = np.dot(X, wls_results.params)
            mu = self.fam.fitted(lin_pred)

            if Y.squeeze().ndim == 1 and np.allclose(mu - Y, 0):
                msg = "Perfect separation detected, results not available"
                raise Error(msg)

            dev_new = self.fam.deviance_weighted(Y, mu, sample_weight)
            converged = np.fabs(dev - dev_new) <= self.tol
            dev = dev_new
            if converged:
                break
        
        self.converged = converged
        self.coef = wls_results.params
        self.dispersion = self.estimate_dispersion(X, Y, mu, sample_weight)
        if self.est_sd:
            self.sd = self.estimate_sd(X, Y, mu, sample_weight, weights)
        self.ll = self.estimate_loglikelihood(Y, mu, sample_weight)