예제 #1
0
    def _fit(self, X, y, sample_weight, relative_penalties):

        if self.lambda_path is not None:
            n_lambda = len(self.lambda_path)
            min_lambda_ratio = 1.0
        else:
            n_lambda = self.n_lambda
            min_lambda_ratio = self.min_lambda_ratio

        _y = y.astype(dtype=np.float64, order='F', copy=True)
        _sample_weight = sample_weight.astype(dtype=np.float64, order='F',
                                              copy=True)

        exclude_vars = 0

        if relative_penalties is None:
            relative_penalties = np.ones(X.shape[1], dtype=np.float64,
                                         order='F')

        coef_bounds = np.empty((2, X.shape[1]), dtype=np.float64, order='F')
        coef_bounds[0, :] = self.lower_limits
        coef_bounds[1, :] = self.upper_limits

        if X.shape[1] > X.shape[0]:
            # the glmnet docs suggest using a different algorithm for the case
            # of p >> n
            algo_flag = 2
        else:
            algo_flag = 1

        # This is a stopping criterion (nx)
        # R defaults to nx = num_features, and ne = num_features + 1
        if self.max_features is None:
            max_features = X.shape[1]
        else:
            max_features = self.max_features

        if issparse(X):
            _x = csc_matrix(X, dtype=np.float64, copy=True)

            (self.n_lambda_,
             self.intercept_path_,
             ca,
             ia,
             nin,
             _,  # rsq
             self.lambda_path_,
             _,  # nlp
             jerr) = spelnet(algo_flag,
                             self.alpha,
                             _x.shape[0],
                             _x.shape[1],
                             _x.data,
                             _x.indptr + 1,  # Fortran uses 1-based indexing
                             _x.indices + 1,
                             _y,
                             _sample_weight,
                             exclude_vars,
                             relative_penalties,
                             coef_bounds,
                             max_features,
                             X.shape[1] + 1,
                             min_lambda_ratio,
                             self.lambda_path,
                             self.tol,
                             n_lambda,
                             self.standardize,
                             self.fit_intercept,
                             self.max_iter)
        else:
            _x = X.astype(dtype=np.float64, order='F', copy=True)

            (self.n_lambda_,
             self.intercept_path_,
             ca,
             ia,
             nin,
             _,  # rsq
             self.lambda_path_,
             _,  # nlp
             jerr) = elnet(algo_flag,
                           self.alpha,
                           _x,
                           _y,
                           _sample_weight,
                           exclude_vars,
                           relative_penalties,
                           coef_bounds,
                           X.shape[1] + 1,
                           min_lambda_ratio,
                           self.lambda_path,
                           self.tol,
                           max_features,
                           n_lambda,
                           self.standardize,
                           self.fit_intercept,
                           self.max_iter)

        # raises RuntimeError if self.jerr_ is nonzero
        self.jerr_ = jerr
        _check_error_flag(self.jerr_)

        self.lambda_path_ = self.lambda_path_[:self.n_lambda_]
        self.lambda_path_ = _fix_lambda_path(self.lambda_path_)
        # trim the pre-allocated arrays returned by glmnet to match the actual
        # number of values found for lambda
        self.intercept_path_ = self.intercept_path_[:self.n_lambda_]
        ca = ca[:, :self.n_lambda_]
        nin = nin[:self.n_lambda_]
        self.coef_path_ = solns(_x.shape[1], ca, ia, nin)

        return self
예제 #2
0
def elastic_net(X, y, rho, pos=True, thr=1.0e-4, weights=None, vp=None,
                isd=True, nlam=100, maxit=1000, intr=False, **kwargs):
    """
    Raw-output wrapper for elastic net linear regression.
    """

    # Mandatory parameters
    X = np.asanyarray(X)
    y = np.asanyarray(y)

    if y.ndim != 2:
        y.shape = (y.shape + (1,))
    # print(X.shape)
    memlimit = X.shape[1]

    # # Flags determining overwrite behavior
    # overwrite_pred_ok = False
    # overwrite_targ_ok = False

    # thr = 1.0e-4   # Minimum change in largest coefficient
    # weights = None          # Relative weighting per observation case
    # vp = None               # Relative penalties per predictor (0 = no penalty)
    # isd = True              # Standardize input variables before proceeding?
    jd = np.zeros(1)        # X to exclude altogether from fitting
    ulam = None             # User-specified lambda values
    flmin = 0.001  # Fraction of largest lambda at which to stop
    # nlam = 100    # The (maximum) number of lambdas to try.
    # maxit = 1000

    box_constraints = np.zeros((2, X.shape[1]), order='F')
    box_constraints[1] = 1e300

    if not pos:
        box_constraints[0] = -1e300

    for keyword in kwargs:
        # if keyword == 'overwrite_pred_ok':
        #     overwrite_pred_ok = kwargs[keyword]
        # elif keyword == 'overwrite_targ_ok':
        #     overwrite_targ_ok = kwargs[keyword]
        # if keyword == 'threshold':
        #     thr = kwargs[keyword]
        # elif keyword == 'weights':
        #     weights = np.asarray(kwargs[keyword]).copy()
        # elif keyword == 'penalties':
        #     vp = kwargs[keyword].copy()
        # elif keyword == 'standardize':
        #     isd = bool(kwargs[keyword])
        if keyword == 'exclude':
            # Add one since Fortran indices start at 1
            exclude = (np.asarray(kwargs[keyword]) + 1).tolist()
            jd = np.array([len(exclude)] + exclude)
        elif keyword == 'lambdas':
            if 'flmin' in kwargs:
                raise ValueError("Can't specify both lambdas & flmin keywords")
            ulam = np.asarray(kwargs[keyword])
            flmin = 2.  # Pass flmin > 1.0 indicating to use the user-supplied.
            nlam = len(ulam)
        elif keyword == 'flmin':
            flmin = kwargs[keyword]
            ulam = None
        elif keyword == 'nlam':
            if 'lambdas' in kwargs:
                raise ValueError("Can't specify both lambdas & nlam keywords")
            nlam = kwargs[keyword]
        else:
            raise ValueError("Unknown keyword argument '%s'" % keyword)

    # # If X is a Fortran contiguous array, it will be overwritten.
    # # Decide whether we want this. If it's not Fortran contiguous it will
    # # be copied into that form anyway so there's no chance of overwriting.
    # if np.isfortran(X):
    #     if not overwrite_pred_ok:
    #         # Might as well make it F-ordered to avoid ANOTHER copy.
    #         X = X.copy(order='F')

    # y being a 1-dimensional array will usually be overwritten
    # with the standardized version unless we take steps to copy it.
    # if not overwrite_targ_ok:
    #     y = y.copy()

    # Uniform weighting if no weights are specified.
    if weights is None:
        weights = np.ones(X.shape[0])
    else:
        weights = np.asarray(weights).copy()
    # Uniform penalties if none were specified.
    if vp is None:
        vp = np.ones(X.shape[1])
    else:
        vp = vp.copy()

    # Call the Fortran wrapper.

    nx = X.shape[1]
    ny = y.shape[1]

    a0 = np.zeros((ny, nlam), dtype=np.float64)
    ca = np.zeros((ny, nx, nlam), dtype=np.float64)
    # ca_ = np.zeros((nx, nlam), dtype=np.float64)
    ia = np.zeros((ny, nx), dtype=np.int32)
    nin = np.zeros((ny, nlam), dtype=np.int32)
    alm = np.zeros((nlam), dtype=np.float64)
    # print(a0.shape, ca.shape, X.shape, y.shape)
    for idx in range(y.shape[1]):

        # X/y is overwritten in the fortran function at every loop, so we must copy it each time
        X_copy = X.copy(order='F')
        y_copy = y[:, idx].copy(order='F')
        # print(X_copy.sum(), y_copy.sum(), jd.sum(), box_constraints.sum(), X.sum(), y.sum())
        lmu, a0[idx], ca[idx], ia[idx], nin[idx], rsq, alm[:], nlp, jerr = \
            _glmnet.elnet(rho, X_copy, y_copy, weights, jd, vp, box_constraints, memlimit, flmin, ulam, thr,
                  nlam=nlam, isd=isd, maxit=maxit, intr=intr)

        # print(y.shape, X.shape, ca[idx].shape, np.sum(ca[idx] != 0), ia[idx].shape, nin[idx])
        # 1/0
        # get list of coefficient in right order
        # ia[idx, :nin] -= 1
        # print(ia)
        # print(ca_)
        # print(nin)
        # ca[idx] = ca_[:, :nin]
        # ca[idx] = solns(X.shape[0], X.shape[1], lmu, ca[idx], ia[idx], nin[idx])

    # print(X_copy.sum(), y_copy.sum(), jd.sum(), box_constraints.sum(), X.sum(), y.sum())
    # # Check for errors, documented in glmnet.f.
    # if jerr != 0:
    #     if jerr == 10000:
    #         raise ValueError('cannot have max(vp) < 0.0')
    #     elif jerr == 7777:
    #         raise ValueError('all used X have 0 variance')
    #     elif jerr < 7777:
    #         raise MemoryError('elnet() returned error code %d' % jerr)
    #     else:
    #         raise Exception('unknown error: %d' % jerr)



    # We substract 1 for the indexes since fortran indices start at 1
    # and python at 0
    # ia -= 1
    # c
    # ia = np.trim_zeros(ia, 'b') - 1
    # print(ia.shape)
    return lmu, a0, ca, ia, nin, rsq, alm, nlp, jerr
예제 #3
0
def elastic_net(predictors,
                target,
                balance,
                memlimit=None,
                largest=None,
                **kwargs):
    """
    Raw-output wrapper for elastic net linear regression.
    """

    # Mandatory parameters
    predictors = np.asanyarray(predictors)
    target = np.asanyarray(target)

    # Decide on largest allowable models for memory/convergence.
    memlimit = predictors.shape[1] if memlimit is None else memlimit

    # If largest isn't specified use memlimit.
    largest = memlimit if largest is None else largest

    if memlimit < largest:
        raise ValueError('Need largest <= memlimit')

    # Flags determining overwrite behavior
    overwrite_pred_ok = False
    overwrite_targ_ok = False

    thr = _DEFAULT_THRESH  # Minimum change in largest coefficient
    weights = None  # Relative weighting per observation case
    vp = None  # Relative penalties per predictor (0 = no penalty)
    isd = True  # Standardize input variables before proceeding?
    jd = np.zeros(1)  # Predictors to exclude altogether from fitting
    ulam = None  # User-specified lambda values
    flmin = _DEFAULT_FLMIN  # Fraction of largest lambda at which to stop
    nlam = _DEFAULT_NLAM  # The (maximum) number of lambdas to try.

    for keyword in kwargs:
        if keyword == 'overwrite_pred_ok':
            overwrite_pred_ok = kwargs[keyword]
        elif keyword == 'overwrite_targ_ok':
            overwrite_targ_ok = kwargs[keyword]
        elif keyword == 'threshold':
            thr = kwargs[keyword]
        elif keyword == 'weights':
            weights = np.asarray(kwargs[keyword]).copy()
        elif keyword == 'penalties':
            vp = kwargs[keyword].copy()
        elif keyword == 'standardize':
            isd = bool(kwargs[keyword])
        elif keyword == 'exclude':
            # Add one since Fortran indices start at 1
            exclude = (np.asarray(kwargs[keyword]) + 1).tolist()
            jd = np.array([len(exclude)] + exclude)
        elif keyword == 'lambdas':
            if 'flmin' in kwargs:
                raise ValueError("Can't specify both lambdas & flmin keywords")
            ulam = np.asarray(kwargs[keyword])
            flmin = 2.  # Pass flmin > 1.0 indicating to use the user-supplied.
            nlam = len(ulam)
        elif keyword == 'flmin':
            flmin = kwargs[keyword]
            ulam = None
        elif keyword == 'nlam':
            if 'lambdas' in kwargs:
                raise ValueError("Can't specify both lambdas & nlam keywords")
            nlam = kwargs[keyword]
        else:
            raise ValueError("Unknown keyword argument '%s'" % keyword)

    # If predictors is a Fortran contiguous array, it will be overwritten.
    # Decide whether we want this. If it's not Fortran contiguous it will
    # be copied into that form anyway so there's no chance of overwriting.
    if np.isfortran(predictors):
        if not overwrite_pred_ok:
            # Might as well make it F-ordered to avoid ANOTHER copy.
            predictors = predictors.copy(order='F')

    # target being a 1-dimensional array will usually be overwritten
    # with the standardized version unless we take steps to copy it.
    if not overwrite_targ_ok:
        target = target.copy()

    # Uniform weighting if no weights are specified.
    if weights is None:
        weights = np.ones(predictors.shape[0])

    # Uniform penalties if none were specified.
    if vp is None:
        vp = np.ones(predictors.shape[1])

    # Call the Fortran wrapper.
    lmu, a0, ca, ia, nin, rsq, alm, nlp, jerr =  \
            _glmnet.elnet(balance, predictors, target, weights, jd, vp,
                          memlimit, flmin, ulam, thr, nlam=nlam)

    # Check for errors, documented in glmnet.f.
    if jerr != 0:
        if jerr == 10000:
            raise ValueError('cannot have max(vp) < 0.0')
        elif jerr == 7777:
            raise ValueError('all used predictors have 0 variance')
        elif jerr < 7777:
            raise MemoryError('elnet() returned error code %d' % jerr)
        else:
            raise Exception('unknown error: %d' % jerr)

    return lmu, a0, ca, ia, nin, rsq, alm, nlp, jerr
예제 #4
0
    def fit(self, X, y, col_names=None,
            lambdas=None, weights=None, rel_penalties=None,
            excl_preds=None, box_constraints=None,
            normalize=True,include_intercept=True):
        '''Fit an elastic net model.

        Arguments: 

          * X: The model matrix.  A n_obs * n_preds array.
          * y: The response.  A n_obs array.

        Optional Arguments:
          
          * lambdas: 
              A user supplied list of lambdas, an elastic net will be fit for
              each lambda supplied.  If no array is passed, glmnet will generate
              its own array of lambdas equally spaced on a logaritmic scale 
              between \lambda_max and \lambda_min.
          * weights: 
               An n_obs array. Sample weights.
          * rel_penalties: 
              An n_preds array. Relative panalty weights for the covariates.  If
              none is passed, all covariates are penalized equally.  If an array
              is passed, then a zero indicates an unpenalized parameter, and a 1
              a fully penalized parameter.  Otherwise all covaraites recieve an
              equal penalty.
          * excl_preds: 
              An n_preds array, used to exclude covaraites from the model. To
              exclude predictors, pass an array with a 1 in the first position,
              then a 1 in the i+1st position excludes the ith covaraite from
              model fitting.  If no array is passed, all covaraites in X are 
              included in the model.
          * box_constraints: 
              An array with dimension 2 * n_obs. Interval constraints on the fit
              coefficients.  The (0, i) entry is a lower bound on the ith
              covariate, and the (1, i) entry is an upper bound.  These must 
              satisfy lower_bound <= 0 <= upper_bound.  If no array is passed,
              no box constraintes are allied to the parameters.

        After fitting, the following attributes are set:
        
        Private attributes:

          * _n_fit_obs:
              The number of rows in the model matrix X.
          * _n_fit_params:
              The number of columns in the model matrix X.
          * _col_names:
              Names for the columns in the model matrix.  Used to display fit 
              coefficients.
          * _out_n_lambdas: 
              The number of lambdas associated with non-zero models (i.e.
              models with at least one none zero parameter estiamte) after
              fitting; for large enough lambda the models will become zero in
              the presense of an L1 regularizer.
          * _intecepts: 
              A one dimensional array containing the intercept estiamtes for
              each value of lambda.  See the intercepts (no underscore) 
              property for a public version.
          * _comp_coef: 
              The fit parameter estiamtes in a compressed form.  This is a
              matrix with each row giving the estimates for a single
              coefficient for various values of \lambda.  The order of the rows
              does not correspond to the order of the coefficents as given in
              the design matrix used to fit the model, this association is
              given by the _p_comp_coef attribute.  Only estaimtes that are
              non-zero for some lambda are reported.
          * _p_comp_coef: 
              A one dimensional integer array associating the coefficients in
              _comp_coef to columns in the model matrix. 
          * _indices: 
              The same information as _p_comp_coef, but zero indexed to be
              compatable with numpy arrays.
          * _n_comp_coef: 
              The number of parameter estimates that are non-zero for some
              value of lambda.
          * _n_passes: 
              The total number of passes over the data used to fit the model.
          * _error_flag: 
              Error flag from the fortran code.

        Public Attributes:

          * r_sqs: 
              An array of length _out_n_lambdas containing the r-squared
              statistic for each model.
          * out_lambdas: 
              An array containing the lambda values associated with each fit
              model.
        '''
        self._check_if_unfit()
        # Convert to arrays if native python objects
        try:
            if not issparse(X):
                X = np.asanyarray(X)
            y = np.asanyarray(y)
        except ValueError:
            raise ValueError("X and y must be either numpy arrays, or "
                             "convertable to numpy arrays."
                  )
        # Grab the design info from patsy for later use, we are abbout to write
        # over this object in some cases.
        if hasattr(X, 'design_info'):
            design_info = X.design_info
        else:
            design_info = None
        # Make a copy if we are not able to overwrite X with its standardized 
        # version. Note that if X is not fortran contiguous, then it will be 
        # copied anyway.
        if not issparse(X) and np.isfortran(X) and not self.overwrite_pred_ok:
            X = X.copy(order='F')
        # Make a copy if we are not able to overwrite y with its standardized
        # version.
        if not self.overwrite_targ_ok:
            y = y.copy()
        # Validate all the inputs:
        self._validate_matrix(X)
        self._validate_inputs(X, y)
        self._validate_lambdas(X, y, lambdas)
        self._validate_weights(X, y, weights)
        self._validate_rel_penalties(X, y, rel_penalties)
        self._validate_excl_preds(X, y, excl_preds)
        self._validate_box_constraints(X, y, box_constraints)
        # Setup is complete, call into the extension module.
        if not issparse(X):
            (self._out_n_lambdas,
             self._intercepts,
             self._comp_coef,
             self._p_comp_coef,
             self._n_comp_coef,
             self.r_sqs,
             self.out_lambdas,
             self._n_passes,
             self._error_flag) = _glmnet.elnet(
                                     self.alpha, 
                                     X, 
                                     y, 
                                     self.weights, 
                                     self.excl_preds, 
                                     self.rel_penalties,
                                     self.box_constraints,
                                     self.max_vars_all, 
                                     self.frac_lg_lambda, 
                                     self.lambdas, 
                                     self.threshold, 
                                     nlam=self.n_lambdas,
                                     isd= normalize,
                                     intr=include_intercept

                                 )
        else:
            X.sort_indices()
            # Fortran arrays are 1 indexed.
            ind_ptrs = X.indptr + 1
            indices = X.indices + 1
            # Call
            (self._out_n_lambdas,
            self._intercepts,
            self._comp_coef,
            self._p_comp_coef,
            self._n_comp_coef,
            self.r_sqs,
            self.out_lambdas,
            self._n_passes,
            self._error_flag) = _glmnet.spelnet(
                                    self.alpha, 
                                    X.shape[0],
                                    X.shape[1],
                                    X.data, 
                                    ind_ptrs, 
                                    indices,
                                    y,
                                    self.weights, 
                                    self.excl_preds, 
                                    self.rel_penalties,
                                    self.box_constraints,
                                    self.max_vars_all, 
                                    self.frac_lg_lambda, 
                                    self.lambdas, 
                                    self.threshold, 
                                    nlam=self.n_lambdas,
                                    isd= normalize,
                                    intr=include_intercept
                                )
        self._check_errors()
        # The indexes into the predictor array are off by one due to fortran
        # convention differing from numpys, this make them indexes into the the
        # numpy array. 
        self._indices = np.trim_zeros(self._p_comp_coef, 'b') - 1
        # Keep some model metadata.
        self._n_fit_obs, self._n_fit_params = X.shape
        # Create a list of column names for the fit parameters, these can be
        # passed in, or attached to the matrix from patsy.  If none are found
        # we crate our own stupid ones.
        if col_names != None:
           self._col_names = col_names
        elif design_info != None:
            self._col_names = design_info.column_names
        else:
            self._col_names = [
                'var_' + str(i) for i in range(self._n_fit_params)
            ]
예제 #5
0
    def fit(self,
            X,
            y,
            col_names=None,
            lambdas=None,
            weights=None,
            rel_penalties=None,
            excl_preds=None,
            box_constraints=None):
        '''Fit an elastic net model.

        Arguments: 

          * X: The model matrix.  A n_obs * n_preds array.
          * y: The response.  A n_obs array.

        Optional Arguments:
          
          * lambdas: 
              A user supplied list of lambdas, an elastic net will be fit for
              each lambda supplied.  If no array is passed, glmnet will generate
              its own array of lambdas equally spaced on a logaritmic scale 
              between \lambda_max and \lambda_min.
          * weights: 
               An n_obs array. Sample weights.
          * rel_penalties: 
              An n_preds array. Relative panalty weights for the covariates.  If
              none is passed, all covariates are penalized equally.  If an array
              is passed, then a zero indicates an unpenalized parameter, and a 1
              a fully penalized parameter.  Otherwise all covaraites recieve an
              equal penalty.
          * excl_preds: 
              An n_preds array, used to exclude covaraites from the model. To
              exclude predictors, pass an array with a 1 in the first position,
              then a 1 in the i+1st position excludes the ith covaraite from
              model fitting.  If no array is passed, all covaraites in X are 
              included in the model.
          * box_constraints: 
              An array with dimension 2 * n_obs. Interval constraints on the fit
              coefficients.  The (0, i) entry is a lower bound on the ith
              covariate, and the (1, i) entry is an upper bound.  These must 
              satisfy lower_bound <= 0 <= upper_bound.  If no array is passed,
              no box constraintes are allied to the parameters.

        After fitting, the following attributes are set:
        
        Private attributes:

          * _n_fit_obs:
              The number of rows in the model matrix X.
          * _n_fit_params:
              The number of columns in the model matrix X.
          * _col_names:
              Names for the columns in the model matrix.  Used to display fit 
              coefficients.
          * _out_n_lambdas: 
              The number of lambdas associated with non-zero models (i.e.
              models with at least one none zero parameter estiamte) after
              fitting; for large enough lambda the models will become zero in
              the presense of an L1 regularizer.
          * _intecepts: 
              A one dimensional array containing the intercept estiamtes for
              each value of lambda.  See the intercepts (no underscore) 
              property for a public version.
          * _comp_coef: 
              The fit parameter estiamtes in a compressed form.  This is a
              matrix with each row giving the estimates for a single
              coefficient for various values of \lambda.  The order of the rows
              does not correspond to the order of the coefficents as given in
              the design matrix used to fit the model, this association is
              given by the _p_comp_coef attribute.  Only estaimtes that are
              non-zero for some lambda are reported.
          * _p_comp_coef: 
              A one dimensional integer array associating the coefficients in
              _comp_coef to columns in the model matrix. 
          * _indices: 
              The same information as _p_comp_coef, but zero indexed to be
              compatable with numpy arrays.
          * _n_comp_coef: 
              The number of parameter estimates that are non-zero for some
              value of lambda.
          * _n_passes: 
              The total number of passes over the data used to fit the model.
          * _error_flag: 
              Error flag from the fortran code.

        Public Attributes:

          * r_sqs: 
              An array of length _out_n_lambdas containing the r-squared
              statistic for each model.
          * out_lambdas: 
              An array containing the lambda values associated with each fit
              model.
        '''
        self._check_if_unfit()
        # Convert to arrays if native python objects
        try:
            if not issparse(X):
                X = np.asanyarray(X)
            y = np.asanyarray(y)
        except ValueError:
            raise ValueError("X and y must be either numpy arrays, or "
                             "convertable to numpy arrays.")
        # Grab the design info from patsy for later use, we are abbout to write
        # over this object in some cases.
        if hasattr(X, 'design_info'):
            design_info = X.design_info
        else:
            design_info = None
        # Make a copy if we are not able to overwrite X with its standardized
        # version. Note that if X is not fortran contiguous, then it will be
        # copied anyway.
        if not issparse(X) and np.isfortran(X) and not self.overwrite_pred_ok:
            X = X.copy(order='F')
        # Make a copy if we are not able to overwrite y with its standardized
        # version.
        if not self.overwrite_targ_ok:
            y = y.copy()
        # Validate all the inputs:
        self._validate_matrix(X)
        self._validate_inputs(X, y)
        self._validate_lambdas(X, y, lambdas)
        self._validate_weights(X, y, weights)
        self._validate_rel_penalties(X, y, rel_penalties)
        self._validate_excl_preds(X, y, excl_preds)
        self._validate_box_constraints(X, y, box_constraints)
        # Setup is complete, call into the extension module.
        if not issparse(X):
            (self._out_n_lambdas, self._intercepts, self._comp_coef,
             self._p_comp_coef, self._n_comp_coef, self.r_sqs,
             self.out_lambdas, self._n_passes,
             self._error_flag) = _glmnet.elnet(self.alpha,
                                               X,
                                               y,
                                               self.weights,
                                               self.excl_preds,
                                               self.rel_penalties,
                                               self.box_constraints,
                                               self.max_vars_all,
                                               self.frac_lg_lambda,
                                               self.lambdas,
                                               self.threshold,
                                               nlam=self.n_lambdas)
        else:
            X.sort_indices()
            # Fortran arrays are 1 indexed.
            ind_ptrs = X.indptr + 1
            indices = X.indices + 1
            # Call
            (self._out_n_lambdas, self._intercepts, self._comp_coef,
             self._p_comp_coef, self._n_comp_coef, self.r_sqs,
             self.out_lambdas, self._n_passes,
             self._error_flag) = _glmnet.spelnet(self.alpha,
                                                 X.shape[0],
                                                 X.shape[1],
                                                 X.data,
                                                 ind_ptrs,
                                                 indices,
                                                 y,
                                                 self.weights,
                                                 self.excl_preds,
                                                 self.rel_penalties,
                                                 self.box_constraints,
                                                 self.max_vars_all,
                                                 self.frac_lg_lambda,
                                                 self.lambdas,
                                                 self.threshold,
                                                 nlam=self.n_lambdas)
        self._check_errors()
        # The indexes into the predictor array are off by one due to fortran
        # convention differing from numpys, this make them indexes into the the
        # numpy array.
        self._indices = np.trim_zeros(self._p_comp_coef, 'b') - 1
        # Keep some model metadata.
        self._n_fit_obs, self._n_fit_params = X.shape
        # Create a list of column names for the fit parameters, these can be
        # passed in, or attached to the matrix from patsy.  If none are found
        # we crate our own stupid ones.
        if col_names != None:
            self._col_names = col_names
        elif design_info != None:
            self._col_names = design_info.column_names
        else:
            self._col_names = [
                'var_' + str(i) for i in range(self._n_fit_params)
            ]
예제 #6
0
    def fit(
            self,
            X,
            y,
            lambdas=None,
            weights=None,
            rel_penalties=None,
            excl_preds=None,
            box_constraints=None,
            intercept=True  # include intercept by default
    ):
        '''Fit an elastic net model.

        Arguments: 

          * X: The predictors.  A n_obs * n_preds array.
          * y: The response.  A n_obs array.

        Optional Arguments:
          
          * lambdas: A user supplied list of lambdas, an elastic net will be 
            fit for each lambda supplied.  If no array is passed, glmnet 
            will generate its own array of lambdas.
          * weights: An n_obs array. Observation weights.
          * rel_penalties: An n_preds array. Relative panalty weights for the
            covariates.  If none is passed, all covariates are penalized 
            equally.  If an array is passed, then a zero indicates an 
            unpenalized parameter, and a 1 a fully penalized parameter.
          * excl_preds: An n_preds array, used to exclude covaraites from 
            the model. To exclude predictors, pass an array with a 1 in the 
            first position, then a 1 in the i+1st position excludes the ith 
            covaraite from model fitting.
          * box_constraints: An array with dimension 2 * n_obs. Interval 
            constraints on the fit coefficients.  The (0, i) entry
            is a lower bound on the ith covariate, and the (1, i) entry is
            an upper bound.

        After fitting, the following attributes are set:
        
        Private attributes:

          * _out_n_lambdas: The number of fit lambdas associated with non-zero
            models; for large enough lambdas the models will become zero in the
            presense of an L1 regularizer.
          * _intecepts: An array of langth _out_n_labdas.  The intercept for
            each model.
          * _comp_coef: The fit coefficients in a compressed form.  Only
            coefficients that are non-zero for some lambda are reported, and the
            associated between these parameters and the predictors are given by
            the _p_comp_coef attribute.
          * _p_comp_coef: An array associating the coefficients in _comp_coef to
            columns in the predictor array. 
          * _indicies: The same information as _p_comp_coef, but zero indexed to
            be compatable with numpy arrays.
          * _n_comp_coef: The number of coefficients that are non-zero for some
            value of lambda.
          * _n_passes: The total number of passes over the data used to fit the
            model.
          * _error_flag: Error flag from the fortran code.

        Public Attributes:

          * r_sqs: An array of length _out_n_lambdas containing the r-squared
            statistic for each model.
          * out_lambdas: An array containing the lambda values associated with
            each fit model.
        '''
        # Convert to arrays if native python objects
        try:
            if not issparse(X):
                X = np.asanyarray(X)
            y = np.asanyarray(y)
        except ValueError:
            raise ValueError("X and y must be wither numpy arrays, or "
                             "convertable to numpy arrays.")
        # Make a copy if we are not able to overwrite X with its standardized
        # version. Note that if X is not fortran contiguous, then it will be
        # copied anyway.
        if not issparse(X) and np.isfortran(X) and not self.overwrite_pred_ok:
            X = X.copy(order='F')
        # The target array will usually be overwritten with its standardized
        # version, if this is not ok, we should copy.
        if not self.overwrite_targ_ok:
            y = y.copy()
        # Validate all the inputs:
        self._validate_matrix(X)
        self._validate_inputs(X, y)
        self._validate_lambdas(X, y, lambdas)
        self._validate_weights(X, y, weights)
        self._validate_rel_penalties(X, y, rel_penalties)
        self._validate_excl_preds(X, y, excl_preds)
        self._validate_box_constraints(X, y, box_constraints)
        # Setup is complete, call the wrapper.
        if not issparse(X):
            (self._out_n_lambdas, self._intercepts, self._comp_coef,
             self._p_comp_coef, self._n_comp_coef, self.r_sqs,
             self.out_lambdas, self._n_passes,
             self._error_flag) = _glmnet.elnet(self.alpha,
                                               X,
                                               y,
                                               self.weights,
                                               self.excl_preds,
                                               self.rel_penalties,
                                               self.box_constraints,
                                               self.max_vars_all,
                                               self.frac_lg_lambda,
                                               self.lambdas,
                                               self.threshold,
                                               nlam=self.n_lambdas,
                                               intr=int(intercept))
        else:
            X.sort_indices()
            # Fortran arrays are 1 indexed.
            ind_ptrs = X.indptr + 1
            indices = X.indices + 1
            (self._out_n_lambdas, self._intercepts, self._comp_coef,
             self._p_comp_coef, self._n_comp_coef, self.r_sqs,
             self.out_lambdas, self._n_passes,
             self._error_flag) = _glmnet.spelnet(self.alpha,
                                                 X.shape[0],
                                                 X.shape[1],
                                                 X.data,
                                                 ind_ptrs,
                                                 indices,
                                                 y,
                                                 self.weights,
                                                 self.excl_preds,
                                                 self.rel_penalties,
                                                 self.box_constraints,
                                                 self.max_vars_all,
                                                 self.frac_lg_lambda,
                                                 self.lambdas,
                                                 self.threshold,
                                                 nlam=self.n_lambdas,
                                                 intr=int(intercept))
        self._check_errors()
        # Keep some model metadata
        self._n_fit_obs, self._n_fit_params = X.shape
        # The indexes into the predictor array are off by one due to fortran
        # convention, fix it up.
        self._indicies = np.trim_zeros(self._p_comp_coef, 'b') - 1
예제 #7
0
def elastic_net(predictors, target, balance, memlimit=None,
                largest=None, **kwargs):
    """
    Raw-output wrapper for elastic net linear regression.
    """
    
    # Mandatory parameters
    predictors = np.asanyarray(predictors)
    target = np.asanyarray(target)

    # Decide on largest allowable models for memory/convergence.
    memlimit = predictors.shape[1] if memlimit is None else memlimit
    
    # If largest isn't specified use memlimit.
    largest = memlimit if largest is None else largest
    
    if memlimit < largest:
        raise ValueError('Need largest <= memlimit')

    # Flags determining overwrite behavior
    overwrite_pred_ok = False
    overwrite_targ_ok = False
    
    thr = _DEFAULT_THRESH   # Minimum change in largest coefficient
    weights = None          # Relative weighting per observation case
    vp = None               # Relative penalties per predictor (0 = no penalty)
    isd = True              # Standardize input variables before proceeding?
    jd = np.zeros(1)        # Predictors to exclude altogether from fitting
    ulam = None             # User-specified lambda values
    flmin = _DEFAULT_FLMIN  # Fraction of largest lambda at which to stop
    nlam = _DEFAULT_NLAM    # The (maximum) number of lambdas to try.

    for keyword in kwargs:
        if keyword == 'overwrite_pred_ok':
            overwrite_pred_ok = kwargs[keyword]
        elif keyword == 'overwrite_targ_ok':
            overwrite_targ_ok = kwargs[keyword]
        elif keyword == 'threshold':
            thr = kwargs[keyword]
        elif keyword == 'weights':
            weights = np.asarray(kwargs[keyword]).copy()
        elif keyword == 'penalties':
            vp = kwargs[keyword].copy()
        elif keyword == 'standardize':
            isd = bool(kwargs[keyword])
        elif keyword == 'exclude':
            # Add one since Fortran indices start at 1
            exclude = (np.asarray(kwargs[keyword]) + 1).tolist()
            jd = np.array([len(exclude)] + exclude)
        elif keyword == 'lambdas':
            if 'flmin' in kwargs:
                raise ValueError("Can't specify both lambdas & flmin keywords")
            ulam = np.asarray(kwargs[keyword])
            flmin = 2. # Pass flmin > 1.0 indicating to use the user-supplied.
            nlam = len(ulam)
        elif keyword == 'flmin':
            flmin = kwargs[keyword]
            ulam = None
        elif keyword == 'nlam':
            if 'lambdas' in kwargs:
                raise ValueError("Can't specify both lambdas & nlam keywords")
            nlam = kwargs[keyword]
        else:
            raise ValueError("Unknown keyword argument '%s'" % keyword)

    # If predictors is a Fortran contiguous array, it will be overwritten.
    # Decide whether we want this. If it's not Fortran contiguous it will 
    # be copied into that form anyway so there's no chance of overwriting.
    if np.isfortran(predictors):
        if not overwrite_pred_ok:
            # Might as well make it F-ordered to avoid ANOTHER copy.
            predictors = predictors.copy(order='F')

    # target being a 1-dimensional array will usually be overwritten
    # with the standardized version unless we take steps to copy it.
    if not overwrite_targ_ok:
        target = target.copy()
    
    # Uniform weighting if no weights are specified.
    if weights is None:
        weights = np.ones(predictors.shape[0])
    
    # Uniform penalties if none were specified.
    if vp is None:
        vp = np.ones(predictors.shape[1])
    
    # Call the Fortran wrapper.
    lmu, a0, ca, ia, nin, rsq, alm, nlp, jerr =  \
            _glmnet.elnet(balance, predictors, target, weights, jd, vp,
                          memlimit, flmin, ulam, thr, nlam=nlam)
    
    # Check for errors, documented in glmnet.f.
    if jerr != 0:
        if jerr == 10000:
            raise ValueError('cannot have max(vp) < 0.0')
        elif jerr == 7777:
            raise ValueError('all used predictors have 0 variance')
        elif jerr < 7777:
            raise MemoryError('elnet() returned error code %d' % jerr)
        else:
            raise Exception('unknown error: %d' % jerr)
    
    return lmu, a0, ca, ia, nin, rsq, alm, nlp, jerr
예제 #8
0
    def fit(self, X, y,
            lambdas=None, weights=None, rel_penalties=None,
            excl_preds=None, box_constraints=None,
            intercept=True # include intercept by default
            ):
        '''Fit an elastic net model.

        Arguments: 

          * X: The predictors.  A n_obs * n_preds array.
          * y: The response.  A n_obs array.

        Optional Arguments:
          
          * lambdas: A user supplied list of lambdas, an elastic net will be 
            fit for each lambda supplied.  If no array is passed, glmnet 
            will generate its own array of lambdas.
          * weights: An n_obs array. Observation weights.
          * rel_penalties: An n_preds array. Relative panalty weights for the
            covariates.  If none is passed, all covariates are penalized 
            equally.  If an array is passed, then a zero indicates an 
            unpenalized parameter, and a 1 a fully penalized parameter.
          * excl_preds: An n_preds array, used to exclude covaraites from 
            the model. To exclude predictors, pass an array with a 1 in the 
            first position, then a 1 in the i+1st position excludes the ith 
            covaraite from model fitting.
          * box_constraints: An array with dimension 2 * n_obs. Interval 
            constraints on the fit coefficients.  The (0, i) entry
            is a lower bound on the ith covariate, and the (1, i) entry is
            an upper bound.

        After fitting, the following attributes are set:
        
        Private attributes:

          * _out_n_lambdas: The number of fit lambdas associated with non-zero
            models; for large enough lambdas the models will become zero in the
            presense of an L1 regularizer.
          * _intecepts: An array of langth _out_n_labdas.  The intercept for
            each model.
          * _comp_coef: The fit coefficients in a compressed form.  Only
            coefficients that are non-zero for some lambda are reported, and the
            associated between these parameters and the predictors are given by
            the _p_comp_coef attribute.
          * _p_comp_coef: An array associating the coefficients in _comp_coef to
            columns in the predictor array. 
          * _indicies: The same information as _p_comp_coef, but zero indexed to
            be compatable with numpy arrays.
          * _n_comp_coef: The number of coefficients that are non-zero for some
            value of lambda.
          * _n_passes: The total number of passes over the data used to fit the
            model.
          * _error_flag: Error flag from the fortran code.

        Public Attributes:

          * r_sqs: An array of length _out_n_lambdas containing the r-squared
            statistic for each model.
          * out_lambdas: An array containing the lambda values associated with
            each fit model.
        '''
        # Convert to arrays if native python objects
        try:
            if not issparse(X):
                X = np.asanyarray(X)
            y = np.asanyarray(y)
        except ValueError:
            raise ValueError("X and y must be wither numpy arrays, or "
                             "convertable to numpy arrays."
                  )
        # Make a copy if we are not able to overwrite X with its standardized 
        # version. Note that if X is not fortran contiguous, then it will be 
        # copied anyway.
        if not issparse(X) and np.isfortran(X) and not self.overwrite_pred_ok:
            X = X.copy(order='F')
        # The target array will usually be overwritten with its standardized
        # version, if this is not ok, we should copy.
        if not self.overwrite_targ_ok:
            y = y.copy()
        # Validate all the inputs:
        self._validate_matrix(X)
        self._validate_inputs(X, y)
        self._validate_lambdas(X, y, lambdas)
        self._validate_weights(X, y, weights)
        self._validate_rel_penalties(X, y, rel_penalties)
        self._validate_excl_preds(X, y, excl_preds)
        self._validate_box_constraints(X, y, box_constraints)
        # Setup is complete, call the wrapper.
        if not issparse(X):
            (self._out_n_lambdas,
             self._intercepts,
             self._comp_coef,
             self._p_comp_coef,
             self._n_comp_coef,
             self.r_sqs,
             self.out_lambdas,
             self._n_passes,
             self._error_flag) = _glmnet.elnet(
                                     self.alpha, 
                                     X, 
                                     y, 
                                     self.weights, 
                                     self.excl_preds, 
                                     self.rel_penalties,
                                     self.box_constraints,
                                     self.max_vars_all, 
                                     self.frac_lg_lambda, 
                                     self.lambdas, 
                                     self.threshold, 
                                     nlam=self.n_lambdas,
                                     intr = int(intercept)
                                 )
        else:
            X.sort_indices()
            # Fortran arrays are 1 indexed.
            ind_ptrs = X.indptr + 1
            indices = X.indices + 1
            (self._out_n_lambdas,
            self._intercepts,
            self._comp_coef,
            self._p_comp_coef,
            self._n_comp_coef,
            self.r_sqs,
            self.out_lambdas,
            self._n_passes,
            self._error_flag) = _glmnet.spelnet(
                                    self.alpha, 
                                    X.shape[0],
                                    X.shape[1],
                                    X.data, 
                                    ind_ptrs, 
                                    indices,
                                    y,
                                    self.weights, 
                                    self.excl_preds, 
                                    self.rel_penalties,
                                    self.box_constraints,
                                    self.max_vars_all, 
                                    self.frac_lg_lambda, 
                                    self.lambdas, 
                                    self.threshold, 
                                    nlam=self.n_lambdas,
                                     intr = int(intercept)
                                )
        self._check_errors()
        # Keep some model metadata
        self._n_fit_obs, self._n_fit_params = X.shape
        # The indexes into the predictor array are off by one due to fortran
        # convention, fix it up.
        self._indicies = np.trim_zeros(self._p_comp_coef, 'b') - 1