Beispiel #1
0
    def _fit(self, X, y, sample_weight=None, relative_penalties=None):
        if self.lambda_path is not None:
            n_lambda = len(self.lambda_path)
            min_lambda_ratio = 1.0
        else:
            n_lambda = self.n_lambda
            min_lambda_ratio = self.min_lambda_ratio

        check_classification_targets(y)
        self.classes_ = np.unique(y)  # the output of np.unique is sorted
        n_classes = len(self.classes_)
        if n_classes < 2:
            raise ValueError("Training data need to contain at least 2 "
                             "classes.")

        # glmnet requires the labels a one-hot-encoded array of
        # (n_samples, n_classes)
        if n_classes == 2:
            # Normally we use 1/0 for the positive and negative classes. Since
            # np.unique sorts the output, the negative class will be in the 0th
            # column. We want a model predicting the positive class, not the
            # negative class, so we flip the columns here (the != condition).
            #
            # Broadcast comparison of self.classes_ to all rows of y. See the
            # numpy rules on broadcasting for more info, essentially this
            # "reshapes" y to (n_samples, n_classes) and self.classes_ to
            # (n_samples, n_classes) and performs an element-wise comparison
            # resulting in _y with shape (n_samples, n_classes).
            _y = (y[:, None] != self.classes_).astype(np.float64, order='F')
        else:
            # multinomial case, glmnet uses the entire array so we can
            # keep the original order.
            _y = (y[:, None] == self.classes_).astype(np.float64, order='F')

        # we need some sort of "offset" array for glmnet
        # an array of shape (n_examples, n_classes)
        offset = np.zeros((X.shape[0], n_classes), dtype=np.float64, order='F')

        # You should have thought of that before you got here.
        exclude_vars = 0

        # how much each feature should be penalized relative to the others
        # this may be useful to expose to the caller if there are vars that
        # must be included in the final model or there is some prior knowledge
        # about how important some vars are relative to others, see the glmnet
        # vignette:
        # http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html
        if relative_penalties is None:
            relative_penalties = np.ones(X.shape[1],
                                         dtype=np.float64,
                                         order='F')

        coef_bounds = np.empty((2, X.shape[1]), dtype=np.float64, order='F')
        coef_bounds[0, :] = self.lower_limits
        coef_bounds[1, :] = self.upper_limits

        # This is a stopping criterion (ne), add 1 to ensure the final model
        # includes all features. R defaults to nx = num_features, and
        # ne = num_features + 1
        # Note, this will be ignored when the user specifies lambda_path.
        max_features = X.shape[1] + 1

        if n_classes == 2:
            # binomial, tell glmnet there is only one class
            # otherwise we will get a coef matrix with two dimensions
            # where each pair are equal in magnitude and opposite in sign
            # also since the magnitudes are constrained to sum to one, the
            # returned coefficients would be one half of the proper values
            n_classes = 1

        # for documentation on the glmnet function lognet, see doc.py
        if issparse(X):
            _x = csc_matrix(X, dtype=np.float64, copy=True)

            (
                self.n_lambda_,
                self.intercept_path_,
                ca,
                ia,
                nin,
                _,  # dev0
                _,  # dev
                self.lambda_path_,
                _,  # nlp
                jerr) = splognet(
                    self.alpha,
                    _x.shape[0],
                    _x.shape[1],
                    n_classes,
                    _x.data,
                    _x.indptr + 1,  # Fortran uses 1-based indexing
                    _x.indices + 1,
                    _y,
                    offset,
                    exclude_vars,
                    relative_penalties,
                    coef_bounds,
                    max_features,
                    max_features - 1,
                    min_lambda_ratio,
                    self.lambda_path,
                    self.tol,
                    n_lambda,
                    self.standardize,
                    self.fit_intercept,
                    self.max_iter,
                    0)
        else:  # not sparse
            # some notes: glmnet requires both x and y to be float64, the two
            # arrays
            # may also be overwritten during the fitting process, so they need
            # to be copied prior to calling lognet. The fortran wrapper will
            # copy any arrays passed to a wrapped function if they are not in
            # the fortran layout, to avoid making extra copies, ensure x and y
            # are `F_CONTIGUOUS` prior to calling lognet.
            _x = X.astype(dtype=np.float64, order='F', copy=True)

            (
                self.n_lambda_,
                self.intercept_path_,
                ca,
                ia,
                nin,
                _,  # dev0
                _,  # dev
                self.lambda_path_,
                _,  # nlp
                jerr) = lognet(self.alpha, n_classes, _x, _y, offset,
                               exclude_vars, relative_penalties, coef_bounds,
                               max_features, min_lambda_ratio,
                               self.lambda_path, self.tol, max_features - 1,
                               n_lambda, self.standardize, self.fit_intercept,
                               self.max_iter, 0)

        # raises RuntimeError if self.jerr_ is nonzero
        self.jerr_ = jerr
        _check_glmnet_error_flag(self.jerr_)

        # glmnet may not return the requested number of lambda values, so we
        # need to trim the trailing zeros from the returned path so
        # len(lambda_path_) is equal to n_lambda_
        self.lambda_path_ = self.lambda_path_[:self.n_lambda_]
        # also fix the first value of lambda
        self.lambda_path_ = _fix_lambda_path(self.lambda_path_)
        self.intercept_path_ = self.intercept_path_[:, :self.n_lambda_]
        # also trim the compressed coefficient matrix
        ca = ca[:, :, :self.n_lambda_]
        # and trim the array of n_coef per lambda (may or may not be non-zero)
        nin = nin[:self.n_lambda_]
        # decompress the coefficients returned by glmnet, see doc.py
        self.coef_path_ = lsolns(X.shape[1], ca, ia, nin)
        # coef_path_ has shape (n_features, n_classes, n_lambda), we should
        # match shape for scikit-learn models:
        # (n_classes, n_features, n_lambda)
        self.coef_path_ = np.transpose(self.coef_path_, axes=(1, 0, 2))

        return self
Beispiel #2
0
    def fit(self,
            X,
            y,
            col_names=None,
            lambdas=None,
            weights=None,
            rel_penalties=None,
            excl_preds=None,
            box_constraints=None,
            offsets=None):
        '''Fit a logistic or multinomial net model.

        Arguments:

          * X: The model matrix.  A n_obs * n_preds array.
          * y: The response.  This method accepts the response in two
            differnt configurations:

            - An n_obs * n_classes array.  In this case, each column in y must
              be of boolean (0, 1) type indicating whether the observation is
              or is not of a given class.
            - An n_obs array.  In this case the array must contain a discrete
              number of values, and is converted into the previous form before
              being passed to the model.

        Optional Arguments:

          * lambdas: 
              A user supplied list of lambdas, an elastic net will be fit for
              each lambda supplied.  If no array is passed, glmnet will generate
              its own array of lambdas equally spaced on a logaritmic scale 
              between \lambda_max and \lambda_min.
          * weights: 
               An n_obs array. Sample weights. It is an error to pass a weights
               array to a logistic model.
          * rel_penalties: 
              An n_preds array. Relative panalty weights for the covariates.  If
              none is passed, all covariates are penalized equally.  If an array
              is passed, then a zero indicates an unpenalized parameter, and a 1
              a fully penalized parameter.  Otherwise all covaraites recieve an
              equal penalty.
          * excl_preds: 
              An n_preds array, used to exclude covaraites from the model. To
              exclude predictors, pass an array with a 1 in the first position,
              then a 1 in the i+1st position excludes the ith covaraite from
              model fitting.  If no array is passed, all covaraites in X are 
              included in the model.
          * box_constraints: 
              An array with dimension 2 * n_obs. Interval constraints on the fit
              coefficients.  The (0, i) entry is a lower bound on the ith
              covariate, and the (1, i) entry is an upper bound.  These must 
              satisfy lower_bound <= 0 <= upper_bound.  If no array is passed,
              no box constraintes are allied to the parameters.
          * offsets: 
              A n_preds * n_classes array. Used as initial offsets for the
              model fitting. 

        After fitting, the following attributes are set:
        
        Private attributes:

          * _n_fit_obs:
              The number of rows in the model matrix X.
          * _n_fit_params:
              The number of columns in the model matrix X.
          * _out_n_lambdas: 
              The number of lambdas associated with non-zero models (i.e.
              models with at least one none zero parameter estiamte) after
              fitting; for large enough lambda the models will become zero in
              the presense of an L1 regularizer.
          * _intecepts: 
              A one dimensional array containing the intercept estiamtes for
              each value of lambda.  See the intercepts (no underscore) 
              property for a public version.
          * _comp_coef: 
              The fit parameter estiamtes in a compressed form.  This is a
              matrix with each row giving the estimates for a single
              coefficient for various values of \lambda.  The order of the rows
              does not correspond to the order of the coefficents as given in
              the design matrix used to fit the model, this association is
              given by the _p_comp_coef attribute.  Only estaimtes that are
              non-zero for some lambda are reported.
          * _p_comp_coef: 
              A one dimensional integer array associating the coefficients in
              _comp_coef to columns in the model matrix. 
          * _indices: 
              The same information as _p_comp_coef, but zero indexed to be
              compatable with numpy arrays.
          * _n_comp_coef: 
              The number of parameter estimates that are non-zero for some
              value of lambda.
          * _n_passes: 
              The total number of passes over the data used to fit the model.
          * _error_flag: 
              Error flag from the fortran code.

        Public Attributes:
          
          * null_dev: 
              The devaince of the null (mean) model.
          * exp_dev: 
              The devaince explained by the model.
          * out_lambdas: 
              An array containing the lambda values associated with each fit
              model.
        '''
        self._check_if_unfit()
        self._check_y(y)
        self._check_weights(weights)
        # Convert to arrays is native python objects
        try:
            if not issparse(X):
                X = np.asanyarray(X)
            y = np.asanyarray(y)
        except ValueError:
            raise ValueError("X and y must be wither numpy arrays, or "
                             "convertable to numpy arrays.")
        # Fortran expects an n_obs * n_classes array for y.  If a one
        # dimensional array is passed, we construct an appropriate widening.
        # TODO: Save class names as attribute.
        y = np.asanyarray(y)
        if len(y.shape) == 1:
            y_classes = np.unique(y)
            y = np.float64(np.column_stack(y == c for c in y_classes))
        self._n_classes = y.shape[1]
        # Two class predictions are handled as a special case, as is usual
        # with logistic models, this is signaled to fortran by passing in a
        # 1 for nc (num classes).
        if self._n_classes == 2:
            f_n_classes = np.array([1])
        else:
            f_n_classes = np.array([self._n_classes])
        # Grab the design info from patsy for later use, we are abbout to write
        # over this object in some cases.
        if hasattr(X, 'design_info'):
            design_info = X.design_info
        else:
            design_info = None
        # Make a copy if we are not able to overwrite X with its standardized
        # version. Note that if X is not fortran contiguous, then it will be
        # copied anyway.
        if not issparse(X) and np.isfortran(X) and not self.overwrite_pred_ok:
            X = X.copy(order='F')
        # Make a copy if we are not able to overwrite y with its standardized
        # version.
        if np.isfortran(y) and not self.overwrite_targ_ok:
            y = y.copy(order='F')
        # Validate all the inputs:
        self._validate_matrix(X)
        self._validate_inputs(X, y)
        self._validate_lambdas(X, y, lambdas)
        self._validate_rel_penalties(X, y, rel_penalties)
        self._validate_excl_preds(X, y, excl_preds)
        self._validate_box_constraints(X, y, box_constraints)
        self._validate_offsets(X, y, offsets)
        # Setup is complete, call the wrapper
        if not issparse(X):
            (self._out_n_lambdas, self._intercepts, self._comp_coef,
             self._p_comp_coef, self._n_comp_coef, self.null_dev, self.exp_dev,
             self.out_lambdas, self._n_passes,
             self._error_flag) = _glmnet.lognet(self.alpha,
                                                f_n_classes,
                                                X,
                                                y,
                                                self.offsets,
                                                self.excl_preds,
                                                self.rel_penalties,
                                                self.box_constraints,
                                                self.max_vars_all,
                                                self.frac_lg_lambda,
                                                self.lambdas,
                                                self.threshold,
                                                nlam=self.n_lambdas)
        else:
            X.sort_indices()
            # Fortran arrays are 1 indexed.
            ind_ptrs = X.indptr + 1
            indices = X.indices + 1
            # Call
            (self._out_n_lambdas, self._intercepts, self._comp_coef,
             self._p_comp_coef, self._n_comp_coef, self.null_dev, self.exp_dev,
             self.out_lambdas, self._n_passes,
             self._error_flag) = _glmnet.splognet(self.alpha,
                                                  X.shape[0],
                                                  X.shape[1],
                                                  f_n_classes,
                                                  X.data,
                                                  ind_ptrs,
                                                  indices,
                                                  y,
                                                  self.offsets,
                                                  self.excl_preds,
                                                  self.rel_penalties,
                                                  self.box_constraints,
                                                  self.max_vars_all,
                                                  self.frac_lg_lambda,
                                                  self.lambdas,
                                                  self.threshold,
                                                  nlam=self.n_lambdas)
        self._check_errors()
        # Keep some model metadata
        self._n_fit_obs, self._n_fit_params = X.shape
        # The indexes into the predictor array are off by one due to fortran
        # convention differing from numpys, this make them indexes into the the
        # numpy array.
        self._indices = np.trim_zeros(self._p_comp_coef, 'b') - 1
        # Create a list of column names for the fit parameters, these can be
        # passed in, or attached to the matrix from patsy.  If none are found
        # we crate our own stupid ones.
        if col_names != None:
            self._col_names = col_names
        elif design_info != None:
            self._col_names = design_info.column_names
        else:
            self._col_names = [
                'var_' + str(i) for i in range(self._n_fit_params)
            ]
    def _fit(self, X, y, sample_weight=None, relative_penalties=None):
        if self.lambda_path is not None:
            n_lambda = len(self.lambda_path)
            min_lambda_ratio = 1.0
        else:
            n_lambda = self.n_lambda
            min_lambda_ratio = self.min_lambda_ratio

        check_classification_targets(y)
        self.classes_ = np.unique(y)  # the output of np.unique is sorted
        n_classes = len(self.classes_)
        if n_classes < 2:
            raise ValueError("Training data need to contain at least 2 "
                             "classes.")

        # glmnet requires the labels a one-hot-encoded array of
        # (n_samples, n_classes)
        if n_classes == 2:
            # Normally we use 1/0 for the positive and negative classes. Since
            # np.unique sorts the output, the negative class will be in the 0th
            # column. We want a model predicting the positive class, not the
            # negative class, so we flip the columns here (the != condition).
            #
            # Broadcast comparison of self.classes_ to all rows of y. See the
            # numpy rules on broadcasting for more info, essentially this
            # "reshapes" y to (n_samples, n_classes) and self.classes_ to
            # (n_samples, n_classes) and performs an element-wise comparison
            # resulting in _y with shape (n_samples, n_classes).
            _y = (y[:, None] != self.classes_).astype(np.float64, order='F')
        else:
            # multinomial case, glmnet uses the entire array so we can
            # keep the original order.
            _y = (y[:, None] == self.classes_).astype(np.float64, order='F')

        # use sample weights, making sure all weights are positive
        # this is inspired by the R wrapper for glmnet, in lognet.R
        if sample_weight is not None:
            weight_gt_0 = sample_weight > 0
            sample_weight = sample_weight[weight_gt_0]
            _y = _y[weight_gt_0, :]
            X = X[weight_gt_0, :]
            _y = _y * np.expand_dims(sample_weight, 1)

        # we need some sort of "offset" array for glmnet
        # an array of shape (n_examples, n_classes)
        offset = np.zeros((X.shape[0], n_classes), dtype=np.float64,
                          order='F')

        # You should have thought of that before you got here.
        exclude_vars = 0

        # how much each feature should be penalized relative to the others
        # this may be useful to expose to the caller if there are vars that
        # must be included in the final model or there is some prior knowledge
        # about how important some vars are relative to others, see the glmnet
        # vignette:
        # http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html
        if relative_penalties is None:
            relative_penalties = np.ones(X.shape[1], dtype=np.float64,
                                         order='F')

        coef_bounds = np.empty((2, X.shape[1]), dtype=np.float64, order='F')
        coef_bounds[0, :] = self.lower_limits
        coef_bounds[1, :] = self.upper_limits

        if n_classes == 2:
            # binomial, tell glmnet there is only one class
            # otherwise we will get a coef matrix with two dimensions
            # where each pair are equal in magnitude and opposite in sign
            # also since the magnitudes are constrained to sum to one, the
            # returned coefficients would be one half of the proper values
            n_classes = 1


        # This is a stopping criterion (nx)
        # R defaults to nx = num_features, and ne = num_features + 1
        if self.max_features is None:
            max_features = X.shape[1]
        else:
            max_features = self.max_features

        # for documentation on the glmnet function lognet, see doc.py
        if issparse(X):
            _x = csc_matrix(X, dtype=np.float64, copy=True)

            (self.n_lambda_,
             self.intercept_path_,
             ca,
             ia,
             nin,
             _,  # dev0
             _,  # dev
             self.lambda_path_,
             _,  # nlp
             jerr) = splognet(self.alpha,
                              _x.shape[0],
                              _x.shape[1],
                              n_classes,
                              _x.data,
                              _x.indptr + 1,  # Fortran uses 1-based indexing
                              _x.indices + 1,
                              _y,
                              offset,
                              exclude_vars,
                              relative_penalties,
                              coef_bounds,
                              max_features,
                              X.shape[1] + 1,
                              min_lambda_ratio,
                              self.lambda_path,
                              self.tol,
                              n_lambda,
                              self.standardize,
                              self.fit_intercept,
                              self.max_iter,
                              0)
        else:  # not sparse
            # some notes: glmnet requires both x and y to be float64, the two
            # arrays
            # may also be overwritten during the fitting process, so they need
            # to be copied prior to calling lognet. The fortran wrapper will
            # copy any arrays passed to a wrapped function if they are not in
            # the fortran layout, to avoid making extra copies, ensure x and y
            # are `F_CONTIGUOUS` prior to calling lognet.
            _x = X.astype(dtype=np.float64, order='F', copy=True)

            (self.n_lambda_,
             self.intercept_path_,
             ca,
             ia,
             nin,
             _,  # dev0
             _,  # dev
             self.lambda_path_,
             _,  # nlp
             jerr) = lognet(self.alpha,
                            n_classes,
                            _x,
                            _y,
                            offset,
                            exclude_vars,
                            relative_penalties,
                            coef_bounds,
                            X.shape[1] + 1,
                            min_lambda_ratio,
                            self.lambda_path,
                            self.tol,
                            max_features,
                            n_lambda,
                            self.standardize,
                            self.fit_intercept,
                            self.max_iter,
                            0)

        # raises RuntimeError if self.jerr_ is nonzero
        self.jerr_ = jerr
        _check_error_flag(self.jerr_)

        # glmnet may not return the requested number of lambda values, so we
        # need to trim the trailing zeros from the returned path so
        # len(lambda_path_) is equal to n_lambda_
        self.lambda_path_ = self.lambda_path_[:self.n_lambda_]
        # also fix the first value of lambda
        self.lambda_path_ = _fix_lambda_path(self.lambda_path_)
        self.intercept_path_ = self.intercept_path_[:, :self.n_lambda_]
        # also trim the compressed coefficient matrix
        ca = ca[:, :, :self.n_lambda_]
        # and trim the array of n_coef per lambda (may or may not be non-zero)
        nin = nin[:self.n_lambda_]
        # decompress the coefficients returned by glmnet, see doc.py
        self.coef_path_ = lsolns(X.shape[1], ca, ia, nin)
        # coef_path_ has shape (n_features, n_classes, n_lambda), we should
        # match shape for scikit-learn models:
        # (n_classes, n_features, n_lambda)
        self.coef_path_ = np.transpose(self.coef_path_, axes=(1, 0, 2))

        return self
Beispiel #4
0
    def fit(self, X, y, col_names=None,
            lambdas=None, weights=None, rel_penalties=None,
            excl_preds=None, box_constraints=None, offsets=None,
            normalize=True,include_intercept=True):
        '''Fit a logistic or multinomial net model.


        Arguments:

          * X: The model matrix.  A n_obs * n_preds array.
          * y: The response.  This method accepts the response in two
            differnt configurations:

            - An n_obs * n_classes array.  In this case, each column in y must
              be of boolean (0, 1) type indicating whether the observation is
              or is not of a given class.
            - An n_obs array.  In this case the array must contain a discrete
              number of values, and is converted into the previous form before
              being passed to the model.

        Optional Arguments:

          * lambdas: 
              A user supplied list of lambdas, an elastic net will be fit for
              each lambda supplied.  If no array is passed, glmnet will generate
              its own array of lambdas equally spaced on a logaritmic scale 
              between \lambda_max and \lambda_min.
          * weights: 
               An n_obs array. Sample weights. It is an error to pass a weights
               array to a logistic model.
          * rel_penalties: 
              An n_preds array. Relative panalty weights for the covariates.  If
              none is passed, all covariates are penalized equally.  If an array
              is passed, then a zero indicates an unpenalized parameter, and a 1
              a fully penalized parameter.  Otherwise all covaraites recieve an
              equal penalty.
          * excl_preds: 
              An n_preds array, used to exclude covaraites from the model. To
              exclude predictors, pass an array with a 1 in the first position,
              then a 1 in the i+1st position excludes the ith covaraite from
              model fitting.  If no array is passed, all covaraites in X are 
              included in the model.
          * box_constraints: 
              An array with dimension 2 * n_obs. Interval constraints on the fit
              coefficients.  The (0, i) entry is a lower bound on the ith
              covariate, and the (1, i) entry is an upper bound.  These must 
              satisfy lower_bound <= 0 <= upper_bound.  If no array is passed,
              no box constraintes are allied to the parameters.
          * offsets: 
              A n_preds * n_classes array. Used as initial offsets for the
              model fitting. 

        After fitting, the following attributes are set:
        
        Private attributes:

          * _n_fit_obs:
              The number of rows in the model matrix X.
          * _n_fit_params:
              The number of columns in the model matrix X.
          * _out_n_lambdas: 
              The number of lambdas associated with non-zero models (i.e.
              models with at least one none zero parameter estiamte) after
              fitting; for large enough lambda the models will become zero in
              the presense of an L1 regularizer.
          * _intecepts: 
              A one dimensional array containing the intercept estiamtes for
              each value of lambda.  See the intercepts (no underscore) 
              property for a public version.
          * _comp_coef: 
              The fit parameter estiamtes in a compressed form.  This is a
              matrix with each row giving the estimates for a single
              coefficient for various values of \lambda.  The order of the rows
              does not correspond to the order of the coefficents as given in
              the design matrix used to fit the model, this association is
              given by the _p_comp_coef attribute.  Only estaimtes that are
              non-zero for some lambda are reported.
          * _p_comp_coef: 
              A one dimensional integer array associating the coefficients in
              _comp_coef to columns in the model matrix. 
          * _indices: 
              The same information as _p_comp_coef, but zero indexed to be
              compatable with numpy arrays.
          * _n_comp_coef: 
              The number of parameter estimates that are non-zero for some
              value of lambda.
          * _n_passes: 
              The total number of passes over the data used to fit the model.
          * _error_flag: 
              Error flag from the fortran code.

        Public Attributes:
          
          * null_dev: 
              The devaince of the null (mean) model.
          * exp_dev: 
              The devaince explained by the model.
          * out_lambdas: 
              An array containing the lambda values associated with each fit
              model.
        '''
        self._check_if_unfit()
        self._check_y(y)
        self._check_weights(weights)
        # Convert to arrays is native python objects
        try:
            if not issparse(X):
                X = np.asanyarray(X)
            y = np.asanyarray(y)
        except ValueError:
            raise ValueError("X and y must be wither numpy arrays, or "
                             "convertable to numpy arrays."
                  )
        # Fortran expects an n_obs * n_classes array for y.  If a one 
        # dimensional array is passed, we construct an appropriate widening. 
        # TODO: Save class names as attribute.
        y = np.asanyarray(y)
        if len(y.shape) == 1:
            y_classes = np.unique(y)
            y = np.float64(np.column_stack(y == c for c in y_classes))
        self._n_classes = y.shape[1]
        # Two class predictions are handled as a special case, as is usual 
        # with logistic models, this is signaled to fortran by passing in a
        # 1 for nc (num classes).
        if self._n_classes == 2:
            f_n_classes = np.array([1])
        else:
            f_n_classes = np.array([self._n_classes])
        # Grab the design info from patsy for later use, we are abbout to write
        # over this object in some cases.
        if hasattr(X, 'design_info'):
            design_info = X.design_info
        else:
            design_info = None
        # Make a copy if we are not able to overwrite X with its standardized 
        # version. Note that if X is not fortran contiguous, then it will be 
        # copied anyway.
        if not issparse(X) and np.isfortran(X) and not self.overwrite_pred_ok:
            X = X.copy(order='F')
        # Make a copy if we are not able to overwrite y with its standardized
        # version.
        if np.isfortran(y) and not self.overwrite_targ_ok:
            y = y.copy(order='F')
        # Validate all the inputs:
        self._validate_matrix(X)
        self._validate_inputs(X, y)
        self._validate_lambdas(X, y, lambdas)
        self._validate_rel_penalties(X, y, rel_penalties)
        self._validate_excl_preds(X, y, excl_preds)
        self._validate_box_constraints(X, y, box_constraints)
        self._validate_offsets(X, y, offsets)
        normalize         = int(normalize)         # probably not necessary
        include_intercept = int(include_intercept) # probably not necessary
        # Setup is complete, call the wrapper
        if not issparse(X):
            (self._out_n_lambdas,
            self._intercepts,
            self._comp_coef,
            self._p_comp_coef,
            self._n_comp_coef,
            self.null_dev,
            self.exp_dev,
            self.out_lambdas,
            self._n_passes,
            self._error_flag) = _glmnet.lognet(
                                    self.alpha, 
                                    f_n_classes,
                                    X,
                                    y, 
                                    self.offsets,
                                    self.excl_preds, 
                                    self.rel_penalties,
                                    self.box_constraints,
                                    self.max_vars_all, 
                                    self.frac_lg_lambda, 
                                    self.lambdas,
                                    self.threshold, 
                                    nlam=self.n_lambdas,
                                    isd= normalize,
                                    intr=include_intercept
                                )
        else:
            X.sort_indices()
            # Fortran arrays are 1 indexed.
            ind_ptrs = X.indptr + 1
            indices = X.indices + 1
            # Call
            (self._out_n_lambdas,
            self._intercepts,
            self._comp_coef,
            self._p_comp_coef,
            self._n_comp_coef,
            self.null_dev,
            self.exp_dev,
            self.out_lambdas,
            self._n_passes,
            self._error_flag) = _glmnet.splognet(
                                    self.alpha, 
                                    X.shape[0],
                                    X.shape[1],
                                    f_n_classes,
                                    X.data,
                                    ind_ptrs,
                                    indices,
                                    y, 
                                    self.offsets,
                                    self.excl_preds, 
                                    self.rel_penalties,
                                    self.box_constraints,
                                    self.max_vars_all, 
                                    self.frac_lg_lambda, 
                                    self.lambdas,
                                    self.threshold, 
                                    nlam=self.n_lambdas,
                                    isd= normalize,
                                    intr=include_intercept
                                )
        self._check_errors()
        # Keep some model metadata
        self._n_fit_obs, self._n_fit_params = X.shape
        # The indexes into the predictor array are off by one due to fortran
        # convention differing from numpys, this make them indexes into the the
        # numpy array. 
        self._indices = np.trim_zeros(self._p_comp_coef, 'b') - 1
        # Create a list of column names for the fit parameters, these can be
        # passed in, or attached to the matrix from patsy.  If none are found
        # we crate our own stupid ones.
        if col_names != None:
           self._col_names = col_names
        elif design_info != None:
            self._col_names = design_info.column_names
        else:
            self._col_names = [
                'var_' + str(i) for i in range(self._n_fit_params)
            ]
Beispiel #5
0
    def fit(self, X, y,
            lambdas=None, weights=None, rel_penalties=None,
            excl_preds=None, box_constraints=None, offsets=None):
        '''Fit a logistic or multinomial net model.

        Arguments:

          * X: The predictors.  A n_obs * n_preds array.

          * y: The response.  This method accepts the predictors in two
            differnt configurations:

            - An n_obs * n_classes array.  In this case, each column in y must
              be a boolean flag indicating whether the observation is or is not
              of this class.
            - An n_obs array.  In this case the array must contain a discrete
              number of values, and is converted into the previous form before
              being passed to the model.

        Optional Arguments:

          * lambdas: A user supplied list of lambdas, an elastic net will be 
            fit for each lambda supplied.  If no array is passed, glmnet 
            will generate its own array of lambdas.
          * weights: An n_obs array. Observation weights.
          * rel_penalties: An n_preds array. Relative panalty weights for the
            covariates.  If none is passed, all covariates are penalized 
            equally.  If an array is passed, then a zero indicates an 
            unpenalized parameter, and a 1 a fully penalized parameter.
          * excl_preds: An n_preds array, used to exclude covaraites from 
            the model. To exclude predictors, pass an array with a 1 in the 
            first position, then a 1 in the i+1st position excludes the ith 
            covaraite from model fitting.
          * box_constraints: An array with dimension 2 * n_obs. Interval 
            constraints on the fit coefficients.  The (0, i) entry
            is a lower bound on the ith covariate, and the (1, i) entry is
            an upper bound.
          * offsets: A n_preds * n_classes array. Used as initial offsets for
            the model fitting. 

        After fitting, the following attributes are set:
        
        Private attributes:

          * _out_n_lambdas: The number of fit lambdas associated with non-zero
            models; for large enough lambdas the models will become zero in the
            presense of an L1 regularizer.
          * _comp_coef: The fit coefficients in a compressed form.  Only
            coefficients that are non-zero for some lambda are reported, and the
            associated between these parameters and the predictors are given by
            the _p_comp_coef attribute.
          * _p_comp_coef: An array associating the coefficients in _comp_coef to
            columns in the predictor array. 
          * _indicies: The same information as _p_comp_coef, but zero indexed to
            be compatable with numpy arrays.
          * _n_comp_coef: The number of coefficients that are non-zero for some
            value of lambda.
          * _n_passes: The total number of passes over the data used to fit the
            model.
          * _error_flag: Error flag from the fortran code.

        Public Attributes:
          
          * null_dev: The devaince of the null model.
          * exp_dev: The devaince explained by the model.
          * out_lambdas: An array containing the lambda values associated with
            each fit model.
        '''
        if weights is not None:
            raise ValueError("LogisticNet cannot be fit with weights.")
        # Convert to arrays is native python objects
        try:
            if not issparse(X):
                X = np.asanyarray(X)
            y = np.asanyarray(y)
        except ValueError:
            raise ValueError("X and y must be wither numpy arrays, or "
                             "convertable to numpy arrays."
                  )
        # Fortran expects an n_obs * n_classes array for y.  If a one 
        # dimensional array is passed, we construct an appropriate widening. 
        y = np.asanyarray(y)
        if len(y.shape) == 1:
            self.logistic = True
            y_classes = np.unique(y)
            y = np.float64(np.column_stack(y == c for c in y_classes))
        else:
            self.logistic = False
        # Count the number of classes in y.
        y_level_count = y.shape[1]
        # Two class predictions are handled as a special case, as is usual 
        # with logistic models
        if y_level_count == 2:
            self.y_level_count = np.array([1])
        else:
            self.y_level_count = np.array([y_level_count])
        # Make a copy if we are not able to overwrite X with its standardized 
        # version. Note that if X is not fortran contiguous, then it will be 
        # copied anyway.
        if not issparse(X) and np.isfortran(X) and not self.overwrite_pred_ok:
            X = X.copy(order='F')
        # The target array will usually be overwritten with its standardized
        # version, if this is not ok, we should copy.
        if np.isfortran(y) and not self.overwrite_targ_ok:
            y = y.copy(order='F')
        # Validate all the inputs:
        self._validate_matrix(X)
        self._validate_inputs(X, y)
        self._validate_lambdas(X, y, lambdas)
        self._validate_weights(X, y, weights)
        self._validate_rel_penalties(X, y, rel_penalties)
        self._validate_excl_preds(X, y, excl_preds)
        self._validate_box_constraints(X, y, box_constraints)
        self._validate_offsets(X, y, offsets)
        # Setup is complete, call the wrapper
        if not issparse(X):
            (self._out_n_lambdas,
            self._intercepts,
            self._comp_coef,
            self._p_comp_coef,
            self._n_comp_coef,
            self.null_dev,
            self.exp_dev,
            self.out_lambdas,
            self._n_passes,
            self._error_flag) = _glmnet.lognet(
                                    self.alpha, 
                                    self.y_level_count,
                                    X,
                                    y, 
                                    self.offsets,
                                    self.excl_preds, 
                                    self.rel_penalties,
                                    self.box_constraints,
                                    self.max_vars_all, 
                                    self.frac_lg_lambda, 
                                    self.lambdas,
                                    self.threshold, 
                                    nlam=self.n_lambdas
                                )
        else:
            ind_ptrs = X.indptr + 1
            indices = X.indices + 1
            (self._out_n_lambdas,
            self._intercepts,
            self._comp_coef,
            self._p_comp_coef,
            self._n_comp_coef,
            self.null_dev,
            self.exp_dev,
            self.out_lambdas,
            self._n_passes,
            self._error_flag) = _glmnet.splognet(
                                    self.alpha, 
                                    X.shape[0],
                                    X.shape[1],
                                    self.y_level_count,
                                    X.data,
                                    ind_ptrs,
                                    indices,
                                    y, 
                                    self.offsets,
                                    self.excl_preds, 
                                    self.rel_penalties,
                                    self.box_constraints,
                                    self.max_vars_all, 
                                    self.frac_lg_lambda, 
                                    self.lambdas,
                                    self.threshold, 
                                    nlam=self.n_lambdas
                                )
        self._check_errors()
        # Keep some model metadata
        self._n_fit_obs, self._n_fit_params = X.shape
        # The indexes into the predictor array are off by one due to fortran
        # convention, fix it up.
        self._indicies = np.trim_zeros(self._p_comp_coef, 'b') - 1