def fit(self, x: ArrayLike): """Compute the minimum and maximum for scaling. Args: x: array-like of shape (n_samples, n_features) The data used to compute the per-feature minimum and maximum used for later scaling along the features axis. Returns: None. Updates the transformer with feature fitting parameters. """ x = np.array(x) self.mins_ = x.min(axis=0) self.maxs_ = x.max(axis=0) self.hinge_indices_ = np.linspace(self.mins_, self.maxs_, self.n_hinges_)
def _format_labels_and_dtypes(self, x: ArrayLike, categorical: list = None, labels: list = None) -> None: """Read input x data and lists of categorical data indices and band labels to format and store this info for later indexing. Args: s: array-like of shape (n_samples, n_features) categorical: indices indicating which x columns are categorical labels: covariate column labels. ignored if x is a pandas DataFrame """ if isinstance(x, np.ndarray): nrows, ncols = x.shape if categorical is None: continuous = list(range(ncols)) else: continuous = list( set(range(ncols)).difference(set(categorical))) self.labels_ = labels or make_band_labels(ncols) self.categorical_ = categorical self.continuous_ = continuous elif isinstance(x, pd.DataFrame): x.drop(["geometry"], axis=1, errors="ignore", inplace=True) self.labels_ = labels or list(x.columns) # store both pandas and numpy indexing of these values self.continuous_pd_ = list( x.select_dtypes(exclude="category").columns) self.categorical_pd_ = list( x.select_dtypes(include="category").columns) all_columns = list(x.columns) self.continuous_ = [ all_columns.index(item) for item in self.continuous_pd_ if item in all_columns ] if len(self.categorical_pd_) != 0: self.categorical_ = [ all_columns.index(item) for item in self.categorical_pd_ if item in all_columns ] else: self.categorical_ = None
def fit(self, x: ArrayLike): """Compute the minimum and maximum for scaling. Args: x: array-like of shape (n_samples, n_features) The data used to compute the per-feature minimum and maximum used for later scaling along the features axis. Returns: None. Updates the transformer with feature fitting parameters. """ self.estimators_ = [] x = np.array(x) if x.ndim == 1: estimator = OneHotEncoder(dtype=np.uint8, sparse=False) self.estimators_.append(estimator.fit(x.reshape(-1, 1))) else: nrows, ncols = x.shape for col in range(ncols): xsub = x[:, col].reshape(-1, 1) estimator = OneHotEncoder(dtype=np.uint8, sparse=False) self.estimators_.append(estimator.fit(xsub))
def transform(self, x: ArrayLike) -> np.ndarray: """Scale covariates according to the feature range. Args: x: array-like of shape (n_samples, n_features) Input data that will be transformed. Returns: ndarray with transformed data. """ x = np.array(x) if x.ndim == 1: estimator = self.estimators_[0] return estimator.transform(x.reshape(-1, 1)) else: class_data = [] nrows, ncols = x.shape for col in range(ncols): xsub = x[:, col].reshape(-1, 1) estimator = self.estimators_[col] class_data.append(estimator.transform(xsub)) return np.concatenate(class_data, axis=1)
def fit( self, x: ArrayLike, y: ArrayLike, categorical: List[int] = None, labels: list = None, is_features: bool = False, feature_labels: list = None, ) -> None: """Trains a maxent model using a set of covariates and presence/background points. Args: x: array-like of shape (n_samples, n_features) with covariate data y: array-like of shape (n_samples,) with binary presence/background (1/0) values categorical: indices for which columns are categorical labels: covariate labels. ignored if x is a pandas DataFrame is_features: specify that x data has been transformed from covariates to features feature_labels: list of length n_features, with labels identifying each column's feature type with options ["linear", "quadratic", "product", "threshold", "hinge", "categorical"] must be set if `is_features=True` """ # fir the feature transformer if is_features: features = x assert feature_labels is not None, "feature_labels must be set if is_features=True" else: self.transformer = _features.MaxentFeatureTransformer( feature_types=self.feature_types_, clamp=self.clamp_, n_hinge_features=self.n_hinge_features_, n_threshold_features=self.n_threshold_features_, ) features = self.transformer.fit_transform(x, categorical=categorical, labels=labels) feature_labels = self.transformer.feature_names_ # compute sample weights if self.weight_strategy_ == "balance": pbr = len(y) / y.sum() else: pbr = self.weight_strategy_ self.weights_ = _features.compute_weights(y, pbr=pbr) # set feature regularization parameters self.regularization_ = _features.compute_regularization( y, features, feature_labels=feature_labels, beta_multiplier=self.beta_multiplier_, beta_threshold=self.beta_threshold_, beta_hinge=self.beta_hinge_, beta_categorical=self.beta_categorical_, ) # get model lambda scores to initialize the glm self.lambdas_ = _features.compute_lambdas(y, self.weights_, self.regularization_) # model fitting self.initialize_model(lambdas=self.lambdas_) self.estimator.fit( features, y, sample_weight=self.weights_, relative_penalties=self.regularization_, ) # get the beta values based on which lamba selection method to use if self.use_lambdas_ == "last": self.beta_scores_ = self.estimator.coef_path_[0, :, -1] elif self.use_lambdas_ == "best": self.beta_scores_ = self.estimator.coef_path_[ 0, :, self.estimator.lambda_max_inx_] # apply maxent-specific transformations raw = self.predict(features[y == 0], transform="raw", is_features=True) # alpha is a normalizing constant that ensures that f1(z) integrates (sums) to 1 self.alpha_ = maxent_alpha(raw) # the distance from f(z) is the relative entropy of f1(z) WRT f(z) self.entropy_ = maxent_entropy(raw)