class BaseNeuralNetwork(six.with_metaclass(ABCMeta, BaseEstimator)): """Base class for neural networks. Warning: This class should not be used directly. Use derived classes instead. """ @abstractmethod def __init__(self, hidden_nodes=[10], activation='relu', algorithm='random_hill_climb', max_iters=100, bias=True, learning_rate=0.1, early_stopping=False, clip_max=1e+10, schedule=GeomDecay(), pop_size=200, mutation_prob=0.1, max_attempts=10, restarts=0, curve=True): self.hidden_nodes = hidden_nodes self.activation_dict = { 'identity': identity, 'relu': relu, 'sigmoid': sigmoid, 'tanh': tanh } self.activation = activation self.algorithm = algorithm self.max_iters = max_iters self.bias = bias self.learning_rate = learning_rate self.early_stopping = early_stopping self.clip_max = clip_max self.schedule = schedule self.pop_size = pop_size self.mutation_prob = mutation_prob self.max_attempts = max_attempts self.curve = curve self.restarts = restarts self.node_list = [] self.fitted_weights = [] self.loss = np.inf self.output_activation = None self.predicted_probs = [] self.fitness_curve = [] @abstractmethod def _is_classifier(self): pass def _validate(self): if (not isinstance(self.max_iters, int) and self.max_iters != np.inf and not self.max_iters.is_integer()) or (self.max_iters < 0): raise Exception("""max_iters must be a positive integer.""") if not isinstance(self.bias, bool): raise Exception("""bias must be True or False.""") if self.learning_rate <= 0: raise Exception("""learning_rate must be greater than 0.""") if not isinstance(self.early_stopping, bool): raise Exception("""early_stopping must be True or False.""") if self.clip_max <= 0: raise Exception("""clip_max must be greater than 0.""") if (not isinstance(self.max_attempts, int) and not self.max_attempts.is_integer()) or (self.max_attempts < 0): raise Exception("""max_attempts must be a positive integer.""") if self.pop_size < 0: raise Exception("""pop_size must be a positive integer.""") elif not isinstance(self.pop_size, int): if self.pop_size.is_integer(): self.pop_size = int(self.pop_size) else: raise Exception("""pop_size must be a positive integer.""") if (self.mutation_prob < 0) or (self.mutation_prob > 1): raise Exception("""mutation_prob must be between 0 and 1.""") if self.activation is None or self.activation not in self.activation_dict.keys( ): raise Exception("""Activation function must be one of: 'identity', 'relu', 'sigmoid' or 'tanh'.""") if self.algorithm not in [ 'random_hill_climb', 'simulated_annealing', 'genetic_alg', 'gradient_descent' ]: raise Exception("""Algorithm must be one of: 'random_hill_climb', 'simulated_annealing', 'genetic_alg', 'gradient_descent'.""" ) def fit(self, X, y=None, init_weights=None): """Fit neural network to data. Parameters ---------- X: array Numpy array containing feature dataset with each row representing a single observation. y: array Numpy array containing data labels. Length must be same as length of X. init_state: array, default: None Numpy array containing starting weights for algorithm. If :code:`None`, then a random state is used. """ self._validate() # Make sure y is an array and not a list y = np.array(y) # Convert y to 2D if necessary if len(np.shape(y)) == 1: y = np.reshape(y, [len(y), 1]) # Verify X and y are the same length if not np.shape(X)[0] == np.shape(y)[0]: raise Exception('The length of X and y must be equal.') # Determine number of nodes in each layer input_nodes = np.shape(X)[1] + self.bias output_nodes = np.shape(y)[1] node_list = [input_nodes] + self.hidden_nodes + [output_nodes] num_nodes = 0 for i in range(len(node_list) - 1): num_nodes += node_list[i] * node_list[i + 1] if init_weights is not None and len(init_weights) != num_nodes: raise Exception("""init_weights must be None or have length %d""" % (num_nodes, )) # Initialize optimization problem fitness = NetworkWeights(X, y, node_list, self.activation_dict[self.activation], self.bias, self._is_classifier(), learning_rate=self.learning_rate) problem = ContinuousOpt(num_nodes, fitness, maximize=False, min_val=-1 * self.clip_max, max_val=self.clip_max, step=self.learning_rate) fitness_curve = [] if self.algorithm == 'random_hill_climb': if init_weights is None: init_weights = np.random.uniform(-1, 1, num_nodes) if self.curve: fitted_weights, loss, fitness_curve = random_hill_climb( problem, max_attempts=self.max_attempts, max_iters=self.max_iters, restarts=self.restarts, init_state=init_weights, curve=self.curve) else: fitted_weights, loss = random_hill_climb( problem, max_attempts=self.max_attempts, max_iters=self.max_iters, restarts=self.restarts, init_state=init_weights, curve=self.curve) elif self.algorithm == 'simulated_annealing': if init_weights is None: init_weights = np.random.uniform(-1, 1, num_nodes) if self.curve: fitted_weights, loss, fitness_curve = simulated_annealing( problem, schedule=self.schedule, max_attempts=self.max_attempts, max_iters=self.max_iters, init_state=init_weights, curve=self.curve) else: fitted_weights, loss = simulated_annealing( problem, schedule=self.schedule, max_attempts=self.max_attempts, max_iters=self.max_iters, init_state=init_weights, curve=self.curve) elif self.algorithm == 'genetic_alg': if self.curve: fitted_weights, loss, fitness_curve = genetic_alg( problem, pop_size=self.pop_size, mutation_prob=self.mutation_prob, max_attempts=self.max_attempts, max_iters=self.max_iters, curve=self.curve) else: fitted_weights, loss = genetic_alg( problem, pop_size=self.pop_size, mutation_prob=self.mutation_prob, max_attempts=self.max_attempts, max_iters=self.max_iters, curve=self.curve) else: # Gradient descent case if init_weights is None: init_weights = np.random.uniform(-1, 1, num_nodes) if self.curve: fitted_weights, loss, fitness_curve = gradient_descent( problem, max_attempts=self.max_attempts, max_iters=self.max_iters, init_state=init_weights, curve=self.curve) else: fitted_weights, loss = gradient_descent( problem, max_attempts=self.max_attempts, max_iters=self.max_iters, init_state=init_weights, curve=self.curve) # Save fitted weights and node list self.node_list = node_list self.fitted_weights = fitted_weights self.loss = loss if self.curve: self.fitness_curve = fitness_curve self.output_activation = fitness.get_output_activation() return self def predict(self, X, y=None): """Use model to predict data labels for given feature array. Parameters ---------- X: array Numpy array containing feature dataset with each row representing a single observation. Returns ------- y_pred: array Numpy array containing predicted data labels. """ if not np.shape(X)[1] == (self.node_list[0] - self.bias): raise Exception("""The number of columns in X must equal %d""" % ((self.node_list[0] - self.bias), )) weights = unflatten_weights(self.fitted_weights, self.node_list) # Add bias column to inputs matrix, if required if self.bias: ones = np.ones([np.shape(X)[0], 1]) inputs = np.hstack((X, ones)) else: inputs = X # Pass data through network for i in range(len(weights)): # Multiple inputs by weights outputs = np.dot(inputs, weights[i]) # Transform outputs to get inputs for next layer (or final preds) if i < len(weights) - 1: inputs = self.activation_dict[self.activation](outputs) else: y_pred = self.output_activation(outputs) # For classifier, convert predicted probabilities to 0-1 labels if self._is_classifier(): self.predicted_probs = y_pred if self.node_list[-1] == 1: y_pred = np.round(y_pred).astype(int) else: zeros = np.zeros_like(y_pred) zeros[np.arange(len(y_pred)), np.argmax(y_pred, axis=1)] = 1 y_pred = zeros.astype(int) return y_pred
class BaseMixture(six.with_metaclass(ABCMeta, DensityMixin, BaseEstimator)): """Base class for mixture models. This abstract class specifies an interface for all mixture classes and provides basic common methods for mixture models. """ def __init__(self, n_components, tol, reg_covar, max_iter, n_init, init_params, random_state, warm_start, verbose, verbose_interval): self.n_components = n_components self.tol = tol self.reg_covar = reg_covar self.max_iter = max_iter self.n_init = n_init self.init_params = init_params self.random_state = random_state self.warm_start = warm_start self.verbose = verbose self.verbose_interval = verbose_interval def _check_initial_parameters(self, X): """Check values of the basic parameters. Parameters ---------- X : array-like, shape (n_samples, n_features) """ if self.n_components < 1: raise ValueError("Invalid value for 'n_components': %d " "Estimation requires at least one component" % self.n_components) if self.tol < 0.: raise ValueError("Invalid value for 'tol': %.5f " "Tolerance used by the EM must be non-negative" % self.tol) if self.n_init < 1: raise ValueError("Invalid value for 'n_init': %d " "Estimation requires at least one run" % self.n_init) if self.max_iter < 1: raise ValueError("Invalid value for 'max_iter': %d " "Estimation requires at least one iteration" % self.max_iter) if self.reg_covar < 0.: raise ValueError("Invalid value for 'reg_covar': %.5f " "regularization on covariance must be " "non-negative" % self.reg_covar) # Check all the parameters values of the derived class self._check_parameters(X) @abstractmethod def _check_parameters(self, X): """Check initial parameters of the derived class. Parameters ---------- X : array-like, shape (n_samples, n_features) """ pass def _initialize_parameters(self, X, random_state): """Initialize the model parameters. Parameters ---------- X : array-like, shape (n_samples, n_features) random_state : RandomState A random number generator instance. """ n_samples, _ = X.shape if self.init_params == 'kmeans': resp = np.zeros((n_samples, self.n_components)) label = cluster.KMeans(n_clusters=self.n_components, n_init=1, random_state=random_state).fit(X).labels_ resp[np.arange(n_samples), label] = 1 elif self.init_params == 'random': resp = random_state.rand(n_samples, self.n_components) resp /= resp.sum(axis=1)[:, np.newaxis] else: raise ValueError("Unimplemented initialization method '%s'" % self.init_params) self._initialize(X, resp) @abstractmethod def _initialize(self, X, resp): """Initialize the model parameters of the derived class. Parameters ---------- X : array-like, shape (n_samples, n_features) resp : array-like, shape (n_samples, n_components) """ pass def fit(self, X,bootw, y=None): """Estimate model parameters with the EM algorithm. The method fits the model ``n_init`` times and sets the parameters with which the model has the largest likelihood or lower bound. Within each trial, the method iterates between E-step and M-step for ``max_iter`` times until the change of likelihood or lower bound is less than ``tol``, otherwise, a ``ConvergenceWarning`` is raised. If ``warm_start`` is ``True``, then ``n_init`` is ignored and a single initialization is performed upon the first call. Upon consecutive calls, training starts where it left off. Parameters ---------- X : array-like, shape (n_samples, n_features) List of n_features-dimensional data points. Each row corresponds to a single data point. Returns ------- self """ self.fit_predict(X,bootw, y) return self def fit_predict(self, X, bootw,y=None): """Estimate model parameters using X and predict the labels for X. The method fits the model n_init times and sets the parameters with which the model has the largest likelihood or lower bound. Within each trial, the method iterates between E-step and M-step for `max_iter` times until the change of likelihood or lower bound is less than `tol`, otherwise, a `ConvergenceWarning` is raised. After fitting, it predicts the most probable label for the input data points. .. versionadded:: 0.20 Parameters ---------- X : array-like, shape (n_samples, n_features) List of n_features-dimensional data points. Each row corresponds to a single data point. Returns ------- labels : array, shape (n_samples,) Component labels. """ X = _check_X(X, self.n_components, ensure_min_samples=2) self._check_initial_parameters(X) # if we enable warm_start, we will have a unique initialisation do_init = not(self.warm_start and hasattr(self, 'converged_')) n_init = self.n_init if do_init else 1 max_lower_bound = -np.infty self.converged_ = False random_state = check_random_state(self.random_state) n_samples, _ = X.shape for init in range(n_init): self._print_verbose_msg_init_beg(init) if do_init: self._initialize_parameters(X, random_state) lower_bound = (-np.infty if do_init else self.lower_bound_) for n_iter in range(1, self.max_iter + 1): prev_lower_bound = lower_bound log_prob_norm, log_resp = self._e_step(X,bootw) self._m_step(X, log_resp,bootw) lower_bound = self._compute_lower_bound( log_resp, log_prob_norm) change = lower_bound - prev_lower_bound self._print_verbose_msg_iter_end(n_iter, change) if abs(change) < self.tol: self.converged_ = True break self._print_verbose_msg_init_end(lower_bound) if lower_bound > max_lower_bound: max_lower_bound = lower_bound best_params = self._get_parameters() best_n_iter = n_iter # Always do a final e-step to guarantee that the labels returned by # fit_predict(X) are always consistent with fit(X).predict(X) # for any value of max_iter and tol (and any random_state). _, log_resp = self._e_step(X,bootw) if not self.converged_: warnings.warn('Initialization %d did not converge. ' 'Try different init parameters, ' 'or increase max_iter, tol ' 'or check for degenerate data.' % (init + 1), ConvergenceWarning) self._set_parameters(best_params) self.n_iter_ = best_n_iter self.lower_bound_ = max_lower_bound return log_resp.argmax(axis=1) def _e_step(self, X,bootw): """E step. Parameters ---------- X : array-like, shape (n_samples, n_features) Returns ------- log_prob_norm : float Mean of the logarithms of the probabilities of each sample in X log_responsibility : array, shape (n_samples, n_components) Logarithm of the posterior probabilities (or responsibilities) of the point of each sample in X. """ log_prob_norm, log_resp = self._estimate_log_prob_resp(X) return np.mean(bootw*log_prob_norm), log_resp @abstractmethod def _m_step(self, X, log_resp, bootw): """M step. Parameters ---------- X : array-like, shape (n_samples, n_features) log_resp : array-like, shape (n_samples, n_components) Logarithm of the posterior probabilities (or responsibilities) of the point of each sample in X. """ pass @abstractmethod def _check_is_fitted(self): pass @abstractmethod def _get_parameters(self): pass @abstractmethod def _set_parameters(self, params): pass def score_samples(self, X): """Compute the weighted log probabilities for each sample. Parameters ---------- X : array-like, shape (n_samples, n_features) List of n_features-dimensional data points. Each row corresponds to a single data point. Returns ------- log_prob : array, shape (n_samples,) Log probabilities of each data point in X. """ self._check_is_fitted() X = _check_X(X, None, self.means_.shape[1]) return logsumexp(self._estimate_weighted_log_prob(X), axis=1) def score_lppd(self,X,y=None): return (self.score_samples(X)) def score(self, X,bootw, y=None): """Compute the per-sample average log-likelihood of the given data X. Parameters ---------- X : array-like, shape (n_samples, n_dimensions) List of n_features-dimensional data points. Each row corresponds to a single data point. Returns ------- log_likelihood : float Log likelihood of the Gaussian mixture given X. """ return (self.score_samples(X)*bootw).mean() def predict(self, X): """Predict the labels for the data samples in X using trained model. Parameters ---------- X : array-like, shape (n_samples, n_features) List of n_features-dimensional data points. Each row corresponds to a single data point. Returns ------- labels : array, shape (n_samples,) Component labels. """ self._check_is_fitted() X = _check_X(X, None, self.means_.shape[1]) return self._estimate_weighted_log_prob(X).argmax(axis=1) def predict_proba(self, X): """Predict posterior probability of each component given the data. Parameters ---------- X : array-like, shape (n_samples, n_features) List of n_features-dimensional data points. Each row corresponds to a single data point. Returns ------- resp : array, shape (n_samples, n_components) Returns the probability each Gaussian (state) in the model given each sample. """ self._check_is_fitted() X = _check_X(X, None, self.means_.shape[1]) _, log_resp = self._estimate_log_prob_resp(X) return np.exp(log_resp) def sample(self, n_samples=1): """Generate random samples from the fitted Gaussian distribution. Parameters ---------- n_samples : int, optional Number of samples to generate. Defaults to 1. Returns ------- X : array, shape (n_samples, n_features) Randomly generated sample y : array, shape (nsamples,) Component labels """ self._check_is_fitted() if n_samples < 1: raise ValueError( "Invalid value for 'n_samples': %d . The sampling requires at " "least one sample." % (self.n_components)) _, n_features = self.means_.shape rng = check_random_state(self.random_state) n_samples_comp = rng.multinomial(n_samples, self.weights_) if self.covariance_type == 'full': X = np.vstack([ rng.multivariate_normal(mean, covariance, int(sample)) for (mean, covariance, sample) in zip( self.means_, self.covariances_, n_samples_comp)]) elif self.covariance_type == "tied": X = np.vstack([ rng.multivariate_normal(mean, self.covariances_, int(sample)) for (mean, sample) in zip( self.means_, n_samples_comp)]) else: X = np.vstack([ mean + rng.randn(sample, n_features) * np.sqrt(covariance) for (mean, covariance, sample) in zip( self.means_, self.covariances_, n_samples_comp)]) y = np.concatenate([np.full(sample, j, dtype=int) for j, sample in enumerate(n_samples_comp)]) return (X, y) def _estimate_weighted_log_prob(self, X): """Estimate the weighted log-probabilities, log P(X | Z) + log weights. Parameters ---------- X : array-like, shape (n_samples, n_features) Returns ------- weighted_log_prob : array, shape (n_samples, n_component) """ return self._estimate_log_prob(X) + self._estimate_log_weights() @abstractmethod def _estimate_log_weights(self): """Estimate log-weights in EM algorithm, E[ log pi ] in VB algorithm. Returns ------- log_weight : array, shape (n_components, ) """ pass @abstractmethod def _estimate_log_prob(self, X): """Estimate the log-probabilities log P(X | Z). Compute the log-probabilities per each component for each sample. Parameters ---------- X : array-like, shape (n_samples, n_features) Returns ------- log_prob : array, shape (n_samples, n_component) """ pass def _estimate_log_prob_resp(self, X): """Estimate log probabilities and responsibilities for each sample. Compute the log probabilities, weighted log probabilities per component and responsibilities for each sample in X with respect to the current state of the model. Parameters ---------- X : array-like, shape (n_samples, n_features) Returns ------- log_prob_norm : array, shape (n_samples,) log p(X) log_responsibilities : array, shape (n_samples, n_components) logarithm of the responsibilities """ weighted_log_prob = self._estimate_weighted_log_prob(X) log_prob_norm = logsumexp(weighted_log_prob, axis=1) with np.errstate(under='ignore'): # ignore underflow log_resp = weighted_log_prob - log_prob_norm[:, np.newaxis] return log_prob_norm, log_resp def _print_verbose_msg_init_beg(self, n_init): """Print verbose message on initialization.""" if self.verbose == 1: print("Initialization %d" % n_init) elif self.verbose >= 2: print("Initialization %d" % n_init) self._init_prev_time = time() self._iter_prev_time = self._init_prev_time def _print_verbose_msg_iter_end(self, n_iter, diff_ll): """Print verbose message on initialization.""" if n_iter % self.verbose_interval == 0: if self.verbose == 1: print(" Iteration %d" % n_iter) elif self.verbose >= 2: cur_time = time() print(" Iteration %d\t time lapse %.5fs\t ll change %.5f" % ( n_iter, cur_time - self._iter_prev_time, diff_ll)) self._iter_prev_time = cur_time def _print_verbose_msg_init_end(self, ll): """Print verbose message on the end of iteration.""" if self.verbose == 1: print("Initialization converged: %s" % self.converged_) elif self.verbose >= 2: print("Initialization converged: %s\t time lapse %.5fs\t ll %.5f" % (self.converged_, time() - self._init_prev_time, ll))
class BaseNB(six.with_metaclass(ABCMeta, BaseEstimator)): _estimator_type = "classifier" def __init__(self): self.is_fitted = False self.classes_ = None self.class_count_ = None # Properties @property def complement_class_count_(self): ''' Complement class count, i.e. number of occurrences of all the samples with all the classes except the given class c ''' from bayes.utils import get_complement_matrix size = len(self.class_count_) return self.class_count_.dot(get_complement_matrix(size)) @property def complement_class_log_proba_(self): ''' Complement class probability, i.e. logprob of occurrence of a sample, which does not belong to the given class c ''' all_samples_count = np.float64(np.sum(self.class_count_)) return np.log(self.complement_class_count_ / all_samples_count) @property def class_log_proba_(self): ''' Log probability of class occurrence ''' all_samples_count = np.float64(np.sum(self.class_count_)) return np.log(self.class_count_ / all_samples_count) # Fitting model def fit(self, X, y): ''' Fit model to given training set Parameters ---------- X : array-like, shape (n_samples, n_features) Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Target values. Returns ------- self : Naive Bayes estimator object Returns self. ''' self._reset() self._partial_fit(X, y) return self @abstractmethod def partial_fit(self, X, y, classes=None): """ Incremental fit on a batch of samples. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. classes : array-like, shape = [n_classes], optional (default=None) List of all the classes that can possibly appear in the y vector. Must be provided at the first call to partial_fit, can be omitted in subsequent calls. Returns ------- self : object Returns self. """ @abstractmethod def _partial_fit(self, X, y, classes=None, first_partial_fit=None): '''''' @abstractmethod def predict(self, X): """ Perform classification on an array of test vectors X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Unseen samples vector Returns ------- C : array, shape = [n_samples] Predicted target values for X """ def _update_complement_features(self, X, y_one_hot): ''' Compute complement features counts Parameters ---------- X: numpy array (n_samples, n_features) Matrix of input samples y_one_hot: numpy array (n_samples, n_classes) Binary matrix encoding input ''' # FIXME: complement_features nomenclature is incoherent if not self.is_fitted: self.complement_features = X.T.dot(np.logical_not(y_one_hot)) else: self.complement_features += X.T.dot(np.logical_not(y_one_hot)) def _update_features(self, X, y_one_hot): ''' Compute features counts Parameters ---------- X: numpy array (n_samples, n_features) Matrix of input samples y_one_hot: numpy array (n_samples, n_classes) Binary matrix encoding input ''' if not self.is_fitted: self.features_ = X.T.dot(y_one_hot) else: self.features_ += X.T.dot(y_one_hot) @abstractmethod def predict_log_proba(self, X): """ Return log-probability estimates for the test vector X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- C : array-like, shape = [n_samples, n_classes] Returns the log-probability of the samples for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ @abstractmethod def _reset(self): '''''' def predict_proba(self, X): """ Return probability estimates for the test vector X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- C : array-like, shape = [n_samples, n_classes] Returns the probability of the samples for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ # TODO: Handle float exponent error return np.exp(self.predict_log_proba(X)) # Scores def accuracy_score(self, X, y): ''' Return acuracy score Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- accuracy_score: float Accuracy on the given test set ''' self._check_is_fitted() return accuracy_score(y, self.predict(X)) # def f1_score(self, X, y): # self._check_is_fitted() # return f1_score(y, self.predict(X)) # # def precision_score(self, X, y): # self._check_is_fitted() # return precision_score(y, self.predict(X)) # # def recall_score(self, X, y): # self._check_is_fitted() # return recall_score(y, self.predict(X)) # # def roc_auc_score(self, X, y): # self._check_is_fitted() # return roc_auc_score(y, self.predict(X)) # Checking params & states def _check_is_fitted(self): if not self.is_fitted: raise NotFittedError def _check_alpha_param(self): if self.alpha == 0.0: warnings.warn( 'Alpha sholud not be zero. It may cause division by zero', AlphaZeroWarning) def _not_implemented_yet(self, message): warnings.warn(NotImplementedYet(message)) # def safe_mult(self, input_array, internal_array): # if isinstance(input_array, csr_matrix): # input_array = input_array.toarray() # return input_array * internal_array def safe_matmult(self, input_array, internal_array): if isinstance(input_array, csr_matrix): input_array = input_array.toarray() return input_array.dot(internal_array.T)
class BaseRandomNN(six.with_metaclass(ABCMeta, BaseEstimator)): """Base class for Random Neural Network classification and regression. Warning: This class should not be used directly. Use derived classes instead. """ @abstractmethod def __init__(self, n_hidden, activation, C, class_weight, weight_scale, batch_size, verbose, warm_start, random_state): self.C = C self.activation = activation self.class_weight = class_weight self.weight_scale = weight_scale self.batch_size = batch_size self.n_hidden = n_hidden self.verbose = verbose self.warm_start = warm_start self.random_state = random_state def _init_weights(self, n_features): """Initialize the parameter weights.""" rng = check_random_state(self.random_state) # Use the initialization method recommended by Glorot et al. weight_init_bound = np.sqrt(6. / (n_features + self.n_hidden)) self.coef_hidden_ = rng.uniform(-weight_init_bound, weight_init_bound, (n_features, self.n_hidden)) self.intercept_hidden_ = rng.uniform(-weight_init_bound, weight_init_bound, self.n_hidden) if self.weight_scale != 1: self.coef_hidden_ *= self.weight_scale self.intercept_hidden_ *= self.weight_scale def _compute_hidden_activations(self, X): """Compute the hidden activations.""" hidden_activations = safe_sparse_dot(X, self.coef_hidden_) hidden_activations += self.intercept_hidden_ # Apply the activation method activation = ACTIVATIONS[self.activation] hidden_activations = activation(hidden_activations) return hidden_activations def _fit(self, X, y, sample_weight=None, incremental=False): """Fit the model to the data X and target y.""" # Validate input params if self.n_hidden <= 0: raise ValueError("n_hidden must be > 0, got %s." % self.n_hidden) if self.C <= 0.0: raise ValueError("C must be > 0, got %s." % self.C) if self.activation not in ACTIVATIONS: raise ValueError("The activation %s is not supported. Supported " "activation are %s." % (self.activation, ACTIVATIONS)) # Initialize public attributes if not hasattr(self, 'classes_'): self.classes_ = None if not hasattr(self, 'coef_hidden_'): self.coef_hidden_ = None # Initialize private attributes if not hasattr(self, '_HT_H_accumulated'): self._HT_H_accumulated = None X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64, order="C", multi_output=True) # This outputs a warning when a 1d array is expected if y.ndim == 2 and y.shape[1] == 1: y = column_or_1d(y, warn=True) # Classification if isinstance(self, ClassifierMixin): self.label_binarizer_.fit(y) if self.classes_ is None or not incremental: self.classes_ = self.label_binarizer_.classes_ # if sample_weight is None: # sample_weight = compute_sample_weight(self.class_weight, # self.classes_, y) else: classes = self.label_binarizer_.classes_ if not np.all(np.in1d(classes, self.classes_)): raise ValueError("`y` has classes not in `self.classes_`." " `self.classes_` has %s. 'y' has %s." % (self.classes_, classes)) y = self.label_binarizer_.transform(y) # Ensure y is 2D if y.ndim == 1: y = np.reshape(y, (-1, 1)) n_samples, n_features = X.shape self.n_outputs_ = y.shape[1] # Step (1/2): Compute the hidden layer coefficients if (self.coef_hidden_ is None or (not incremental and not self.warm_start)): # Randomize and scale the input-to-hidden coefficients self._init_weights(n_features) # Step (2/2): Compute hidden-to-output coefficients if self.batch_size is None: # Run the least-square algorithm on the whole dataset batch_size = n_samples else: # Run the recursive least-square algorithm on mini-batches batch_size = self.batch_size batches = gen_batches(n_samples, batch_size) # (First time call) Run the least-square algorithm on batch 0 if not incremental or self._HT_H_accumulated is None: batch_slice = next(batches) H_batch = self._compute_hidden_activations(X[batch_slice]) # Get sample weights for the batch if sample_weight is None: sw = None else: sw = sample_weight[batch_slice] # beta_{0} = inv(H_{0}^T H_{0} + (1. / C) * I) * H_{0}.T y_{0} self.coef_output_ = ridge_regression(H_batch, y[batch_slice], 1. / self.C, sample_weight=sw).T # Initialize K if this is batch based or partial_fit if self.batch_size is not None or incremental: # K_{0} = H_{0}^T * W * H_{0} weighted_H_batch = _multiply_weights(H_batch, sw) self._HT_H_accumulated = safe_sparse_dot( H_batch.T, weighted_H_batch) if self.verbose: y_scores = self._decision_scores(X[batch_slice]) if self.batch_size is None: verbose_string = "Training mean squared error =" else: verbose_string = "Batch 0, Training mean squared error =" print("%s %f" % (verbose_string, mean_squared_error( y[batch_slice], y_scores, sample_weight=sw))) # Run the least-square algorithm on batch 1, 2, ..., n for batch, batch_slice in enumerate(batches): # Compute hidden activations H_{i} for batch i H_batch = self._compute_hidden_activations(X[batch_slice]) # Get sample weights (sw) for the batch if sample_weight is None: sw = None else: sw = sample_weight[batch_slice] weighted_H_batch = _multiply_weights(H_batch, sw) # Update K_{i+1} by H_{i}^T * W * H_{i} self._HT_H_accumulated += safe_sparse_dot(H_batch.T, weighted_H_batch) # Update beta_{i+1} by # K_{i+1}^{-1} * H_{i+1}^T * W * (y_{i+1} - H_{i+1} * beta_{i}) y_batch = y[batch_slice] - safe_sparse_dot(H_batch, self.coef_output_) weighted_y_batch = _multiply_weights(y_batch, sw) Hy_batch = safe_sparse_dot(H_batch.T, weighted_y_batch) # Update hidden-to-output coefficients regularized_HT_H = self._HT_H_accumulated.copy() regularized_HT_H.flat[::self.n_hidden + 1] += 1. / self.C # It is safe to use linalg.solve (instead of linalg.lstsq # which is slow) since it is highly unlikely that # regularized_HT_H is singular due to the random # projection of the first layer and 'C' regularization being # not dangerously large. self.coef_output_ += linalg.solve(regularized_HT_H, Hy_batch, sym_pos=True, overwrite_a=True, overwrite_b=True) if self.verbose: y_scores = self._decision_scores(X[batch_slice]) print("Batch %d, Training mean squared error = %f" % (batch + 1, mean_squared_error( y[batch_slice], y_scores, sample_weight=sw))) return self def fit(self, X, y, sample_weight=None): """Fit the model to the data X and target y. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data. y : array-like, shape (n_samples,) Target values. sample_weight : array-like, shape (n_samples,) Per-sample weights. Rescale C per sample. Higher weights force the classifier to put more emphasis on these points. Returns ------- self : returns a trained RandomNN usable for prediction. """ return self._fit(X, y, sample_weight=sample_weight, incremental=False) def partial_fit(self, X, y, sample_weight=None): """Fit the model to the data X and target y. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Subset of training data. y : array-like, shape (n_samples,) Subset of target values. sample_weight : array-like, shape (n_samples,) Per-sample weights. Rescale C per sample. Higher weights force the classifier to put more emphasis on these points. Returns ------- self : returns a trained RandomNN usable for prediction. """ self._fit(X, y, sample_weight=sample_weight, incremental=True) return self def _decision_scores(self, X): """Predict using the RandomNN model Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data. Returns ------- y_pred : array-like, shape (n_samples,) or (n_samples, n_outputs) The predicted values. """ X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) if self.batch_size is None: hidden_activations = self._compute_hidden_activations(X) y_pred = safe_sparse_dot(hidden_activations, self.coef_output_) else: n_samples = X.shape[0] batches = gen_batches(n_samples, self.batch_size) y_pred = np.zeros((n_samples, self.n_outputs_)) for batch in batches: h_batch = self._compute_hidden_activations(X[batch]) y_pred[batch] = safe_sparse_dot(h_batch, self.coef_output_) return y_pred
class BaseWeightBoosting(six.with_metaclass(ABCMeta, BaseEnsemble)): """Base class for AdaBoost estimators. Warning: This class should not be used directly. Use derived classes instead. """ @abstractmethod def __init__(self, base_estimator, n_estimators=50, estimator_params=tuple(), learning_rate=1.): super(BaseWeightBoosting, self).__init__(base_estimator=base_estimator, n_estimators=n_estimators, estimator_params=estimator_params) self.learning_rate = learning_rate def fit(self, X, y, sample_weight=None): """Build a boosted classifier/regressor from the training set (X, y). Parameters ---------- X : array-like of shape = [n_samples, n_features] The training input samples. y : array-like of shape = [n_samples] The target values (integers that correspond to classes in classification, real numbers in regression). sample_weight : array-like of shape = [n_samples], optional Sample weights. If None, the sample weights are initialized to 1 / n_samples. Returns ------- self : object Returns self. """ # Check parameters if self.learning_rate <= 0: raise ValueError("learning_rate must be greater than zero") # Check data X, y = check_arrays(X, y, sparse_format="dense") if sample_weight is None: # Initialize weights to 1 / n_samples sample_weight = np.empty(X.shape[0], dtype=np.float) sample_weight[:] = 1. / X.shape[0] else: # Normalize existing weights sample_weight = np.copy(sample_weight) / sample_weight.sum() # Check that the sample weights sum is positive if sample_weight.sum() <= 0: raise ValueError("Attempting to fit with a non-positive " "weighted number of samples.") # Clear any previous fit results self.estimators_ = [] self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float) self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float) # Create argsorted X for fast tree induction X_argsorted = None if isinstance(self.base_estimator, BaseDecisionTree): X_argsorted = np.asfortranarray( np.argsort(X.T, axis=1).astype(np.int32).T) for iboost in xrange(self.n_estimators): # Boosting step sample_weight, estimator_weight, estimator_error = self._boost( iboost, X, y, sample_weight, X_argsorted=X_argsorted) # Early termination if sample_weight is None: break self.estimator_weights_[iboost] = estimator_weight self.estimator_errors_[iboost] = estimator_error # Stop if error is zero if estimator_error == 0: break sample_weight_sum = np.sum(sample_weight) # Stop if the sum of sample weights has become non-positive if sample_weight_sum <= 0: break if iboost < self.n_estimators - 1: # Normalize sample_weight /= sample_weight_sum return self def _check_fitted(self): if not hasattr(self, "estimators_"): raise ValueError("call fit first") @abstractmethod def _boost(self, iboost, X, y, sample_weight, X_argsorted=None): """Implement a single boost. Warning: This method needs to be overriden by subclasses. Parameters ---------- iboost : int The index of the current boost iteration. X : array-like of shape = [n_samples, n_features] The training input samples. y : array-like of shape = [n_samples] The target values (integers that correspond to classes). sample_weight : array-like of shape = [n_samples] The current sample weights. X_argsorted : array-like, shape = [n_samples, n_features] (optional) Each column of ``X_argsorted`` holds the row indices of ``X`` sorted according to the value of the corresponding feature in ascending order. The argument is supported to enable multiple decision trees to share the data structure and to avoid re-computation in tree ensembles. For maximum efficiency use dtype np.int32. Returns ------- sample_weight : array-like of shape = [n_samples] or None The reweighted sample weights. If None then boosting has terminated early. estimator_weight : float The weight for the current boost. If None then boosting has terminated early. error : float The classification error for the current boost. If None then boosting has terminated early. """ pass def staged_score(self, X, y): """Return staged scores for X, y. This generator method yields the ensemble score after each iteration of boosting and therefore allows monitoring, such as to determine the score on a test set after each boost. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training set. y : array-like, shape = [n_samples] Labels for X. Returns ------- z : float """ for y_pred in self.staged_predict(X): if isinstance(self, ClassifierMixin): yield accuracy_score(y, y_pred) else: yield r2_score(y, y_pred) @property def feature_importances_(self): """Return the feature importances (the higher, the more important the feature). Returns ------- feature_importances_ : array, shape = [n_features] """ if self.estimators_ is None or len(self.estimators_) == 0: raise ValueError("Estimator not fitted, " "call `fit` before `feature_importances_`.") try: norm = self.estimator_weights_.sum() return (sum(weight * clf.feature_importances_ for weight, clf in zip(self.estimator_weights_, self.estimators_)) / norm) except AttributeError: raise AttributeError("Unable to compute feature importances " "since base_estimator does not have a " "feature_importances_ attribute")
class BaseNB(six.with_metaclass(ABCMeta, BaseEstimator, ClassifierMixin)): """Abstract base class for naive Bayes estimators""" @abstractmethod def _joint_log_likelihood(self, X): """Compute the unnormalized posterior log probability of X I.e. ``log P(c) + log P(x|c)`` for all rows x of X, as an array-like of shape [n_classes, n_samples]. Input is passed to _joint_log_likelihood as-is by predict, predict_proba and predict_log_proba. """ def predict(self, X): """ Perform classification on an array of test vectors X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- C : array, shape = [n_samples] Predicted target values for X """ jll = self._joint_log_likelihood(X) return self.classes_[np.argmax(jll, axis=1)] def predict_log_proba(self, X): """ Return log-probability estimates for the test vector X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- C : array-like, shape = [n_samples, n_classes] Returns the log-probability of the samples for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ jll = self._joint_log_likelihood(X) # normalize by P(x) = P(f_1, ..., f_n) log_prob_x = logsumexp(jll, axis=1) return jll - np.atleast_2d(log_prob_x).T def predict_proba(self, X): """ Return probability estimates for the test vector X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- C : array-like, shape = [n_samples, n_classes] Returns the probability of the samples for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ probas = np.exp(self.predict_log_proba(X)) rowsum = np.sum(probas, axis=1) # if np.array_equal(rowsum, np.ones(rowsum.shape[0])): # print "rowsum are 1" # else: # print "rowsums are't 1" return probas / rowsum.reshape(rowsum.shape[0], 1)
class BinaryLogitBoost(with_metaclass(ABCMeta, BaseEnsemble)): """ Parameters ---------- Please refer to scikit-learn's boosting documentation : https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/ensemble/weight_boosting.py """ def __init__(self, base_estimator=None, n_estimators=50, random_state=None): super(BinaryLogitBoost, self).__init__(base_estimator=base_estimator, n_estimators=n_estimators) self.random_state = random_state def fit(self, X, y, sample_weight=None): y = self._validate_y(y) X, y = check_X_y(X, y, accept_sparse='csc', dtype=DTYPE) n_samples = X.shape[0] if sample_weight is None: # Initialize weights to 1 / n_samples sample_weight = np.empty(n_samples, dtype=np.float) sample_weight[:] = 1. / n_samples else: # Normalize existing weights sample_weight = sample_weight / sample_weight.sum(dtype=np.float64) # Check that the sample weights sum is positive if sample_weight.sum() <= 0: raise ValueError("Attempting to fit with a non-positive " "weighted number of samples.") # Check parameters self._validate_estimator() # Clear any previous fit results self.estimators_ = [] estimators = [] predictions = np.zeros(n_samples) p = 0.5 * np.ones(n_samples) for iboost in range(self.n_estimators): sample_weight = p * (1 - p) z = (y - p) / sample_weight estimator = self._make_estimator() try: estimator.set_params(random_state=self.random_state) except ValueError: pass estimator.fit(X, z, sample_weight=sample_weight) estimators.append(estimator) predictions += (1 / 2) * estimator.predict(X) p = 1 / (1 + np.exp(-2 * predictions)) self.estimators_ = estimators return self def predict(self, X): estimators = self.estimators_ predictions = sum([estimator.predict(X) for estimator in estimators]) return self.classes_.take(np.where(predictions > 0, 1, 0)) def predict_proba(self, X): n_samples = X.shape[0] proba = np.zeros((n_samples, 2)) # Binary classification estimators = self.estimators_ predictions = sum([estimator.predict(X) for estimator in estimators]) proba[:, 0] = 1 / (1 + np.exp(predictions)) proba[:, 1] = 1 - proba[:, 0] return proba def _validate_estimator(self): """Check the estimator and set the base_estimator_ attribute.""" super(BinaryLogitBoost, self)._validate_estimator( default=DecisionTreeRegressor(max_depth=3)) def _validate_y(self, y): y = column_or_1d(y, warn=True) check_classification_targets(y) self.classes_, y = np.unique(y, return_inverse=True) n_classes = len(self.classes_) if n_classes > 2: raise ValueError( "It's a binary classification algorithm. Use a dataset with only 2 classes to predict." ) return y
class RecommenderTestClass(six.with_metaclass(ABCMeta)): """An abstract base test class for algo test suites. All recommender algorithm test classes should inherit from this. """ @abstractmethod def test_simple_fit(self, *args, **kwargs): """Test a simple fit""" @abstractmethod def test_complex_fit(self, *args, **kwargs): """Test a more complex fit""" @abstractmethod def test_recommend_single(self, *args, **kwargs): """Test recommending for a single user.""" @abstractmethod def test_recommend_all(self, *args, **kwargs): """Test recommending for all users.""" @abstractmethod def test_serialize(self, *args, **kwargs): """Test serializing the algo.""" @staticmethod def _single_recommend_assertions(clf, train_data, test_data): # Simple recommendation operation recs = clf.recommend_for_user(0, test_data, n=5, filter_previously_rated=False) assert len(recs) == 5 # Create recommendations for everything, but filter out a single item n = train_data.shape[1] recs = clf.recommend_for_user(0, test_data, n=n, filter_previously_rated=False, filter_items=[1]) # Show that '1' is not in the recommendations mask = np.in1d([1], recs) # type: np.ndarray assert not mask.any() # Show we can also create recommendations with return_scores=True recs, scores = clf.recommend_for_user(0, test_data, n=5, return_scores=True) assert len(recs) == len(scores) == 5, (recs, scores) assert all(isinstance(arr, np.ndarray) for arr in (recs, scores)) @staticmethod def _all_recommend_assertions(clf, test_data): n = test_data.shape[1] recs = clf.recommend_for_all_users(test_data, n=n, return_scores=True, filter_previously_rated=True) # Show that it's a generator assert isinstance(recs, types.GeneratorType) first_recs, first_scores = next(recs) assert len(first_recs) == len(first_scores) # show no rated items in the recs rated = test_data[0, :].indices mask = np.in1d(rated, first_recs) # type: np.ndarray assert not mask.any() @staticmethod def _serialization_assertions(clf, train_data, test_data, tolerate_fail=False): pkl_location = "als.pkl" # Test persistence try: # Show we can serialize BEFORE it's fit joblib.dump(clf, pkl_location, compress=3) os.unlink(pkl_location) # NOW train clf.fit(train_data) # Get recommendations recs1 = clf.recommend_for_user(0, test_data, n=3, return_scores=False) # dump it, recommend again and show the internal state didn't # change while we were pickling it out joblib.dump(clf, pkl_location, compress=3) recs2 = clf.recommend_for_user(0, test_data, n=3, return_scores=False) # open it up and create more recommendations loaded = joblib.load(pkl_location) recs3 = loaded \ .recommend_for_user(0, test_data, n=3, return_scores=False) # Now show they're all the same if not tolerate_fail: assert_array_equal(recs1, recs2, err_msg="%s != %s" % (str(recs1), str(recs2))) assert_array_equal(recs1, recs3, err_msg="%s != %s" % (str(recs1), str(recs3))) finally: os.unlink(pkl_location) # If the model has an index saved somewhere, remove it also if hasattr(clf, "_model_key"): index_cache = os.path.join(RECLAB_CACHE, clf._model_key) if os.path.exists(index_cache): shutil.rmtree(index_cache)
class BaseBagging(with_metaclass(ABCMeta, BaseEnsemble)): """Base class for Bagging meta-estimator. Warning: This class should not be used directly. Use derived classes instead. """ @abstractmethod def __init__(self, base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=False, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=1, random_state=None, verbose=0, sampler='under', max_depth=None): super(BaseBagging, self).__init__(base_estimator=base_estimator, n_estimators=n_estimators) self.max_samples = max_samples self.max_features = max_features self.bootstrap = bootstrap self.bootstrap_features = bootstrap_features self.oob_score = oob_score self.warm_start = warm_start self.n_jobs = n_jobs self.random_state = random_state self.verbose = verbose self.sampler = sampler self.max_depth = max_depth def fit(self, X, y, max_depth=None, sample_weight=None): """Build a Bagging ensemble of estimators from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape = [n_samples, n_features] The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. y : array-like, shape = [n_samples] The target values (class labels in classification, real numbers in regression). sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Note that this is supported only if the base estimator supports sample weighting. Returns ------- self : object Returns self. """ return self._fit(X, y, self.max_samples, self.max_depth, sample_weight=sample_weight) def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): """Build a Bagging ensemble of estimators from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape = [n_samples, n_features] The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. y : array-like, shape = [n_samples] The target values (class labels in classification, real numbers in regression). max_samples : int or float, optional (default=None) Argument to use instead of self.max_samples. max_depth : int, optional (default=None) Override value used when constructing base estimator. Only supported if the base estimator has a max_depth parameter. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Note that this is supported only if the base estimator supports sample weighting. Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) # Convert data X, y = check_X_y(X, y, ['csr', 'csc']) if sample_weight is not None: sample_weight = check_array(sample_weight, ensure_2d=False) check_consistent_length(y, sample_weight) # Remap output n_samples, self.n_features_ = X.shape self._n_samples = n_samples y = self._validate_y(y) # Check parameters self._validate_estimator() if max_depth is not None: self.base_estimator_.max_depth = max_depth # Validate max_samples if max_samples is None: max_samples = self.max_samples elif not isinstance(max_samples, (numbers.Integral, np.integer)): max_samples = int(max_samples * X.shape[0]) if not (0 < max_samples <= X.shape[0]): raise ValueError("max_samples must be in (0, n_samples]") # Store validated integer row sampling value self._max_samples = max_samples # Validate max_features if isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float max_features = int(self.max_features * self.n_features_) if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") # Store validated integer feature sampling value self._max_features = max_features # Other checks if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") if self.warm_start and self.oob_score: raise ValueError("Out of bag estimate only available" " if warm_start=False") if hasattr(self, "oob_score_") and self.warm_start: del self.oob_score_ if not self.warm_start or not hasattr(self, 'estimators_'): # Free allocated memory, if any self.estimators_ = [] self.estimators_features_ = [] n_more_estimators = self.n_estimators - len(self.estimators_) if n_more_estimators < 0: raise ValueError('n_estimators=%d must be larger or equal to ' 'len(estimators_)=%d when warm_start==True' % (self.n_estimators, len(self.estimators_))) elif n_more_estimators == 0: warn("Warm-start fitting without increasing n_estimators does not " "fit new trees.") return self # Parallel loop n_jobs, n_estimators, starts = _partition_estimators( n_more_estimators, self.n_jobs) total_n_estimators = sum(n_estimators) # Advance random state to state after training # the first n_estimators if self.warm_start and len(self.estimators_) > 0: random_state.randint(MAX_INT, size=len(self.estimators_)) seeds = random_state.randint(MAX_INT, size=n_more_estimators) self._seeds = seeds # # Resample data in each bag # if self.sampler == 'under': # X_res, y_res = # else: # X_res, y_res = X, y all_results = Parallel(n_jobs=n_jobs, verbose=self.verbose)( delayed(_parallel_build_estimators)(n_estimators[i], self, X, y, sample_weight, seeds[starts[i]:starts[i + 1]], total_n_estimators, verbose=self.verbose) for i in range(n_jobs)) # Reduce self.estimators_ += list( itertools.chain.from_iterable(t[0] for t in all_results)) self.estimators_features_ += list( itertools.chain.from_iterable(t[1] for t in all_results)) if self.oob_score: self._set_oob_score(X, y) return self @abstractmethod def _set_oob_score(self, X, y): """Calculate out of bag predictions and score.""" def _validate_y(self, y): # Default implementation return column_or_1d(y, warn=True) def _get_estimators_indices(self): # Get drawn indices along both sample and feature axes for seed in self._seeds: # Operations accessing random_state must be performed identically # to those in `_parallel_build_estimators()` random_state = np.random.RandomState(seed) feature_indices, sample_indices = _generate_bagging_indices( random_state, self.bootstrap_features, self.bootstrap, self.n_features_, self._n_samples, self._max_features, self._max_samples) yield feature_indices, sample_indices @property def estimators_samples_(self): """The subset of drawn samples for each base estimator. Returns a dynamically generated list of boolean masks identifying the samples used for fitting each member of the ensemble, i.e., the in-bag samples. Note: the list is re-created at each call to the property in order to reduce the object memory footprint by not storing the sampling data. Thus fetching the property may be slower than expected. """ sample_masks = [] for _, sample_indices in self._get_estimators_indices(): mask = indices_to_mask(sample_indices, self._n_samples) sample_masks.append(mask) return sample_masks
class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)): """Mixin class for samplers with abstract method. Warning: This class should not be used directly. Use the derive classes instead. """ _estimator_type = 'sampler' def fit(self, X, y): """Check inputs and statistics of the sampler. You should use ``fit_resample`` in all cases. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Data array. y : array-like, shape (n_samples,) Target array. Returns ------- self : object Return the instance itself. """ self._deprecate_ratio() X, y, _ = self._check_X_y(X, y) self.sampling_strategy_ = check_sampling_strategy( self.sampling_strategy, y, self._sampling_type) return self def fit_resample(self, X, y): """Resample the dataset. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- X_resampled : {array-like, sparse matrix}, shape \ (n_samples_new, n_features) The array containing the resampled data. y_resampled : array-like, shape (n_samples_new,) The corresponding label of `X_resampled`. """ self._deprecate_ratio() check_classification_targets(y) X, y, binarize_y = self._check_X_y(X, y) self.sampling_strategy_ = check_sampling_strategy( self.sampling_strategy, y, self._sampling_type) output = self._fit_resample(X, y) if binarize_y: y_sampled = label_binarize(output[1], np.unique(y)) if len(output) == 2: return output[0], y_sampled return output[0], y_sampled, output[2] return output # define an alias for back-compatibility fit_sample = fit_resample @abstractmethod def _fit_resample(self, X, y): """Base method defined in each sampler to defined the sampling strategy. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- X_resampled : {ndarray, sparse matrix}, shape \ (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` """ pass
class _BaseFactorizationMachine(six.with_metaclass(ABCMeta, _BasePoly)): @abstractmethod def __init__(self, degree=2, loss='squared', n_components=2, alpha=1, beta=1, tol=1e-6, fit_lower='explicit', fit_linear=True, warm_start=False, init_lambdas='ones', max_iter=10000, verbose=False, random_state=None): self.degree = degree self.loss = loss self.n_components = n_components self.alpha = alpha self.beta = beta self.tol = tol self.fit_lower = fit_lower self.fit_linear = fit_linear self.warm_start = warm_start self.init_lambdas = init_lambdas self.max_iter = max_iter self.verbose = verbose self.random_state = random_state def _augment(self, X): # for factorization machines, we add a dummy column for each order. if self.fit_lower == 'augment': k = 2 if self.fit_linear else 1 for _ in range(self.degree - k): X = add_dummy_feature(X, value=1) return X def fit(self, X, y): """Fit factorization machine to training data. Parameters ---------- X : array-like or sparse, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : Estimator Returns self. """ if self.degree > 3: raise ValueError("FMs with degree >3 not yet supported.") X, y = self._check_X_y(X, y) X = self._augment(X) n_features = X.shape[1] # augmented X_col_norms = row_norms(X.T, squared=True) dataset = get_dataset(X, order="fortran") rng = check_random_state(self.random_state) loss_obj = self._get_loss(self.loss) if not (self.warm_start and hasattr(self, 'w_')): self.w_ = np.zeros(n_features, dtype=np.double) if self.fit_lower == 'explicit': n_orders = self.degree - 1 else: n_orders = 1 if not (self.warm_start and hasattr(self, 'P_')): self.P_ = 0.01 * rng.randn(n_orders, self.n_components, n_features) if not (self.warm_start and hasattr(self, 'lams_')): if self.init_lambdas == 'ones': self.lams_ = np.ones(self.n_components) elif self.init_lambdas == 'random_signs': self.lams_ = np.sign(rng.randn(self.n_components)) else: raise ValueError("Lambdas must be initialized as ones " "(init_lambdas='ones') or as random " "+/- 1 (init_lambdas='random_signs').") y_pred = self._get_output(X) converged = _cd_direct_ho(self.P_, self.w_, dataset, X_col_norms, y, y_pred, self.lams_, self.degree, self.alpha, self.beta, self.fit_linear, self.fit_lower == 'explicit', loss_obj, self.max_iter, self.tol, self.verbose) if not converged: warnings.warn("Objective did not converge. Increase max_iter.") return self def _get_output(self, X): y_pred = _poly_predict(X, self.P_[0, :, :], self.lams_, kernel='anova', degree=self.degree) if self.fit_linear: y_pred += safe_sparse_dot(X, self.w_) if self.fit_lower == 'explicit' and self.degree == 3: # degree cannot currently be > 3 y_pred += _poly_predict(X, self.P_[1, :, :], self.lams_, kernel='anova', degree=2) return y_pred def _predict(self, X): if not hasattr(self, "P_"): raise NotFittedError("Estimator not fitted.") X = check_array(X, accept_sparse='csc', dtype=np.double) X = self._augment(X) return self._get_output(X)
class _BaseChain(six.with_metaclass(ABCMeta, BaseEstimator)): def __init__(self, base_estimator, order=None, cv=None, random_state=None): self.base_estimator = base_estimator self.order = order self.cv = cv self.random_state = random_state @abstractmethod def fit(self, X, Y): """Fit the model to data matrix X and targets Y. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data. Y : array-like, shape (n_samples, n_classes) The target values. Returns ------- self : object """ X, Y = check_X_y(X, Y, multi_output=True, accept_sparse=True, force_all_finite=False, dtype="object") random_state = check_random_state(self.random_state) check_array(X, accept_sparse=True, force_all_finite=False, dtype="object") self.order_ = self.order if self.order_ is None: self.order_ = np.array(range(Y.shape[1])) elif isinstance(self.order_, str): if self.order_ == 'random': self.order_ = random_state.permutation(Y.shape[1]) elif sorted(self.order_) != list(range(Y.shape[1])): raise ValueError("invalid order") self.estimators_ = [ clone(self.base_estimator) for _ in range(Y.shape[1]) ] if self.cv is None: Y_pred_chain = Y[:, self.order_] if sp.issparse(X): X_aug = sp.hstack((X, Y_pred_chain), format='lil') X_aug = X_aug.tocsr() else: X_aug = np.hstack((X, Y_pred_chain)) elif sp.issparse(X): Y_pred_chain = sp.lil_matrix((X.shape[0], Y.shape[1])) X_aug = sp.hstack((X, Y_pred_chain), format='lil') else: Y_pred_chain = np.zeros((X.shape[0], Y.shape[1])) X_aug = np.hstack((X, Y_pred_chain)) del Y_pred_chain for chain_idx, estimator in enumerate(self.estimators_): y = Y[:, self.order_[chain_idx]] estimator.fit(X_aug[:, :(X.shape[1] + chain_idx)], y) if self.cv is not None and chain_idx < len(self.estimators_) - 1: col_idx = X.shape[1] + chain_idx cv_result = cross_val_predict(self.base_estimator, X_aug[:, :col_idx], y=y, cv=self.cv) if sp.issparse(X_aug): X_aug[:, col_idx] = np.expand_dims(cv_result, 1) else: X_aug[:, col_idx] = cv_result return self def predict(self, X): """Predict on the data matrix X using the ClassifierChain model. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data. Returns ------- Y_pred : array-like, shape (n_samples, n_classes) The predicted values. """ X = check_array(X, accept_sparse=True, dtype="object") Y_pred_chain = np.zeros((X.shape[0], len(self.estimators_))) for chain_idx, estimator in enumerate(self.estimators_): previous_predictions = Y_pred_chain[:, :chain_idx] if sp.issparse(X): if chain_idx == 0: X_aug = X else: X_aug = sp.hstack((X, previous_predictions)) else: X_aug = np.hstack((X, previous_predictions)) Y_pred_chain[:, chain_idx] = estimator.predict(X_aug) inv_order = np.empty_like(self.order_) inv_order[self.order_] = np.arange(len(self.order_)) Y_pred = Y_pred_chain[:, inv_order] return Y_pred
class MultiOutputEstimator( six.with_metaclass(ABCMeta, BaseEstimator, MetaEstimatorMixin)): @abstractmethod def __init__(self, estimator, n_jobs=None): self.estimator = estimator self.n_jobs = n_jobs @if_delegate_has_method('estimator') def partial_fit(self, X, y, classes=None, sample_weight=None): """Incrementally fit the model to data. Fit a separate model for each output variable. Parameters ---------- X : (sparse) array-like, shape (n_samples, n_features) Data. y : (sparse) array-like, shape (n_samples, n_outputs) Multi-output targets. classes : list of numpy arrays, shape (n_outputs) Each array is unique classes for one output in str/int Can be obtained by via ``[np.unique(y[:, i]) for i in range(y.shape[1])]``, where y is the target matrix of the entire dataset. This argument is required for the first call to partial_fit and can be omitted in the subsequent calls. Note that y doesn't need to contain all labels in `classes`. sample_weight : array-like, shape = (n_samples) or None Sample weights. If None, then samples are equally weighted. Only supported if the underlying regressor supports sample weights. Returns ------- self : object """ X, y = check_X_y(X, y, multi_output=True, accept_sparse=True) if y.ndim == 1: raise ValueError("y must have at least two dimensions for " "multi-output regression but has only one.") if (sample_weight is not None and not has_fit_parameter(self.estimator, 'sample_weight')): raise ValueError("Underlying estimator does not support" " sample weights.") first_time = not hasattr(self, 'estimators_') self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_partial_fit_estimator) (self.estimators_[i] if not first_time else self.estimator, X, y[:, i], classes[i] if classes is not None else None, sample_weight, first_time) for i in range(y.shape[1])) return self def fit(self, X, y, sample_weight=None): """ Fit the model to data. Fit a separate model for each output variable. Parameters ---------- X : (sparse) array-like, shape (n_samples, n_features) Data. y : (sparse) array-like, shape (n_samples, n_outputs) Multi-output targets. An indicator matrix turns on multilabel estimation. sample_weight : array-like, shape = (n_samples) or None Sample weights. If None, then samples are equally weighted. Only supported if the underlying regressor supports sample weights. Returns ------- self : object """ if not hasattr(self.estimator, "fit"): raise ValueError( "The base estimator should implement a fit method") X, y = check_X_y(X, y, multi_output=True, accept_sparse=True, dtype="object") if is_classifier(self): check_classification_targets(y) if y.ndim == 1: raise ValueError("y must have at least two dimensions for " "multi-output regression but has only one.") if (sample_weight is not None and not has_fit_parameter(self.estimator, 'sample_weight')): raise ValueError("Underlying estimator does not support" " sample weights.") self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_fit_estimator)(self.estimator, X, y[:, i], sample_weight) for i in range(y.shape[1])) return self def predict(self, X): """Predict multi-output variable using a model trained for each target variable. Parameters ---------- X : (sparse) array-like, shape (n_samples, n_features) Data. Returns ------- y : (sparse) array-like, shape (n_samples, n_outputs) Multi-output targets predicted across multiple predictors. Note: Separate models are generated for each predictor. """ check_is_fitted(self, 'estimators_') if not hasattr(self.estimator, "predict"): raise ValueError( "The base estimator should implement a predict method") X = check_array(X, accept_sparse=True, force_all_finite=False, dtype="object") y = Parallel(n_jobs=self.n_jobs)( delayed(parallel_helper)(e, 'predict', X) for e in self.estimators_) return np.asarray(y).T
class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)): """Mixin class for samplers with abstact method. Warning: This class should not be used directly. Use the derive classes instead. """ _estimator_type = 'sampler' def __init__(self, ratio='auto', random_state=None): """Initialize this object and its instance variables. Parameters ---------- ratio : str or float, optional (default='auto') If 'auto', the ratio will be defined automatically to balanced the dataset. Otherwise, the ratio will corresponds to the number of samples in the minority class over the the number of samples in the majority class. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random. Returns ------- None """ self.ratio = ratio self.random_state = random_state self.logger = logging.getLogger(__name__) def fit(self, X, y): """Find the classes statistics before to perform sampling. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- self : object, Return self. """ # Check the consistency of X and y X, y = check_X_y(X, y) self.min_c_ = None self.maj_c_ = None self.stats_c_ = {} self.X_shape_ = None if hasattr(self, 'ratio'): self._validate_ratio() if hasattr(self, 'size_ngh'): self._validate_size_ngh_deprecation() elif hasattr(self, 'k') and not hasattr(self, 'm'): self._validate_k_deprecation() elif hasattr(self, 'k') and hasattr(self, 'm'): self._validate_k_m_deprecation() self.logger.info('Compute classes statistics ...') # Raise an error if there is only one class # if uniques.size == 1: # raise RuntimeError("Only one class detected, aborting...") # Raise a warning for the moment to be compatible with BaseEstimator self.logger.debug('The number of classes is %s', np.unique(y).size) self.logger.debug('Shall we raise a warning: %s', np.unique(y).size == 1) if np.unique(y).size == 1: warnings.simplefilter('always', UserWarning) warnings.warn('Only one class detected, something will get wrong') self.logger.debug('The warning should has been raised.') # Store the size of X to check at sampling time if we have the # same data self.X_shape_ = X.shape # Create a dictionary containing the class statistics self.stats_c_ = Counter(y) # Find the minority and majority classes self.min_c_ = min(self.stats_c_, key=self.stats_c_.get) self.maj_c_ = max(self.stats_c_, key=self.stats_c_.get) self.logger.info('%s classes detected: %s', np.unique(y).size, self.stats_c_) # Check if the ratio provided at initialisation make sense if isinstance(self.ratio, float): if self.ratio < (self.stats_c_[self.min_c_] / self.stats_c_[self.maj_c_]): raise RuntimeError('The ratio requested at initialisation' ' should be greater or equal than the' ' balancing ratio of the current data.') return self def sample(self, X, y): """Resample the dataset. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- X_resampled : ndarray, shape (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled` """ # Check the consistency of X and y X, y = check_X_y(X, y) # Check that the data have been fitted if not hasattr(self, 'stats_c_'): raise RuntimeError('You need to fit the data, first!!!') # Check if the size of the data is identical than at fitting if X.shape != self.X_shape_: raise RuntimeError('The data that you attempt to resample do not' ' seem to be the one earlier fitted. Use the' ' fitted data.') if hasattr(self, 'ratio'): self._validate_ratio() if hasattr(self, 'size_ngh'): self._validate_size_ngh_deprecation() elif hasattr(self, 'k') and not hasattr(self, 'm'): self._validate_k_deprecation() elif hasattr(self, 'k') and hasattr(self, 'm'): self._validate_k_m_deprecation() return self._sample(X, y) def fit_sample(self, X, y): """Fit the statistics and resample the data directly. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- X_resampled : ndarray, shape (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled` """ return self.fit(X, y).sample(X, y) def _validate_ratio(self): # The ratio correspond to the number of samples in the minority class # over the number of samples in the majority class. Thus, the ratio # cannot be greater than 1.0 if isinstance(self.ratio, float): if self.ratio > 1: raise ValueError('Ration cannot be greater than one.') elif self.ratio <= 0: raise ValueError('Ratio cannot be negative.') elif isinstance(self.ratio, six.string_types): if self.ratio != 'auto': raise ValueError('Unknown string for the parameter ratio.') else: raise ValueError('Unknown parameter type for ratio.') def _validate_size_ngh_deprecation(self): "Private function to warn about the deprecation about size_ngh." # Announce deprecation if necessary if self.size_ngh is not None: warnings.warn('`size_ngh` will be replaced in version 0.4. Use' ' `n_neighbors` instead.', DeprecationWarning) self.n_neighbors = self.size_ngh def _validate_k_deprecation(self): """Private function to warn about deprecation of k in ADASYN""" if self.k is not None: warnings.warn('`k` will be replaced in version 0.4. Use' ' `n_neighbors` instead.', DeprecationWarning) self.n_neighbors = self.k def _validate_k_m_deprecation(self): """Private function to warn about deprecation of k in ADASYN""" if self.k is not None: warnings.warn('`k` will be replaced in version 0.4. Use' ' `k_neighbors` instead.', DeprecationWarning) self.k_neighbors = self.k if self.m is not None: warnings.warn('`m` will be replaced in version 0.4. Use' ' `m_neighbors` instead.', DeprecationWarning) self.m_neighbors = self.m @abstractmethod def _sample(self, X, y): """Resample the dataset. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- X_resampled : ndarray, shape (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled` """ pass def __getstate__(self): """Prevent logger from being pickled.""" object_dictionary = self.__dict__.copy() del object_dictionary['logger'] return object_dictionary def __setstate__(self, dict): """Re-open the logger.""" logger = logging.getLogger(__name__) self.__dict__.update(dict) self.logger = logger
class RandomForestModel(six.with_metaclass(ABCMeta, BaseEnsemble)): @abstractmethod def __init__(self, base_estimator, n_estimators=100, estimator_params=tuple()): super(RandomForestModel, self).__init__( base_estimator=base_estimator, n_estimators=n_estimators, estimator_params=estimator_params) def get_model(self, X, y,sample_weight = None): # Validate or convert input data X = check_array(X, accept_sparse="csc", dtype=DTYPE) y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None) if issparse(X): # Pre-sort indices to avoid that each individual tree of the # ensemble sorts the indices. X.sort_indices() # Remap output n_samples, self.n_features_ = X.shape y = np.atleast_1d(y) if y.ndim == 1: y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] y, expanded_class_weight = self.check_y_class_wt(y) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) if expanded_class_weight is not None: sample_weight = expanded_class_weight # Check parameters self._validate_estimator() random_state = np.random.mtrand._rand # Free allocated memory, if any self.estimators_ = [] n_more_estimators = self.n_estimators - len(self.estimators_) trees = [] for i in range(n_more_estimators): tree = self._make_estimator(append=False, random_state=random_state) trees.append(tree) trees = Parallel(n_jobs=2, backend="threading")( delayed(build_trees)(t, X, y, sample_weight) for i, t in enumerate(trees)) # Collect newly grown trees self.estimators_.extend(trees) # Decapsulate classes_ attributes if hasattr(self, "classes_") and self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self def check_y_class_wt(self, y): return y, None def check_X(self, X): return self.estimators_[0]._validate_X_predict(X, check_input=True)
class BaseSearchCV(six.with_metaclass(ABCMeta, BaseEstimator, MetaEstimatorMixin)): """Base class for hyper parameter search with cross-validation.""" @abstractmethod def __init__(self, estimator, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score='raise'): self.scoring = scoring self.estimator = estimator self.n_jobs = n_jobs self.fit_params = fit_params if fit_params is not None else {} self.iid = iid self.refit = refit self.cv = cv self.verbose = verbose self.pre_dispatch = pre_dispatch self.error_score = error_score @property def _estimator_type(self): return self.estimator._estimator_type def score(self, X, y=None): """Returns the score on the given data, if the estimator has been refit. This uses the score defined by ``scoring`` where provided, and the ``best_estimator_.score`` method otherwise. Parameters ---------- X : array-like, shape = [n_samples, n_features] Input data, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] or [n_samples, n_output], optional Target relative to X for classification or regression; None for unsupervised learning. Returns ------- score : float Notes ----- * The long-standing behavior of this method changed in version 0.16. * It no longer uses the metric provided by ``estimator.score`` if the ``scoring`` parameter was set when fitting. """ if self.scorer_ is None: raise ValueError("No score function explicitly defined, " "and the estimator doesn't provide one %s" % self.best_estimator_) if self.scoring is not None and hasattr(self.best_estimator_, 'score'): warnings.warn("The long-standing behavior to use the estimator's " "score function in {0}.score has changed. The " "scoring parameter is now used." "".format(self.__class__.__name__), ChangedBehaviorWarning) return self.scorer_(self.best_estimator_, X, y) @if_delegate_has_method(delegate='estimator') def predict(self, X): """Call predict on the estimator with the best found parameters. Only available if ``refit=True`` and the underlying estimator supports ``predict``. Parameters ----------- X : indexable, length n_samples Must fulfill the input assumptions of the underlying estimator. """ return self.best_estimator_.predict(X) @if_delegate_has_method(delegate='estimator') def predict_proba(self, X): """Call predict_proba on the estimator with the best found parameters. Only available if ``refit=True`` and the underlying estimator supports ``predict_proba``. Parameters ----------- X : indexable, length n_samples Must fulfill the input assumptions of the underlying estimator. """ return self.best_estimator_.predict_proba(X) @if_delegate_has_method(delegate='estimator') def predict_log_proba(self, X): """Call predict_log_proba on the estimator with the best found parameters. Only available if ``refit=True`` and the underlying estimator supports ``predict_log_proba``. Parameters ----------- X : indexable, length n_samples Must fulfill the input assumptions of the underlying estimator. """ return self.best_estimator_.predict_log_proba(X) @if_delegate_has_method(delegate='estimator') def decision_function(self, X): """Call decision_function on the estimator with the best found parameters. Only available if ``refit=True`` and the underlying estimator supports ``decision_function``. Parameters ----------- X : indexable, length n_samples Must fulfill the input assumptions of the underlying estimator. """ return self.best_estimator_.decision_function(X) @if_delegate_has_method(delegate='estimator') def transform(self, X): """Call transform on the estimator with the best found parameters. Only available if the underlying estimator supports ``transform`` and ``refit=True``. Parameters ----------- X : indexable, length n_samples Must fulfill the input assumptions of the underlying estimator. """ return self.best_estimator_.transform(X) @if_delegate_has_method(delegate='estimator') def inverse_transform(self, Xt): """Call inverse_transform on the estimator with the best found parameters. Only available if the underlying estimator implements ``inverse_transform`` and ``refit=True``. Parameters ----------- Xt : indexable, length n_samples Must fulfill the input assumptions of the underlying estimator. """ return self.best_estimator_.transform(Xt) def _fit(self, X, y, labels, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y, labels = indexable(X, y, labels) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) n_splits = cv.get_n_splits(X, y, labels) if self.verbose > 0 and isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(n_splits, n_candidates, n_candidates * n_splits)) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for parameters in parameter_iterable for train, test in cv.split(X, y, labels)) # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_splits): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in \ out[grid_start:grid_start + n_splits]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_splits) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
class RFClassifier(six.with_metaclass(ABCMeta, RandomForestModel, ClassifierMixin)): def __init__(self, base_estimator=DecisionTreeClassifier(), n_estimators=100): super(RFClassifier, self).__init__( base_estimator, n_estimators=n_estimators, estimator_params=("criterion", "max_depth", "min_samples_split", "min_samples_leaf", "min_weight_fraction_leaf", "max_features", "max_leaf_nodes", "min_impurity_decrease", "min_impurity_split")) self.criterion = "gini" self.max_depth = None self.min_samples_split = 2 self.min_samples_leaf = 1 self.min_weight_fraction_leaf = 0. self.max_features = "auto" self.max_leaf_nodes = None self.min_impurity_decrease = 0. self.min_impurity_split = None def check_y_class_wt(self, y): y = np.copy(y) self.classes_ = [] self.n_classes_ = [] y_store_unique_indices = np.zeros(y.shape, dtype=np.int) for k in range(self.n_outputs_): classes_k, y_store_unique_indices[:, k] = np.unique(y[:, k], return_inverse=True) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) y = y_store_unique_indices return y, None def get_predictions(self, X): proba = self.predict_proba(X) if self.n_outputs_ == 1: return self.classes_.take(np.argmax(proba, axis=1), axis=0) else: n_samples = proba[0].shape[0] predictions = np.zeros((n_samples, self.n_outputs_)) for k in range(self.n_outputs_): predictions[:, k] = self.classes_[k].take(np.argmax(proba[k], axis=1), axis=0) return predictions def predict_proba(self, X): # Check data X = self.check_X(X) # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, 2) # avoid storing the output of every estimator by summing them here all_proba = [np.zeros((X.shape[0], j), dtype=np.float64) for j in np.atleast_1d(self.n_classes_)] lock = threading.Lock() Parallel(n_jobs=n_jobs, backend="threading")( delayed(accumulate_prediction)(e.predict_proba, X, all_proba, lock) for e in self.estimators_) for proba in all_proba: proba /= len(self.estimators_) if len(all_proba) == 1: return all_proba[0] else: return all_proba
class H2OBaseCrossValidator(six.with_metaclass(ABCMeta)): """Base class for H2O cross validation operations. All implementing subclasses should override ``get_n_splits`` and ``_iter_test_indices``. """ def __init__(self): pass def split(self, frame, y=None): """Generate indices to split data into training and test. Parameters ---------- frame : ``H2OFrame`` The h2o frame to split y : str, optional (default=None) The name of the column to stratify, if applicable. Returns ------- train : ndarray The training set indices for the split test : ndarray The testing set indices for that split """ frame = check_frame(frame, copy=False) indices = np.arange(frame.shape[0]) for test_index in self._iter_test_masks(frame, y): train_index = indices[np.logical_not(test_index)] test_index = indices[test_index] # h2o can't handle anything but lists... yield list(train_index), list(test_index) def _iter_test_masks(self, frame, y=None): """Generates boolean masks corresponding to the tests set. Parameters ---------- frame : H2OFrame The h2o frame to split y : string, optional (default=None) The column to stratify. Returns ------- test_mask : np.ndarray, shape=(n_samples,) The indices for the test split """ for test_index in self._iter_test_indices(frame, y): test_mask = np.zeros(frame.shape[0], dtype=np.bool) test_mask[test_index] = True yield test_mask def _iter_test_indices(self, frame, y=None): raise NotImplementedError('this method must be implemented by a subclass') @abstractmethod def get_n_splits(self): """Get the number of splits or folds for this instance of the cross validator. """ pass def __repr__(self): return _build_repr(self)
class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)): """Mixin class for samplers with abstract method. Warning: This class should not be used directly. Use the derive classes instead. """ _estimator_type = 'sampler' def _check_X_y(self, X, y): """Private function to check that the X and y in fitting are the same than in sampling.""" X_hash, y_hash = hash_X_y(X, y) if self.X_hash_ != X_hash or self.y_hash_ != y_hash: raise RuntimeError("X and y need to be same array earlier fitted.") def sample(self, X, y): """Resample the dataset. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- X_resampled : {ndarray, sparse matrix}, shape \ (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled` """ # Check the consistency of X and y X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) check_is_fitted(self, 'ratio_') self._check_X_y(X, y) return self._sample(X, y) def fit_sample(self, X, y): """Fit the statistics and resample the data directly. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- X_resampled : {array-like, sparse matrix}, shape \ (n_samples_new, n_features) The array containing the resampled data. y_resampled : array-like, shape (n_samples_new,) The corresponding label of `X_resampled` """ return self.fit(X, y).sample(X, y) @abstractmethod def _sample(self, X, y): """Resample the dataset. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- X_resampled : {ndarray, sparse matrix}, shape \ (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` """ pass def __getstate__(self): """Prevent logger from being pickled.""" object_dictionary = self.__dict__.copy() del object_dictionary['logger'] return object_dictionary def __setstate__(self, dict): """Re-open the logger.""" logger = logging.getLogger(__name__) self.__dict__.update(dict) self.logger = logger
class H2OBaseShuffleSplit(six.with_metaclass(ABCMeta)): """Base class for H2OShuffleSplit and H2OStratifiedShuffleSplit. This is used for ``h2o_train_test_split`` in strategic train/test splits of H2OFrames. Implementing subclasses should override ``_iter_indices``. Parameters ---------- n_splits : int, optional (default=2) The number of folds or splits in the split test_size : float or int, optional (default=0.1) The ratio of observations for the test fold train_size : float or int, optional (default=None) The ratio of observations for the train fold random_state : int or RandomState, optional (default=None) The random state for duplicative purposes. """ def __init__(self, n_splits=2, test_size=0.1, train_size=None, random_state=None): _validate_shuffle_split_init(test_size, train_size) self.n_splits = n_splits self.test_size = test_size self.train_size = train_size self.random_state = random_state def split(self, frame, y=None): """Split the frame. Parameters ---------- frame : H2OFrame The frame to split y : string, optional (default=None) The column to stratify. """ for train, test in self._iter_indices(frame, y): yield train, test @abstractmethod def _iter_indices(self, frame, y): """Abstract method for iterating the indices. Parameters ---------- frame : H2OFrame The frame to split y : string, optional (default=None) The column to stratify. """ pass def get_n_splits(self): """Get the number of splits or folds for this instance of the shuffle split. """ return self.n_splits def __repr__(self): return _build_repr(self)
class StationaryCorrelation(with_metaclass(ABCMeta, object)): """ Base-class for stationary correlation models for Gaussian Processes. Stationary correlation models dependent only on the relative distance and not on the absolute positions of the respective datapoints. We can thus work internally solely on these distances. """ def __init__(self): pass def fit(self, X, nugget=10. * MACHINE_EPSILON): """ Fits the correlation model for training data X Parameters ---------- X : array_like, shape=(n_samples, n_features) An array of training datapoints at which observations were made, i.e., where the outputs y are known nugget : double or ndarray, optional The Gaussian Process nugget parameter The nugget is added to the diagonal of the assumed training covariance; in this way it acts as a Tikhonov regularization in the problem. In the special case of the squared exponential correlation function, the nugget mathematically represents the variance of the input values. Default assumes a nugget close to machine precision for the sake of robustness (nugget = 10. * MACHINE_EPSILON). """ self.X = X self.nugget = nugget self.n_samples = X.shape[0] # Calculate array with shape (n_eval, n_features) giving the # componentwise distances between locations x and x' at which the # correlation model should be evaluated. self.D, self.ij = l1_cross_differences(self.X) if (np.min(np.sum(self.D, axis=1)) == 0. and not isinstance(self, PureNugget)): raise Exception("Multiple input features cannot have the same" " value.") def __call__(self, theta, X=None): """ Compute correlation for given correlation parameter(s) theta. Parameters ---------- theta : array_like An array with giving the autocorrelation parameter(s). Dimensionality depends on the specific correlation model; often shape (1,) corresponds to an isotropic correlation model and shape (n_features,) to a anisotropic one. X : array_like, shape(n_eval, n_features) An array containing the n_eval query points whose correlation with the training datapoints shall be computed. If None, autocorrelation of the training datapoints is computed instead. Returns ------- r : array_like, shape=(n_eval, n_samples) if X != None (n_samples, n_samples) if X == None An array containing the values of the correlation model. """ theta = np.asarray(theta, dtype=np.float) if X is not None: # Get pairwise componentwise L1-differences to the input training # set d = X[:, np.newaxis, :] - self.X[np.newaxis, :, :] d = d.reshape((-1, X.shape[1])) else: # No external datapoints given; auto-correlation of training set # is used instead d = self.D if d.ndim > 1: n_features = d.shape[1] else: n_features = 1 # Compute the correlation for the respective correlation model (handled # by subclass) r = self._compute_corr(theta, d, n_features) if X is not None: # Convert to 2d matrix return r.reshape(-1, self.n_samples) else: # Auto-correlation computed only for upper triangular part of # matrix. Fill diagonal with 1+nugget and the lower triangular # by exploiting symmetry of matrix R = np.eye(self.n_samples) * (1. + self.nugget) R[self.ij[:, 0], self.ij[:, 1]] = r R[self.ij[:, 1], self.ij[:, 0]] = r return R def log_prior(self, theta): """ Returns the (log) prior probability of parameters theta. The prior is assumed to be uniform over the parameter space. NOTE: The returned quantity is an improper prior as its integral over the parameter space is not equal to 1. Parameters ---------- theta : array_like, shape=(1,) or (n_features,) An array with shape 1 (isotropic) or n_features (anisotropic) giving the autocorrelation parameter(s). Returns ------- log_p : float The (log) prior probability of parameters theta. An improper probability. """ return 0 @abstractmethod def _compute_corr(self, theta, d, n_features): """ Correlation for given pairwise, component-wise L1-differences.
class _H2OBaseKFold(six.with_metaclass(ABCMeta, H2OBaseCrossValidator)): """Base class for KFold and Stratified KFold. Parameters ---------- n_folds : int The number of splits shuffle : bool Whether to shuffle indices random_state : int or RandomState The random state for the split """ @abstractmethod def __init__(self, n_folds, shuffle, random_state): if not isinstance(n_folds, numbers.Integral): raise ValueError('n_folds must be of Integral type. ' '%s of type %s was passed' % (n_folds, type(n_folds))) n_folds = int(n_folds) if n_folds <= 1: raise ValueError('k-fold cross-validation requires at least one ' 'train/test split by setting n_folds=2 or more') if shuffle not in [True, False]: raise TypeError('shuffle must be True or False. Got %s (type=%s)' % (str(shuffle), type(shuffle))) self.n_folds = n_folds self.shuffle = shuffle self.random_state = random_state @overrides(H2OBaseCrossValidator) def split(self, frame, y=None): """Split the frame. Parameters ---------- frame : H2OFrame The frame to split y : string, optional (default=None) The column to stratify. """ frame = check_frame(frame, copy=False) n_obs = frame.shape[0] if self.n_folds > n_obs: raise ValueError('Cannot have n_folds greater than n_obs') for train, test in super(_H2OBaseKFold, self).split(frame, y): yield train, test @overrides(H2OBaseCrossValidator) def get_n_splits(self): """Get the number of splits or folds. Returns ------- n_folds : int The number of folds """ return self.n_folds
class semiKMeans(six.with_metaclass(ABCMeta, BaseEstimator,ClusterMixin, TransformerMixin)): def __init__(self,maxiter=100,fixedprec=1e-9,verbose=False): self.maxiter=maxiter self.verbose=verbose self.fixedprec=fixedprec self.labels=None self.plattlr = None self.cluster_centers_=None def predict(self, X): """Predict the closest cluster each sample in X belongs to. In the vector quantization literature, `cluster_centers_` is called the code book and each value returned by `predict` is the index of the closest code in the code book. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] New data to predict. Returns ------- labels : array, shape [n_samples,] Index of the cluster each sample belongs to. """ #check_is_fitted(self, 'cluster_centers_') X = self._check_test_data(X) x_squared_norms = row_norms(X, squared=True) return _labels_inertia(X, x_squared_norms, self.cluster_centers_)[0] def _check_test_data(self, X): X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES, warn_on_dtype=True) n_samples, n_features = X.shape expected_n_features = self.cluster_centers_.shape[1] if not n_features == expected_n_features: raise ValueError("Incorrect number of features. " "Got %d features, expected %d" % ( n_features, expected_n_features)) return X def fit_transform(self,texts,labels): # Initialize clusters with labeled data clust_names = [x for x, y in collections.Counter(labels).items() if y > 0] clust_names = sorted(clust_names)[1:] #print(clust_names) #centroids = np.zeros((len(clust_names),len(texts[1,:]))) centroids = np.zeros((len(clust_names),texts.shape[1])) for unique_name in clust_names: indices = [i for i, x in enumerate(labels) if x == unique_name] aux = texts[indices,:] #print(np.mean(aux,axis=0).shape) #print(centroids[clust_names.index(unique_name),:].shape) centroids[clust_names.index(unique_name),:] = np.mean(aux,axis=0) texts = mat(texts) centroids = mat(centroids) new_labels = labels cnt = 0 # Main loop while cnt<self.maxiter: cnt +=1 if self.verbose: print('Iter: '+str(cnt)) # Assign data to nearest centroid (cosine distance) dist = dot(texts,centroids.T)/linalg.norm(texts)/linalg.norm(centroids) n_lab = dist.argmax(axis=1) for ii in range(len(n_lab)): new_labels[ii] = clust_names[n_lab[ii,0]] # print [y for x, y in collections.Counter(new_labels).items() if y > 0] # Recalculate clusters new_centroids = np.zeros((len(clust_names),len(texts.T))) for unique_name in clust_names: indices = [i for i, x in enumerate(new_labels) if x == unique_name] if len(indices)>0: aux = texts[indices,:] new_centroids[clust_names.index(unique_name),:] = aux.mean(0) else: new_centroids[clust_names.index(unique_name),:] = centroids[clust_names.index(unique_name),:] # Check exit condition difference = np.power((centroids-new_centroids),2) if difference.sum()<self.fixedprec: break; else: self.labels = new_labels centroids = new_centroids self.cluster_centers_=new_centroids self.plattlr = LR() preds = self.predict(texts[labels!=-1,:]) self.plattlr.fit( preds.reshape( -1, 1 ), labels[labels!=-1]) return (labels) def predict_proba(self, X): """Compute probabilities of possible outcomes for samples in X. The model need to have probability information computed at training time: fit with attribute `probability` set to True. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- T : array-like, shape = [n_samples, n_classes] Returns the probability of the sample for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ preds = self.predict(X) #################Should be Added by me################ ######################### return self.plattlr.predict_proba(preds.reshape( -1, 1 ))
class BaseSymbolic(six.with_metaclass(ABCMeta, BaseEstimator)): """Base class for symbolic regression / classification estimators. Warning: This class should not be used directly. Use derived classes instead. """ @abstractmethod def __init__(self, population_size=1000, hall_of_fame=None, n_components=None, generations=20, tournament_size=20, stopping_criteria=0.0, const_range=(-1., 1.), init_depth=(2, 6), init_method='half and half', function_set=('add', 'sub', 'mul', 'div'), metric='mean absolute error', parsimony_coefficient=0.001, p_crossover=0.9, p_subtree_mutation=0.01, p_hoist_mutation=0.01, p_point_mutation=0.01, p_point_replace=0.05, max_samples=1.0, warm_start=False, n_jobs=1, verbose=0, random_state=None): self.population_size = population_size self.hall_of_fame = hall_of_fame self.n_components = n_components self.generations = generations self.tournament_size = tournament_size self.stopping_criteria = stopping_criteria self.const_range = const_range self.init_depth = init_depth self.init_method = init_method self.function_set = function_set self.metric = metric self.parsimony_coefficient = parsimony_coefficient self.p_crossover = p_crossover self.p_subtree_mutation = p_subtree_mutation self.p_hoist_mutation = p_hoist_mutation self.p_point_mutation = p_point_mutation self.p_point_replace = p_point_replace self.max_samples = max_samples self.warm_start = warm_start self.n_jobs = n_jobs self.verbose = verbose self.random_state = random_state def _verbose_reporter(self, start_time=None, gen=None, population=None, fitness=None, length=None): """A report of the progress of the evolution process. Parameters ---------- start_time : float The start time for the current generation. gen : int The current generation (0 is the first naive random population). population : list The current population. fitness : list The current population's raw fitness. length : list The current population's lengths. """ if start_time is None: print('%4s|%-25s|%-42s|' % (' ', 'Population Average'.center(25), 'Best Individual'.center(42))) print('-' * 4 + ' ' + '-' * 25 + ' ' + '-' * 42 + ' ' + '-' * 10) header_fields = ('Gen', 'Length', 'Fitness', 'Length', 'Fitness', 'OOB Fitness', 'Time Left') print('%4s %8s %16s %8s %16s %16s %10s' % header_fields) else: # Estimate remaining time for run remaining_time = ((self.generations - gen - 1) * (time() - start_time) / float(gen + 1)) if remaining_time > 60: remaining_time = '{0:.2f}m'.format(remaining_time / 60.0) else: remaining_time = '{0:.2f}s'.format(remaining_time) # Find the current generation's best individual if self._metric.greater_is_better: best_program = population[np.argmax(fitness)] else: best_program = population[np.argmin(fitness)] oob_fitness = 'N/A' if self.max_samples < 1.0: oob_fitness = best_program.oob_fitness_ print('%4s %8s %16s %8s %16s %16s %10s' % (gen, np.round(np.mean(length), 2), np.mean(fitness), best_program.length_, best_program.raw_fitness_, oob_fitness, remaining_time)) def fit(self, X, y, sample_weight=None): """Fit the Genetic Program according to X, y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. sample_weight : array-like, shape = [n_samples], optional Weights applied to individual samples. Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) # Check arrays X, y = check_X_y(X, y, y_numeric=True) _, self.n_features_ = X.shape hall_of_fame = self.hall_of_fame if hall_of_fame is None: hall_of_fame = self.population_size if hall_of_fame > self.population_size or hall_of_fame < 1: raise ValueError('hall_of_fame (%d) must be less than or equal to ' 'population_size (%d).' % (self.hall_of_fame, self.population_size)) n_components = self.n_components if n_components is None: n_components = hall_of_fame if n_components > hall_of_fame or n_components < 1: raise ValueError('n_components (%d) must be less than or equal to ' 'hall_of_fame (%d).' % (self.n_components, self.hall_of_fame)) self._function_set = [] for function in self.function_set: if isinstance(function, six.string_types): if function not in _function_map: raise ValueError('invalid function name %s found in ' '`function_set`.' % function) self._function_set.append(_function_map[function]) elif isinstance(function, _Function): self._function_set.append(function) else: raise ValueError('invalid type %s found in `function_set`.' % type(function)) if len(self._function_set) == 0: raise ValueError('No valid functions found in `function_set`.') # For point-mutation to find a compatible replacement node self._arities = {} for function in self._function_set: arity = function.arity self._arities[arity] = self._arities.get(arity, []) self._arities[arity].append(function) if isinstance(self.metric, _Fitness): self._metric = self.metric elif isinstance(self, RegressorMixin): if self.metric not in ('mean absolute error', 'mse', 'rmse'): raise ValueError('Unsupported metric: %s' % self.metric) else: self._metric = _fitness_map[self.metric] elif isinstance(self, TransformerMixin): if self.metric not in ('pearson', 'spearman'): raise ValueError('Unsupported metric: %s' % self.metric) else: self._metric = _fitness_map[self.metric] self._method_probs = np.array([ self.p_crossover, self.p_subtree_mutation, self.p_hoist_mutation, self.p_point_mutation ]) self._method_probs = np.cumsum(self._method_probs) if self._method_probs[-1] > 1: raise ValueError('The sum of p_crossover, p_subtree_mutation, ' 'p_hoist_mutation and p_point_mutation should ' 'total to 1.0 or less.') if self.init_method not in ('half and half', 'grow', 'full'): raise ValueError('Valid program initializations methods include ' '"grow", "full" and "half and half". Given %s.' % self.init_method) if (not isinstance(self.const_range, tuple) or len(self.const_range) != 2): raise ValueError('const_range should be a tuple with length two.') if (not isinstance(self.init_depth, tuple) or len(self.init_depth) != 2): raise ValueError('init_depth should be a tuple with length two.') if self.init_depth[0] > self.init_depth[1]: raise ValueError('init_depth should be in increasing numerical ' 'order: (min_depth, max_depth).') params = self.get_params() params['_metric'] = self._metric params['function_set'] = self._function_set params['arities'] = self._arities params['method_probs'] = self._method_probs if not self.warm_start or not hasattr(self, "_programs"): # Free allocated memory, if any self._programs = [] prior_generations = len(self._programs) n_more_generations = self.generations - prior_generations if n_more_generations < 0: raise ValueError('generations=%d must be larger or equal to ' 'len(_programs)=%d when warm_start==True' % (self.generations, len(self._programs))) elif n_more_generations == 0: fitness = [program.raw_fitness_ for program in self._programs[-1]] warn("Warm-start fitting without increasing n_estimators does not " "fit new trees.") if self.warm_start: # Generate and discard seeds that would have been produced on the # initial fit call. for i in range(len(self._programs)): _ = random_state.randint(MAX_INT, size=self.population_size) if self.verbose: # Print header fields self._verbose_reporter() start_time = time() for gen in range(prior_generations, self.generations): if gen == 0: parents = None else: parents = self._programs[gen - 1] # Parallel loop n_jobs, n_programs, starts = _partition_estimators( self.population_size, self.n_jobs) seeds = random_state.randint(MAX_INT, size=self.population_size) population = Parallel( n_jobs=n_jobs, verbose=int(self.verbose > 1))(delayed(_parallel_evolve)( n_programs[i], parents, X, y, sample_weight, seeds[starts[i]:starts[i + 1]], params) for i in range(n_jobs)) # Reduce, maintaining order across different n_jobs population = list(itertools.chain.from_iterable(population)) fitness = [program.raw_fitness_ for program in population] length = [program.length_ for program in population] parsimony_coefficient = None if self.parsimony_coefficient == 'auto': parsimony_coefficient = (np.cov(length, fitness)[1, 0] / np.var(length)) for program in population: program.fitness_ = program.fitness(parsimony_coefficient) self._programs.append(population) # Remove old programs that didn't make it into the new population. for old_gen in np.arange(gen, 0, -1): indices = [] for program in self._programs[old_gen]: if program is not None: for idx in program.parents: if 'idx' in idx: indices.append(program.parents[idx]) indices = set(indices) for idx in range(self.population_size): if idx not in indices: self._programs[old_gen - 1][idx] = None if self.verbose: self._verbose_reporter(start_time, gen, population, fitness, length) # Check for early stopping if self._metric.greater_is_better: best_fitness = fitness[np.argmax(fitness)] if best_fitness >= self.stopping_criteria: break else: best_fitness = fitness[np.argmin(fitness)] if best_fitness <= self.stopping_criteria: break if isinstance(self, RegressorMixin): # Find the best individual in the final generation self._program = self._programs[-1][np.argmin(fitness)] if isinstance(self, TransformerMixin): # Find the best individuals in the final generation fitness = np.array(fitness) hall_of_fame = fitness.argsort()[:self.hall_of_fame] evaluation = np.array([ gp.execute(X) for gp in [self._programs[-1][i] for i in hall_of_fame] ]) if self.metric == 'spearman': evaluation = np.apply_along_axis(rankdata, 1, evaluation) # Iteratively remove the worst individual of the worst pair with np.errstate(divide='ignore', invalid='ignore'): correlations = np.abs(np.corrcoef(evaluation)) np.fill_diagonal(correlations, 0.) components = list(range(self.hall_of_fame)) indices = list(range(self.hall_of_fame)) while len(components) > self.n_components: worst = np.unravel_index(np.argmax(correlations), correlations.shape) worst = worst[np.argmax(np.sum(correlations[worst, :], 1))] components.pop(worst) indices.remove(worst) correlations = correlations[:, indices][indices, :] indices = list(range(len(components))) self._best_programs = [ self._programs[-1][i] for i in hall_of_fame[components] ] return self
class BaseMultilayerPerceptron(six.with_metaclass(ABCMeta, BaseEstimator)): """Base class for MLP classification and regression. Warning: This class should not be used directly. Use derived classes instead. """ @abstractmethod def __init__(self, hidden_layer_sizes, activation, algorithm, alpha, batch_size, learning_rate, learning_rate_init, power_t, max_iter, loss, shuffle, random_state, tol, verbose, warm_start, momentum, nesterovs_momentum, early_stopping, validation_fraction, beta_1, beta_2, epsilon): self.activation = activation self.algorithm = algorithm self.alpha = alpha self.batch_size = batch_size self.learning_rate = learning_rate self.learning_rate_init = learning_rate_init self.power_t = power_t self.max_iter = max_iter self.loss = loss self.hidden_layer_sizes = hidden_layer_sizes self.shuffle = shuffle self.random_state = random_state self.tol = tol self.verbose = verbose self.warm_start = warm_start self.momentum = momentum self.nesterovs_momentum = nesterovs_momentum self.early_stopping = early_stopping self.validation_fraction = validation_fraction self.beta_1 = beta_1 self.beta_2 = beta_2 self.epsilon = epsilon def _unpack(self, packed_parameters): """Extract the coefficients and intercepts from packed_parameters.""" for i in range(self.n_layers_ - 1): start, end, shape = self._coef_indptr[i] self.coefs_[i] = np.reshape(packed_parameters[start:end], shape) start, end = self._intercept_indptr[i] self.intercepts_[i] = packed_parameters[start:end] def _forward_pass(self, activations, with_output_activation=True): """Perform a forward pass on the network by computing the values of the neurons in the hidden layers and the output layer. Parameters ---------- activations: list, length = n_layers - 1 The ith element of the list holds the values of the ith layer. with_output_activation : bool, default True If True, the output passes through the output activation function, which is either the softmax function or the logistic function """ hidden_activation = ACTIVATIONS[self.activation] # Iterate over the hidden layers for i in range(self.n_layers_ - 1): activations[i + 1] = safe_sparse_dot(activations[i], self.coefs_[i]) activations[i + 1] += self.intercepts_[i] # For the hidden layers if (i + 1) != (self.n_layers_ - 1): activations[i + 1] = hidden_activation(activations[i + 1]) # For the last layer if with_output_activation: output_activation = ACTIVATIONS[self.out_activation_] activations[i + 1] = output_activation(activations[i + 1]) return activations def _compute_loss_grad(self, layer, n_samples, activations, deltas, coef_grads, intercept_grads): """Compute the gradient of loss with respect to coefs and intercept for specified layer. This function does backpropagation for the specified one layer. """ coef_grads[layer] = safe_sparse_dot(activations[layer].T, deltas[layer]) coef_grads[layer] += (self.alpha * self.coefs_[layer]) coef_grads[layer] /= n_samples intercept_grads[layer] = np.mean(deltas[layer], 0) return coef_grads, intercept_grads def _loss_grad_lbfgs(self, packed_coef_inter, X, y, activations, deltas, coef_grads, intercept_grads): """Compute the MLP loss function and its corresponding derivatives with respect to the different parameters given in the initialization. Returned gradients are packed in a single vector so it can be used in l-bfgs Parameters ---------- packed_parameters : array-like A vector comprising the flattened coefficients and intercepts. X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data. y : array-like, shape (n_samples,) The target values. activations: list, length = n_layers - 1 The ith element of the list holds the values of the ith layer. deltas : list, length = n_layers - 1 The ith element of the list holds the difference between the activations of the i + 1 layer and the backpropagated error. More specifically, deltas are gradients of loss with respect to z in each layer, where z = wx + b is the value of a particular layer before passing through the activation function coef_grad : list, length = n_layers - 1 The ith element contains the amount of change used to update the coefficient parameters of the ith layer in an iteration. intercept_grads : list, length = n_layers - 1 The ith element contains the amount of change used to update the intercept parameters of the ith layer in an iteration. Returns ------- loss : float grad : array-like, shape (number of nodes of all layers,) """ self._unpack(packed_coef_inter) loss, coef_grads, intercept_grads = self._backprop( X, y, activations, deltas, coef_grads, intercept_grads) self.n_iter_ += 1 grad = _pack(coef_grads, intercept_grads) return loss, grad def _backprop(self, X, y, activations, deltas, coef_grads, intercept_grads): """Compute the MLP loss function and its corresponding derivatives with respect to each parameter: weights and bias vectors. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data. y : array-like, shape (n_samples,) The target values. activations: list, length = n_layers - 1 The ith element of the list holds the values of the ith layer. deltas : list, length = n_layers - 1 The ith element of the list holds the difference between the activations of the i + 1 layer and the backpropagated error. More specifically, deltas are gradients of loss with respect to z in each layer, where z = wx + b is the value of a particular layer before passing through the activation function coef_grad : list, length = n_layers - 1 The ith element contains the amount of change used to update the coefficient parameters of the ith layer in an iteration. intercept_grads : list, length = n_layers - 1 The ith element contains the amount of change used to update the intercept parameters of the ith layer in an iteration. Returns ------- loss : float coef_grads : list, length = n_layers - 1 intercept_grads : list, length = n_layers - 1 """ n_samples = X.shape[0] # Forward propagate activations = self._forward_pass(activations) # Get loss loss = LOSS_FUNCTIONS[self.loss](y, activations[-1]) # Add L2 regularization term to loss values = np.sum( np.array([np.dot(s.ravel(), s.ravel()) for s in self.coefs_])) loss += (0.5 * self.alpha) * values / n_samples # Backward propagate last = self.n_layers_ - 2 # The calculation of delta[last] here works with following # combinations of output activation and loss function: # sigmoid and binary cross entropy, softmax and categorical cross # entropy, and identity with squared loss diff = y - activations[-1] deltas[last] = -diff # Compute gradient for the last layer coef_grads, intercept_grads = self._compute_loss_grad( last, n_samples, activations, deltas, coef_grads, intercept_grads) # Iterate over the hidden layers for i in range(self.n_layers_ - 2, 0, -1): deltas[i - 1] = safe_sparse_dot(deltas[i], self.coefs_[i].T) derivative = DERIVATIVES[self.activation] deltas[i - 1] *= derivative(activations[i]) coef_grads, intercept_grads = self._compute_loss_grad( i - 1, n_samples, activations, deltas, coef_grads, intercept_grads) return loss, coef_grads, intercept_grads def _initialize(self, y, layer_units): # set all attributes, allocate weights etc for first call # Initialize parameters self.n_iter_ = 0 self.t_ = 0 self.n_outputs_ = y.shape[1] # Compute the number of layers self.n_layers_ = len(layer_units) # Output for regression if not isinstance(self, ClassifierMixin): self.out_activation_ = 'identity' # Output for multi class elif self.label_binarizer_.y_type_ == 'multiclass': self.out_activation_ = 'softmax' # Output for binary class and multi-label else: self.out_activation_ = 'logistic' if self.loss == 'log_loss': self.loss = 'binary_log_loss' # Initialize coefficient and intercept layers self.coefs_ = [] self.intercepts_ = [] for i in range(self.n_layers_ - 1): rng = check_random_state(self.random_state) coef_init, intercept_init = self._init_coef(layer_units[i], layer_units[i + 1], rng) self.coefs_.append(coef_init) self.intercepts_.append(intercept_init) if self.algorithm in _STOCHASTIC_ALGOS: self.loss_curve_ = [] self._no_improvement_count = 0 if self.early_stopping: self.validation_scores_ = [] self.best_validation_score_ = -np.inf else: self.best_loss_ = np.inf def _init_coef(self, fan_in, fan_out, rng): if self.activation == 'logistic': # Use the initialization method recommended by # Glorot et al. init_bound = np.sqrt(2. / (fan_in + fan_out)) elif self.activation == 'tanh': init_bound = np.sqrt(6. / (fan_in + fan_out)) elif self.activation == 'relu': init_bound = np.sqrt(6. / (fan_in + fan_out)) else: # this was caught earlier, just to make sure raise ValueError("Unknown activation function %s" % self.activation) coef_init = rng.uniform(-init_bound, init_bound, (fan_in, fan_out)) intercept_init = rng.uniform(-init_bound, init_bound, fan_out) return coef_init, intercept_init def _fit(self, X, y, incremental=False): # Make sure self.hidden_layer_sizes is a list hidden_layer_sizes = self.hidden_layer_sizes if not hasattr(hidden_layer_sizes, "__iter__"): hidden_layer_sizes = [hidden_layer_sizes] hidden_layer_sizes = list(hidden_layer_sizes) # Validate input parameters. self._validate_hyperparameters() if np.any(np.array(hidden_layer_sizes) <= 0): raise ValueError("hidden_layer_sizes must be > 0, got %s." % hidden_layer_sizes) X, y = self._validate_input(X, y, incremental) n_samples, n_features = X.shape # Ensure y is 2D if y.ndim == 1: y = y.reshape((-1, 1)) self.n_outputs_ = y.shape[1] layer_units = ([n_features] + hidden_layer_sizes + [self.n_outputs_]) if not hasattr(self, 'coefs_') or (not self.warm_start and not incremental): # First time training the model self._initialize(y, layer_units) # l-bfgs does not support mini-batches if self.algorithm == 'l-bfgs': batch_size = n_samples elif self.batch_size == 'auto': batch_size = min(200, n_samples) else: if self.batch_size < 1 or self.batch_size > n_samples: warnings.warn("Got `batch_size` less than 1 or larger than " "sample size. It is going to be clipped") batch_size = np.clip(self.batch_size, 1, n_samples) # Initialize lists activations = [X] activations.extend(np.empty((batch_size, n_fan_out)) for n_fan_out in layer_units[1:]) deltas = [np.empty_like(a_layer) for a_layer in activations] coef_grads = [np.empty((n_fan_in_, n_fan_out_)) for n_fan_in_, n_fan_out_ in zip(layer_units[:-1], layer_units[1:])] intercept_grads = [np.empty(n_fan_out_) for n_fan_out_ in layer_units[1:]] # Run the Stochastic optimization algorithm if self.algorithm in _STOCHASTIC_ALGOS: self._fit_stochastic(X, y, activations, deltas, coef_grads, intercept_grads, layer_units, incremental) # Run the LBFGS algorithm elif self.algorithm == 'l-bfgs': self._fit_lbfgs(X, y, activations, deltas, coef_grads, intercept_grads, layer_units) return self def _validate_hyperparameters(self): if not isinstance(self.shuffle, bool): raise ValueError("shuffle must be either True or False, got %s." % self.shuffle) if self.max_iter <= 0: raise ValueError("max_iter must be > 0, got %s." % self.max_iter) if self.alpha < 0.0: raise ValueError("alpha must be >= 0, got %s." % self.alpha) if (self.learning_rate in ["constant", "invscaling", "adaptive"] and self.learning_rate_init <= 0.0): raise ValueError("learning_rate_init must be > 0, got %s." % self.learning_rate) if self.momentum > 1 or self.momentum < 0: raise ValueError("momentum must be >= 0 and <= 1, got %s" % self.momentum) if not isinstance(self.nesterovs_momentum, bool): raise ValueError("nesterovs_momentum must be either True or False," " got %s." % self.nesterovs_momentum) if not isinstance(self.early_stopping, bool): raise ValueError("early_stopping must be either True or False," " got %s." % self.early_stopping) if self.validation_fraction < 0 or self.validation_fraction >= 1: raise ValueError("validation_fraction must be >= 0 and < 1, " "got %s" % self.validation_fraction) if self.beta_1 < 0 or self.beta_1 >= 1: raise ValueError("beta_1 must be >= 0 and < 1, got %s" % self.beta_1) if self.beta_2 < 0 or self.beta_2 >= 1: raise ValueError("beta_2 must be >= 0 and < 1, got %s" % self.beta_2) if self.epsilon <= 0.0: raise ValueError("epsilon must be > 0, got %s." % self.epsilon) # raise ValueError if not registered supported_activations = ['logistic', 'tanh', 'relu'] if self.activation not in supported_activations: raise ValueError("The activation '%s' is not supported. Supported " "activations are %s." % (self.activation, supported_activations)) if self.learning_rate not in ["constant", "invscaling", "adaptive"]: raise ValueError("learning rate %s is not supported. " % self.learning_rate) if self.algorithm not in _STOCHASTIC_ALGOS + ["l-bfgs"]: raise ValueError("The algorithm %s is not supported. " % self.algorithm) def _fit_lbfgs(self, X, y, activations, deltas, coef_grads, intercept_grads, layer_units): # Store meta information for the parameters self._coef_indptr = [] self._intercept_indptr = [] start = 0 # Save sizes and indices of coefficients for faster unpacking for i in range(self.n_layers_ - 1): n_fan_in, n_fan_out = layer_units[i], layer_units[i + 1] end = start + (n_fan_in * n_fan_out) self._coef_indptr.append((start, end, (n_fan_in, n_fan_out))) start = end # Save sizes and indices of intercepts for faster unpacking for i in range(self.n_layers_ - 1): end = start + layer_units[i + 1] self._intercept_indptr.append((start, end)) start = end # Run LBFGS packed_coef_inter = _pack(self.coefs_, self.intercepts_) if self.verbose is True or self.verbose >= 1: iprint = 1 else: iprint = -1 optimal_parameters, self.loss_, d = fmin_l_bfgs_b( x0=packed_coef_inter, func=self._loss_grad_lbfgs, maxfun=self.max_iter, iprint=iprint, pgtol=self.tol, args=(X, y, activations, deltas, coef_grads, intercept_grads)) self._unpack(optimal_parameters) def _fit_stochastic(self, X, y, activations, deltas, coef_grads, intercept_grads, layer_units, incremental): rng = check_random_state(self.random_state) if not incremental or not hasattr(self, '_optimizer'): params = self.coefs_ + self.intercepts_ if self.algorithm == 'sgd': self._optimizer = SGDOptimizer( params, self.learning_rate_init, self.learning_rate, self.momentum, self.nesterovs_momentum, self.power_t) elif self.algorithm == 'adam': self._optimizer = AdamOptimizer( params, self.learning_rate_init, self.beta_1, self.beta_2, self.epsilon) # early_stopping in partial_fit doesn't make sense early_stopping = self.early_stopping and not incremental if early_stopping: X, X_val, y, y_val = train_test_split( X, y, random_state=self.random_state, test_size=self.validation_fraction) if isinstance(self, ClassifierMixin): y_val = self.label_binarizer_.inverse_transform(y_val) else: X_val = None y_val = None n_samples = X.shape[0] if self.batch_size == 'auto': batch_size = min(200, n_samples) else: batch_size = np.clip(self.batch_size, 1, n_samples) try: for it in range(self.max_iter): X, y = shuffle(X, y, random_state=rng) accumulated_loss = 0.0 for batch_slice in gen_batches(n_samples, batch_size): activations[0] = X[batch_slice] batch_loss, coef_grads, intercept_grads = self._backprop( X[batch_slice], y[batch_slice], activations, deltas, coef_grads, intercept_grads) accumulated_loss += batch_loss * (batch_slice.stop - batch_slice.start) # update weights grads = coef_grads + intercept_grads self._optimizer.update_params(grads) self.n_iter_ += 1 self.loss_ = accumulated_loss / X.shape[0] self.t_ += n_samples self.loss_curve_.append(self.loss_) if self.verbose: print("Iteration %d, loss = %.8f" % (self.n_iter_, self.loss_)) # update no_improvement_count based on training loss or # validation score according to early_stopping self._update_no_improvement_count(early_stopping, X_val, y_val) # for learning rate that needs to be updated at iteration end self._optimizer.iteration_ends(self.t_) if self._no_improvement_count > 2: # not better than last two iterations by tol. # stop or decrease learning rate if early_stopping: msg = ("Validation score did not improve more than " "tol=%f for two consecutive epochs." % self.tol) else: msg = ("Training loss did not improve more than tol=%f" " for two consecutive epochs." % self.tol) is_stopping = self._optimizer.trigger_stopping( msg, self.verbose) if is_stopping: break else: self._no_improvement_count = 0 if incremental: break if self.n_iter_ == self.max_iter: warnings.warn('Stochastic Optimizer: Maximum iterations' ' reached and the optimization hasn\'t ' 'converged yet.' % (), ConvergenceWarning) except KeyboardInterrupt: pass if early_stopping: # restore best weights self.coefs_ = self._best_coefs self.intercepts_ = self._best_intercepts def _update_no_improvement_count(self, early_stopping, X_val, y_val): if early_stopping: # compute validation score, use that for stopping self.validation_scores_.append(self.score(X_val, y_val)) if self.verbose: print("Validation score: %f" % self.validation_scores_[-1]) # update best parameters # use validation_scores_, not loss_curve_ # let's hope no-one overloads .score with mse last_valid_score = self.validation_scores_[-1] if last_valid_score < (self.best_validation_score_ + self.tol): self._no_improvement_count += 1 else: self._no_improvement_count = 0 if last_valid_score > self.best_validation_score_: self.best_validation_score_ = last_valid_score self._best_coefs = [c.copy() for c in self.coefs_] self._best_intercepts = [i.copy() for i in self.intercepts_] else: if self.loss_curve_[-1] > self.best_loss_ - self.tol: self._no_improvement_count += 1 else: self._no_improvement_count = 0 if self.loss_curve_[-1] < self.best_loss_: self.best_loss_ = self.loss_curve_[-1] def fit(self, X, y): """Fit the model to data matrix X and target y. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data. y : array-like, shape (n_samples,) The target values. Returns ------- self : returns a trained MLP model. """ return self._fit(X, y, incremental=False) @property def partial_fit(self): """Fit the model to data matrix X and target y. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data. y : array-like, shape (n_samples,) The target values. Returns ------- self : returns a trained MLP model. """ if self.algorithm not in _STOCHASTIC_ALGOS: raise AttributeError("partial_fit is only available for stochastic" "optimization algorithms. %s is not" " stochastic" % self.algorithm) return self._partial_fit def _partial_fit(self, X, y, classes=None): return self._fit(X, y, incremental=True) def _decision_scores(self, X): """Predict using the trained model Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data. Returns ------- y_pred : array-like, shape (n_samples,) or (n_samples, n_outputs) The decision function of the samples for each class in the model. """ X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) # Make sure self.hidden_layer_sizes is a list hidden_layer_sizes = self.hidden_layer_sizes if not hasattr(hidden_layer_sizes, "__iter__"): hidden_layer_sizes = [hidden_layer_sizes] hidden_layer_sizes = list(hidden_layer_sizes) layer_units = [X.shape[1]] + hidden_layer_sizes + \ [self.n_outputs_] # Initialize layers activations = [X] for i in range(self.n_layers_ - 1): activations.append(np.empty((X.shape[0], layer_units[i + 1]))) # forward propagate self._forward_pass(activations, with_output_activation=False) y_pred = activations[-1] return y_pred
class IRAPSClassifier( six.with_metaclass(ABCMeta, _BaseFilter, BaseEstimator, RegressorMixin)): """ Extend the bases of both sklearn feature_selector and classifier. From sklearn BaseEstimator: get_params() set_params() From sklearn _BaseFilter: get_support() fit_transform(X) transform(X) From sklearn RegressorMixin: score(X, y): R2 New: predict(X) predict_label(X) get_signature() Properties: discretize_value Parameters ---------- iraps_core: object p_thres: float, threshold for p_values fc_thres: float, threshold for fold change or mean difference occurrence: float, occurrence rate selected by set of p_thres and fc_thres discretize: float, threshold of z_score to discretize target value memory: None, str or joblib.Memory object min_signature_features: int, the mininum number of features in a signature """ def __init__(self, iraps_core, p_thres=1e-4, fc_thres=0.1, occurrence=0.8, discretize=-1, memory=None, min_signature_features=1): self.iraps_core = iraps_core self.p_thres = p_thres self.fc_thres = fc_thres self.occurrence = occurrence self.discretize = discretize self.memory = memory self.min_signature_features = min_signature_features def fit(self, X, y): memory = check_memory(self.memory) cached_fit = memory.cache(_iraps_core_fit) iraps_core = clone(self.iraps_core) # allow pre-fitted iraps_core here if not hasattr(iraps_core, 'pvalues_'): iraps_core = cached_fit(iraps_core, X, y) self.iraps_core_ = iraps_core pvalues = as_float_array(iraps_core.pvalues_, copy=True) # why np.nan is here? pvalues[np.isnan(pvalues)] = np.finfo(pvalues.dtype).max fold_changes = as_float_array(iraps_core.fold_changes_, copy=True) fold_changes[np.isnan(fold_changes)] = 0.0 base_values = as_float_array(iraps_core.base_values_, copy=True) p_thres = self.p_thres fc_thres = self.fc_thres occurrence = self.occurrence mask_0 = np.zeros(pvalues.shape, dtype=np.int32) # mark p_values less than the threashold mask_0[pvalues <= p_thres] = 1 # mark fold_changes only when greater than the threashold mask_0[abs(fold_changes) < fc_thres] = 0 # count the occurrence and mask greater than the threshold counts = mask_0.sum(axis=0) occurrence_thres = int(occurrence * iraps_core.n_iter) mask = np.zeros(counts.shape, dtype=bool) mask[counts >= occurrence_thres] = 1 # generate signature fold_changes[mask_0 == 0] = 0.0 signature = fold_changes[:, mask].sum(axis=0) / counts[mask] signature = np.vstack((signature, base_values[:, mask].mean(axis=0))) # It's not clearn whether min_size could impact prediction # performance if signature is None\ or signature.shape[1] < self.min_signature_features: raise ValueError("The classifier got None signature or the number " "of sinature feature is less than minimum!") self.signature_ = np.asarray(signature) self.mask_ = mask # TODO: support other discretize method: fixed value, upper # third quater, etc. self.discretize_value = y.mean() + y.std() * self.discretize if iraps_core.negative_thres > iraps_core.positive_thres: self.less_is_positive = True else: self.less_is_positive = False return self def _get_support_mask(self): """ return mask of feature selection indices """ check_is_fitted(self, 'mask_') return self.mask_ def get_signature(self): """ return signature """ check_is_fitted(self, 'signature_') return self.signature_ def predict(self, X): """ compute the correlation coefficient with irpas signature """ signature = self.get_signature() X = as_float_array(X) X_transformed = self.transform(X) - signature[1] corrcoef = np.array( [np.corrcoef(signature[0], e)[0][1] for e in X_transformed]) corrcoef[np.isnan(corrcoef)] = np.finfo(np.float32).min return corrcoef def predict_label(self, X, clf_cutoff=0.4): return self.predict(X) >= clf_cutoff
class BaseSpectral(six.with_metaclass(ABCMeta, BaseEstimator, BiclusterMixin)): """Base class for spectral biclustering.""" @abstractmethod def __init__(self, n_clusters=3, svd_method="randomized", n_svd_vecs=None, mini_batch=False, init="k-means++", n_init=10, n_jobs=1, random_state=None): self.n_clusters = n_clusters self.svd_method = svd_method self.n_svd_vecs = n_svd_vecs self.mini_batch = mini_batch self.init = init self.n_init = n_init self.n_jobs = n_jobs self.random_state = random_state def _check_parameters(self): legal_svd_methods = ('randomized', 'arpack') if self.svd_method not in legal_svd_methods: raise ValueError("Unknown SVD method: '{0}'. svd_method must be" " one of {1}.".format(self.svd_method, legal_svd_methods)) def fit(self, X): """Creates a biclustering for X. Parameters ---------- X : array-like, shape (n_samples, n_features) """ X = check_array(X, accept_sparse='csr', dtype=np.float64) check_array_ndim(X) self._check_parameters() self._fit(X) def _svd(self, array, n_components, n_discard): """Returns first `n_components` left and right singular vectors u and v, discarding the first `n_discard`. """ if self.svd_method == 'randomized': kwargs = {} if self.n_svd_vecs is not None: kwargs['n_oversamples'] = self.n_svd_vecs u, _, vt = randomized_svd(array, n_components, random_state=self.random_state, **kwargs) elif self.svd_method == 'arpack': u, _, vt = svds(array, k=n_components, ncv=self.n_svd_vecs) if np.any(np.isnan(vt)): # some eigenvalues of A * A.T are negative, causing # sqrt() to be np.nan. This causes some vectors in vt # to be np.nan. _, v = eigsh(safe_sparse_dot(array.T, array), ncv=self.n_svd_vecs) vt = v.T if np.any(np.isnan(u)): _, u = eigsh(safe_sparse_dot(array, array.T), ncv=self.n_svd_vecs) assert_all_finite(u) assert_all_finite(vt) u = u[:, n_discard:] vt = vt[n_discard:] return u, vt.T def _k_means(self, data, n_clusters): if self.mini_batch: model = MiniBatchKMeans(n_clusters, init=self.init, n_init=self.n_init, random_state=self.random_state) else: model = KMeans(n_clusters, init=self.init, n_init=self.n_init, n_jobs=self.n_jobs, random_state=self.random_state) model.fit(data) centroid = model.cluster_centers_ labels = model.labels_ return centroid, labels
class IRAPSCore(six.with_metaclass(ABCMeta, BaseEstimator)): """ Base class of IRAPSClassifier From sklearn BaseEstimator: get_params() set_params() Parameters ---------- n_iter : int sample count positive_thres : float z_score shreshold to discretize positive target values negative_thres : float z_score threshold to discretize negative target values verbose : int 0 or geater, if not 0, print progress n_jobs : int, default=1 The number of CPUs to use to do the computation. pre_dispatch : int, or string. Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be: - None, in which case all the jobs are immediately created and spawned. Use this for lightweight and fast-running jobs, to avoid delays due to on-demand spawning of the jobs - An int, giving the exact number of total jobs that are spawned - A string, giving an expression as a function of n_jobs, as in '2*n_jobs' random_state : int or None """ def __init__(self, n_iter=1000, positive_thres=-1, negative_thres=0, verbose=0, n_jobs=1, pre_dispatch='2*n_jobs', random_state=None): """ IRAPS turns towwards general Anomaly Detection It comapares positive_thres with negative_thres, and decide which portion is the positive target. e.g.: (positive_thres=-1, negative_thres=0) => positive = Z_score of target < -1 (positive_thres=1, negative_thres=0) => positive = Z_score of target > 1 Note: The positive targets here is always the abnormal minority group. """ self.n_iter = n_iter self.positive_thres = positive_thres self.negative_thres = negative_thres self.verbose = verbose self.n_jobs = n_jobs self.pre_dispatch = pre_dispatch self.random_state = random_state def fit(self, X, y): """ X: array-like (n_samples x n_features) y: 1-d array-like (n_samples) """ X, y = check_X_y(X, y, ['csr', 'csc'], multi_output=False) def _stochastic_sampling(X, y, random_state=None, positive_thres=-1, negative_thres=0): # each iteration select a random number of random subset of # training samples. this is somewhat different from the original # IRAPS method, but effect is almost the same. SAMPLE_SIZE = [0.25, 0.75] n_samples = X.shape[0] if random_state is None: n_select = random.randint(int(n_samples * SAMPLE_SIZE[0]), int(n_samples * SAMPLE_SIZE[1])) index = random.sample(list(range(n_samples)), n_select) else: n_select = random.Random(random_state).randint( int(n_samples * SAMPLE_SIZE[0]), int(n_samples * SAMPLE_SIZE[1])) index = random.Random(random_state).sample( list(range(n_samples)), n_select) X_selected, y_selected = X[index], y[index] # Spliting by z_scores. y_selected = (y_selected - y_selected.mean()) / y_selected.std() if positive_thres < negative_thres: X_selected_positive = X_selected[y_selected < positive_thres] X_selected_negative = X_selected[y_selected > negative_thres] else: X_selected_positive = X_selected[y_selected > positive_thres] X_selected_negative = X_selected[y_selected < negative_thres] # For every iteration, at least 5 responders are selected if X_selected_positive.shape[0] < 5: warnings.warn("Warning: fewer than 5 positives were selected!") return # p_values _, p = ttest_ind(X_selected_positive, X_selected_negative, axis=0, equal_var=False) # fold_change == mean change? # TODO implement other normalization method positive_mean = X_selected_positive.mean(axis=0) negative_mean = X_selected_negative.mean(axis=0) mean_change = positive_mean - negative_mean # mean_change = np.select( # [positive_mean > negative_mean, # positive_mean < negative_mean], # [positive_mean / negative_mean, # -negative_mean / positive_mean]) # mean_change could be adjusted by power of 2 # mean_change = 2**mean_change \ # if mean_change>0 else -2**abs(mean_change) return p, mean_change, negative_mean parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=self.pre_dispatch) if self.random_state is None: res = parallel( delayed(_stochastic_sampling)( X, y, random_state=None, positive_thres=self.positive_thres, negative_thres=self.negative_thres) for i in range(self.n_iter)) else: res = parallel( delayed(_stochastic_sampling)( X, y, random_state=seed, positive_thres=self.positive_thres, negative_thres=self.negative_thres) for seed in range(self.random_state, self.random_state + self.n_iter)) res = [_ for _ in res if _] if len(res) < 50: raise ValueError("too few (%d) valid feature lists " "were generated!" % len(res)) pvalues = np.vstack([x[0] for x in res]) fold_changes = np.vstack([x[1] for x in res]) base_values = np.vstack([x[2] for x in res]) self.pvalues_ = np.asarray(pvalues) self.fold_changes_ = np.asarray(fold_changes) self.base_values_ = np.asarray(base_values) return self
class BaseBagging(with_metaclass(ABCMeta, BaseEnsemble)): """Base class for Bagging meta-estimator. Warning: This class should not be used directly. Use derived classes instead. """ @abstractmethod def __init__(self, base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=1, random_state=None, verbose=0): super(BaseBagging, self).__init__(base_estimator=base_estimator, n_estimators=n_estimators) self.max_samples = max_samples self.max_features = max_features self.bootstrap = bootstrap self.bootstrap_features = bootstrap_features self.oob_score = oob_score self.warm_start = warm_start self.n_jobs = n_jobs self.random_state = random_state self.verbose = verbose def fit(self, X, y): """Build a Bagging ensemble of estimators from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape = [n_samples, n_features] The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. y : array-like, shape = [n_samples] The target values (class labels in classification, real numbers in regression). Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) # Convert data X, y = check_X_y(X, y, ['csr', 'csc']) # Remap output n_samples, self.n_features_ = X.shape y = self._validate_y(y) # Check parameters self._validate_estimator() if isinstance(self.max_samples, (numbers.Integral, np.integer)): max_samples = self.max_samples else: # float max_samples = int(self.max_samples * X.shape[0]) if not (0 < max_samples <= X.shape[0]): raise ValueError("max_samples must be in (0, n_samples]") if isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float max_features = int(self.max_features * self.n_features_) if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") if self.warm_start and self.oob_score: raise ValueError("Out of bag estimate only available" " if warm_start=False") if hasattr(self, "oob_score_") and self.warm_start: del self.oob_score_ if not self.warm_start or len(self.estimators_) == 0: # Free allocated memory, if any self.estimators_ = [] self.estimators_samples_ = [] self.estimators_features_ = [] n_more_estimators = self.n_estimators - len(self.estimators_) if n_more_estimators < 0: raise ValueError('n_estimators=%d must be larger or equal to ' 'len(estimators_)=%d when warm_start==True' % (self.n_estimators, len(self.estimators_))) elif n_more_estimators == 0: warn("Warm-start fitting without increasing n_estimators does not " "fit new trees.") return self # Parallel loop n_jobs, n_estimators, starts = _partition_estimators( n_more_estimators, self.n_jobs) # Advance random state to state after training # the first n_estimators if self.warm_start and len(self.estimators_) > 0: random_state.randint(MAX_INT, size=len(self.estimators_)) seeds = random_state.randint(MAX_INT, size=n_more_estimators) all_results = Parallel(n_jobs=n_jobs, verbose=self.verbose)( # TEF: changed following call to balanced procedure: delayed(_parallel_build_balanced_estimators)( n_estimators[i], self, X, y, seeds[starts[i]:starts[i + 1]], verbose=self.verbose) for i in range(n_jobs)) # Reduce self.estimators_ += list( itertools.chain.from_iterable(t[0] for t in all_results)) self.estimators_samples_ += list( itertools.chain.from_iterable(t[1] for t in all_results)) self.estimators_features_ += list( itertools.chain.from_iterable(t[2] for t in all_results)) if self.oob_score: self._set_oob_score(X, y) return self @abstractmethod def _set_oob_score(self, X, y): """Calculate out of bag predictions and score.""" def _validate_y(self, y): # Default implementation return column_or_1d(y, warn=True)
class BaseDecisionTree(six.with_metaclass(ABCMeta, BaseEstimator, _LearntSelectorMixin)): """Base class for decision trees. Warning: This class should not be used directly. Use derived classes instead. """ @abstractmethod def __init__(self, criterion, splitter, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, max_leaf_nodes, random_state, output_transformer): self.criterion = criterion self.splitter = splitter self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.random_state = random_state self.max_leaf_nodes = max_leaf_nodes self.output_transformer = output_transformer self.n_features_ = None self.n_outputs_ = None self.classes_ = None self.n_classes_ = None self.tree_ = None self.max_features_ = None self.output_transformer_ = None def fit(self, X, y, sample_weight=None, check_input=True): """Build a decision tree from the training set (X, y). Parameters ---------- X : array-like, shape = [n_samples, n_features] The training input samples. Use ``dtype=np.float32`` for maximum efficiency. y : array-like, shape = [n_samples] or [n_samples, n_outputs] The target values (class labels in classification, real numbers in regression). In the regression case, use ``dtype=np.float64`` and ``order='C'`` for maximum efficiency. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. check_input : boolean, (default=True) Allow to bypass several input checking. Don't use this parameter unless you know what you do. Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) # Convert data if check_input: X = check_array(X, dtype=DTYPE) # Determine output settings n_samples, self.n_features_ = X.shape is_classification = isinstance(self, ClassifierMixin) y = np.atleast_1d(y) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] if is_classification: y = np.copy(y) self.classes_ = [] self.n_classes_ = [] for k in xrange(self.n_outputs_): classes_k, y[:, k] = np.unique(y[:, k], return_inverse=True) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) else: self.classes_ = [None] * self.n_outputs_ self.n_classes_ = [1] * self.n_outputs_ self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters max_depth = ((2 ** 31) - 1 if self.max_depth is None else self.max_depth) max_leaf_nodes = (-1 if self.max_leaf_nodes is None else self.max_leaf_nodes) if isinstance(self.max_features, six.string_types): if self.max_features == "auto": if is_classification: max_features = max(1, int(np.sqrt(self.n_features_))) else: max_features = self.n_features_ elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError( 'Invalid value for max_features. Allowed string ' 'values are "auto", "sqrt" or "log2".') elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float if self.max_features > 0.0: max_features = max(1, int(self.max_features * self.n_features_)) else: max_features = 0 self.max_features_ = max_features if len(y) != n_samples: raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples)) if self.min_samples_split <= 0: raise ValueError("min_samples_split must be greater than zero.") if self.min_samples_leaf <= 0: raise ValueError("min_samples_leaf must be greater than zero.") if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if max_depth <= 0: raise ValueError("max_depth must be greater than zero. ") if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") if not isinstance(max_leaf_nodes, (numbers.Integral, np.integer)): raise ValueError("max_leaf_nodes must be integral number but was " "%r" % max_leaf_nodes) if -1 < max_leaf_nodes < 2: raise ValueError(("max_leaf_nodes {0} must be either smaller than " "0 or larger than 1").format(max_leaf_nodes)) if sample_weight is not None: if (getattr(sample_weight, "dtype", None) != DOUBLE or not sample_weight.flags.contiguous): sample_weight = np.ascontiguousarray( sample_weight, dtype=DOUBLE) if len(sample_weight.shape) > 1: raise ValueError("Sample weights array has more " "than one dimension: %d" % len(sample_weight.shape)) if len(sample_weight) != n_samples: raise ValueError("Number of weights=%d does not match " "number of samples=%d" % (len(sample_weight), n_samples)) # Set min_weight_leaf from min_weight_fraction_leaf if self.min_weight_fraction_leaf != 0. and sample_weight is not None: min_weight_leaf = (self.min_weight_fraction_leaf * np.sum(sample_weight)) else: min_weight_leaf = 0. # Set min_samples_split sensibly min_samples_split = max(self.min_samples_split, 2 * self.min_samples_leaf) # Build tree criterion = self.criterion if not isinstance(criterion, Criterion): if is_classification: criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) else: criterion = CRITERIA_REG[self.criterion](self.n_outputs_) splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = SPLITTERS[self.splitter](criterion, self.max_features_, self.min_samples_leaf, min_weight_leaf, random_state) #### Added to have transform output spcace if self.output_transformer is not None: self.output_transformer_ = clone(self.output_transformer) # Set a random_state to the transformer, if it's not already set try: self.output_transformer_.set_params( random_state=check_random_state(self.random_state)) except ValueError: # Sub class might not have a random_state, but super class # have pass y_transf = self.output_transformer_.fit_transform(y) if y_transf.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y_transf = y_transf.reshape((-1, 1)) if (y_transf.dtype != DOUBLE or not y_transf.flags.contiguous): y_transf = np.ascontiguousarray(y_transf, dtype=DOUBLE) base_splitter = splitter splitter = SplitterTransformer(criterion, self.max_features_, self.min_samples_leaf, min_weight_leaf, random_state) splitter.set_output_space(base_splitter, y_transf) #### --- self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_) # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if max_leaf_nodes < 0: builder = DepthFirstTreeBuilder(splitter, min_samples_split, self.min_samples_leaf, min_weight_leaf, max_depth) else: builder = BestFirstTreeBuilder(splitter, min_samples_split, self.min_samples_leaf, min_weight_leaf, max_depth, max_leaf_nodes) builder.build(self.tree_, X, y, sample_weight) if self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self def predict(self, X): """Predict class or regression value for X. For a classification model, the predicted class for each sample in X is returned. For a regression model, the predicted value based on X is returned. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- y : array of shape = [n_samples] or [n_samples, n_outputs] The predicted classes, or the predict values. """ if getattr(X, "dtype", None) != DTYPE or X.ndim != 2: X = check_array(X, dtype=DTYPE) n_samples, n_features = X.shape if self.tree_ is None: raise Exception("Tree not initialized. Perform a fit first") if self.n_features_ != n_features: raise ValueError("Number of features of the model must " " match the input. Model n_features is %s and " " input n_features is %s " % (self.n_features_, n_features)) proba = self.tree_.predict(X) # Classification if isinstance(self, ClassifierMixin): if self.n_outputs_ == 1: return self.classes_.take(np.argmax(proba, axis=1), axis=0) else: predictions = np.zeros((n_samples, self.n_outputs_)) for k in xrange(self.n_outputs_): predictions[:, k] = self.classes_[k].take( np.argmax(proba[:, k], axis=1), axis=0) return predictions # Regression else: if self.n_outputs_ == 1: return proba[:, 0] else: return proba[:, :, 0] @property def feature_importances_(self): """Return the feature importances. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance. Returns ------- feature_importances_ : array, shape = [n_features] """ if self.tree_ is None: raise ValueError("Estimator not fitted, " "call `fit` before `feature_importances_`.") return self.tree_.compute_feature_importances()