Ejemplo n.º 1
0
class BaseNeuralNetwork(six.with_metaclass(ABCMeta, BaseEstimator)):
    """Base class for neural networks.

    Warning: This class should not be used directly.
    Use derived classes instead.
    """
    @abstractmethod
    def __init__(self,
                 hidden_nodes=[10],
                 activation='relu',
                 algorithm='random_hill_climb',
                 max_iters=100,
                 bias=True,
                 learning_rate=0.1,
                 early_stopping=False,
                 clip_max=1e+10,
                 schedule=GeomDecay(),
                 pop_size=200,
                 mutation_prob=0.1,
                 max_attempts=10,
                 restarts=0,
                 curve=True):

        self.hidden_nodes = hidden_nodes

        self.activation_dict = {
            'identity': identity,
            'relu': relu,
            'sigmoid': sigmoid,
            'tanh': tanh
        }
        self.activation = activation

        self.algorithm = algorithm
        self.max_iters = max_iters
        self.bias = bias
        self.learning_rate = learning_rate
        self.early_stopping = early_stopping
        self.clip_max = clip_max
        self.schedule = schedule
        self.pop_size = pop_size
        self.mutation_prob = mutation_prob
        self.max_attempts = max_attempts
        self.curve = curve
        self.restarts = restarts

        self.node_list = []
        self.fitted_weights = []
        self.loss = np.inf
        self.output_activation = None
        self.predicted_probs = []
        self.fitness_curve = []

    @abstractmethod
    def _is_classifier(self):
        pass

    def _validate(self):
        if (not isinstance(self.max_iters, int) and self.max_iters != np.inf
                and not self.max_iters.is_integer()) or (self.max_iters < 0):
            raise Exception("""max_iters must be a positive integer.""")

        if not isinstance(self.bias, bool):
            raise Exception("""bias must be True or False.""")

        if self.learning_rate <= 0:
            raise Exception("""learning_rate must be greater than 0.""")

        if not isinstance(self.early_stopping, bool):
            raise Exception("""early_stopping must be True or False.""")

        if self.clip_max <= 0:
            raise Exception("""clip_max must be greater than 0.""")

        if (not isinstance(self.max_attempts, int) and
                not self.max_attempts.is_integer()) or (self.max_attempts < 0):
            raise Exception("""max_attempts must be a positive integer.""")

        if self.pop_size < 0:
            raise Exception("""pop_size must be a positive integer.""")
        elif not isinstance(self.pop_size, int):
            if self.pop_size.is_integer():
                self.pop_size = int(self.pop_size)
            else:
                raise Exception("""pop_size must be a positive integer.""")

        if (self.mutation_prob < 0) or (self.mutation_prob > 1):
            raise Exception("""mutation_prob must be between 0 and 1.""")

        if self.activation is None or self.activation not in self.activation_dict.keys(
        ):
            raise Exception("""Activation function must be one of: 'identity',
                    'relu', 'sigmoid' or 'tanh'.""")

        if self.algorithm not in [
                'random_hill_climb', 'simulated_annealing', 'genetic_alg',
                'gradient_descent'
        ]:
            raise Exception("""Algorithm must be one of: 'random_hill_climb',
                    'simulated_annealing', 'genetic_alg', 'gradient_descent'."""
                            )

    def fit(self, X, y=None, init_weights=None):
        """Fit neural network to data.

        Parameters
        ----------
        X: array
            Numpy array containing feature dataset with each row
            representing a single observation.

        y: array
            Numpy array containing data labels. Length must be same as
            length of X.

        init_state: array, default: None
            Numpy array containing starting weights for algorithm.
            If :code:`None`, then a random state is used.
        """
        self._validate()

        # Make sure y is an array and not a list
        y = np.array(y)

        # Convert y to 2D if necessary
        if len(np.shape(y)) == 1:
            y = np.reshape(y, [len(y), 1])

        # Verify X and y are the same length
        if not np.shape(X)[0] == np.shape(y)[0]:
            raise Exception('The length of X and y must be equal.')

        # Determine number of nodes in each layer
        input_nodes = np.shape(X)[1] + self.bias
        output_nodes = np.shape(y)[1]
        node_list = [input_nodes] + self.hidden_nodes + [output_nodes]

        num_nodes = 0

        for i in range(len(node_list) - 1):
            num_nodes += node_list[i] * node_list[i + 1]

        if init_weights is not None and len(init_weights) != num_nodes:
            raise Exception("""init_weights must be None or have length %d""" %
                            (num_nodes, ))

        # Initialize optimization problem
        fitness = NetworkWeights(X,
                                 y,
                                 node_list,
                                 self.activation_dict[self.activation],
                                 self.bias,
                                 self._is_classifier(),
                                 learning_rate=self.learning_rate)

        problem = ContinuousOpt(num_nodes,
                                fitness,
                                maximize=False,
                                min_val=-1 * self.clip_max,
                                max_val=self.clip_max,
                                step=self.learning_rate)

        fitness_curve = []

        if self.algorithm == 'random_hill_climb':
            if init_weights is None:
                init_weights = np.random.uniform(-1, 1, num_nodes)

            if self.curve:
                fitted_weights, loss, fitness_curve = random_hill_climb(
                    problem,
                    max_attempts=self.max_attempts,
                    max_iters=self.max_iters,
                    restarts=self.restarts,
                    init_state=init_weights,
                    curve=self.curve)
            else:
                fitted_weights, loss = random_hill_climb(
                    problem,
                    max_attempts=self.max_attempts,
                    max_iters=self.max_iters,
                    restarts=self.restarts,
                    init_state=init_weights,
                    curve=self.curve)

        elif self.algorithm == 'simulated_annealing':
            if init_weights is None:
                init_weights = np.random.uniform(-1, 1, num_nodes)

            if self.curve:
                fitted_weights, loss, fitness_curve = simulated_annealing(
                    problem,
                    schedule=self.schedule,
                    max_attempts=self.max_attempts,
                    max_iters=self.max_iters,
                    init_state=init_weights,
                    curve=self.curve)
            else:
                fitted_weights, loss = simulated_annealing(
                    problem,
                    schedule=self.schedule,
                    max_attempts=self.max_attempts,
                    max_iters=self.max_iters,
                    init_state=init_weights,
                    curve=self.curve)

        elif self.algorithm == 'genetic_alg':

            if self.curve:
                fitted_weights, loss, fitness_curve = genetic_alg(
                    problem,
                    pop_size=self.pop_size,
                    mutation_prob=self.mutation_prob,
                    max_attempts=self.max_attempts,
                    max_iters=self.max_iters,
                    curve=self.curve)
            else:
                fitted_weights, loss = genetic_alg(
                    problem,
                    pop_size=self.pop_size,
                    mutation_prob=self.mutation_prob,
                    max_attempts=self.max_attempts,
                    max_iters=self.max_iters,
                    curve=self.curve)

        else:  # Gradient descent case
            if init_weights is None:
                init_weights = np.random.uniform(-1, 1, num_nodes)

            if self.curve:
                fitted_weights, loss, fitness_curve = gradient_descent(
                    problem,
                    max_attempts=self.max_attempts,
                    max_iters=self.max_iters,
                    init_state=init_weights,
                    curve=self.curve)
            else:
                fitted_weights, loss = gradient_descent(
                    problem,
                    max_attempts=self.max_attempts,
                    max_iters=self.max_iters,
                    init_state=init_weights,
                    curve=self.curve)

        # Save fitted weights and node list
        self.node_list = node_list
        self.fitted_weights = fitted_weights
        self.loss = loss
        if self.curve:
            self.fitness_curve = fitness_curve
        self.output_activation = fitness.get_output_activation()

        return self

    def predict(self, X, y=None):
        """Use model to predict data labels for given feature array.

        Parameters
        ----------
        X: array
            Numpy array containing feature dataset with each row
            representing a single observation.

        Returns
        -------
        y_pred: array
            Numpy array containing predicted data labels.
        """
        if not np.shape(X)[1] == (self.node_list[0] - self.bias):
            raise Exception("""The number of columns in X must equal %d""" %
                            ((self.node_list[0] - self.bias), ))

        weights = unflatten_weights(self.fitted_weights, self.node_list)

        # Add bias column to inputs matrix, if required
        if self.bias:
            ones = np.ones([np.shape(X)[0], 1])
            inputs = np.hstack((X, ones))

        else:
            inputs = X

        # Pass data through network
        for i in range(len(weights)):
            # Multiple inputs by weights
            outputs = np.dot(inputs, weights[i])

            # Transform outputs to get inputs for next layer (or final preds)
            if i < len(weights) - 1:
                inputs = self.activation_dict[self.activation](outputs)
            else:
                y_pred = self.output_activation(outputs)

        # For classifier, convert predicted probabilities to 0-1 labels
        if self._is_classifier():
            self.predicted_probs = y_pred

            if self.node_list[-1] == 1:
                y_pred = np.round(y_pred).astype(int)
            else:
                zeros = np.zeros_like(y_pred)
                zeros[np.arange(len(y_pred)), np.argmax(y_pred, axis=1)] = 1
                y_pred = zeros.astype(int)

        return y_pred
Ejemplo n.º 2
0
class BaseMixture(six.with_metaclass(ABCMeta, DensityMixin, BaseEstimator)):
    """Base class for mixture models.

    This abstract class specifies an interface for all mixture classes and
    provides basic common methods for mixture models.
    """

    def __init__(self, n_components, tol, reg_covar,
                 max_iter, n_init, init_params, random_state, warm_start,
                 verbose, verbose_interval):
        self.n_components = n_components
        self.tol = tol
        self.reg_covar = reg_covar
        self.max_iter = max_iter
        self.n_init = n_init
        self.init_params = init_params
        self.random_state = random_state
        self.warm_start = warm_start
        self.verbose = verbose
        self.verbose_interval = verbose_interval

    def _check_initial_parameters(self, X):
        """Check values of the basic parameters.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
        """
        if self.n_components < 1:
            raise ValueError("Invalid value for 'n_components': %d "
                             "Estimation requires at least one component"
                             % self.n_components)

        if self.tol < 0.:
            raise ValueError("Invalid value for 'tol': %.5f "
                             "Tolerance used by the EM must be non-negative"
                             % self.tol)

        if self.n_init < 1:
            raise ValueError("Invalid value for 'n_init': %d "
                             "Estimation requires at least one run"
                             % self.n_init)

        if self.max_iter < 1:
            raise ValueError("Invalid value for 'max_iter': %d "
                             "Estimation requires at least one iteration"
                             % self.max_iter)

        if self.reg_covar < 0.:
            raise ValueError("Invalid value for 'reg_covar': %.5f "
                             "regularization on covariance must be "
                             "non-negative"
                             % self.reg_covar)

        # Check all the parameters values of the derived class
        self._check_parameters(X)

    @abstractmethod
    def _check_parameters(self, X):
        """Check initial parameters of the derived class.

        Parameters
        ----------
        X : array-like, shape  (n_samples, n_features)
        """
        pass

    def _initialize_parameters(self, X, random_state):
        """Initialize the model parameters.

        Parameters
        ----------
        X : array-like, shape  (n_samples, n_features)

        random_state : RandomState
            A random number generator instance.
        """
        n_samples, _ = X.shape

        if self.init_params == 'kmeans':
            resp = np.zeros((n_samples, self.n_components))
            label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
                                   random_state=random_state).fit(X).labels_
            resp[np.arange(n_samples), label] = 1
        elif self.init_params == 'random':
            resp = random_state.rand(n_samples, self.n_components)
            resp /= resp.sum(axis=1)[:, np.newaxis]
        else:
            raise ValueError("Unimplemented initialization method '%s'"
                             % self.init_params)

        self._initialize(X, resp)

    @abstractmethod
    def _initialize(self, X, resp):
        """Initialize the model parameters of the derived class.

        Parameters
        ----------
        X : array-like, shape  (n_samples, n_features)

        resp : array-like, shape (n_samples, n_components)
        """
        pass

    def fit(self, X,bootw, y=None):
        """Estimate model parameters with the EM algorithm.

        The method fits the model ``n_init`` times and sets the parameters with
        which the model has the largest likelihood or lower bound. Within each
        trial, the method iterates between E-step and M-step for ``max_iter``
        times until the change of likelihood or lower bound is less than
        ``tol``, otherwise, a ``ConvergenceWarning`` is raised.
        If ``warm_start`` is ``True``, then ``n_init`` is ignored and a single
        initialization is performed upon the first call. Upon consecutive
        calls, training starts where it left off.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            List of n_features-dimensional data points. Each row
            corresponds to a single data point.

        Returns
        -------
        self
        """
        self.fit_predict(X,bootw, y)
        return self

    def fit_predict(self, X, bootw,y=None):
        """Estimate model parameters using X and predict the labels for X.

        The method fits the model n_init times and sets the parameters with
        which the model has the largest likelihood or lower bound. Within each
        trial, the method iterates between E-step and M-step for `max_iter`
        times until the change of likelihood or lower bound is less than
        `tol`, otherwise, a `ConvergenceWarning` is raised. After fitting, it
        predicts the most probable label for the input data points.

        .. versionadded:: 0.20

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            List of n_features-dimensional data points. Each row
            corresponds to a single data point.

        Returns
        -------
        labels : array, shape (n_samples,)
            Component labels.
        """
        X = _check_X(X, self.n_components, ensure_min_samples=2)
        self._check_initial_parameters(X)

        # if we enable warm_start, we will have a unique initialisation
        do_init = not(self.warm_start and hasattr(self, 'converged_'))
        n_init = self.n_init if do_init else 1

        max_lower_bound = -np.infty
        self.converged_ = False

        random_state = check_random_state(self.random_state)

        n_samples, _ = X.shape
        for init in range(n_init):
            self._print_verbose_msg_init_beg(init)

            if do_init:
                self._initialize_parameters(X, random_state)

            lower_bound = (-np.infty if do_init else self.lower_bound_)

            for n_iter in range(1, self.max_iter + 1):
                prev_lower_bound = lower_bound

                log_prob_norm, log_resp = self._e_step(X,bootw)
                self._m_step(X, log_resp,bootw)
                lower_bound = self._compute_lower_bound(
                    log_resp, log_prob_norm)

                change = lower_bound - prev_lower_bound
                self._print_verbose_msg_iter_end(n_iter, change)

                if abs(change) < self.tol:
                    self.converged_ = True
                    break

            self._print_verbose_msg_init_end(lower_bound)

            if lower_bound > max_lower_bound:
                max_lower_bound = lower_bound
                best_params = self._get_parameters()
                best_n_iter = n_iter

        # Always do a final e-step to guarantee that the labels returned by
        # fit_predict(X) are always consistent with fit(X).predict(X)
        # for any value of max_iter and tol (and any random_state).
        _, log_resp = self._e_step(X,bootw)

        if not self.converged_:
            warnings.warn('Initialization %d did not converge. '
                          'Try different init parameters, '
                          'or increase max_iter, tol '
                          'or check for degenerate data.'
                          % (init + 1), ConvergenceWarning)

        self._set_parameters(best_params)
        self.n_iter_ = best_n_iter
        self.lower_bound_ = max_lower_bound

        return log_resp.argmax(axis=1)

    def _e_step(self, X,bootw):
        """E step.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)

        Returns
        -------
        log_prob_norm : float
            Mean of the logarithms of the probabilities of each sample in X

        log_responsibility : array, shape (n_samples, n_components)
            Logarithm of the posterior probabilities (or responsibilities) of
            the point of each sample in X.
        """
        log_prob_norm, log_resp = self._estimate_log_prob_resp(X)
        return np.mean(bootw*log_prob_norm), log_resp

    @abstractmethod
    def _m_step(self, X, log_resp, bootw):
        """M step.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)

        log_resp : array-like, shape (n_samples, n_components)
            Logarithm of the posterior probabilities (or responsibilities) of
            the point of each sample in X.
        """
        pass

    @abstractmethod
    def _check_is_fitted(self):
        pass

    @abstractmethod
    def _get_parameters(self):
        pass

    @abstractmethod
    def _set_parameters(self, params):
        pass

    def score_samples(self, X):
        """Compute the weighted log probabilities for each sample.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            List of n_features-dimensional data points. Each row
            corresponds to a single data point.

        Returns
        -------
        log_prob : array, shape (n_samples,)
            Log probabilities of each data point in X.
        """
        self._check_is_fitted()
        X = _check_X(X, None, self.means_.shape[1])

        return logsumexp(self._estimate_weighted_log_prob(X), axis=1)

    def score_lppd(self,X,y=None):
        return (self.score_samples(X))


    def score(self, X,bootw, y=None):
        """Compute the per-sample average log-likelihood of the given data X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_dimensions)
            List of n_features-dimensional data points. Each row
            corresponds to a single data point.

        Returns
        -------
        log_likelihood : float
            Log likelihood of the Gaussian mixture given X.
        """


        return (self.score_samples(X)*bootw).mean()

    def predict(self, X):
        """Predict the labels for the data samples in X using trained model.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            List of n_features-dimensional data points. Each row
            corresponds to a single data point.

        Returns
        -------
        labels : array, shape (n_samples,)
            Component labels.
        """
        self._check_is_fitted()
        X = _check_X(X, None, self.means_.shape[1])
        return self._estimate_weighted_log_prob(X).argmax(axis=1)

    def predict_proba(self, X):
        """Predict posterior probability of each component given the data.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            List of n_features-dimensional data points. Each row
            corresponds to a single data point.

        Returns
        -------
        resp : array, shape (n_samples, n_components)
            Returns the probability each Gaussian (state) in
            the model given each sample.
        """
        self._check_is_fitted()
        X = _check_X(X, None, self.means_.shape[1])
        _, log_resp = self._estimate_log_prob_resp(X)
        return np.exp(log_resp)

    def sample(self, n_samples=1):
        """Generate random samples from the fitted Gaussian distribution.

        Parameters
        ----------
        n_samples : int, optional
            Number of samples to generate. Defaults to 1.

        Returns
        -------
        X : array, shape (n_samples, n_features)
            Randomly generated sample

        y : array, shape (nsamples,)
            Component labels

        """
        self._check_is_fitted()

        if n_samples < 1:
            raise ValueError(
                "Invalid value for 'n_samples': %d . The sampling requires at "
                "least one sample." % (self.n_components))

        _, n_features = self.means_.shape
        rng = check_random_state(self.random_state)
        n_samples_comp = rng.multinomial(n_samples, self.weights_)

        if self.covariance_type == 'full':
            X = np.vstack([
                rng.multivariate_normal(mean, covariance, int(sample))
                for (mean, covariance, sample) in zip(
                    self.means_, self.covariances_, n_samples_comp)])
        elif self.covariance_type == "tied":
            X = np.vstack([
                rng.multivariate_normal(mean, self.covariances_, int(sample))
                for (mean, sample) in zip(
                    self.means_, n_samples_comp)])
        else:
            X = np.vstack([
                mean + rng.randn(sample, n_features) * np.sqrt(covariance)
                for (mean, covariance, sample) in zip(
                    self.means_, self.covariances_, n_samples_comp)])

        y = np.concatenate([np.full(sample, j, dtype=int)
                           for j, sample in enumerate(n_samples_comp)])

        return (X, y)

    def _estimate_weighted_log_prob(self, X):
        """Estimate the weighted log-probabilities, log P(X | Z) + log weights.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)

        Returns
        -------
        weighted_log_prob : array, shape (n_samples, n_component)
        """
        return self._estimate_log_prob(X) + self._estimate_log_weights()

    @abstractmethod
    def _estimate_log_weights(self):
        """Estimate log-weights in EM algorithm, E[ log pi ] in VB algorithm.

        Returns
        -------
        log_weight : array, shape (n_components, )
        """
        pass

    @abstractmethod
    def _estimate_log_prob(self, X):
        """Estimate the log-probabilities log P(X | Z).

        Compute the log-probabilities per each component for each sample.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)

        Returns
        -------
        log_prob : array, shape (n_samples, n_component)
        """
        pass

    def _estimate_log_prob_resp(self, X):
        """Estimate log probabilities and responsibilities for each sample.

        Compute the log probabilities, weighted log probabilities per
        component and responsibilities for each sample in X with respect to
        the current state of the model.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)

        Returns
        -------
        log_prob_norm : array, shape (n_samples,)
            log p(X)

        log_responsibilities : array, shape (n_samples, n_components)
            logarithm of the responsibilities
        """
        weighted_log_prob = self._estimate_weighted_log_prob(X)
        log_prob_norm = logsumexp(weighted_log_prob, axis=1)
        with np.errstate(under='ignore'):
            # ignore underflow
            log_resp = weighted_log_prob - log_prob_norm[:, np.newaxis]
        return log_prob_norm, log_resp

    def _print_verbose_msg_init_beg(self, n_init):
        """Print verbose message on initialization."""
        if self.verbose == 1:
            print("Initialization %d" % n_init)
        elif self.verbose >= 2:
            print("Initialization %d" % n_init)
            self._init_prev_time = time()
            self._iter_prev_time = self._init_prev_time

    def _print_verbose_msg_iter_end(self, n_iter, diff_ll):
        """Print verbose message on initialization."""
        if n_iter % self.verbose_interval == 0:
            if self.verbose == 1:
                print("  Iteration %d" % n_iter)
            elif self.verbose >= 2:
                cur_time = time()
                print("  Iteration %d\t time lapse %.5fs\t ll change %.5f" % (
                    n_iter, cur_time - self._iter_prev_time, diff_ll))
                self._iter_prev_time = cur_time

    def _print_verbose_msg_init_end(self, ll):
        """Print verbose message on the end of iteration."""
        if self.verbose == 1:
            print("Initialization converged: %s" % self.converged_)
        elif self.verbose >= 2:
            print("Initialization converged: %s\t time lapse %.5fs\t ll %.5f" %
                  (self.converged_, time() - self._init_prev_time, ll))
Ejemplo n.º 3
0
class BaseNB(six.with_metaclass(ABCMeta, BaseEstimator)):

    _estimator_type = "classifier"

    def __init__(self):
        self.is_fitted = False
        self.classes_ = None
        self.class_count_ = None

    # Properties

    @property
    def complement_class_count_(self):
        '''

        Complement class count, i.e. number of occurrences of all the samples with
        all the classes except the given class c

        '''
        from bayes.utils import get_complement_matrix
        size = len(self.class_count_)
        return self.class_count_.dot(get_complement_matrix(size))

    @property
    def complement_class_log_proba_(self):
        '''

        Complement class probability, i.e. logprob of occurrence of a sample, which
        does not belong to the given class c

        '''
        all_samples_count = np.float64(np.sum(self.class_count_))
        return np.log(self.complement_class_count_ / all_samples_count)

    @property
    def class_log_proba_(self):
        '''
        Log probability of class occurrence
        '''
        all_samples_count = np.float64(np.sum(self.class_count_))
        return np.log(self.class_count_ / all_samples_count)

    # Fitting model

    def fit(self, X, y):
        '''

        Fit model to given training set

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape (n_samples,)
            Target values.
        Returns
        -------
        self : Naive Bayes estimator object
            Returns self.
        '''
        self._reset()
        self._partial_fit(X, y)
        return self

    @abstractmethod
    def partial_fit(self, X, y, classes=None):
        """
        Incremental fit on a batch of samples.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.
        classes : array-like, shape = [n_classes], optional (default=None)
            List of all the classes that can possibly appear in the y vector.
            Must be provided at the first call to partial_fit, can be omitted
            in subsequent calls.

        Returns
        -------
        self : object
             Returns self.
        """

    @abstractmethod
    def _partial_fit(self, X, y, classes=None, first_partial_fit=None):
        ''''''

    @abstractmethod
    def predict(self, X):
        """
        Perform classification on an array of test vectors X.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Unseen samples vector
        Returns
        -------
        C : array, shape = [n_samples]
            Predicted target values for X

        """

    def _update_complement_features(self, X, y_one_hot):
        '''

        Compute complement features counts

        Parameters
        ----------
        X: numpy array (n_samples, n_features)
            Matrix of input samples
        y_one_hot: numpy array (n_samples, n_classes)
            Binary matrix encoding input
        '''
        # FIXME: complement_features nomenclature is incoherent
        if not self.is_fitted:
            self.complement_features = X.T.dot(np.logical_not(y_one_hot))
        else:
            self.complement_features += X.T.dot(np.logical_not(y_one_hot))

    def _update_features(self, X, y_one_hot):
        '''

        Compute features counts

        Parameters
        ----------
        X: numpy array (n_samples, n_features)
            Matrix of input samples
        y_one_hot: numpy array (n_samples, n_classes)
            Binary matrix encoding input
        '''
        if not self.is_fitted:
            self.features_ = X.T.dot(y_one_hot)
        else:
            self.features_ += X.T.dot(y_one_hot)

    @abstractmethod
    def predict_log_proba(self, X):
        """
        Return log-probability estimates for the test vector X.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
        Returns
        -------
        C : array-like, shape = [n_samples, n_classes]
            Returns the log-probability of the samples for each class in
            the model. The columns correspond to the classes in sorted
            order, as they appear in the attribute `classes_`.
        """

    @abstractmethod
    def _reset(self):
        ''''''

    def predict_proba(self, X):
        """
        Return probability estimates for the test vector X.
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
        Returns
        -------
        C : array-like, shape = [n_samples, n_classes]
            Returns the probability of the samples for each class in
            the model. The columns correspond to the classes in sorted
            order, as they appear in the attribute `classes_`.
        """
        # TODO: Handle float exponent error
        return np.exp(self.predict_log_proba(X))

    # Scores

    def accuracy_score(self, X, y):
        '''

        Return acuracy score

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------
        accuracy_score: float
            Accuracy on the given test set

        '''
        self._check_is_fitted()
        return accuracy_score(y, self.predict(X))

    # def f1_score(self, X, y):
    #     self._check_is_fitted()
    #     return f1_score(y, self.predict(X))
    #
    # def precision_score(self, X, y):
    #     self._check_is_fitted()
    #     return precision_score(y, self.predict(X))
    #
    # def recall_score(self, X, y):
    #     self._check_is_fitted()
    #     return recall_score(y, self.predict(X))
    #
    # def roc_auc_score(self, X, y):
    #     self._check_is_fitted()
    #     return roc_auc_score(y, self.predict(X))

    # Checking params & states

    def _check_is_fitted(self):
        if not self.is_fitted:
            raise NotFittedError

    def _check_alpha_param(self):
        if self.alpha == 0.0:
            warnings.warn(
                'Alpha sholud not be zero. It may cause division by zero',
                AlphaZeroWarning)

    def _not_implemented_yet(self, message):
        warnings.warn(NotImplementedYet(message))

    # def safe_mult(self, input_array, internal_array):
    #     if isinstance(input_array, csr_matrix):
    #         input_array = input_array.toarray()
    #     return input_array * internal_array

    def safe_matmult(self, input_array, internal_array):
        if isinstance(input_array, csr_matrix):
            input_array = input_array.toarray()
        return input_array.dot(internal_array.T)
Ejemplo n.º 4
0
class BaseRandomNN(six.with_metaclass(ABCMeta, BaseEstimator)):
    """Base class for Random Neural Network classification and regression.

    Warning: This class should not be used directly.
    Use derived classes instead.
    """
    @abstractmethod
    def __init__(self, n_hidden, activation, C, class_weight, weight_scale,
                 batch_size, verbose, warm_start, random_state):
        self.C = C
        self.activation = activation
        self.class_weight = class_weight
        self.weight_scale = weight_scale
        self.batch_size = batch_size
        self.n_hidden = n_hidden
        self.verbose = verbose
        self.warm_start = warm_start
        self.random_state = random_state

    def _init_weights(self, n_features):
        """Initialize the parameter weights."""
        rng = check_random_state(self.random_state)

        # Use the initialization method recommended by Glorot et al.
        weight_init_bound = np.sqrt(6. / (n_features + self.n_hidden))

        self.coef_hidden_ = rng.uniform(-weight_init_bound, weight_init_bound,
                                        (n_features, self.n_hidden))
        self.intercept_hidden_ = rng.uniform(-weight_init_bound,
                                             weight_init_bound, self.n_hidden)
        if self.weight_scale != 1:
            self.coef_hidden_ *= self.weight_scale
            self.intercept_hidden_ *= self.weight_scale

    def _compute_hidden_activations(self, X):
        """Compute the hidden activations."""

        hidden_activations = safe_sparse_dot(X, self.coef_hidden_)
        hidden_activations += self.intercept_hidden_

        # Apply the activation method
        activation = ACTIVATIONS[self.activation]
        hidden_activations = activation(hidden_activations)

        return hidden_activations

    def _fit(self, X, y, sample_weight=None, incremental=False):
        """Fit the model to the data X and target y."""
        # Validate input params
        if self.n_hidden <= 0:
            raise ValueError("n_hidden must be > 0, got %s." % self.n_hidden)
        if self.C <= 0.0:
            raise ValueError("C must be > 0, got %s." % self.C)
        if self.activation not in ACTIVATIONS:
            raise ValueError("The activation %s is not supported. Supported "
                             "activation are %s." %
                             (self.activation, ACTIVATIONS))

        # Initialize public attributes
        if not hasattr(self, 'classes_'):
            self.classes_ = None
        if not hasattr(self, 'coef_hidden_'):
            self.coef_hidden_ = None

        # Initialize private attributes
        if not hasattr(self, '_HT_H_accumulated'):
            self._HT_H_accumulated = None

        X, y = check_X_y(X,
                         y,
                         accept_sparse=['csr', 'csc', 'coo'],
                         dtype=np.float64,
                         order="C",
                         multi_output=True)

        # This outputs a warning when a 1d array is expected
        if y.ndim == 2 and y.shape[1] == 1:
            y = column_or_1d(y, warn=True)

        # Classification
        if isinstance(self, ClassifierMixin):
            self.label_binarizer_.fit(y)

            if self.classes_ is None or not incremental:
                self.classes_ = self.label_binarizer_.classes_
                # if sample_weight is None:
                #     sample_weight = compute_sample_weight(self.class_weight,
                #                                           self.classes_, y)
            else:
                classes = self.label_binarizer_.classes_
                if not np.all(np.in1d(classes, self.classes_)):
                    raise ValueError("`y` has classes not in `self.classes_`."
                                     " `self.classes_` has %s. 'y' has %s." %
                                     (self.classes_, classes))

            y = self.label_binarizer_.transform(y)

        # Ensure y is 2D
        if y.ndim == 1:
            y = np.reshape(y, (-1, 1))

        n_samples, n_features = X.shape
        self.n_outputs_ = y.shape[1]

        # Step (1/2): Compute the hidden layer coefficients
        if (self.coef_hidden_ is None
                or (not incremental and not self.warm_start)):
            # Randomize and scale the input-to-hidden coefficients
            self._init_weights(n_features)

        # Step (2/2): Compute hidden-to-output coefficients
        if self.batch_size is None:
            # Run the least-square algorithm on the whole dataset
            batch_size = n_samples
        else:
            # Run the recursive least-square algorithm on mini-batches
            batch_size = self.batch_size

        batches = gen_batches(n_samples, batch_size)

        # (First time call) Run the least-square algorithm on batch 0
        if not incremental or self._HT_H_accumulated is None:
            batch_slice = next(batches)
            H_batch = self._compute_hidden_activations(X[batch_slice])

            # Get sample weights for the batch
            if sample_weight is None:
                sw = None
            else:
                sw = sample_weight[batch_slice]

            # beta_{0} = inv(H_{0}^T H_{0} + (1. / C) * I) * H_{0}.T y_{0}
            self.coef_output_ = ridge_regression(H_batch,
                                                 y[batch_slice],
                                                 1. / self.C,
                                                 sample_weight=sw).T

            # Initialize K if this is batch based or partial_fit
            if self.batch_size is not None or incremental:
                # K_{0} = H_{0}^T * W * H_{0}
                weighted_H_batch = _multiply_weights(H_batch, sw)
                self._HT_H_accumulated = safe_sparse_dot(
                    H_batch.T, weighted_H_batch)

            if self.verbose:
                y_scores = self._decision_scores(X[batch_slice])

                if self.batch_size is None:
                    verbose_string = "Training mean squared error ="
                else:
                    verbose_string = "Batch 0, Training mean squared error ="

                print("%s %f" %
                      (verbose_string,
                       mean_squared_error(
                           y[batch_slice], y_scores, sample_weight=sw)))

        # Run the least-square algorithm on batch 1, 2, ..., n
        for batch, batch_slice in enumerate(batches):
            # Compute hidden activations H_{i} for batch i
            H_batch = self._compute_hidden_activations(X[batch_slice])

            # Get sample weights (sw) for the batch
            if sample_weight is None:
                sw = None
            else:
                sw = sample_weight[batch_slice]

            weighted_H_batch = _multiply_weights(H_batch, sw)

            # Update K_{i+1} by H_{i}^T * W * H_{i}
            self._HT_H_accumulated += safe_sparse_dot(H_batch.T,
                                                      weighted_H_batch)

            # Update beta_{i+1} by
            # K_{i+1}^{-1} * H_{i+1}^T * W * (y_{i+1} - H_{i+1} * beta_{i})
            y_batch = y[batch_slice] - safe_sparse_dot(H_batch,
                                                       self.coef_output_)

            weighted_y_batch = _multiply_weights(y_batch, sw)
            Hy_batch = safe_sparse_dot(H_batch.T, weighted_y_batch)

            # Update hidden-to-output coefficients
            regularized_HT_H = self._HT_H_accumulated.copy()
            regularized_HT_H.flat[::self.n_hidden + 1] += 1. / self.C

            # It is safe to use linalg.solve (instead of linalg.lstsq
            # which is slow) since it is highly unlikely that
            # regularized_HT_H is singular due to the random
            # projection of the first layer and 'C' regularization being
            # not dangerously large.
            self.coef_output_ += linalg.solve(regularized_HT_H,
                                              Hy_batch,
                                              sym_pos=True,
                                              overwrite_a=True,
                                              overwrite_b=True)
            if self.verbose:
                y_scores = self._decision_scores(X[batch_slice])
                print("Batch %d, Training mean squared error = %f" %
                      (batch + 1,
                       mean_squared_error(
                           y[batch_slice], y_scores, sample_weight=sw)))
        return self

    def fit(self, X, y, sample_weight=None):
        """Fit the model to the data X and target y.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The input data.

        y : array-like, shape (n_samples,)
            Target values.

        sample_weight : array-like, shape (n_samples,)
            Per-sample weights. Rescale C per sample. Higher weights
            force the classifier to put more emphasis on these points.

        Returns
        -------
        self : returns a trained RandomNN usable for prediction.
        """
        return self._fit(X, y, sample_weight=sample_weight, incremental=False)

    def partial_fit(self, X, y, sample_weight=None):
        """Fit the model to the data X and target y.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Subset of training data.

        y : array-like, shape (n_samples,)
            Subset of target values.

        sample_weight : array-like, shape (n_samples,)
            Per-sample weights. Rescale C per sample. Higher weights
            force the classifier to put more emphasis on these points.

        Returns
        -------
        self : returns a trained RandomNN usable for prediction.
        """
        self._fit(X, y, sample_weight=sample_weight, incremental=True)

        return self

    def _decision_scores(self, X):
        """Predict using the RandomNN model

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The input data.

        Returns
        -------
        y_pred : array-like, shape (n_samples,) or (n_samples, n_outputs)
            The predicted values.
        """
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        if self.batch_size is None:
            hidden_activations = self._compute_hidden_activations(X)
            y_pred = safe_sparse_dot(hidden_activations, self.coef_output_)
        else:
            n_samples = X.shape[0]
            batches = gen_batches(n_samples, self.batch_size)

            y_pred = np.zeros((n_samples, self.n_outputs_))
            for batch in batches:
                h_batch = self._compute_hidden_activations(X[batch])
                y_pred[batch] = safe_sparse_dot(h_batch, self.coef_output_)

        return y_pred
Ejemplo n.º 5
0
class BaseWeightBoosting(six.with_metaclass(ABCMeta, BaseEnsemble)):
    """Base class for AdaBoost estimators.

    Warning: This class should not be used directly. Use derived classes
    instead.
    """
    @abstractmethod
    def __init__(self,
                 base_estimator,
                 n_estimators=50,
                 estimator_params=tuple(),
                 learning_rate=1.):

        super(BaseWeightBoosting,
              self).__init__(base_estimator=base_estimator,
                             n_estimators=n_estimators,
                             estimator_params=estimator_params)

        self.learning_rate = learning_rate

    def fit(self, X, y, sample_weight=None):
        """Build a boosted classifier/regressor from the training set (X, y).

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The training input samples.

        y : array-like of shape = [n_samples]
            The target values (integers that correspond to classes in
            classification, real numbers in regression).

        sample_weight : array-like of shape = [n_samples], optional
            Sample weights. If None, the sample weights are initialized to
            1 / n_samples.

        Returns
        -------
        self : object
            Returns self.
        """
        # Check parameters
        if self.learning_rate <= 0:
            raise ValueError("learning_rate must be greater than zero")

        # Check data
        X, y = check_arrays(X, y, sparse_format="dense")

        if sample_weight is None:
            # Initialize weights to 1 / n_samples
            sample_weight = np.empty(X.shape[0], dtype=np.float)
            sample_weight[:] = 1. / X.shape[0]
        else:
            # Normalize existing weights
            sample_weight = np.copy(sample_weight) / sample_weight.sum()

            # Check that the sample weights sum is positive
            if sample_weight.sum() <= 0:
                raise ValueError("Attempting to fit with a non-positive "
                                 "weighted number of samples.")

        # Clear any previous fit results
        self.estimators_ = []
        self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float)
        self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float)

        # Create argsorted X for fast tree induction
        X_argsorted = None

        if isinstance(self.base_estimator, BaseDecisionTree):
            X_argsorted = np.asfortranarray(
                np.argsort(X.T, axis=1).astype(np.int32).T)

        for iboost in xrange(self.n_estimators):
            # Boosting step
            sample_weight, estimator_weight, estimator_error = self._boost(
                iboost, X, y, sample_weight, X_argsorted=X_argsorted)

            # Early termination
            if sample_weight is None:
                break

            self.estimator_weights_[iboost] = estimator_weight
            self.estimator_errors_[iboost] = estimator_error

            # Stop if error is zero
            if estimator_error == 0:
                break

            sample_weight_sum = np.sum(sample_weight)

            # Stop if the sum of sample weights has become non-positive
            if sample_weight_sum <= 0:
                break

            if iboost < self.n_estimators - 1:
                # Normalize
                sample_weight /= sample_weight_sum

        return self

    def _check_fitted(self):
        if not hasattr(self, "estimators_"):
            raise ValueError("call fit first")

    @abstractmethod
    def _boost(self, iboost, X, y, sample_weight, X_argsorted=None):
        """Implement a single boost.

        Warning: This method needs to be overriden by subclasses.

        Parameters
        ----------
        iboost : int
            The index of the current boost iteration.

        X : array-like of shape = [n_samples, n_features]
            The training input samples.

        y : array-like of shape = [n_samples]
            The target values (integers that correspond to classes).

        sample_weight : array-like of shape = [n_samples]
            The current sample weights.

        X_argsorted : array-like, shape = [n_samples, n_features] (optional)
            Each column of ``X_argsorted`` holds the row indices of ``X``
            sorted according to the value of the corresponding feature
            in ascending order.
            The argument is supported to enable multiple decision trees
            to share the data structure and to avoid re-computation in
            tree ensembles. For maximum efficiency use dtype np.int32.

        Returns
        -------
        sample_weight : array-like of shape = [n_samples] or None
            The reweighted sample weights.
            If None then boosting has terminated early.

        estimator_weight : float
            The weight for the current boost.
            If None then boosting has terminated early.

        error : float
            The classification error for the current boost.
            If None then boosting has terminated early.
        """
        pass

    def staged_score(self, X, y):
        """Return staged scores for X, y.

        This generator method yields the ensemble score after each iteration of
        boosting and therefore allows monitoring, such as to determine the
        score on a test set after each boost.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training set.

        y : array-like, shape = [n_samples]
            Labels for X.

        Returns
        -------
        z : float
        """
        for y_pred in self.staged_predict(X):
            if isinstance(self, ClassifierMixin):
                yield accuracy_score(y, y_pred)
            else:
                yield r2_score(y, y_pred)

    @property
    def feature_importances_(self):
        """Return the feature importances (the higher, the more important the
           feature).

        Returns
        -------
        feature_importances_ : array, shape = [n_features]
        """
        if self.estimators_ is None or len(self.estimators_) == 0:
            raise ValueError("Estimator not fitted, "
                             "call `fit` before `feature_importances_`.")

        try:
            norm = self.estimator_weights_.sum()
            return (sum(weight * clf.feature_importances_ for weight, clf in
                        zip(self.estimator_weights_, self.estimators_)) / norm)

        except AttributeError:
            raise AttributeError("Unable to compute feature importances "
                                 "since base_estimator does not have a "
                                 "feature_importances_ attribute")
class BaseNB(six.with_metaclass(ABCMeta, BaseEstimator, ClassifierMixin)):
    """Abstract base class for naive Bayes estimators"""
    @abstractmethod
    def _joint_log_likelihood(self, X):
        """Compute the unnormalized posterior log probability of X

        I.e. ``log P(c) + log P(x|c)`` for all rows x of X, as an array-like of
        shape [n_classes, n_samples].

        Input is passed to _joint_log_likelihood as-is by predict,
        predict_proba and predict_log_proba.
        """

    def predict(self, X):
        """
        Perform classification on an array of test vectors X.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]

        Returns
        -------
        C : array, shape = [n_samples]
            Predicted target values for X
        """
        jll = self._joint_log_likelihood(X)
        return self.classes_[np.argmax(jll, axis=1)]

    def predict_log_proba(self, X):
        """
        Return log-probability estimates for the test vector X.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]

        Returns
        -------
        C : array-like, shape = [n_samples, n_classes]
            Returns the log-probability of the samples for each class in
            the model. The columns correspond to the classes in sorted
            order, as they appear in the attribute `classes_`.
        """
        jll = self._joint_log_likelihood(X)
        # normalize by P(x) = P(f_1, ..., f_n)
        log_prob_x = logsumexp(jll, axis=1)
        return jll - np.atleast_2d(log_prob_x).T

    def predict_proba(self, X):
        """
        Return probability estimates for the test vector X.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]

        Returns
        -------
        C : array-like, shape = [n_samples, n_classes]
            Returns the probability of the samples for each class in
            the model. The columns correspond to the classes in sorted
            order, as they appear in the attribute `classes_`.
        """
        probas = np.exp(self.predict_log_proba(X))
        rowsum = np.sum(probas, axis=1)
        # if np.array_equal(rowsum, np.ones(rowsum.shape[0])):
        #     print "rowsum are 1"
        # else:
        #     print "rowsums are't 1"
        return probas / rowsum.reshape(rowsum.shape[0], 1)
Ejemplo n.º 7
0
class BinaryLogitBoost(with_metaclass(ABCMeta, BaseEnsemble)):
    """
    Parameters
    ----------
    Please refer to scikit-learn's boosting documentation :
    https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/ensemble/weight_boosting.py
    """
    def __init__(self,
                 base_estimator=None,
                 n_estimators=50,
                 random_state=None):
        super(BinaryLogitBoost, self).__init__(base_estimator=base_estimator,
                                               n_estimators=n_estimators)
        self.random_state = random_state

    def fit(self, X, y, sample_weight=None):
        y = self._validate_y(y)

        X, y = check_X_y(X, y, accept_sparse='csc', dtype=DTYPE)
        n_samples = X.shape[0]

        if sample_weight is None:
            # Initialize weights to 1 / n_samples
            sample_weight = np.empty(n_samples, dtype=np.float)
            sample_weight[:] = 1. / n_samples
        else:
            # Normalize existing weights
            sample_weight = sample_weight / sample_weight.sum(dtype=np.float64)

            # Check that the sample weights sum is positive
            if sample_weight.sum() <= 0:
                raise ValueError("Attempting to fit with a non-positive "
                                 "weighted number of samples.")

        # Check parameters
        self._validate_estimator()

        # Clear any previous fit results
        self.estimators_ = []

        estimators = []
        predictions = np.zeros(n_samples)

        p = 0.5 * np.ones(n_samples)

        for iboost in range(self.n_estimators):
            sample_weight = p * (1 - p)
            z = (y - p) / sample_weight

            estimator = self._make_estimator()

            try:
                estimator.set_params(random_state=self.random_state)
            except ValueError:
                pass

            estimator.fit(X, z, sample_weight=sample_weight)
            estimators.append(estimator)
            predictions += (1 / 2) * estimator.predict(X)

            p = 1 / (1 + np.exp(-2 * predictions))

        self.estimators_ = estimators

        return self

    def predict(self, X):
        estimators = self.estimators_
        predictions = sum([estimator.predict(X) for estimator in estimators])
        return self.classes_.take(np.where(predictions > 0, 1, 0))

    def predict_proba(self, X):
        n_samples = X.shape[0]
        proba = np.zeros((n_samples, 2))  # Binary classification

        estimators = self.estimators_

        predictions = sum([estimator.predict(X) for estimator in estimators])
        proba[:, 0] = 1 / (1 + np.exp(predictions))
        proba[:, 1] = 1 - proba[:, 0]

        return proba

    def _validate_estimator(self):
        """Check the estimator and set the base_estimator_ attribute."""
        super(BinaryLogitBoost, self)._validate_estimator(
            default=DecisionTreeRegressor(max_depth=3))

    def _validate_y(self, y):
        y = column_or_1d(y, warn=True)
        check_classification_targets(y)
        self.classes_, y = np.unique(y, return_inverse=True)
        n_classes = len(self.classes_)

        if n_classes > 2:
            raise ValueError(
                "It's a binary classification algorithm. Use a dataset with only 2 classes to predict."
            )

        return y
Ejemplo n.º 8
0
class RecommenderTestClass(six.with_metaclass(ABCMeta)):
    """An abstract base test class for algo test suites.
    All recommender algorithm test classes should inherit from this.
    """
    @abstractmethod
    def test_simple_fit(self, *args, **kwargs):
        """Test a simple fit"""

    @abstractmethod
    def test_complex_fit(self, *args, **kwargs):
        """Test a more complex fit"""

    @abstractmethod
    def test_recommend_single(self, *args, **kwargs):
        """Test recommending for a single user."""

    @abstractmethod
    def test_recommend_all(self, *args, **kwargs):
        """Test recommending for all users."""

    @abstractmethod
    def test_serialize(self, *args, **kwargs):
        """Test serializing the algo."""

    @staticmethod
    def _single_recommend_assertions(clf, train_data, test_data):
        # Simple recommendation operation
        recs = clf.recommend_for_user(0,
                                      test_data,
                                      n=5,
                                      filter_previously_rated=False)
        assert len(recs) == 5

        # Create recommendations for everything, but filter out a single item
        n = train_data.shape[1]
        recs = clf.recommend_for_user(0,
                                      test_data,
                                      n=n,
                                      filter_previously_rated=False,
                                      filter_items=[1])

        # Show that '1' is not in the recommendations
        mask = np.in1d([1], recs)  # type: np.ndarray
        assert not mask.any()

        # Show we can also create recommendations with return_scores=True
        recs, scores = clf.recommend_for_user(0,
                                              test_data,
                                              n=5,
                                              return_scores=True)
        assert len(recs) == len(scores) == 5, (recs, scores)
        assert all(isinstance(arr, np.ndarray) for arr in (recs, scores))

    @staticmethod
    def _all_recommend_assertions(clf, test_data):
        n = test_data.shape[1]
        recs = clf.recommend_for_all_users(test_data,
                                           n=n,
                                           return_scores=True,
                                           filter_previously_rated=True)

        # Show that it's a generator
        assert isinstance(recs, types.GeneratorType)
        first_recs, first_scores = next(recs)
        assert len(first_recs) == len(first_scores)

        # show no rated items in the recs
        rated = test_data[0, :].indices
        mask = np.in1d(rated, first_recs)  # type: np.ndarray
        assert not mask.any()

    @staticmethod
    def _serialization_assertions(clf,
                                  train_data,
                                  test_data,
                                  tolerate_fail=False):
        pkl_location = "als.pkl"

        # Test persistence
        try:
            # Show we can serialize BEFORE it's fit
            joblib.dump(clf, pkl_location, compress=3)
            os.unlink(pkl_location)

            # NOW train
            clf.fit(train_data)

            # Get recommendations
            recs1 = clf.recommend_for_user(0,
                                           test_data,
                                           n=3,
                                           return_scores=False)

            # dump it, recommend again and show the internal state didn't
            # change while we were pickling it out
            joblib.dump(clf, pkl_location, compress=3)
            recs2 = clf.recommend_for_user(0,
                                           test_data,
                                           n=3,
                                           return_scores=False)

            # open it up and create more recommendations
            loaded = joblib.load(pkl_location)
            recs3 = loaded \
                .recommend_for_user(0, test_data, n=3,
                                    return_scores=False)

            # Now show they're all the same
            if not tolerate_fail:
                assert_array_equal(recs1,
                                   recs2,
                                   err_msg="%s != %s" %
                                   (str(recs1), str(recs2)))
                assert_array_equal(recs1,
                                   recs3,
                                   err_msg="%s != %s" %
                                   (str(recs1), str(recs3)))

        finally:
            os.unlink(pkl_location)

            # If the model has an index saved somewhere, remove it also
            if hasattr(clf, "_model_key"):
                index_cache = os.path.join(RECLAB_CACHE, clf._model_key)
                if os.path.exists(index_cache):
                    shutil.rmtree(index_cache)
Ejemplo n.º 9
0
class BaseBagging(with_metaclass(ABCMeta, BaseEnsemble)):
    """Base class for Bagging meta-estimator.
    Warning: This class should not be used directly. Use derived classes
    instead.
    """
    @abstractmethod
    def __init__(self,
                 base_estimator=None,
                 n_estimators=10,
                 max_samples=1.0,
                 max_features=1.0,
                 bootstrap=False,
                 bootstrap_features=False,
                 oob_score=False,
                 warm_start=False,
                 n_jobs=1,
                 random_state=None,
                 verbose=0,
                 sampler='under',
                 max_depth=None):
        super(BaseBagging, self).__init__(base_estimator=base_estimator,
                                          n_estimators=n_estimators)

        self.max_samples = max_samples
        self.max_features = max_features
        self.bootstrap = bootstrap
        self.bootstrap_features = bootstrap_features
        self.oob_score = oob_score
        self.warm_start = warm_start
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.verbose = verbose
        self.sampler = sampler
        self.max_depth = max_depth

    def fit(self, X, y, max_depth=None, sample_weight=None):
        """Build a Bagging ensemble of estimators from the training
           set (X, y).
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrices are accepted only if
            they are supported by the base estimator.
        y : array-like, shape = [n_samples]
            The target values (class labels in classification, real numbers in
            regression).
        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted.
            Note that this is supported only if the base estimator supports
            sample weighting.
        Returns
        -------
        self : object
            Returns self.
        """
        return self._fit(X,
                         y,
                         self.max_samples,
                         self.max_depth,
                         sample_weight=sample_weight)

    def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None):
        """Build a Bagging ensemble of estimators from the training
           set (X, y).
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrices are accepted only if
            they are supported by the base estimator.
        y : array-like, shape = [n_samples]
            The target values (class labels in classification, real numbers in
            regression).
        max_samples : int or float, optional (default=None)
            Argument to use instead of self.max_samples.
        max_depth : int, optional (default=None)
            Override value used when constructing base estimator. Only
            supported if the base estimator has a max_depth parameter.
        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted.
            Note that this is supported only if the base estimator supports
            sample weighting.
        Returns
        -------
        self : object
            Returns self.
        """
        random_state = check_random_state(self.random_state)

        # Convert data
        X, y = check_X_y(X, y, ['csr', 'csc'])
        if sample_weight is not None:
            sample_weight = check_array(sample_weight, ensure_2d=False)
            check_consistent_length(y, sample_weight)

        # Remap output
        n_samples, self.n_features_ = X.shape
        self._n_samples = n_samples
        y = self._validate_y(y)

        # Check parameters
        self._validate_estimator()

        if max_depth is not None:
            self.base_estimator_.max_depth = max_depth

        # Validate max_samples
        if max_samples is None:
            max_samples = self.max_samples
        elif not isinstance(max_samples, (numbers.Integral, np.integer)):
            max_samples = int(max_samples * X.shape[0])

        if not (0 < max_samples <= X.shape[0]):
            raise ValueError("max_samples must be in (0, n_samples]")

        # Store validated integer row sampling value
        self._max_samples = max_samples

        # Validate max_features
        if isinstance(self.max_features, (numbers.Integral, np.integer)):
            max_features = self.max_features
        else:  # float
            max_features = int(self.max_features * self.n_features_)

        if not (0 < max_features <= self.n_features_):
            raise ValueError("max_features must be in (0, n_features]")

        # Store validated integer feature sampling value
        self._max_features = max_features

        # Other checks
        if not self.bootstrap and self.oob_score:
            raise ValueError("Out of bag estimation only available"
                             " if bootstrap=True")

        if self.warm_start and self.oob_score:
            raise ValueError("Out of bag estimate only available"
                             " if warm_start=False")

        if hasattr(self, "oob_score_") and self.warm_start:
            del self.oob_score_

        if not self.warm_start or not hasattr(self, 'estimators_'):
            # Free allocated memory, if any
            self.estimators_ = []
            self.estimators_features_ = []

        n_more_estimators = self.n_estimators - len(self.estimators_)

        if n_more_estimators < 0:
            raise ValueError('n_estimators=%d must be larger or equal to '
                             'len(estimators_)=%d when warm_start==True' %
                             (self.n_estimators, len(self.estimators_)))

        elif n_more_estimators == 0:
            warn("Warm-start fitting without increasing n_estimators does not "
                 "fit new trees.")
            return self

        # Parallel loop
        n_jobs, n_estimators, starts = _partition_estimators(
            n_more_estimators, self.n_jobs)
        total_n_estimators = sum(n_estimators)

        # Advance random state to state after training
        # the first n_estimators
        if self.warm_start and len(self.estimators_) > 0:
            random_state.randint(MAX_INT, size=len(self.estimators_))

        seeds = random_state.randint(MAX_INT, size=n_more_estimators)
        self._seeds = seeds

        #        # Resample data in each bag
        #        if self.sampler == 'under':
        #            X_res, y_res =
        #        else:
        #            X_res, y_res = X, y

        all_results = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
            delayed(_parallel_build_estimators)(n_estimators[i],
                                                self,
                                                X,
                                                y,
                                                sample_weight,
                                                seeds[starts[i]:starts[i + 1]],
                                                total_n_estimators,
                                                verbose=self.verbose)
            for i in range(n_jobs))

        # Reduce
        self.estimators_ += list(
            itertools.chain.from_iterable(t[0] for t in all_results))
        self.estimators_features_ += list(
            itertools.chain.from_iterable(t[1] for t in all_results))

        if self.oob_score:
            self._set_oob_score(X, y)

        return self

    @abstractmethod
    def _set_oob_score(self, X, y):
        """Calculate out of bag predictions and score."""

    def _validate_y(self, y):
        # Default implementation
        return column_or_1d(y, warn=True)

    def _get_estimators_indices(self):
        # Get drawn indices along both sample and feature axes
        for seed in self._seeds:
            # Operations accessing random_state must be performed identically
            # to those in `_parallel_build_estimators()`
            random_state = np.random.RandomState(seed)
            feature_indices, sample_indices = _generate_bagging_indices(
                random_state, self.bootstrap_features, self.bootstrap,
                self.n_features_, self._n_samples, self._max_features,
                self._max_samples)

            yield feature_indices, sample_indices

    @property
    def estimators_samples_(self):
        """The subset of drawn samples for each base estimator.
        Returns a dynamically generated list of boolean masks identifying
        the samples used for fitting each member of the ensemble, i.e.,
        the in-bag samples.
        Note: the list is re-created at each call to the property in order
        to reduce the object memory footprint by not storing the sampling
        data. Thus fetching the property may be slower than expected.
        """
        sample_masks = []
        for _, sample_indices in self._get_estimators_indices():
            mask = indices_to_mask(sample_indices, self._n_samples)
            sample_masks.append(mask)

        return sample_masks
Ejemplo n.º 10
0
class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)):
    """Mixin class for samplers with abstract method.

    Warning: This class should not be used directly. Use the derive classes
    instead.
    """

    _estimator_type = 'sampler'

    def fit(self, X, y):
        """Check inputs and statistics of the sampler.

        You should use ``fit_resample`` in all cases.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Data array.

        y : array-like, shape (n_samples,)
            Target array.

        Returns
        -------
        self : object
            Return the instance itself.

        """
        self._deprecate_ratio()
        X, y, _ = self._check_X_y(X, y)
        self.sampling_strategy_ = check_sampling_strategy(
            self.sampling_strategy, y, self._sampling_type)
        return self

    def fit_resample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : array-like, shape (n_samples,)
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : {array-like, sparse matrix}, shape \
(n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : array-like, shape (n_samples_new,)
            The corresponding label of `X_resampled`.

        """
        self._deprecate_ratio()

        check_classification_targets(y)
        X, y, binarize_y = self._check_X_y(X, y)

        self.sampling_strategy_ = check_sampling_strategy(
            self.sampling_strategy, y, self._sampling_type)

        output = self._fit_resample(X, y)

        if binarize_y:
            y_sampled = label_binarize(output[1], np.unique(y))
            if len(output) == 2:
                return output[0], y_sampled
            return output[0], y_sampled, output[2]
        return output

    #  define an alias for back-compatibility
    fit_sample = fit_resample

    @abstractmethod
    def _fit_resample(self, X, y):
        """Base method defined in each sampler to defined the sampling
        strategy.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : array-like, shape (n_samples,)
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : {ndarray, sparse matrix}, shape \
(n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new,)
            The corresponding label of `X_resampled`

        """
        pass
Ejemplo n.º 11
0
class _BaseFactorizationMachine(six.with_metaclass(ABCMeta, _BasePoly)):

    @abstractmethod
    def __init__(self, degree=2, loss='squared', n_components=2, alpha=1,
                 beta=1, tol=1e-6, fit_lower='explicit', fit_linear=True,
                 warm_start=False, init_lambdas='ones', max_iter=10000,
                 verbose=False, random_state=None):
        self.degree = degree
        self.loss = loss
        self.n_components = n_components
        self.alpha = alpha
        self.beta = beta
        self.tol = tol
        self.fit_lower = fit_lower
        self.fit_linear = fit_linear
        self.warm_start = warm_start
        self.init_lambdas = init_lambdas
        self.max_iter = max_iter
        self.verbose = verbose
        self.random_state = random_state

    def _augment(self, X):
        # for factorization machines, we add a dummy column for each order.

        if self.fit_lower == 'augment':
            k = 2 if self.fit_linear else 1
            for _ in range(self.degree - k):
                X = add_dummy_feature(X, value=1)
        return X

    def fit(self, X, y):
        """Fit factorization machine to training data.

        Parameters
        ----------
        X : array-like or sparse, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------
        self : Estimator
            Returns self.
        """
        if self.degree > 3:
            raise ValueError("FMs with degree >3 not yet supported.")

        X, y = self._check_X_y(X, y)
        X = self._augment(X)
        n_features = X.shape[1]  # augmented
        X_col_norms = row_norms(X.T, squared=True)
        dataset = get_dataset(X, order="fortran")
        rng = check_random_state(self.random_state)
        loss_obj = self._get_loss(self.loss)

        if not (self.warm_start and hasattr(self, 'w_')):
            self.w_ = np.zeros(n_features, dtype=np.double)

        if self.fit_lower == 'explicit':
            n_orders = self.degree - 1
        else:
            n_orders = 1

        if not (self.warm_start and hasattr(self, 'P_')):
            self.P_ = 0.01 * rng.randn(n_orders, self.n_components, n_features)

        if not (self.warm_start and hasattr(self, 'lams_')):
            if self.init_lambdas == 'ones':
                self.lams_ = np.ones(self.n_components)
            elif self.init_lambdas == 'random_signs':
                self.lams_ = np.sign(rng.randn(self.n_components))
            else:
                raise ValueError("Lambdas must be initialized as ones "
                                 "(init_lambdas='ones') or as random "
                                 "+/- 1 (init_lambdas='random_signs').")

        y_pred = self._get_output(X)

        converged = _cd_direct_ho(self.P_, self.w_, dataset, X_col_norms, y,
                                  y_pred, self.lams_, self.degree, self.alpha,
                                  self.beta, self.fit_linear,
                                  self.fit_lower == 'explicit', loss_obj,
                                  self.max_iter, self.tol, self.verbose)
        if not converged:
            warnings.warn("Objective did not converge. Increase max_iter.")

        return self

    def _get_output(self, X):
        y_pred = _poly_predict(X, self.P_[0, :, :], self.lams_, kernel='anova',
                               degree=self.degree)

        if self.fit_linear:
            y_pred += safe_sparse_dot(X, self.w_)

        if self.fit_lower == 'explicit' and self.degree == 3:
            # degree cannot currently be > 3
            y_pred += _poly_predict(X, self.P_[1, :, :], self.lams_,
                                    kernel='anova', degree=2)

        return y_pred

    def _predict(self, X):
        if not hasattr(self, "P_"):
            raise NotFittedError("Estimator not fitted.")
        X = check_array(X, accept_sparse='csc', dtype=np.double)
        X = self._augment(X)
        return self._get_output(X)
Ejemplo n.º 12
0
class _BaseChain(six.with_metaclass(ABCMeta, BaseEstimator)):
    def __init__(self, base_estimator, order=None, cv=None, random_state=None):
        self.base_estimator = base_estimator
        self.order = order
        self.cv = cv
        self.random_state = random_state

    @abstractmethod
    def fit(self, X, Y):
        """Fit the model to data matrix X and targets Y.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The input data.
        Y : array-like, shape (n_samples, n_classes)
            The target values.

        Returns
        -------
        self : object
        """
        X, Y = check_X_y(X,
                         Y,
                         multi_output=True,
                         accept_sparse=True,
                         force_all_finite=False,
                         dtype="object")

        random_state = check_random_state(self.random_state)
        check_array(X,
                    accept_sparse=True,
                    force_all_finite=False,
                    dtype="object")
        self.order_ = self.order
        if self.order_ is None:
            self.order_ = np.array(range(Y.shape[1]))
        elif isinstance(self.order_, str):
            if self.order_ == 'random':
                self.order_ = random_state.permutation(Y.shape[1])
        elif sorted(self.order_) != list(range(Y.shape[1])):
            raise ValueError("invalid order")

        self.estimators_ = [
            clone(self.base_estimator) for _ in range(Y.shape[1])
        ]

        if self.cv is None:
            Y_pred_chain = Y[:, self.order_]
            if sp.issparse(X):
                X_aug = sp.hstack((X, Y_pred_chain), format='lil')
                X_aug = X_aug.tocsr()
            else:
                X_aug = np.hstack((X, Y_pred_chain))

        elif sp.issparse(X):
            Y_pred_chain = sp.lil_matrix((X.shape[0], Y.shape[1]))
            X_aug = sp.hstack((X, Y_pred_chain), format='lil')

        else:
            Y_pred_chain = np.zeros((X.shape[0], Y.shape[1]))
            X_aug = np.hstack((X, Y_pred_chain))

        del Y_pred_chain

        for chain_idx, estimator in enumerate(self.estimators_):
            y = Y[:, self.order_[chain_idx]]
            estimator.fit(X_aug[:, :(X.shape[1] + chain_idx)], y)
            if self.cv is not None and chain_idx < len(self.estimators_) - 1:
                col_idx = X.shape[1] + chain_idx
                cv_result = cross_val_predict(self.base_estimator,
                                              X_aug[:, :col_idx],
                                              y=y,
                                              cv=self.cv)
                if sp.issparse(X_aug):
                    X_aug[:, col_idx] = np.expand_dims(cv_result, 1)
                else:
                    X_aug[:, col_idx] = cv_result

        return self

    def predict(self, X):
        """Predict on the data matrix X using the ClassifierChain model.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The input data.

        Returns
        -------
        Y_pred : array-like, shape (n_samples, n_classes)
            The predicted values.

        """
        X = check_array(X, accept_sparse=True, dtype="object")
        Y_pred_chain = np.zeros((X.shape[0], len(self.estimators_)))
        for chain_idx, estimator in enumerate(self.estimators_):
            previous_predictions = Y_pred_chain[:, :chain_idx]
            if sp.issparse(X):
                if chain_idx == 0:
                    X_aug = X
                else:
                    X_aug = sp.hstack((X, previous_predictions))
            else:
                X_aug = np.hstack((X, previous_predictions))
            Y_pred_chain[:, chain_idx] = estimator.predict(X_aug)

        inv_order = np.empty_like(self.order_)
        inv_order[self.order_] = np.arange(len(self.order_))
        Y_pred = Y_pred_chain[:, inv_order]

        return Y_pred
Ejemplo n.º 13
0
class MultiOutputEstimator(
        six.with_metaclass(ABCMeta, BaseEstimator, MetaEstimatorMixin)):
    @abstractmethod
    def __init__(self, estimator, n_jobs=None):
        self.estimator = estimator
        self.n_jobs = n_jobs

    @if_delegate_has_method('estimator')
    def partial_fit(self, X, y, classes=None, sample_weight=None):
        """Incrementally fit the model to data.
        Fit a separate model for each output variable.

        Parameters
        ----------
        X : (sparse) array-like, shape (n_samples, n_features)
            Data.

        y : (sparse) array-like, shape (n_samples, n_outputs)
            Multi-output targets.

        classes : list of numpy arrays, shape (n_outputs)
            Each array is unique classes for one output in str/int
            Can be obtained by via
            ``[np.unique(y[:, i]) for i in range(y.shape[1])]``, where y is the
            target matrix of the entire dataset.
            This argument is required for the first call to partial_fit
            and can be omitted in the subsequent calls.
            Note that y doesn't need to contain all labels in `classes`.

        sample_weight : array-like, shape = (n_samples) or None
            Sample weights. If None, then samples are equally weighted.
            Only supported if the underlying regressor supports sample
            weights.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, multi_output=True, accept_sparse=True)

        if y.ndim == 1:
            raise ValueError("y must have at least two dimensions for "
                             "multi-output regression but has only one.")

        if (sample_weight is not None
                and not has_fit_parameter(self.estimator, 'sample_weight')):
            raise ValueError("Underlying estimator does not support"
                             " sample weights.")

        first_time = not hasattr(self, 'estimators_')

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_partial_fit_estimator)
            (self.estimators_[i] if not first_time else self.estimator, X,
             y[:, i], classes[i] if classes is not None else None,
             sample_weight, first_time) for i in range(y.shape[1]))
        return self

    def fit(self, X, y, sample_weight=None):
        """ Fit the model to data.
        Fit a separate model for each output variable.

        Parameters
        ----------
        X : (sparse) array-like, shape (n_samples, n_features)
            Data.

        y : (sparse) array-like, shape (n_samples, n_outputs)
            Multi-output targets. An indicator matrix turns on multilabel
            estimation.

        sample_weight : array-like, shape = (n_samples) or None
            Sample weights. If None, then samples are equally weighted.
            Only supported if the underlying regressor supports sample
            weights.

        Returns
        -------
        self : object
        """

        if not hasattr(self.estimator, "fit"):
            raise ValueError(
                "The base estimator should implement a fit method")

        X, y = check_X_y(X,
                         y,
                         multi_output=True,
                         accept_sparse=True,
                         dtype="object")

        if is_classifier(self):
            check_classification_targets(y)

        if y.ndim == 1:
            raise ValueError("y must have at least two dimensions for "
                             "multi-output regression but has only one.")

        if (sample_weight is not None
                and not has_fit_parameter(self.estimator, 'sample_weight')):
            raise ValueError("Underlying estimator does not support"
                             " sample weights.")

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_estimator)(self.estimator, X, y[:, i], sample_weight)
            for i in range(y.shape[1]))
        return self

    def predict(self, X):
        """Predict multi-output variable using a model
         trained for each target variable.

        Parameters
        ----------
        X : (sparse) array-like, shape (n_samples, n_features)
            Data.

        Returns
        -------
        y : (sparse) array-like, shape (n_samples, n_outputs)
            Multi-output targets predicted across multiple predictors.
            Note: Separate models are generated for each predictor.
        """
        check_is_fitted(self, 'estimators_')
        if not hasattr(self.estimator, "predict"):
            raise ValueError(
                "The base estimator should implement a predict method")

        X = check_array(X,
                        accept_sparse=True,
                        force_all_finite=False,
                        dtype="object")

        y = Parallel(n_jobs=self.n_jobs)(
            delayed(parallel_helper)(e, 'predict', X)
            for e in self.estimators_)

        return np.asarray(y).T
Ejemplo n.º 14
0
class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)):
    """Mixin class for samplers with abstact method.

    Warning: This class should not be used directly. Use the derive classes
    instead.
    """

    _estimator_type = 'sampler'

    def __init__(self, ratio='auto', random_state=None):
        """Initialize this object and its instance variables.

        Parameters
        ----------
        ratio : str or float, optional (default='auto')
            If 'auto', the ratio will be defined automatically to balanced
            the dataset. Otherwise, the ratio will corresponds to the number
            of samples in the minority class over the the number of samples
            in the majority class.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by np.random.

        Returns
        -------
        None

        """

        self.ratio = ratio
        self.random_state = random_state
        self.logger = logging.getLogger(__name__)

    def fit(self, X, y):
        """Find the classes statistics before to perform sampling.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        self : object,
            Return self.

        """

        # Check the consistency of X and y
        X, y = check_X_y(X, y)

        self.min_c_ = None
        self.maj_c_ = None
        self.stats_c_ = {}
        self.X_shape_ = None

        if hasattr(self, 'ratio'):
            self._validate_ratio()

        if hasattr(self, 'size_ngh'):
            self._validate_size_ngh_deprecation()
        elif hasattr(self, 'k') and not hasattr(self, 'm'):
            self._validate_k_deprecation()
        elif hasattr(self, 'k') and hasattr(self, 'm'):
            self._validate_k_m_deprecation()

        self.logger.info('Compute classes statistics ...')

        # Raise an error if there is only one class
        # if uniques.size == 1:
        #     raise RuntimeError("Only one class detected, aborting...")
        # Raise a warning for the moment to be compatible with BaseEstimator
        self.logger.debug('The number of classes is %s', np.unique(y).size)
        self.logger.debug('Shall we raise a warning: %s',
                          np.unique(y).size == 1)
        if np.unique(y).size == 1:
            warnings.simplefilter('always', UserWarning)
            warnings.warn('Only one class detected, something will get wrong')
            self.logger.debug('The warning should has been raised.')

        # Store the size of X to check at sampling time if we have the
        # same data
        self.X_shape_ = X.shape

        # Create a dictionary containing the class statistics
        self.stats_c_ = Counter(y)

        # Find the minority and majority classes
        self.min_c_ = min(self.stats_c_, key=self.stats_c_.get)
        self.maj_c_ = max(self.stats_c_, key=self.stats_c_.get)

        self.logger.info('%s classes detected: %s',
                         np.unique(y).size, self.stats_c_)

        # Check if the ratio provided at initialisation make sense
        if isinstance(self.ratio, float):
            if self.ratio < (self.stats_c_[self.min_c_] /
                             self.stats_c_[self.maj_c_]):
                raise RuntimeError('The ratio requested at initialisation'
                                   ' should be greater or equal than the'
                                   ' balancing ratio of the current data.')

        return self

    def sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : ndarray, shape (n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new)
            The corresponding label of `X_resampled`

        """

        # Check the consistency of X and y
        X, y = check_X_y(X, y)

        # Check that the data have been fitted
        if not hasattr(self, 'stats_c_'):
            raise RuntimeError('You need to fit the data, first!!!')

        # Check if the size of the data is identical than at fitting
        if X.shape != self.X_shape_:
            raise RuntimeError('The data that you attempt to resample do not'
                               ' seem to be the one earlier fitted. Use the'
                               ' fitted data.')

        if hasattr(self, 'ratio'):
            self._validate_ratio()

        if hasattr(self, 'size_ngh'):
            self._validate_size_ngh_deprecation()
        elif hasattr(self, 'k') and not hasattr(self, 'm'):
            self._validate_k_deprecation()
        elif hasattr(self, 'k') and hasattr(self, 'm'):
            self._validate_k_m_deprecation()

        return self._sample(X, y)

    def fit_sample(self, X, y):
        """Fit the statistics and resample the data directly.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : ndarray, shape (n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new)
            The corresponding label of `X_resampled`

        """

        return self.fit(X, y).sample(X, y)

    def _validate_ratio(self):
        # The ratio correspond to the number of samples in the minority class
        # over the number of samples in the majority class. Thus, the ratio
        # cannot be greater than 1.0
        if isinstance(self.ratio, float):
            if self.ratio > 1:
                raise ValueError('Ration cannot be greater than one.')
            elif self.ratio <= 0:
                raise ValueError('Ratio cannot be negative.')

        elif isinstance(self.ratio, six.string_types):
            if self.ratio != 'auto':
                raise ValueError('Unknown string for the parameter ratio.')
        else:
            raise ValueError('Unknown parameter type for ratio.')

    def _validate_size_ngh_deprecation(self):
        "Private function to warn about the deprecation about size_ngh."

        # Announce deprecation if necessary
        if self.size_ngh is not None:
            warnings.warn('`size_ngh` will be replaced in version 0.4. Use'
                          ' `n_neighbors` instead.', DeprecationWarning)
            self.n_neighbors = self.size_ngh

    def _validate_k_deprecation(self):
        """Private function to warn about deprecation of k in ADASYN"""
        if self.k is not None:
            warnings.warn('`k` will be replaced in version 0.4. Use'
                          ' `n_neighbors` instead.', DeprecationWarning)
            self.n_neighbors = self.k

    def _validate_k_m_deprecation(self):
        """Private function to warn about deprecation of k in ADASYN"""
        if self.k is not None:
            warnings.warn('`k` will be replaced in version 0.4. Use'
                          ' `k_neighbors` instead.', DeprecationWarning)
            self.k_neighbors = self.k

        if self.m is not None:
            warnings.warn('`m` will be replaced in version 0.4. Use'
                          ' `m_neighbors` instead.', DeprecationWarning)
            self.m_neighbors = self.m

    @abstractmethod
    def _sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : ndarray, shape (n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new)
            The corresponding label of `X_resampled`
        """
        pass

    def __getstate__(self):
        """Prevent logger from being pickled."""
        object_dictionary = self.__dict__.copy()
        del object_dictionary['logger']
        return object_dictionary

    def __setstate__(self, dict):
        """Re-open the logger."""
        logger = logging.getLogger(__name__)
        self.__dict__.update(dict)
        self.logger = logger
Ejemplo n.º 15
0
class RandomForestModel(six.with_metaclass(ABCMeta, BaseEnsemble)):
    
    @abstractmethod
    def __init__(self,
                 base_estimator,
                 n_estimators=100,
                 estimator_params=tuple()):
        super(RandomForestModel, self).__init__(
            base_estimator=base_estimator,
            n_estimators=n_estimators,
            estimator_params=estimator_params)

    def get_model(self, X, y,sample_weight = None):
        # Validate or convert input data
        X = check_array(X, accept_sparse="csc", dtype=DTYPE)
        y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
        if issparse(X):
            # Pre-sort indices to avoid that each individual tree of the
            # ensemble sorts the indices.
            X.sort_indices()

        # Remap output
        n_samples, self.n_features_ = X.shape
        y = np.atleast_1d(y)
        
        if y.ndim == 1:
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]
        y, expanded_class_weight = self.check_y_class_wt(y)

        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
            y = np.ascontiguousarray(y, dtype=DOUBLE)

        if expanded_class_weight is not None:
            sample_weight = expanded_class_weight

        # Check parameters
        self._validate_estimator()

        random_state = np.random.mtrand._rand

        # Free allocated memory, if any
        self.estimators_ = []
        n_more_estimators = self.n_estimators - len(self.estimators_)

        trees = []
        for i in range(n_more_estimators):
            tree = self._make_estimator(append=False,
                                        random_state=random_state)
            trees.append(tree)

        trees = Parallel(n_jobs=2,
                         backend="threading")(
            delayed(build_trees)(t, X, y, sample_weight)
            for i, t in enumerate(trees))

        # Collect newly grown trees
        self.estimators_.extend(trees)

        # Decapsulate classes_ attributes
        if hasattr(self, "classes_") and self.n_outputs_ == 1:
            self.n_classes_ = self.n_classes_[0]
            self.classes_ = self.classes_[0]

        return self

    def check_y_class_wt(self, y):
        return y, None
    
    def check_X(self, X):
        return self.estimators_[0]._validate_X_predict(X, check_input=True)
Ejemplo n.º 16
0
class BaseSearchCV(six.with_metaclass(ABCMeta, BaseEstimator,
                                      MetaEstimatorMixin)):
    """Base class for hyper parameter search with cross-validation."""

    @abstractmethod
    def __init__(self, estimator, scoring=None,
                 fit_params=None, n_jobs=1, iid=True,
                 refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs',
                 error_score='raise'):

        self.scoring = scoring
        self.estimator = estimator
        self.n_jobs = n_jobs
        self.fit_params = fit_params if fit_params is not None else {}
        self.iid = iid
        self.refit = refit
        self.cv = cv
        self.verbose = verbose
        self.pre_dispatch = pre_dispatch
        self.error_score = error_score

    @property
    def _estimator_type(self):
        return self.estimator._estimator_type

    def score(self, X, y=None):
        """Returns the score on the given data, if the estimator has been refit.

        This uses the score defined by ``scoring`` where provided, and the
        ``best_estimator_.score`` method otherwise.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Input data, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.

        Returns
        -------
        score : float

        Notes
        -----
         * The long-standing behavior of this method changed in version 0.16.
         * It no longer uses the metric provided by ``estimator.score`` if the
           ``scoring`` parameter was set when fitting.

        """
        if self.scorer_ is None:
            raise ValueError("No score function explicitly defined, "
                             "and the estimator doesn't provide one %s"
                             % self.best_estimator_)
        if self.scoring is not None and hasattr(self.best_estimator_, 'score'):
            warnings.warn("The long-standing behavior to use the estimator's "
                          "score function in {0}.score has changed. The "
                          "scoring parameter is now used."
                          "".format(self.__class__.__name__),
                          ChangedBehaviorWarning)
        return self.scorer_(self.best_estimator_, X, y)

    @if_delegate_has_method(delegate='estimator')
    def predict(self, X):
        """Call predict on the estimator with the best found parameters.

        Only available if ``refit=True`` and the underlying estimator supports
        ``predict``.

        Parameters
        -----------
        X : indexable, length n_samples
            Must fulfill the input assumptions of the
            underlying estimator.

        """
        return self.best_estimator_.predict(X)

    @if_delegate_has_method(delegate='estimator')
    def predict_proba(self, X):
        """Call predict_proba on the estimator with the best found parameters.

        Only available if ``refit=True`` and the underlying estimator supports
        ``predict_proba``.

        Parameters
        -----------
        X : indexable, length n_samples
            Must fulfill the input assumptions of the
            underlying estimator.

        """
        return self.best_estimator_.predict_proba(X)

    @if_delegate_has_method(delegate='estimator')
    def predict_log_proba(self, X):
        """Call predict_log_proba on the estimator with the best found parameters.

        Only available if ``refit=True`` and the underlying estimator supports
        ``predict_log_proba``.

        Parameters
        -----------
        X : indexable, length n_samples
            Must fulfill the input assumptions of the
            underlying estimator.

        """
        return self.best_estimator_.predict_log_proba(X)

    @if_delegate_has_method(delegate='estimator')
    def decision_function(self, X):
        """Call decision_function on the estimator with the best found parameters.

        Only available if ``refit=True`` and the underlying estimator supports
        ``decision_function``.

        Parameters
        -----------
        X : indexable, length n_samples
            Must fulfill the input assumptions of the
            underlying estimator.

        """
        return self.best_estimator_.decision_function(X)

    @if_delegate_has_method(delegate='estimator')
    def transform(self, X):
        """Call transform on the estimator with the best found parameters.

        Only available if the underlying estimator supports ``transform`` and
        ``refit=True``.

        Parameters
        -----------
        X : indexable, length n_samples
            Must fulfill the input assumptions of the
            underlying estimator.

        """
        return self.best_estimator_.transform(X)

    @if_delegate_has_method(delegate='estimator')
    def inverse_transform(self, Xt):
        """Call inverse_transform on the estimator with the best found parameters.

        Only available if the underlying estimator implements
        ``inverse_transform`` and ``refit=True``.

        Parameters
        -----------
        Xt : indexable, length n_samples
            Must fulfill the input assumptions of the
            underlying estimator.

        """
        return self.best_estimator_.transform(Xt)

    def _fit(self, X, y, labels, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""

        estimator = self.estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y, labels = indexable(X, y, labels)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
        n_splits = cv.get_n_splits(X, y, labels)

        if self.verbose > 0 and isinstance(parameter_iterable, Sized):
            n_candidates = len(parameter_iterable)
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(n_splits, n_candidates,
                                     n_candidates * n_splits))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch
        )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
                                  train, test, self.verbose, parameters,
                                  self.fit_params, return_parameters=True,
                                  error_score=self.error_score)
          for parameters in parameter_iterable
          for train, test in cv.split(X, y, labels))

        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_splits):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _, parameters in \
                    out[grid_start:grid_start + n_splits]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_splits)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Ejemplo n.º 17
0
class RFClassifier(six.with_metaclass(ABCMeta, RandomForestModel,
                                          ClassifierMixin)):

    def __init__(self,
                 base_estimator=DecisionTreeClassifier(),
                 n_estimators=100):

        super(RFClassifier, self).__init__(
            base_estimator,
            n_estimators=n_estimators,
            estimator_params=("criterion", "max_depth", "min_samples_split",
                              "min_samples_leaf", "min_weight_fraction_leaf",
                              "max_features", "max_leaf_nodes",
                              "min_impurity_decrease", "min_impurity_split"))

        self.criterion = "gini"
        self.max_depth = None
        self.min_samples_split = 2
        self.min_samples_leaf = 1
        self.min_weight_fraction_leaf = 0.
        self.max_features = "auto"
        self.max_leaf_nodes = None
        self.min_impurity_decrease = 0.
        self.min_impurity_split = None

    def check_y_class_wt(self, y):
        y = np.copy(y)

        self.classes_ = []
        self.n_classes_ = []

        y_store_unique_indices = np.zeros(y.shape, dtype=np.int)
        for k in range(self.n_outputs_):
            classes_k, y_store_unique_indices[:, k] = np.unique(y[:, k], return_inverse=True)
            self.classes_.append(classes_k)
            self.n_classes_.append(classes_k.shape[0])
        y = y_store_unique_indices

        return y, None
    
    def get_predictions(self, X):
        proba = self.predict_proba(X)

        if self.n_outputs_ == 1:
            return self.classes_.take(np.argmax(proba, axis=1), axis=0)

        else:
            n_samples = proba[0].shape[0]
            predictions = np.zeros((n_samples, self.n_outputs_))

            for k in range(self.n_outputs_):
                predictions[:, k] = self.classes_[k].take(np.argmax(proba[k],
                                                                    axis=1),
                                                          axis=0)

            return predictions

    def predict_proba(self, X):
        # Check data
        X = self.check_X(X)

        # Assign chunk of trees to jobs
        n_jobs, _, _ = _partition_estimators(self.n_estimators, 2)

        # avoid storing the output of every estimator by summing them here
        all_proba = [np.zeros((X.shape[0], j), dtype=np.float64)
                     for j in np.atleast_1d(self.n_classes_)]
        lock = threading.Lock()
        Parallel(n_jobs=n_jobs, backend="threading")(
            delayed(accumulate_prediction)(e.predict_proba, X, all_proba, lock)
            for e in self.estimators_)

        for proba in all_proba:
            proba /= len(self.estimators_)

        if len(all_proba) == 1:
            return all_proba[0]
        else:
            return all_proba
Ejemplo n.º 18
0
class H2OBaseCrossValidator(six.with_metaclass(ABCMeta)):
    """Base class for H2O cross validation operations.
    All implementing subclasses should override ``get_n_splits``
    and ``_iter_test_indices``.
    """

    def __init__(self):
        pass

    def split(self, frame, y=None):
        """Generate indices to split data into training and test.

        Parameters
        ----------

        frame : ``H2OFrame``
            The h2o frame to split

        y : str, optional (default=None)
            The name of the column to stratify, if applicable.

        Returns
        -------

        train : ndarray
            The training set indices for the split

        test : ndarray
            The testing set indices for that split
        """

        frame = check_frame(frame, copy=False)
        indices = np.arange(frame.shape[0])
        for test_index in self._iter_test_masks(frame, y):
            train_index = indices[np.logical_not(test_index)]
            test_index = indices[test_index]

            # h2o can't handle anything but lists...
            yield list(train_index), list(test_index)

    def _iter_test_masks(self, frame, y=None):
        """Generates boolean masks corresponding to the tests set.

        Parameters
        ----------

        frame : H2OFrame
            The h2o frame to split

        y : string, optional (default=None)
            The column to stratify.

        Returns
        -------

        test_mask : np.ndarray, shape=(n_samples,)
            The indices for the test split
        """
        for test_index in self._iter_test_indices(frame, y):
            test_mask = np.zeros(frame.shape[0], dtype=np.bool)
            test_mask[test_index] = True
            yield test_mask

    def _iter_test_indices(self, frame, y=None):
        raise NotImplementedError('this method must be implemented by a subclass')

    @abstractmethod
    def get_n_splits(self):
        """Get the number of splits or folds for
        this instance of the cross validator.
        """
        pass

    def __repr__(self):
        return _build_repr(self)
Ejemplo n.º 19
0
class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)):
    """Mixin class for samplers with abstract method.

    Warning: This class should not be used directly. Use the derive classes
    instead.
    """

    _estimator_type = 'sampler'

    def _check_X_y(self, X, y):
        """Private function to check that the X and y in fitting are the same
        than in sampling."""
        X_hash, y_hash = hash_X_y(X, y)
        if self.X_hash_ != X_hash or self.y_hash_ != y_hash:
            raise RuntimeError("X and y need to be same array earlier fitted.")

    def sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : array-like, shape (n_samples,)
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : {ndarray, sparse matrix}, shape \
(n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new)
            The corresponding label of `X_resampled`

        """

        # Check the consistency of X and y
        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])

        check_is_fitted(self, 'ratio_')
        self._check_X_y(X, y)

        return self._sample(X, y)

    def fit_sample(self, X, y):
        """Fit the statistics and resample the data directly.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : array-like, shape (n_samples,)
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : {array-like, sparse matrix}, shape \
(n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : array-like, shape (n_samples_new,)
            The corresponding label of `X_resampled`

        """

        return self.fit(X, y).sample(X, y)

    @abstractmethod
    def _sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : array-like, shape (n_samples,)
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : {ndarray, sparse matrix}, shape \
(n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new,)
            The corresponding label of `X_resampled`

        """
        pass

    def __getstate__(self):
        """Prevent logger from being pickled."""
        object_dictionary = self.__dict__.copy()
        del object_dictionary['logger']
        return object_dictionary

    def __setstate__(self, dict):
        """Re-open the logger."""
        logger = logging.getLogger(__name__)
        self.__dict__.update(dict)
        self.logger = logger
Ejemplo n.º 20
0
class H2OBaseShuffleSplit(six.with_metaclass(ABCMeta)):
    """Base class for H2OShuffleSplit and H2OStratifiedShuffleSplit. This
    is used for ``h2o_train_test_split`` in strategic train/test splits of
    H2OFrames. Implementing subclasses should override ``_iter_indices``.

    Parameters
    ----------

    n_splits : int, optional (default=2)
        The number of folds or splits in the split

    test_size : float or int, optional (default=0.1)
        The ratio of observations for the test fold

    train_size : float or int, optional (default=None)
        The ratio of observations for the train fold   

    random_state : int or RandomState, optional (default=None)
        The random state for duplicative purposes. 
    """

    def __init__(self, n_splits=2, test_size=0.1, train_size=None, random_state=None):
        _validate_shuffle_split_init(test_size, train_size)
        self.n_splits = n_splits
        self.test_size = test_size
        self.train_size = train_size
        self.random_state = random_state

    def split(self, frame, y=None):
        """Split the frame.

        Parameters
        ----------

        frame : H2OFrame
            The frame to split

        y : string, optional (default=None)
            The column to stratify.
        """
        for train, test in self._iter_indices(frame, y):
            yield train, test

    @abstractmethod
    def _iter_indices(self, frame, y):
        """Abstract method for iterating the indices.

        Parameters
        ----------

        frame : H2OFrame
            The frame to split

        y : string, optional (default=None)
            The column to stratify.
        """
        pass

    def get_n_splits(self):
        """Get the number of splits or folds for
        this instance of the shuffle split.
        """
        return self.n_splits

    def __repr__(self):
        return _build_repr(self)
Ejemplo n.º 21
0
class StationaryCorrelation(with_metaclass(ABCMeta, object)):
    """ Base-class for stationary correlation models for Gaussian Processes.

    Stationary correlation models dependent only on the relative distance
    and not on the absolute positions of the respective datapoints. We can thus
    work internally solely on these distances.
    """
    def __init__(self):
        pass

    def fit(self, X, nugget=10. * MACHINE_EPSILON):
        """ Fits the correlation model for training data X

        Parameters
        ----------
        X : array_like, shape=(n_samples, n_features)
            An array of training datapoints at which observations were made,
            i.e., where the outputs y are known
        nugget : double or ndarray, optional
            The Gaussian Process nugget parameter
            The nugget is added to the diagonal of the assumed training
            covariance; in this way it acts as a Tikhonov regularization in
            the problem.  In the special case of the squared exponential
            correlation function, the nugget mathematically represents the
            variance of the input values. Default assumes a nugget close to
            machine precision for the sake of robustness
            (nugget = 10. * MACHINE_EPSILON).
        """
        self.X = X
        self.nugget = nugget
        self.n_samples = X.shape[0]

        # Calculate array with shape (n_eval, n_features) giving the
        # componentwise distances between locations x and x' at which the
        # correlation model should be evaluated.
        self.D, self.ij = l1_cross_differences(self.X)
        if (np.min(np.sum(self.D, axis=1)) == 0.
                and not isinstance(self, PureNugget)):
            raise Exception("Multiple input features cannot have the same"
                            " value.")

    def __call__(self, theta, X=None):
        """ Compute correlation for given correlation parameter(s) theta.

        Parameters
        ----------
        theta : array_like
            An array with giving the autocorrelation parameter(s).
            Dimensionality depends on the specific correlation model; often
            shape (1,) corresponds to an isotropic correlation model and shape
            (n_features,) to a anisotropic one.

        X : array_like, shape(n_eval, n_features)
            An array containing the n_eval query points whose correlation with
            the training datapoints shall be computed. If None, autocorrelation
            of the training datapoints is computed instead.

        Returns
        -------
        r : array_like, shape=(n_eval, n_samples) if X != None
                              (n_samples, n_samples) if X == None
            An array containing the values of the correlation model.
        """
        theta = np.asarray(theta, dtype=np.float)
        if X is not None:
            # Get pairwise componentwise L1-differences to the input training
            # set
            d = X[:, np.newaxis, :] - self.X[np.newaxis, :, :]
            d = d.reshape((-1, X.shape[1]))
        else:
            # No external datapoints given; auto-correlation of training set
            # is used instead
            d = self.D

        if d.ndim > 1:
            n_features = d.shape[1]
        else:
            n_features = 1

        # Compute the correlation for the respective correlation model (handled
        # by subclass)
        r = self._compute_corr(theta, d, n_features)

        if X is not None:
            # Convert to 2d matrix
            return r.reshape(-1, self.n_samples)
        else:
            # Auto-correlation computed only for upper triangular part of
            # matrix. Fill diagonal with 1+nugget and the lower triangular
            # by exploiting symmetry of matrix
            R = np.eye(self.n_samples) * (1. + self.nugget)
            R[self.ij[:, 0], self.ij[:, 1]] = r
            R[self.ij[:, 1], self.ij[:, 0]] = r
            return R

    def log_prior(self, theta):
        """ Returns the (log) prior probability of parameters theta.

        The prior is assumed to be uniform over the parameter space.
        NOTE: The returned quantity is an improper prior as its integral over
              the parameter space is not equal to 1.

        Parameters
        ----------
        theta : array_like, shape=(1,) or (n_features,)
            An array with shape 1 (isotropic) or n_features (anisotropic)
            giving the autocorrelation parameter(s).

        Returns
        -------
        log_p : float
            The (log) prior probability of parameters theta. An improper
            probability.
        """
        return 0

    @abstractmethod
    def _compute_corr(self, theta, d, n_features):
        """ Correlation for given pairwise, component-wise L1-differences.
Ejemplo n.º 22
0
class _H2OBaseKFold(six.with_metaclass(ABCMeta, H2OBaseCrossValidator)):
    """Base class for KFold and Stratified KFold.
    
    Parameters
    ----------

    n_folds : int
        The number of splits

    shuffle : bool
        Whether to shuffle indices

    random_state : int or RandomState
        The random state for the split
    """

    @abstractmethod
    def __init__(self, n_folds, shuffle, random_state):
        if not isinstance(n_folds, numbers.Integral):
            raise ValueError('n_folds must be of Integral type. '
                             '%s of type %s was passed' % (n_folds, type(n_folds)))

        n_folds = int(n_folds)
        if n_folds <= 1:
            raise ValueError('k-fold cross-validation requires at least one '
                             'train/test split by setting n_folds=2 or more')

        if shuffle not in [True, False]:
            raise TypeError('shuffle must be True or False. Got %s (type=%s)'
                            % (str(shuffle), type(shuffle)))

        self.n_folds = n_folds
        self.shuffle = shuffle
        self.random_state = random_state

    @overrides(H2OBaseCrossValidator)
    def split(self, frame, y=None):
        """Split the frame.

        Parameters
        ----------

        frame : H2OFrame
            The frame to split

        y : string, optional (default=None)
            The column to stratify.
        """
        frame = check_frame(frame, copy=False)
        n_obs = frame.shape[0]

        if self.n_folds > n_obs:
            raise ValueError('Cannot have n_folds greater than n_obs')

        for train, test in super(_H2OBaseKFold, self).split(frame, y):
            yield train, test

    @overrides(H2OBaseCrossValidator)
    def get_n_splits(self):
        """Get the number of splits or folds.

        Returns
        -------

        n_folds : int
            The number of folds
        """
        return self.n_folds
Ejemplo n.º 23
0
class semiKMeans(six.with_metaclass(ABCMeta, BaseEstimator,ClusterMixin, TransformerMixin)):
    def __init__(self,maxiter=100,fixedprec=1e-9,verbose=False):
        self.maxiter=maxiter
        self.verbose=verbose
        self.fixedprec=fixedprec
        self.labels=None
        self.plattlr = None
        self.cluster_centers_=None
    def predict(self, X):
            """Predict the closest cluster each sample in X belongs to.

            In the vector quantization literature, `cluster_centers_` is called
            the code book and each value returned by `predict` is the index of
            the closest code in the code book.

            Parameters
            ----------
            X : {array-like, sparse matrix}, shape = [n_samples, n_features]
                New data to predict.

            Returns
            -------
            labels : array, shape [n_samples,]
                Index of the cluster each sample belongs to.
            """
            #check_is_fitted(self, 'cluster_centers_')

            X = self._check_test_data(X)
            x_squared_norms = row_norms(X, squared=True)
            return _labels_inertia(X, x_squared_norms, self.cluster_centers_)[0]

    def _check_test_data(self, X):
        X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES,
                        warn_on_dtype=True)
        n_samples, n_features = X.shape
        expected_n_features = self.cluster_centers_.shape[1]
        if not n_features == expected_n_features:
            raise ValueError("Incorrect number of features. "
                             "Got %d features, expected %d" % (
                                 n_features, expected_n_features))

        return X
    def fit_transform(self,texts,labels):
        # Initialize clusters with labeled data
        clust_names = [x for x, y in collections.Counter(labels).items() if y > 0]
        clust_names = sorted(clust_names)[1:]
        #print(clust_names)
        #centroids = np.zeros((len(clust_names),len(texts[1,:])))
        centroids = np.zeros((len(clust_names),texts.shape[1]))

        for unique_name in clust_names:
            indices = [i for i, x in enumerate(labels) if x == unique_name]

            aux = texts[indices,:]
            #print(np.mean(aux,axis=0).shape)
            #print(centroids[clust_names.index(unique_name),:].shape)
            centroids[clust_names.index(unique_name),:] = np.mean(aux,axis=0)

        texts = mat(texts)
        centroids = mat(centroids)
        new_labels = labels

        cnt = 0
        # Main loop
        while cnt<self.maxiter:
            cnt +=1

            if self.verbose:
                print('Iter: '+str(cnt))

            # Assign data to nearest centroid (cosine distance)
            dist = dot(texts,centroids.T)/linalg.norm(texts)/linalg.norm(centroids)
            n_lab = dist.argmax(axis=1)

            for ii in range(len(n_lab)):
                new_labels[ii] = clust_names[n_lab[ii,0]]

    #        print [y for x, y in collections.Counter(new_labels).items() if y > 0]

            # Recalculate clusters
            new_centroids = np.zeros((len(clust_names),len(texts.T)))
            for unique_name in clust_names:
                indices = [i for i, x in enumerate(new_labels) if x == unique_name]

                if len(indices)>0:
                    aux = texts[indices,:]
                    new_centroids[clust_names.index(unique_name),:] = aux.mean(0)
                else:
                    new_centroids[clust_names.index(unique_name),:] = centroids[clust_names.index(unique_name),:]

            # Check exit condition
            difference = np.power((centroids-new_centroids),2)

            if difference.sum()<self.fixedprec:
                break;
            else:
                self.labels = new_labels
                centroids = new_centroids

        self.cluster_centers_=new_centroids
        self.plattlr = LR()
        preds = self.predict(texts[labels!=-1,:])
        self.plattlr.fit( preds.reshape( -1, 1 ), labels[labels!=-1])

        return (labels)

    def predict_proba(self, X):
        """Compute probabilities of possible outcomes for samples in X.

        The model need to have probability information computed at training
        time: fit with attribute `probability` set to True.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]

        Returns
        -------
        T : array-like, shape = [n_samples, n_classes]
            Returns the probability of the sample for each class in
            the model. The columns correspond to the classes in sorted
            order, as they appear in the attribute `classes_`.
        """


        preds = self.predict(X)
        #################Should be Added by me################

        #########################
        return self.plattlr.predict_proba(preds.reshape( -1, 1 ))
Ejemplo n.º 24
0
class BaseSymbolic(six.with_metaclass(ABCMeta, BaseEstimator)):
    """Base class for symbolic regression / classification estimators.

    Warning: This class should not be used directly.
    Use derived classes instead.
    """
    @abstractmethod
    def __init__(self,
                 population_size=1000,
                 hall_of_fame=None,
                 n_components=None,
                 generations=20,
                 tournament_size=20,
                 stopping_criteria=0.0,
                 const_range=(-1., 1.),
                 init_depth=(2, 6),
                 init_method='half and half',
                 function_set=('add', 'sub', 'mul', 'div'),
                 metric='mean absolute error',
                 parsimony_coefficient=0.001,
                 p_crossover=0.9,
                 p_subtree_mutation=0.01,
                 p_hoist_mutation=0.01,
                 p_point_mutation=0.01,
                 p_point_replace=0.05,
                 max_samples=1.0,
                 warm_start=False,
                 n_jobs=1,
                 verbose=0,
                 random_state=None):

        self.population_size = population_size
        self.hall_of_fame = hall_of_fame
        self.n_components = n_components
        self.generations = generations
        self.tournament_size = tournament_size
        self.stopping_criteria = stopping_criteria
        self.const_range = const_range
        self.init_depth = init_depth
        self.init_method = init_method
        self.function_set = function_set
        self.metric = metric
        self.parsimony_coefficient = parsimony_coefficient
        self.p_crossover = p_crossover
        self.p_subtree_mutation = p_subtree_mutation
        self.p_hoist_mutation = p_hoist_mutation
        self.p_point_mutation = p_point_mutation
        self.p_point_replace = p_point_replace
        self.max_samples = max_samples
        self.warm_start = warm_start
        self.n_jobs = n_jobs
        self.verbose = verbose
        self.random_state = random_state

    def _verbose_reporter(self,
                          start_time=None,
                          gen=None,
                          population=None,
                          fitness=None,
                          length=None):
        """A report of the progress of the evolution process.

        Parameters
        ----------
        start_time : float
            The start time for the current generation.

        gen : int
            The current generation (0 is the first naive random population).

        population : list
            The current population.

        fitness : list
            The current population's raw fitness.

        length : list
            The current population's lengths.
        """
        if start_time is None:
            print('%4s|%-25s|%-42s|' % (' ', 'Population Average'.center(25),
                                        'Best Individual'.center(42)))
            print('-' * 4 + ' ' + '-' * 25 + ' ' + '-' * 42 + ' ' + '-' * 10)
            header_fields = ('Gen', 'Length', 'Fitness', 'Length', 'Fitness',
                             'OOB Fitness', 'Time Left')
            print('%4s %8s %16s %8s %16s %16s %10s' % header_fields)

        else:
            # Estimate remaining time for run
            remaining_time = ((self.generations - gen - 1) *
                              (time() - start_time) / float(gen + 1))
            if remaining_time > 60:
                remaining_time = '{0:.2f}m'.format(remaining_time / 60.0)
            else:
                remaining_time = '{0:.2f}s'.format(remaining_time)

            # Find the current generation's best individual
            if self._metric.greater_is_better:
                best_program = population[np.argmax(fitness)]
            else:
                best_program = population[np.argmin(fitness)]

            oob_fitness = 'N/A'
            if self.max_samples < 1.0:
                oob_fitness = best_program.oob_fitness_

            print('%4s %8s %16s %8s %16s %16s %10s' %
                  (gen, np.round(np.mean(length),
                                 2), np.mean(fitness), best_program.length_,
                   best_program.raw_fitness_, oob_fitness, remaining_time))

    def fit(self, X, y, sample_weight=None):
        """Fit the Genetic Program according to X, y.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples]
            Target values.

        sample_weight : array-like, shape = [n_samples], optional
            Weights applied to individual samples.

        Returns
        -------
        self : object
            Returns self.
        """
        random_state = check_random_state(self.random_state)

        # Check arrays
        X, y = check_X_y(X, y, y_numeric=True)
        _, self.n_features_ = X.shape

        hall_of_fame = self.hall_of_fame
        if hall_of_fame is None:
            hall_of_fame = self.population_size
        if hall_of_fame > self.population_size or hall_of_fame < 1:
            raise ValueError('hall_of_fame (%d) must be less than or equal to '
                             'population_size (%d).' %
                             (self.hall_of_fame, self.population_size))
        n_components = self.n_components
        if n_components is None:
            n_components = hall_of_fame
        if n_components > hall_of_fame or n_components < 1:
            raise ValueError('n_components (%d) must be less than or equal to '
                             'hall_of_fame (%d).' %
                             (self.n_components, self.hall_of_fame))

        self._function_set = []
        for function in self.function_set:
            if isinstance(function, six.string_types):
                if function not in _function_map:
                    raise ValueError('invalid function name %s found in '
                                     '`function_set`.' % function)
                self._function_set.append(_function_map[function])
            elif isinstance(function, _Function):
                self._function_set.append(function)
            else:
                raise ValueError('invalid type %s found in `function_set`.' %
                                 type(function))
        if len(self._function_set) == 0:
            raise ValueError('No valid functions found in `function_set`.')

        # For point-mutation to find a compatible replacement node
        self._arities = {}
        for function in self._function_set:
            arity = function.arity
            self._arities[arity] = self._arities.get(arity, [])
            self._arities[arity].append(function)

        if isinstance(self.metric, _Fitness):
            self._metric = self.metric
        elif isinstance(self, RegressorMixin):
            if self.metric not in ('mean absolute error', 'mse', 'rmse'):
                raise ValueError('Unsupported metric: %s' % self.metric)
            else:
                self._metric = _fitness_map[self.metric]
        elif isinstance(self, TransformerMixin):
            if self.metric not in ('pearson', 'spearman'):
                raise ValueError('Unsupported metric: %s' % self.metric)
            else:
                self._metric = _fitness_map[self.metric]

        self._method_probs = np.array([
            self.p_crossover, self.p_subtree_mutation, self.p_hoist_mutation,
            self.p_point_mutation
        ])
        self._method_probs = np.cumsum(self._method_probs)

        if self._method_probs[-1] > 1:
            raise ValueError('The sum of p_crossover, p_subtree_mutation, '
                             'p_hoist_mutation and p_point_mutation should '
                             'total to 1.0 or less.')

        if self.init_method not in ('half and half', 'grow', 'full'):
            raise ValueError('Valid program initializations methods include '
                             '"grow", "full" and "half and half". Given %s.' %
                             self.init_method)

        if (not isinstance(self.const_range, tuple)
                or len(self.const_range) != 2):
            raise ValueError('const_range should be a tuple with length two.')

        if (not isinstance(self.init_depth, tuple)
                or len(self.init_depth) != 2):
            raise ValueError('init_depth should be a tuple with length two.')
        if self.init_depth[0] > self.init_depth[1]:
            raise ValueError('init_depth should be in increasing numerical '
                             'order: (min_depth, max_depth).')

        params = self.get_params()
        params['_metric'] = self._metric
        params['function_set'] = self._function_set
        params['arities'] = self._arities
        params['method_probs'] = self._method_probs

        if not self.warm_start or not hasattr(self, "_programs"):
            # Free allocated memory, if any
            self._programs = []

        prior_generations = len(self._programs)
        n_more_generations = self.generations - prior_generations

        if n_more_generations < 0:
            raise ValueError('generations=%d must be larger or equal to '
                             'len(_programs)=%d when warm_start==True' %
                             (self.generations, len(self._programs)))
        elif n_more_generations == 0:
            fitness = [program.raw_fitness_ for program in self._programs[-1]]
            warn("Warm-start fitting without increasing n_estimators does not "
                 "fit new trees.")

        if self.warm_start:
            # Generate and discard seeds that would have been produced on the
            # initial fit call.
            for i in range(len(self._programs)):
                _ = random_state.randint(MAX_INT, size=self.population_size)

        if self.verbose:
            # Print header fields
            self._verbose_reporter()
            start_time = time()

        for gen in range(prior_generations, self.generations):

            if gen == 0:
                parents = None
            else:
                parents = self._programs[gen - 1]

            # Parallel loop
            n_jobs, n_programs, starts = _partition_estimators(
                self.population_size, self.n_jobs)
            seeds = random_state.randint(MAX_INT, size=self.population_size)

            population = Parallel(
                n_jobs=n_jobs,
                verbose=int(self.verbose > 1))(delayed(_parallel_evolve)(
                    n_programs[i], parents, X, y, sample_weight,
                    seeds[starts[i]:starts[i + 1]], params)
                                               for i in range(n_jobs))

            # Reduce, maintaining order across different n_jobs
            population = list(itertools.chain.from_iterable(population))

            fitness = [program.raw_fitness_ for program in population]
            length = [program.length_ for program in population]

            parsimony_coefficient = None
            if self.parsimony_coefficient == 'auto':
                parsimony_coefficient = (np.cov(length, fitness)[1, 0] /
                                         np.var(length))
            for program in population:
                program.fitness_ = program.fitness(parsimony_coefficient)

            self._programs.append(population)

            # Remove old programs that didn't make it into the new population.
            for old_gen in np.arange(gen, 0, -1):
                indices = []
                for program in self._programs[old_gen]:
                    if program is not None:
                        for idx in program.parents:
                            if 'idx' in idx:
                                indices.append(program.parents[idx])
                indices = set(indices)
                for idx in range(self.population_size):
                    if idx not in indices:
                        self._programs[old_gen - 1][idx] = None

            if self.verbose:
                self._verbose_reporter(start_time, gen, population, fitness,
                                       length)

            # Check for early stopping
            if self._metric.greater_is_better:
                best_fitness = fitness[np.argmax(fitness)]
                if best_fitness >= self.stopping_criteria:
                    break
            else:
                best_fitness = fitness[np.argmin(fitness)]
                if best_fitness <= self.stopping_criteria:
                    break

        if isinstance(self, RegressorMixin):
            # Find the best individual in the final generation
            self._program = self._programs[-1][np.argmin(fitness)]

        if isinstance(self, TransformerMixin):
            # Find the best individuals in the final generation
            fitness = np.array(fitness)
            hall_of_fame = fitness.argsort()[:self.hall_of_fame]
            evaluation = np.array([
                gp.execute(X)
                for gp in [self._programs[-1][i] for i in hall_of_fame]
            ])
            if self.metric == 'spearman':
                evaluation = np.apply_along_axis(rankdata, 1, evaluation)

            # Iteratively remove the worst individual of the worst pair
            with np.errstate(divide='ignore', invalid='ignore'):
                correlations = np.abs(np.corrcoef(evaluation))
            np.fill_diagonal(correlations, 0.)
            components = list(range(self.hall_of_fame))
            indices = list(range(self.hall_of_fame))
            while len(components) > self.n_components:
                worst = np.unravel_index(np.argmax(correlations),
                                         correlations.shape)
                worst = worst[np.argmax(np.sum(correlations[worst, :], 1))]
                components.pop(worst)
                indices.remove(worst)
                correlations = correlations[:, indices][indices, :]
                indices = list(range(len(components)))
            self._best_programs = [
                self._programs[-1][i] for i in hall_of_fame[components]
            ]

        return self
class BaseMultilayerPerceptron(six.with_metaclass(ABCMeta, BaseEstimator)):
    """Base class for MLP classification and regression.

    Warning: This class should not be used directly.
    Use derived classes instead.
    """

    @abstractmethod
    def __init__(self, hidden_layer_sizes, activation, algorithm,
                 alpha, batch_size, learning_rate, learning_rate_init, power_t,
                 max_iter, loss, shuffle, random_state, tol, verbose,
                 warm_start, momentum, nesterovs_momentum, early_stopping,
                 validation_fraction, beta_1, beta_2, epsilon):
        self.activation = activation
        self.algorithm = algorithm
        self.alpha = alpha
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.learning_rate_init = learning_rate_init
        self.power_t = power_t
        self.max_iter = max_iter
        self.loss = loss
        self.hidden_layer_sizes = hidden_layer_sizes
        self.shuffle = shuffle
        self.random_state = random_state
        self.tol = tol
        self.verbose = verbose
        self.warm_start = warm_start
        self.momentum = momentum
        self.nesterovs_momentum = nesterovs_momentum
        self.early_stopping = early_stopping
        self.validation_fraction = validation_fraction
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.epsilon = epsilon

    def _unpack(self, packed_parameters):
        """Extract the coefficients and intercepts from packed_parameters."""
        for i in range(self.n_layers_ - 1):
            start, end, shape = self._coef_indptr[i]
            self.coefs_[i] = np.reshape(packed_parameters[start:end], shape)

            start, end = self._intercept_indptr[i]
            self.intercepts_[i] = packed_parameters[start:end]

    def _forward_pass(self, activations, with_output_activation=True):
        """Perform a forward pass on the network by computing the values
        of the neurons in the hidden layers and the output layer.

        Parameters
        ----------
        activations: list, length = n_layers - 1
            The ith element of the list holds the values of the ith layer.

        with_output_activation : bool, default True
            If True, the output passes through the output activation
            function, which is either the softmax function or the
            logistic function
        """
        hidden_activation = ACTIVATIONS[self.activation]
        # Iterate over the hidden layers
        for i in range(self.n_layers_ - 1):
            activations[i + 1] = safe_sparse_dot(activations[i],
                                                 self.coefs_[i])
            activations[i + 1] += self.intercepts_[i]

            # For the hidden layers
            if (i + 1) != (self.n_layers_ - 1):
                activations[i + 1] = hidden_activation(activations[i + 1])

        # For the last layer
        if with_output_activation:
            output_activation = ACTIVATIONS[self.out_activation_]
            activations[i + 1] = output_activation(activations[i + 1])

        return activations

    def _compute_loss_grad(self, layer, n_samples, activations, deltas,
                           coef_grads, intercept_grads):
        """Compute the gradient of loss with respect to coefs and intercept for
        specified layer.

        This function does backpropagation for the specified one layer.
        """
        coef_grads[layer] = safe_sparse_dot(activations[layer].T,
                                            deltas[layer])
        coef_grads[layer] += (self.alpha * self.coefs_[layer])
        coef_grads[layer] /= n_samples

        intercept_grads[layer] = np.mean(deltas[layer], 0)

        return coef_grads, intercept_grads

    def _loss_grad_lbfgs(self, packed_coef_inter, X, y, activations, deltas,
                         coef_grads, intercept_grads):
        """Compute the MLP loss function and its corresponding derivatives
        with respect to the different parameters given in the initialization.

        Returned gradients are packed in a single vector so it can be used
        in l-bfgs

        Parameters
        ----------
        packed_parameters : array-like
            A vector comprising the flattened coefficients and intercepts.

        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The input data.

        y : array-like, shape (n_samples,)
            The target values.

        activations: list, length = n_layers - 1
            The ith element of the list holds the values of the ith layer.

        deltas : list, length = n_layers - 1
            The ith element of the list holds the difference between the
            activations of the i + 1 layer and the backpropagated error.
            More specifically, deltas are gradients of loss with respect to z
            in each layer, where z = wx + b is the value of a particular layer
            before passing through the activation function

        coef_grad : list, length = n_layers - 1
            The ith element contains the amount of change used to update the
            coefficient parameters of the ith layer in an iteration.

        intercept_grads : list, length = n_layers - 1
            The ith element contains the amount of change used to update the
            intercept parameters of the ith layer in an iteration.

        Returns
        -------
        loss : float
        grad : array-like, shape (number of nodes of all layers,)

        """
        self._unpack(packed_coef_inter)
        loss, coef_grads, intercept_grads = self._backprop(
            X, y, activations, deltas, coef_grads, intercept_grads)
        self.n_iter_ += 1
        grad = _pack(coef_grads, intercept_grads)
        return loss, grad

    def _backprop(self, X, y, activations, deltas, coef_grads,
                  intercept_grads):
        """Compute the MLP loss function and its corresponding derivatives
        with respect to each parameter: weights and bias vectors.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The input data.

        y : array-like, shape (n_samples,)
            The target values.

        activations: list, length = n_layers - 1
             The ith element of the list holds the values of the ith layer.

        deltas : list, length = n_layers - 1
            The ith element of the list holds the difference between the
            activations of the i + 1 layer and the backpropagated error.
            More specifically, deltas are gradients of loss with respect to z
            in each layer, where z = wx + b is the value of a particular layer
            before passing through the activation function

        coef_grad : list, length = n_layers - 1
            The ith element contains the amount of change used to update the
            coefficient parameters of the ith layer in an iteration.

        intercept_grads : list, length = n_layers - 1
            The ith element contains the amount of change used to update the
            intercept parameters of the ith layer in an iteration.

        Returns
        -------
        loss : float
        coef_grads : list, length = n_layers - 1
        intercept_grads : list, length = n_layers - 1
        """
        n_samples = X.shape[0]

        # Forward propagate
        activations = self._forward_pass(activations)

        # Get loss
        loss = LOSS_FUNCTIONS[self.loss](y, activations[-1])
        # Add L2 regularization term to loss
        values = np.sum(
            np.array([np.dot(s.ravel(), s.ravel()) for s in self.coefs_]))
        loss += (0.5 * self.alpha) * values / n_samples

        # Backward propagate
        last = self.n_layers_ - 2

        # The calculation of delta[last] here works with following
        # combinations of output activation and loss function:
        # sigmoid and binary cross entropy, softmax and categorical cross
        # entropy, and identity with squared loss
        diff = y - activations[-1]
        deltas[last] = -diff

        # Compute gradient for the last layer
        coef_grads, intercept_grads = self._compute_loss_grad(
            last, n_samples, activations, deltas, coef_grads, intercept_grads)

        # Iterate over the hidden layers
        for i in range(self.n_layers_ - 2, 0, -1):
            deltas[i - 1] = safe_sparse_dot(deltas[i], self.coefs_[i].T)
            derivative = DERIVATIVES[self.activation]
            deltas[i - 1] *= derivative(activations[i])

            coef_grads, intercept_grads = self._compute_loss_grad(
                i - 1, n_samples, activations, deltas, coef_grads,
                intercept_grads)

        return loss, coef_grads, intercept_grads

    def _initialize(self, y, layer_units):
        # set all attributes, allocate weights etc for first call
        # Initialize parameters
        self.n_iter_ = 0
        self.t_ = 0
        self.n_outputs_ = y.shape[1]

        # Compute the number of layers
        self.n_layers_ = len(layer_units)

        # Output for regression
        if not isinstance(self, ClassifierMixin):
            self.out_activation_ = 'identity'
        # Output for multi class
        elif self.label_binarizer_.y_type_ == 'multiclass':
            self.out_activation_ = 'softmax'
        # Output for binary class and multi-label
        else:
            self.out_activation_ = 'logistic'
            if self.loss == 'log_loss':
                self.loss = 'binary_log_loss'

        # Initialize coefficient and intercept layers
        self.coefs_ = []
        self.intercepts_ = []

        for i in range(self.n_layers_ - 1):
            rng = check_random_state(self.random_state)
            coef_init, intercept_init = self._init_coef(layer_units[i],
                                                        layer_units[i + 1],
                                                        rng)
            self.coefs_.append(coef_init)
            self.intercepts_.append(intercept_init)

        if self.algorithm in _STOCHASTIC_ALGOS:
            self.loss_curve_ = []
            self._no_improvement_count = 0
            if self.early_stopping:
                self.validation_scores_ = []
                self.best_validation_score_ = -np.inf
            else:
                self.best_loss_ = np.inf

    def _init_coef(self, fan_in, fan_out, rng):
        if self.activation == 'logistic':
            # Use the initialization method recommended by
            # Glorot et al.
            init_bound = np.sqrt(2. / (fan_in + fan_out))
        elif self.activation == 'tanh':
            init_bound = np.sqrt(6. / (fan_in + fan_out))
        elif self.activation == 'relu':
            init_bound = np.sqrt(6. / (fan_in + fan_out))
        else:
            # this was caught earlier, just to make sure
            raise ValueError("Unknown activation function %s" %
                             self.activation)

        coef_init = rng.uniform(-init_bound, init_bound, (fan_in, fan_out))
        intercept_init = rng.uniform(-init_bound, init_bound, fan_out)
        return coef_init, intercept_init

    def _fit(self, X, y, incremental=False):
        # Make sure self.hidden_layer_sizes is a list
        hidden_layer_sizes = self.hidden_layer_sizes
        if not hasattr(hidden_layer_sizes, "__iter__"):
            hidden_layer_sizes = [hidden_layer_sizes]
        hidden_layer_sizes = list(hidden_layer_sizes)

        # Validate input parameters.
        self._validate_hyperparameters()
        if np.any(np.array(hidden_layer_sizes) <= 0):
            raise ValueError("hidden_layer_sizes must be > 0, got %s." %
                             hidden_layer_sizes)

        X, y = self._validate_input(X, y, incremental)
        n_samples, n_features = X.shape

        # Ensure y is 2D
        if y.ndim == 1:
            y = y.reshape((-1, 1))

        self.n_outputs_ = y.shape[1]

        layer_units = ([n_features] + hidden_layer_sizes +
                       [self.n_outputs_])

        if not hasattr(self, 'coefs_') or (not self.warm_start and not
                                           incremental):
            # First time training the model
            self._initialize(y, layer_units)

        # l-bfgs does not support mini-batches
        if self.algorithm == 'l-bfgs':
            batch_size = n_samples
        elif self.batch_size == 'auto':
            batch_size = min(200, n_samples)
        else:
            if self.batch_size < 1 or self.batch_size > n_samples:
                warnings.warn("Got `batch_size` less than 1 or larger than "
                              "sample size. It is going to be clipped")
            batch_size = np.clip(self.batch_size, 1, n_samples)

        # Initialize lists
        activations = [X]
        activations.extend(np.empty((batch_size, n_fan_out))
                           for n_fan_out in layer_units[1:])
        deltas = [np.empty_like(a_layer) for a_layer in activations]

        coef_grads = [np.empty((n_fan_in_, n_fan_out_)) for n_fan_in_,
                      n_fan_out_ in zip(layer_units[:-1],
                                        layer_units[1:])]

        intercept_grads = [np.empty(n_fan_out_) for n_fan_out_ in
                           layer_units[1:]]

        # Run the Stochastic optimization algorithm
        if self.algorithm in _STOCHASTIC_ALGOS:
            self._fit_stochastic(X, y, activations, deltas, coef_grads,
                                 intercept_grads, layer_units, incremental)

        # Run the LBFGS algorithm
        elif self.algorithm == 'l-bfgs':
            self._fit_lbfgs(X, y, activations, deltas, coef_grads,
                            intercept_grads, layer_units)
        return self

    def _validate_hyperparameters(self):
        if not isinstance(self.shuffle, bool):
            raise ValueError("shuffle must be either True or False, got %s." %
                             self.shuffle)
        if self.max_iter <= 0:
            raise ValueError("max_iter must be > 0, got %s." % self.max_iter)
        if self.alpha < 0.0:
            raise ValueError("alpha must be >= 0, got %s." % self.alpha)
        if (self.learning_rate in ["constant", "invscaling", "adaptive"] and
                self.learning_rate_init <= 0.0):
            raise ValueError("learning_rate_init must be > 0, got %s." %
                             self.learning_rate)
        if self.momentum > 1 or self.momentum < 0:
            raise ValueError("momentum must be >= 0 and <= 1, got %s" %
                             self.momentum)
        if not isinstance(self.nesterovs_momentum, bool):
            raise ValueError("nesterovs_momentum must be either True or False,"
                             " got %s." % self.nesterovs_momentum)
        if not isinstance(self.early_stopping, bool):
            raise ValueError("early_stopping must be either True or False,"
                             " got %s." % self.early_stopping)
        if self.validation_fraction < 0 or self.validation_fraction >= 1:
            raise ValueError("validation_fraction must be >= 0 and < 1, "
                             "got %s" % self.validation_fraction)
        if self.beta_1 < 0 or self.beta_1 >= 1:
            raise ValueError("beta_1 must be >= 0 and < 1, got %s" %
                             self.beta_1)
        if self.beta_2 < 0 or self.beta_2 >= 1:
            raise ValueError("beta_2 must be >= 0 and < 1, got %s" %
                             self.beta_2)
        if self.epsilon <= 0.0:
            raise ValueError("epsilon must be > 0, got %s." % self.epsilon)

        # raise ValueError if not registered
        supported_activations = ['logistic', 'tanh', 'relu']
        if self.activation not in supported_activations:
            raise ValueError("The activation '%s' is not supported. Supported "
                             "activations are %s." % (self.activation,
                                                      supported_activations))
        if self.learning_rate not in ["constant", "invscaling", "adaptive"]:
            raise ValueError("learning rate %s is not supported. " %
                             self.learning_rate)
        if self.algorithm not in _STOCHASTIC_ALGOS + ["l-bfgs"]:
            raise ValueError("The algorithm %s is not supported. " %
                             self.algorithm)

    def _fit_lbfgs(self, X, y, activations, deltas, coef_grads,
                   intercept_grads, layer_units):
        # Store meta information for the parameters
        self._coef_indptr = []
        self._intercept_indptr = []
        start = 0

        # Save sizes and indices of coefficients for faster unpacking
        for i in range(self.n_layers_ - 1):
            n_fan_in, n_fan_out = layer_units[i], layer_units[i + 1]

            end = start + (n_fan_in * n_fan_out)
            self._coef_indptr.append((start, end, (n_fan_in, n_fan_out)))
            start = end

        # Save sizes and indices of intercepts for faster unpacking
        for i in range(self.n_layers_ - 1):
            end = start + layer_units[i + 1]
            self._intercept_indptr.append((start, end))
            start = end

        # Run LBFGS
        packed_coef_inter = _pack(self.coefs_,
                                  self.intercepts_)

        if self.verbose is True or self.verbose >= 1:
            iprint = 1
        else:
            iprint = -1

        optimal_parameters, self.loss_, d = fmin_l_bfgs_b(
            x0=packed_coef_inter,
            func=self._loss_grad_lbfgs,
            maxfun=self.max_iter,
            iprint=iprint,
            pgtol=self.tol,
            args=(X, y, activations, deltas, coef_grads, intercept_grads))

        self._unpack(optimal_parameters)

    def _fit_stochastic(self, X, y, activations, deltas, coef_grads,
                        intercept_grads, layer_units, incremental):
        rng = check_random_state(self.random_state)

        if not incremental or not hasattr(self, '_optimizer'):
            params = self.coefs_ + self.intercepts_

            if self.algorithm == 'sgd':
                self._optimizer = SGDOptimizer(
                    params, self.learning_rate_init, self.learning_rate,
                    self.momentum, self.nesterovs_momentum, self.power_t)
            elif self.algorithm == 'adam':
                self._optimizer = AdamOptimizer(
                    params, self.learning_rate_init, self.beta_1, self.beta_2,
                    self.epsilon)

        # early_stopping in partial_fit doesn't make sense
        early_stopping = self.early_stopping and not incremental
        if early_stopping:
            X, X_val, y, y_val = train_test_split(
                X, y, random_state=self.random_state,
                test_size=self.validation_fraction)
            if isinstance(self, ClassifierMixin):
                y_val = self.label_binarizer_.inverse_transform(y_val)
        else:
            X_val = None
            y_val = None

        n_samples = X.shape[0]

        if self.batch_size == 'auto':
            batch_size = min(200, n_samples)
        else:
            batch_size = np.clip(self.batch_size, 1, n_samples)

        try:
            for it in range(self.max_iter):
                X, y = shuffle(X, y, random_state=rng)
                accumulated_loss = 0.0
                for batch_slice in gen_batches(n_samples, batch_size):
                    activations[0] = X[batch_slice]
                    batch_loss, coef_grads, intercept_grads = self._backprop(
                        X[batch_slice], y[batch_slice], activations, deltas,
                        coef_grads, intercept_grads)
                    accumulated_loss += batch_loss * (batch_slice.stop -
                                                      batch_slice.start)

                    # update weights
                    grads = coef_grads + intercept_grads
                    self._optimizer.update_params(grads)

                self.n_iter_ += 1
                self.loss_ = accumulated_loss / X.shape[0]

                self.t_ += n_samples
                self.loss_curve_.append(self.loss_)
                if self.verbose:
                    print("Iteration %d, loss = %.8f" % (self.n_iter_,
                                                         self.loss_))

                # update no_improvement_count based on training loss or
                # validation score according to early_stopping
                self._update_no_improvement_count(early_stopping, X_val, y_val)

                # for learning rate that needs to be updated at iteration end
                self._optimizer.iteration_ends(self.t_)

                if self._no_improvement_count > 2:
                    # not better than last two iterations by tol.
                    # stop or decrease learning rate
                    if early_stopping:
                        msg = ("Validation score did not improve more than "
                               "tol=%f for two consecutive epochs." % self.tol)
                    else:
                        msg = ("Training loss did not improve more than tol=%f"
                               " for two consecutive epochs." % self.tol)

                    is_stopping = self._optimizer.trigger_stopping(
                        msg, self.verbose)
                    if is_stopping:
                        break
                    else:
                        self._no_improvement_count = 0

                if incremental:
                    break

                if self.n_iter_ == self.max_iter:
                    warnings.warn('Stochastic Optimizer: Maximum iterations'
                                  ' reached and the optimization hasn\'t '
                                  'converged yet.'
                                  % (), ConvergenceWarning)
        except KeyboardInterrupt:
            pass

        if early_stopping:
            # restore best weights
            self.coefs_ = self._best_coefs
            self.intercepts_ = self._best_intercepts

    def _update_no_improvement_count(self, early_stopping, X_val, y_val):
        if early_stopping:
            # compute validation score, use that for stopping
            self.validation_scores_.append(self.score(X_val, y_val))

            if self.verbose:
                print("Validation score: %f" % self.validation_scores_[-1])
            # update best parameters
            # use validation_scores_, not loss_curve_
            # let's hope no-one overloads .score with mse
            last_valid_score = self.validation_scores_[-1]

            if last_valid_score < (self.best_validation_score_ +
                                   self.tol):
                self._no_improvement_count += 1
            else:
                self._no_improvement_count = 0

            if last_valid_score > self.best_validation_score_:
                self.best_validation_score_ = last_valid_score
                self._best_coefs = [c.copy() for c in self.coefs_]
                self._best_intercepts = [i.copy()
                                         for i in self.intercepts_]
        else:
            if self.loss_curve_[-1] > self.best_loss_ - self.tol:
                self._no_improvement_count += 1
            else:
                self._no_improvement_count = 0
            if self.loss_curve_[-1] < self.best_loss_:
                self.best_loss_ = self.loss_curve_[-1]

    def fit(self, X, y):
        """Fit the model to data matrix X and target y.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The input data.

        y : array-like, shape (n_samples,)
            The target values.

        Returns
        -------
        self : returns a trained MLP model.
        """
        return self._fit(X, y, incremental=False)

    @property
    def partial_fit(self):
        """Fit the model to data matrix X and target y.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The input data.

        y : array-like, shape (n_samples,)
            The target values.

        Returns
        -------
        self : returns a trained MLP model.
        """
        if self.algorithm not in _STOCHASTIC_ALGOS:
            raise AttributeError("partial_fit is only available for stochastic"
                                 "optimization algorithms. %s is not"
                                 " stochastic" % self.algorithm)
        return self._partial_fit

    def _partial_fit(self, X, y, classes=None):
        return self._fit(X, y, incremental=True)

    def _decision_scores(self, X):
        """Predict using the trained model

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The input data.

        Returns
        -------
        y_pred : array-like, shape (n_samples,) or (n_samples, n_outputs)
            The decision function of the samples for each class in the model.
        """
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])

        # Make sure self.hidden_layer_sizes is a list
        hidden_layer_sizes = self.hidden_layer_sizes
        if not hasattr(hidden_layer_sizes, "__iter__"):
            hidden_layer_sizes = [hidden_layer_sizes]
        hidden_layer_sizes = list(hidden_layer_sizes)

        layer_units = [X.shape[1]] + hidden_layer_sizes + \
            [self.n_outputs_]

        # Initialize layers
        activations = [X]

        for i in range(self.n_layers_ - 1):
            activations.append(np.empty((X.shape[0],
                                         layer_units[i + 1])))
        # forward propagate
        self._forward_pass(activations, with_output_activation=False)
        y_pred = activations[-1]

        return y_pred
Ejemplo n.º 26
0
class IRAPSClassifier(
        six.with_metaclass(ABCMeta, _BaseFilter, BaseEstimator,
                           RegressorMixin)):
    """
    Extend the bases of both sklearn feature_selector and classifier.
    From sklearn BaseEstimator:
        get_params()
        set_params()
    From sklearn _BaseFilter:
        get_support()
        fit_transform(X)
        transform(X)
    From sklearn RegressorMixin:
        score(X, y): R2
    New:
        predict(X)
        predict_label(X)
        get_signature()
    Properties:
        discretize_value

    Parameters
    ----------
    iraps_core: object
    p_thres: float, threshold for p_values
    fc_thres: float, threshold for fold change or mean difference
    occurrence: float, occurrence rate selected by set of p_thres and fc_thres
    discretize: float, threshold of z_score to discretize target value
    memory: None, str or joblib.Memory object
    min_signature_features: int, the mininum number of features in a signature
    """
    def __init__(self,
                 iraps_core,
                 p_thres=1e-4,
                 fc_thres=0.1,
                 occurrence=0.8,
                 discretize=-1,
                 memory=None,
                 min_signature_features=1):
        self.iraps_core = iraps_core
        self.p_thres = p_thres
        self.fc_thres = fc_thres
        self.occurrence = occurrence
        self.discretize = discretize
        self.memory = memory
        self.min_signature_features = min_signature_features

    def fit(self, X, y):
        memory = check_memory(self.memory)
        cached_fit = memory.cache(_iraps_core_fit)
        iraps_core = clone(self.iraps_core)
        # allow pre-fitted iraps_core here
        if not hasattr(iraps_core, 'pvalues_'):
            iraps_core = cached_fit(iraps_core, X, y)
        self.iraps_core_ = iraps_core

        pvalues = as_float_array(iraps_core.pvalues_, copy=True)
        # why np.nan is here?
        pvalues[np.isnan(pvalues)] = np.finfo(pvalues.dtype).max

        fold_changes = as_float_array(iraps_core.fold_changes_, copy=True)
        fold_changes[np.isnan(fold_changes)] = 0.0

        base_values = as_float_array(iraps_core.base_values_, copy=True)

        p_thres = self.p_thres
        fc_thres = self.fc_thres
        occurrence = self.occurrence

        mask_0 = np.zeros(pvalues.shape, dtype=np.int32)
        # mark p_values less than the threashold
        mask_0[pvalues <= p_thres] = 1
        # mark fold_changes only when greater than the threashold
        mask_0[abs(fold_changes) < fc_thres] = 0

        # count the occurrence and mask greater than the threshold
        counts = mask_0.sum(axis=0)
        occurrence_thres = int(occurrence * iraps_core.n_iter)
        mask = np.zeros(counts.shape, dtype=bool)
        mask[counts >= occurrence_thres] = 1

        # generate signature
        fold_changes[mask_0 == 0] = 0.0
        signature = fold_changes[:, mask].sum(axis=0) / counts[mask]
        signature = np.vstack((signature, base_values[:, mask].mean(axis=0)))
        # It's not clearn whether min_size could impact prediction
        # performance
        if signature is None\
                or signature.shape[1] < self.min_signature_features:
            raise ValueError("The classifier got None signature or the number "
                             "of sinature feature is less than minimum!")

        self.signature_ = np.asarray(signature)
        self.mask_ = mask
        # TODO: support other discretize method: fixed value, upper
        # third quater, etc.
        self.discretize_value = y.mean() + y.std() * self.discretize
        if iraps_core.negative_thres > iraps_core.positive_thres:
            self.less_is_positive = True
        else:
            self.less_is_positive = False

        return self

    def _get_support_mask(self):
        """
        return mask of feature selection indices
        """
        check_is_fitted(self, 'mask_')

        return self.mask_

    def get_signature(self):
        """
        return signature
        """
        check_is_fitted(self, 'signature_')

        return self.signature_

    def predict(self, X):
        """
        compute the correlation coefficient with irpas signature
        """
        signature = self.get_signature()

        X = as_float_array(X)
        X_transformed = self.transform(X) - signature[1]
        corrcoef = np.array(
            [np.corrcoef(signature[0], e)[0][1] for e in X_transformed])
        corrcoef[np.isnan(corrcoef)] = np.finfo(np.float32).min

        return corrcoef

    def predict_label(self, X, clf_cutoff=0.4):
        return self.predict(X) >= clf_cutoff
Ejemplo n.º 27
0
class BaseSpectral(six.with_metaclass(ABCMeta, BaseEstimator, BiclusterMixin)):
    """Base class for spectral biclustering."""
    @abstractmethod
    def __init__(self,
                 n_clusters=3,
                 svd_method="randomized",
                 n_svd_vecs=None,
                 mini_batch=False,
                 init="k-means++",
                 n_init=10,
                 n_jobs=1,
                 random_state=None):
        self.n_clusters = n_clusters
        self.svd_method = svd_method
        self.n_svd_vecs = n_svd_vecs
        self.mini_batch = mini_batch
        self.init = init
        self.n_init = n_init
        self.n_jobs = n_jobs
        self.random_state = random_state

    def _check_parameters(self):
        legal_svd_methods = ('randomized', 'arpack')
        if self.svd_method not in legal_svd_methods:
            raise ValueError("Unknown SVD method: '{0}'. svd_method must be"
                             " one of {1}.".format(self.svd_method,
                                                   legal_svd_methods))

    def fit(self, X):
        """Creates a biclustering for X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)

        """
        X = check_array(X, accept_sparse='csr', dtype=np.float64)
        check_array_ndim(X)
        self._check_parameters()
        self._fit(X)

    def _svd(self, array, n_components, n_discard):
        """Returns first `n_components` left and right singular
        vectors u and v, discarding the first `n_discard`.

        """
        if self.svd_method == 'randomized':
            kwargs = {}
            if self.n_svd_vecs is not None:
                kwargs['n_oversamples'] = self.n_svd_vecs
            u, _, vt = randomized_svd(array,
                                      n_components,
                                      random_state=self.random_state,
                                      **kwargs)

        elif self.svd_method == 'arpack':
            u, _, vt = svds(array, k=n_components, ncv=self.n_svd_vecs)
            if np.any(np.isnan(vt)):
                # some eigenvalues of A * A.T are negative, causing
                # sqrt() to be np.nan. This causes some vectors in vt
                # to be np.nan.
                _, v = eigsh(safe_sparse_dot(array.T, array),
                             ncv=self.n_svd_vecs)
                vt = v.T
            if np.any(np.isnan(u)):
                _, u = eigsh(safe_sparse_dot(array, array.T),
                             ncv=self.n_svd_vecs)

        assert_all_finite(u)
        assert_all_finite(vt)
        u = u[:, n_discard:]
        vt = vt[n_discard:]
        return u, vt.T

    def _k_means(self, data, n_clusters):
        if self.mini_batch:
            model = MiniBatchKMeans(n_clusters,
                                    init=self.init,
                                    n_init=self.n_init,
                                    random_state=self.random_state)
        else:
            model = KMeans(n_clusters,
                           init=self.init,
                           n_init=self.n_init,
                           n_jobs=self.n_jobs,
                           random_state=self.random_state)
        model.fit(data)
        centroid = model.cluster_centers_
        labels = model.labels_
        return centroid, labels
Ejemplo n.º 28
0
class IRAPSCore(six.with_metaclass(ABCMeta, BaseEstimator)):
    """
    Base class of IRAPSClassifier
    From sklearn BaseEstimator:
        get_params()
        set_params()

    Parameters
    ----------
    n_iter : int
        sample count

    positive_thres : float
        z_score shreshold to discretize positive target values

    negative_thres : float
        z_score threshold to discretize negative target values

    verbose : int
        0 or geater, if not 0, print progress

    n_jobs : int, default=1
        The number of CPUs to use to do the computation.

    pre_dispatch : int, or string.
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an
        explosion of memory consumption when more jobs get dispatched
        than CPUs can process. This parameter can be:
            - None, in which case all the jobs are immediately
              created and spawned. Use this for lightweight and
              fast-running jobs, to avoid delays due to on-demand
              spawning of the jobs
            - An int, giving the exact number of total jobs that are
              spawned
            - A string, giving an expression as a function of n_jobs,
              as in '2*n_jobs'

    random_state : int or None
    """
    def __init__(self,
                 n_iter=1000,
                 positive_thres=-1,
                 negative_thres=0,
                 verbose=0,
                 n_jobs=1,
                 pre_dispatch='2*n_jobs',
                 random_state=None):
        """
        IRAPS turns towwards general Anomaly Detection
        It comapares positive_thres with negative_thres,
        and decide which portion is the positive target.
        e.g.:
        (positive_thres=-1, negative_thres=0)
                 => positive = Z_score of target < -1
        (positive_thres=1, negative_thres=0)
                 => positive = Z_score of target > 1

        Note: The positive targets here is always the
            abnormal minority group.
        """
        self.n_iter = n_iter
        self.positive_thres = positive_thres
        self.negative_thres = negative_thres
        self.verbose = verbose
        self.n_jobs = n_jobs
        self.pre_dispatch = pre_dispatch
        self.random_state = random_state

    def fit(self, X, y):
        """
        X: array-like (n_samples x n_features)
        y: 1-d array-like (n_samples)
        """
        X, y = check_X_y(X, y, ['csr', 'csc'], multi_output=False)

        def _stochastic_sampling(X,
                                 y,
                                 random_state=None,
                                 positive_thres=-1,
                                 negative_thres=0):
            # each iteration select a random number of random subset of
            # training samples. this is somewhat different from the original
            # IRAPS method, but effect is almost the same.
            SAMPLE_SIZE = [0.25, 0.75]
            n_samples = X.shape[0]

            if random_state is None:
                n_select = random.randint(int(n_samples * SAMPLE_SIZE[0]),
                                          int(n_samples * SAMPLE_SIZE[1]))
                index = random.sample(list(range(n_samples)), n_select)
            else:
                n_select = random.Random(random_state).randint(
                    int(n_samples * SAMPLE_SIZE[0]),
                    int(n_samples * SAMPLE_SIZE[1]))
                index = random.Random(random_state).sample(
                    list(range(n_samples)), n_select)

            X_selected, y_selected = X[index], y[index]

            # Spliting by z_scores.
            y_selected = (y_selected - y_selected.mean()) / y_selected.std()
            if positive_thres < negative_thres:
                X_selected_positive = X_selected[y_selected < positive_thres]
                X_selected_negative = X_selected[y_selected > negative_thres]
            else:
                X_selected_positive = X_selected[y_selected > positive_thres]
                X_selected_negative = X_selected[y_selected < negative_thres]

            # For every iteration, at least 5 responders are selected
            if X_selected_positive.shape[0] < 5:
                warnings.warn("Warning: fewer than 5 positives were selected!")
                return

            # p_values
            _, p = ttest_ind(X_selected_positive,
                             X_selected_negative,
                             axis=0,
                             equal_var=False)

            # fold_change == mean change?
            # TODO implement other normalization method
            positive_mean = X_selected_positive.mean(axis=0)
            negative_mean = X_selected_negative.mean(axis=0)
            mean_change = positive_mean - negative_mean
            # mean_change = np.select(
            #       [positive_mean > negative_mean,
            #           positive_mean < negative_mean],
            #       [positive_mean / negative_mean,
            #           -negative_mean / positive_mean])
            # mean_change could be adjusted by power of 2
            # mean_change = 2**mean_change \
            #       if mean_change>0 else -2**abs(mean_change)

            return p, mean_change, negative_mean

        parallel = Parallel(n_jobs=self.n_jobs,
                            verbose=self.verbose,
                            pre_dispatch=self.pre_dispatch)
        if self.random_state is None:
            res = parallel(
                delayed(_stochastic_sampling)(
                    X,
                    y,
                    random_state=None,
                    positive_thres=self.positive_thres,
                    negative_thres=self.negative_thres)
                for i in range(self.n_iter))
        else:
            res = parallel(
                delayed(_stochastic_sampling)(
                    X,
                    y,
                    random_state=seed,
                    positive_thres=self.positive_thres,
                    negative_thres=self.negative_thres)
                for seed in range(self.random_state, self.random_state +
                                  self.n_iter))
        res = [_ for _ in res if _]
        if len(res) < 50:
            raise ValueError("too few (%d) valid feature lists "
                             "were generated!" % len(res))
        pvalues = np.vstack([x[0] for x in res])
        fold_changes = np.vstack([x[1] for x in res])
        base_values = np.vstack([x[2] for x in res])

        self.pvalues_ = np.asarray(pvalues)
        self.fold_changes_ = np.asarray(fold_changes)
        self.base_values_ = np.asarray(base_values)

        return self
Ejemplo n.º 29
0
class BaseBagging(with_metaclass(ABCMeta, BaseEnsemble)):
    """Base class for Bagging meta-estimator.

    Warning: This class should not be used directly. Use derived classes
    instead.
    """
    @abstractmethod
    def __init__(self,
                 base_estimator=None,
                 n_estimators=10,
                 max_samples=1.0,
                 max_features=1.0,
                 bootstrap=True,
                 bootstrap_features=False,
                 oob_score=False,
                 warm_start=False,
                 n_jobs=1,
                 random_state=None,
                 verbose=0):
        super(BaseBagging, self).__init__(base_estimator=base_estimator,
                                          n_estimators=n_estimators)

        self.max_samples = max_samples
        self.max_features = max_features
        self.bootstrap = bootstrap
        self.bootstrap_features = bootstrap_features
        self.oob_score = oob_score
        self.warm_start = warm_start
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.verbose = verbose

    def fit(self, X, y):
        """Build a Bagging ensemble of estimators from the training
           set (X, y).

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrices are accepted only if
            they are supported by the base estimator.

        y : array-like, shape = [n_samples]
            The target values (class labels in classification, real numbers in
            regression).


        Returns
        -------
        self : object
            Returns self.
        """
        random_state = check_random_state(self.random_state)

        # Convert data
        X, y = check_X_y(X, y, ['csr', 'csc'])

        # Remap output
        n_samples, self.n_features_ = X.shape
        y = self._validate_y(y)

        # Check parameters
        self._validate_estimator()

        if isinstance(self.max_samples, (numbers.Integral, np.integer)):
            max_samples = self.max_samples
        else:  # float
            max_samples = int(self.max_samples * X.shape[0])

        if not (0 < max_samples <= X.shape[0]):
            raise ValueError("max_samples must be in (0, n_samples]")

        if isinstance(self.max_features, (numbers.Integral, np.integer)):
            max_features = self.max_features
        else:  # float
            max_features = int(self.max_features * self.n_features_)

        if not (0 < max_features <= self.n_features_):
            raise ValueError("max_features must be in (0, n_features]")

        if not self.bootstrap and self.oob_score:
            raise ValueError("Out of bag estimation only available"
                             " if bootstrap=True")

        if self.warm_start and self.oob_score:
            raise ValueError("Out of bag estimate only available"
                             " if warm_start=False")

        if hasattr(self, "oob_score_") and self.warm_start:
            del self.oob_score_

        if not self.warm_start or len(self.estimators_) == 0:
            # Free allocated memory, if any
            self.estimators_ = []
            self.estimators_samples_ = []
            self.estimators_features_ = []

        n_more_estimators = self.n_estimators - len(self.estimators_)

        if n_more_estimators < 0:
            raise ValueError('n_estimators=%d must be larger or equal to '
                             'len(estimators_)=%d when warm_start==True' %
                             (self.n_estimators, len(self.estimators_)))

        elif n_more_estimators == 0:
            warn("Warm-start fitting without increasing n_estimators does not "
                 "fit new trees.")
            return self

        # Parallel loop
        n_jobs, n_estimators, starts = _partition_estimators(
            n_more_estimators, self.n_jobs)

        # Advance random state to state after training
        # the first n_estimators
        if self.warm_start and len(self.estimators_) > 0:
            random_state.randint(MAX_INT, size=len(self.estimators_))

        seeds = random_state.randint(MAX_INT, size=n_more_estimators)

        all_results = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
            # TEF: changed following call to balanced procedure:
            delayed(_parallel_build_balanced_estimators)(
                n_estimators[i],
                self,
                X,
                y,
                seeds[starts[i]:starts[i + 1]],
                verbose=self.verbose) for i in range(n_jobs))

        # Reduce
        self.estimators_ += list(
            itertools.chain.from_iterable(t[0] for t in all_results))
        self.estimators_samples_ += list(
            itertools.chain.from_iterable(t[1] for t in all_results))
        self.estimators_features_ += list(
            itertools.chain.from_iterable(t[2] for t in all_results))

        if self.oob_score:
            self._set_oob_score(X, y)

        return self

    @abstractmethod
    def _set_oob_score(self, X, y):
        """Calculate out of bag predictions and score."""

    def _validate_y(self, y):
        # Default implementation
        return column_or_1d(y, warn=True)
Ejemplo n.º 30
0
class BaseDecisionTree(six.with_metaclass(ABCMeta, BaseEstimator,
                                          _LearntSelectorMixin)):
    """Base class for decision trees.

    Warning: This class should not be used directly.
    Use derived classes instead.
    """

    @abstractmethod
    def __init__(self,
                 criterion,
                 splitter,
                 max_depth,
                 min_samples_split,
                 min_samples_leaf,
                 min_weight_fraction_leaf,
                 max_features,
                 max_leaf_nodes,
                 random_state,
                 output_transformer):
        self.criterion = criterion
        self.splitter = splitter
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.max_features = max_features
        self.random_state = random_state
        self.max_leaf_nodes = max_leaf_nodes
        self.output_transformer = output_transformer

        self.n_features_ = None
        self.n_outputs_ = None
        self.classes_ = None
        self.n_classes_ = None

        self.tree_ = None
        self.max_features_ = None
        self.output_transformer_ = None

    def fit(self, X, y, sample_weight=None, check_input=True):
        """Build a decision tree from the training set (X, y).

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The training input samples. Use ``dtype=np.float32`` for maximum
            efficiency.

        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values (class labels in classification, real numbers in
            regression). In the regression case, use ``dtype=np.float64`` and
            ``order='C'`` for maximum efficiency.

        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted. Splits
            that would create child nodes with net zero or negative weight are
            ignored while searching for a split in each node. In the case of
            classification, splits are also ignored if they would result in any
            single class carrying a negative weight in either child node.

        check_input : boolean, (default=True)
            Allow to bypass several input checking.
            Don't use this parameter unless you know what you do.

        Returns
        -------
        self : object
            Returns self.
        """
        random_state = check_random_state(self.random_state)

        # Convert data
        if check_input:
            X = check_array(X, dtype=DTYPE)

        # Determine output settings
        n_samples, self.n_features_ = X.shape
        is_classification = isinstance(self, ClassifierMixin)

        y = np.atleast_1d(y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        if is_classification:
            y = np.copy(y)

            self.classes_ = []
            self.n_classes_ = []

            for k in xrange(self.n_outputs_):
                classes_k, y[:, k] = np.unique(y[:, k], return_inverse=True)
                self.classes_.append(classes_k)
                self.n_classes_.append(classes_k.shape[0])

        else:
            self.classes_ = [None] * self.n_outputs_
            self.n_classes_ = [1] * self.n_outputs_

        self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)

        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
            y = np.ascontiguousarray(y, dtype=DOUBLE)

        # Check parameters
        max_depth = ((2 ** 31) - 1 if self.max_depth is None
                     else self.max_depth)
        max_leaf_nodes = (-1 if self.max_leaf_nodes is None
                          else self.max_leaf_nodes)

        if isinstance(self.max_features, six.string_types):
            if self.max_features == "auto":
                if is_classification:
                    max_features = max(1, int(np.sqrt(self.n_features_)))
                else:
                    max_features = self.n_features_
            elif self.max_features == "sqrt":
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "log2":
                max_features = max(1, int(np.log2(self.n_features_)))
            else:
                raise ValueError(
                    'Invalid value for max_features. Allowed string '
                    'values are "auto", "sqrt" or "log2".')
        elif self.max_features is None:
            max_features = self.n_features_
        elif isinstance(self.max_features, (numbers.Integral, np.integer)):
            max_features = self.max_features
        else:  # float
            if self.max_features > 0.0:
                max_features = max(1, int(self.max_features * self.n_features_))
            else:
                max_features = 0

        self.max_features_ = max_features

        if len(y) != n_samples:
            raise ValueError("Number of labels=%d does not match "
                             "number of samples=%d" % (len(y), n_samples))
        if self.min_samples_split <= 0:
            raise ValueError("min_samples_split must be greater than zero.")
        if self.min_samples_leaf <= 0:
            raise ValueError("min_samples_leaf must be greater than zero.")
        if not 0 <= self.min_weight_fraction_leaf <= 0.5:
            raise ValueError("min_weight_fraction_leaf must in [0, 0.5]")
        if max_depth <= 0:
            raise ValueError("max_depth must be greater than zero. ")
        if not (0 < max_features <= self.n_features_):
            raise ValueError("max_features must be in (0, n_features]")
        if not isinstance(max_leaf_nodes, (numbers.Integral, np.integer)):
            raise ValueError("max_leaf_nodes must be integral number but was "
                             "%r" % max_leaf_nodes)
        if -1 < max_leaf_nodes < 2:
            raise ValueError(("max_leaf_nodes {0} must be either smaller than "
                              "0 or larger than 1").format(max_leaf_nodes))

        if sample_weight is not None:
            if (getattr(sample_weight, "dtype", None) != DOUBLE or
                    not sample_weight.flags.contiguous):
                sample_weight = np.ascontiguousarray(
                    sample_weight, dtype=DOUBLE)
            if len(sample_weight.shape) > 1:
                raise ValueError("Sample weights array has more "
                                 "than one dimension: %d" %
                                 len(sample_weight.shape))
            if len(sample_weight) != n_samples:
                raise ValueError("Number of weights=%d does not match "
                                 "number of samples=%d" %
                                 (len(sample_weight), n_samples))

        # Set min_weight_leaf from min_weight_fraction_leaf
        if self.min_weight_fraction_leaf != 0. and sample_weight is not None:
            min_weight_leaf = (self.min_weight_fraction_leaf *
                               np.sum(sample_weight))
        else:
            min_weight_leaf = 0.

        # Set min_samples_split sensibly
        min_samples_split = max(self.min_samples_split,
                                2 * self.min_samples_leaf)

        # Build tree
        criterion = self.criterion
        if not isinstance(criterion, Criterion):
            if is_classification:
                criterion = CRITERIA_CLF[self.criterion](self.n_outputs_,
                                                         self.n_classes_)
            else:
                criterion = CRITERIA_REG[self.criterion](self.n_outputs_)

        splitter = self.splitter
        if not isinstance(self.splitter, Splitter):
            splitter = SPLITTERS[self.splitter](criterion,
                                                self.max_features_,
                                                self.min_samples_leaf,
                                                min_weight_leaf,
                                                random_state)

        #### Added to have transform output spcace
        if self.output_transformer is not None:
            self.output_transformer_ = clone(self.output_transformer)

            # Set a random_state to the transformer, if it's not already set
            try:

                self.output_transformer_.set_params(
                    random_state=check_random_state(self.random_state))
            except ValueError:
                # Sub class might not have a random_state, but super class
                # have
                pass

            y_transf = self.output_transformer_.fit_transform(y)
            if y_transf.ndim == 1:
                # reshape is necessary to preserve the data contiguity against vs
                # [:, np.newaxis] that does not.
                y_transf = y_transf.reshape((-1, 1))

            if (y_transf.dtype != DOUBLE or not y_transf.flags.contiguous):
                y_transf = np.ascontiguousarray(y_transf, dtype=DOUBLE)

            base_splitter = splitter

            splitter = SplitterTransformer(criterion,
                                           self.max_features_,
                                           self.min_samples_leaf,
                                           min_weight_leaf,
                                           random_state)
            splitter.set_output_space(base_splitter, y_transf)

        #### ---

        self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_)

        # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
        if max_leaf_nodes < 0:
            builder = DepthFirstTreeBuilder(splitter, min_samples_split,
                                            self.min_samples_leaf,
                                            min_weight_leaf,
                                            max_depth)
        else:
            builder = BestFirstTreeBuilder(splitter, min_samples_split,
                                           self.min_samples_leaf,
                                           min_weight_leaf,
                                           max_depth,
                                           max_leaf_nodes)

        builder.build(self.tree_, X, y, sample_weight)

        if self.n_outputs_ == 1:
            self.n_classes_ = self.n_classes_[0]
            self.classes_ = self.classes_[0]

        return self

    def predict(self, X):
        """Predict class or regression value for X.

        For a classification model, the predicted class for each sample in X is
        returned. For a regression model, the predicted value based on X is
        returned.

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The input samples.

        Returns
        -------
        y : array of shape = [n_samples] or [n_samples, n_outputs]
            The predicted classes, or the predict values.
        """
        if getattr(X, "dtype", None) != DTYPE or X.ndim != 2:
            X = check_array(X, dtype=DTYPE)

        n_samples, n_features = X.shape

        if self.tree_ is None:
            raise Exception("Tree not initialized. Perform a fit first")

        if self.n_features_ != n_features:
            raise ValueError("Number of features of the model must "
                             " match the input. Model n_features is %s and "
                             " input n_features is %s "
                             % (self.n_features_, n_features))

        proba = self.tree_.predict(X)

        # Classification
        if isinstance(self, ClassifierMixin):
            if self.n_outputs_ == 1:
                return self.classes_.take(np.argmax(proba, axis=1), axis=0)

            else:
                predictions = np.zeros((n_samples, self.n_outputs_))

                for k in xrange(self.n_outputs_):
                    predictions[:, k] = self.classes_[k].take(
                        np.argmax(proba[:, k], axis=1),
                        axis=0)

                return predictions

        # Regression
        else:
            if self.n_outputs_ == 1:
                return proba[:, 0]

            else:
                return proba[:, :, 0]

    @property
    def feature_importances_(self):
        """Return the feature importances.

        The importance of a feature is computed as the (normalized) total
        reduction of the criterion brought by that feature.
        It is also known as the Gini importance.

        Returns
        -------
        feature_importances_ : array, shape = [n_features]
        """
        if self.tree_ is None:
            raise ValueError("Estimator not fitted, "
                             "call `fit` before `feature_importances_`.")

        return self.tree_.compute_feature_importances()