Exemple #1
0
 def fit(self, X, y, input_checks = True):
     """
     Build the classifier on the training set (X, y)
     ----------
     X : array-like or sparse matrix of shape = [n_instances, n_columns]
         The training input samples.  If a Pandas data frame is passed, column 0 is extracted.
     y : array-like, shape = [n_instances]
         The class labels.
     input_checks: boolean
         whether to check the X and y parameters
     Returns
     -------
     self : object
     """
     if input_checks: validate_X_y(X, y)
     self.X = dataset_properties.positive_dataframe_indices(X)
     self.random_state = check_random_state(self.random_state)
     # setup label encoding
     self.label_encoder = LabelEncoder()
     self.label_encoder.fit(y)
     self.classes_ = self.label_encoder.classes_
     self.y = self.label_encoder.transform(y)
     if self.distance_measure is None:
         if self.get_distance_measure is None:
             self.get_distance_measure = self.setup_distance_measure(self)
         self.distance_measure = self.get_distance_measure(self)
     self.X_exemplar, self.y_exemplar = self.pick_exemplars(self)
     return self
Exemple #2
0
 def fit(self, X, y, input_checks = True):
     """
     Build the classifier on the training set (X, y)
     ----------
     X : array-like or sparse matrix of shape = [n_instances, n_columns]
         The training input samples.  If a Pandas data frame is passed, column 0 is extracted.
     y : array-like, shape = [n_instances]
         The class labels.
     input_checks: boolean
         whether to check the X and y parameters
     Returns
     -------
     self : object
     """
     if input_checks: validate_X_y(X, y)
     self.X = dataset_properties.positive_dataframe_indices(X)
     self.random_state = check_random_state(self.random_state)
     # setup label encoding
     self.label_encoder = LabelEncoder()
     self.label_encoder.fit(y)
     self.classes_ = self.label_encoder.classes_
     self.y = self.label_encoder.transform(y)
     if self.distance_measure is None:
         if self.get_distance_measure is None:
             self.get_distance_measure = self.setup_distance_measure_getter(self)
         self.distance_measure = self.get_distance_measure(self)
     if self.n_jobs > 1 or self.n_jobs < 0:
         parallel = Parallel(self.n_jobs)
         self.trees = parallel(delayed(self._fit_tree)(X, y, index, self.random_state.randint(0, self.n_trees))
                               for index in range(self.n_trees))
     else:
         self.trees = [self._fit_tree(X, y, index, self.random_state.randint(0, self.n_trees))
                       for index in range(self.n_trees)]
     return self
Exemple #3
0
    def fit(self, X, y):
        """Perform a shapelet transform then builds a random forest. Contract default for ST is 5 hours
        ----------
        X : array-like or sparse matrix of shape = [n_instances,series_length] or shape = [n_instances,n_columns]
            The training input samples.  If a Pandas data frame is passed it must have a single column (i.e. univariate
            classification. RISE has no bespoke method for multivariate classification as yet.
        y : array-like, shape =  [n_instances]    The class labels.

        Returns
        -------
        self : object
         """

        validate_X_y(X, y)
        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        self.classifier.fit(X, y)

        #        self.shapelet_transform.fit(X,y)
        #        print("Shapelet Search complete")
        #        self.st_X =self.shapelet_transform.transform(X)
        #        print("Transform complete")
        #        X = np.asarray([a.values for a in X.iloc[:, 0]])
        #        self.classifier.fit(X,y)
        #       print("Build classifier complete")
        return self
 def fit(self, X, y, input_checks=True):
     """
     Build the classifier on the training set (X, y)
     ----------
     X : array-like or sparse matrix of shape = [n_instances, n_columns]
         The training input samples.  If a Pandas data frame is passed, column 0 is extracted.
     y : array-like, shape = [n_instances]
         The class labels.
     input_checks: boolean
         whether to check the X and y parameters
     Returns
     -------
     self : object
     """
     if input_checks: validate_X_y(X, y)
     self.X = dataset_properties.positive_dataframe_indices(X)
     self.random_state = check_random_state(self.random_state)
     if self.find_stump is None:
         self.find_stump = best_of_n_stumps(self.n_stump_evaluations)
     # setup label encoding
     if self.label_encoder == None:
         self.label_encoder = LabelEncoder()
         self.label_encoder.fit(y)
         y = self.label_encoder.transform(y)
     self.y = y
     self.classes_ = self.label_encoder.classes_
     if self.distance_measure is None:
         if self.get_distance_measure is None:
             self.get_distance_measure = self.setup_distance_measure(self)
         self.distance_measure = self.get_distance_measure(self)
     self.stump = self.find_stump(self)
     n_branches = len(self.stump.y_exemplar)
     self.branches = [None] * n_branches
     if self.depth < self.max_depth:
         for index in range(n_branches):
             sub_y = self.stump.y_branches[index]
             if not self.is_leaf(sub_y):
                 sub_tree = ProximityTree(
                     random_state=self.random_state,
                     get_exemplars=self.get_exemplars,
                     distance_measure=self.distance_measure,
                     setup_distance_measure=self.setup_distance_measure,
                     get_distance_measure=self.get_distance_measure,
                     get_gain=self.get_gain,
                     is_leaf=self.is_leaf,
                     verbosity=self.verbosity,
                     max_depth=self.max_depth,
                     n_jobs=self.n_jobs)
                 sub_tree.label_encoder = self.label_encoder
                 sub_tree.depth = self.depth + 1
                 self.branches[index] = sub_tree
                 sub_X = self.stump.X_branches[index]
                 sub_tree.fit(sub_X, sub_y)
     return self
Exemple #5
0
    def _set_oob_score(self, X, y):
        """Compute out-of-bag score"""
        validate_X_y(X, y)
        check_X_is_univariate(X)

        n_classes_ = self.n_classes_
        n_samples = y.shape[0]

        oob_decision_function = []
        oob_score = 0.0
        predictions = [
            np.zeros((n_samples, n_classes_[k]))
            for k in range(self.n_outputs_)
        ]

        n_samples_bootstrap = _get_n_samples_bootstrap(n_samples,
                                                       self.max_samples)

        for estimator in self.estimators_:
            unsampled_indices = _generate_unsampled_indices(
                estimator.random_state, n_samples, n_samples_bootstrap)
            p_estimator = estimator.predict_proba(X.iloc[unsampled_indices, :])

            if self.n_outputs_ == 1:
                p_estimator = [p_estimator]

            for k in range(self.n_outputs_):
                predictions[k][unsampled_indices, :] += p_estimator[k]

        for k in range(self.n_outputs_):
            if (predictions[k].sum(axis=1) == 0).any():
                warn("Some inputs do not have OOB scores. "
                     "This probably means too few trees were used "
                     "to compute any reliable oob estimates.")

            decision = (predictions[k] /
                        predictions[k].sum(axis=1)[:, np.newaxis])
            oob_decision_function.append(decision)
            oob_score += np.mean(y[:, k] == np.argmax(predictions[k], axis=1),
                                 axis=0)

        if self.n_outputs_ == 1:
            self.oob_decision_function_ = oob_decision_function[0]
        else:
            self.oob_decision_function_ = oob_decision_function

        self.oob_score_ = oob_score / self.n_outputs_
Exemple #6
0
    def fit(self, X, y, sample_weight=None):
        """Build a forest of trees from the training set (X, y).

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The training input samples. Internally, its dtype will be converted
            to ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csc_matrix``.
        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values (class labels in classification, real numbers in
            regression).
        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted. Splits
            that would create child nodes with net zero or negative weight are
            ignored while searching for a split in each node. In the case of
            classification, splits are also ignored if they would result in any
            single class carrying a negative weight in either child node.

        Returns
        -------
        self : object
        """
        validate_X_y(X, y)

        # Validate or convert input data
        if sample_weight is not None:
            sample_weight = check_array(sample_weight, ensure_2d=False)
        if issparse(X):
            # Pre-sort indices to avoid that each individual tree of the
            # ensemble sorts the indices.
            X.sort_indices()

        # Remap output
        self.n_features_ = X.shape[1] if X.ndim == 2 else 1

        y = np.atleast_1d(y)
        if y.ndim == 2 and y.shape[1] == 1:
            warn(
                "A column-vector y was passed when a 1d array was"
                " expected. Please change the shape of y to "
                "(n_samples,), for example using ravel().",
                DataConversionWarning,
                stacklevel=2)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        y, expanded_class_weight = self._validate_y_class_weight(y)

        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
            y = np.ascontiguousarray(y, dtype=DOUBLE)

        if expanded_class_weight is not None:
            if sample_weight is not None:
                sample_weight = sample_weight * expanded_class_weight
            else:
                sample_weight = expanded_class_weight

        # Check parameters
        self._validate_estimator()

        if not self.bootstrap and self.oob_score:
            raise ValueError("Out of bag estimation only available"
                             " if bootstrap=True")

        random_state = check_random_state(self.random_state)

        if not self.warm_start or not hasattr(self, "estimators_"):
            # Free allocated memory, if any
            self.estimators_ = []

        n_more_estimators = self.n_estimators - len(self.estimators_)

        if n_more_estimators < 0:
            raise ValueError('n_estimators=%d must be larger or equal to '
                             'len(estimators_)=%d when warm_start==True' %
                             (self.n_estimators, len(self.estimators_)))

        elif n_more_estimators == 0:
            warn("Warm-start fitting without increasing n_estimators does not "
                 "fit new trees.")
        else:
            if self.warm_start and len(self.estimators_) > 0:
                # We draw from the random state to get the random state we
                # would have got if we hadn't used a warm_start.
                random_state.randint(MAX_INT, size=len(self.estimators_))

            trees = [
                self._make_estimator(append=False, random_state=random_state)
                for i in range(n_more_estimators)
            ]

            # Parallel loop: for standard random forests, the threading
            # backend is preferred as the Cython code for fitting the trees
            # is internally releasing the Python GIL making threading more
            # efficient than multiprocessing in that case. However, in this case,
            # for fitting pipelines in parallel, multiprocessing is more efficient.
            trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
                delayed(_parallel_build_trees)(t,
                                               self,
                                               X,
                                               y,
                                               sample_weight,
                                               i,
                                               len(trees),
                                               verbose=self.verbose,
                                               class_weight=self.class_weight)
                for i, t in enumerate(trees))

            # Collect newly grown trees
            self.estimators_.extend(trees)

        if self.oob_score:
            self._set_oob_score(X, y)

        return self