Beispiel #1
0
    def transform(self, X, y=None):
        """
            Takes as input a time series dataset and returns the matrix profile
            for each single time series of the dataset.

            Parameters
            ----------
                X: pandas.DataFrame
                   Time series dataset.

            Output
            ------
                Xt: pandas.DataFrame
                    Dataframe with the same number of rows as the input.
                    The number of columns equals the number of subsequences
                    of the desired length in each time series.
        """
        # Input checks
        self.check_is_fitted()
        check_X(X, enforce_univariate=True)

        n_instances = X.shape[0]

        # Convert into tabular format
        tabulariser = Tabularizer()
        X = tabulariser.fit_transform(X)

        Xt = pd.DataFrame(
            stomp_self(np.array([X.iloc[i]]), self.m)
            for i in range(n_instances))
        return Xt
Beispiel #2
0
    def predict_proba(self, X, input_checks=True, **kwargs):
        """
        Find probability estimates for each class for all cases in X.
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_instances, n_columns]
            The training input samples.
            If a Pandas data frame is passed (sktime format)
            If a Pandas data frame is passed, a check is performed that it
            only has one column.
            If not, an exception is thrown, since this classifier does not
            yet have
            multivariate capability.
        input_checks: boolean
            whether to check the X parameter
        Returns
        -------
        output : array of shape = [n_instances, n_classes] of probabilities
        """
        if input_checks:
            check_X(X)

        if isinstance(X, pd.DataFrame):
            if X.shape[1] > 1 or not isinstance(X.iloc[0, 0], pd.Series):
                raise TypeError(
                    "Input should either be a 2d numpy array, or a pandas "
                    "dataframe with a single column of Series objects "
                    "(networks cannot yet handle multivariate problems")
            else:
                X = np.asarray([a.values for a in X.iloc[:, 0]])

        if len(X.shape) == 2:
            # add a dimension to make it multivariate with one dimension
            X = X.reshape((X.shape[0], X.shape[1], 1))

        probs = np.zeros((X.shape[0], self.nb_classes))

        for skdl_model in self.skdl_models:
            if self.keep_in_memory:
                keras_model = skdl_model.model
            else:
                keras_model = keras.models.load_model(
                    Path(self.model_save_directory) / (skdl_model + ".hdf5"))

            # keras models' predict is same as what we/sklearn means by
            # predict_proba, i.e. give prob distributions
            probs = probs + keras_model.predict(X, **kwargs)

            if not self.keep_in_memory:
                del keras_model
                gc.collect()
                keras.backend.clear_session()

        probs = probs / len(self.skdl_models)

        # check if binary classification
        if probs.shape[1] == 1:
            # first column is probability of class 0 and second is of class 1
            probs = np.hstack([1 - probs, probs])
        return probs
def test_check_X_enforce_min_columns():
    X, y = make_classification_problem(n_columns=2)
    msg = r"columns"
    with pytest.raises(ValueError, match=msg):
        check_X(X, enforce_min_columns=3)

    with pytest.raises(ValueError, match=msg):
        check_X_y(X, y, enforce_min_columns=3)
def test_check_X_enforce_univariate():
    X, y = make_classification_problem(n_columns=2)
    msg = r"univariate"
    with pytest.raises(ValueError, match=msg):
        check_X(X, enforce_univariate=True)

    with pytest.raises(ValueError, match=msg):
        check_X_y(X, y, enforce_univariate=True)
def test_check_enforce_min_instances():
    X, y = make_classification_problem(n_instances=3)
    msg = r"instance"
    with pytest.raises(ValueError, match=msg):
        check_X(X, enforce_min_instances=4)

    with pytest.raises(ValueError, match=msg):
        check_X_y(X, y, enforce_min_instances=4)

    with pytest.raises(ValueError, match=msg):
        check_y(y, enforce_min_instances=4)
Beispiel #6
0
    def _set_oob_score(self, X, y):
        """Compute out-of-bag score"""
        check_X_y(X, y)
        check_X(X, enforce_univariate=True)

        n_classes_ = self.n_classes_
        n_samples = y.shape[0]

        oob_decision_function = []
        oob_score = 0.0
        predictions = [
            np.zeros((n_samples, n_classes_[k]))
            for k in range(self.n_outputs_)
        ]

        n_samples_bootstrap = _get_n_samples_bootstrap(n_samples,
                                                       self.max_samples)

        for estimator in self.estimators_:
            final_estimator = estimator.steps[-1][1]
            unsampled_indices = _generate_unsampled_indices(
                final_estimator.random_state, n_samples, n_samples_bootstrap)
            p_estimator = estimator.predict_proba(X.iloc[unsampled_indices, :])

            if self.n_outputs_ == 1:
                p_estimator = [p_estimator]

            for k in range(self.n_outputs_):
                predictions[k][unsampled_indices, :] += p_estimator[k]

        for k in range(self.n_outputs_):
            if (predictions[k].sum(axis=1) == 0).any():
                warn("Some inputs do not have OOB scores. "
                     "This probably means too few trees were used "
                     "to compute any reliable oob estimates.")

            decision = (predictions[k] /
                        predictions[k].sum(axis=1)[:, np.newaxis])
            oob_decision_function.append(decision)
            oob_score += np.mean(y[:, k] == np.argmax(predictions[k], axis=1),
                                 axis=0)

        if self.n_outputs_ == 1:
            self.oob_decision_function_ = oob_decision_function[0]
        else:
            self.oob_decision_function_ = oob_decision_function

        self.oob_score_ = oob_score / self.n_outputs_
Beispiel #7
0
    def transform(self, X, y=None):
        """Transform X.
        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_samples, n_columns]
            Nested dataframe with time-series in cells.
        Returns
        -------
        Xt : pandas DataFrame
          Transformed pandas DataFrame
        """

        # input checks
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True, coerce_to_pandas=True)

        # get column name
        column_name = X.columns[0]

        self._starts = []
        self._lengths = []

        # find plateaus (segments of the same value)
        for x in X.iloc[:, 0]:
            x = np.asarray(x)

            # find indices of transition
            if np.isnan(self.value):
                i = np.where(np.isnan(x), 1, 0)

            elif np.isinf(self.value):
                i = np.where(np.isinf(x), 1, 0)

            else:
                i = np.where(x == self.value, 1, 0)

            # pad and find where segments transition
            transitions = np.diff(np.hstack([0, i, 0]))

            # compute starts, ends and lengths of the segments
            starts = np.where(transitions == 1)[0]
            ends = np.where(transitions == -1)[0]
            lengths = ends - starts

            # filter out single points
            starts = starts[lengths >= self.min_length]
            lengths = lengths[lengths >= self.min_length]

            self._starts.append(starts)
            self._lengths.append(lengths)

        # put into dataframe
        Xt = pd.DataFrame()
        column_prefix = "%s_%s" % (
            column_name,
            "nan" if np.isnan(self.value) else str(self.value),
        )
        Xt["%s_starts" % column_prefix] = pd.Series(self._starts)
        Xt["%s_lengths" % column_prefix] = pd.Series(self._lengths)
        return Xt
Beispiel #8
0
    def fit(self, X, y=None):
        """Calculate word breakpoints using _mcb

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            The class labels.

        Returns
        -------
        self : object
         """

        if self.alphabet_size < 2 or self.alphabet_size > 4:
            raise ValueError(
                "Alphabet size must be an integer between 2 and 4")

        if self.word_length < 1 or self.word_length > 16:
            raise ValueError("Word length must be an integer between 1 and 16")

        if self.igb and y is None:
            raise ValueError(
                "Class values must be provided for information gain binning")

        X = check_X(X, enforce_univariate=True)
        X = tabularize(X, return_array=True)

        self.n_instances, self.series_length = X.shape
        self.breakpoints = self._igb(X, y) if self.igb else self._mcb(X)

        self._is_fitted = True
        return self
Beispiel #9
0
    def _transform_words(self, X):
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=False, coerce_to_pandas=True)

        if self.use_first_order_differences:
            X = self.add_first_order_differences(X)

        bag_all_words = [dict() for _ in range(len(X))]

        # On each dimension, perform SFA
        for ind, column in enumerate(self.col_names):
            X_dim = X[[column]]
            X_dim = from_nested_to_3d_numpy(X_dim)

            for i, window_size in enumerate(self.window_sizes[ind]):

                # SFA transform
                sfa_words = self.SFA_transformers[ind][i].transform(X_dim)
                bag = sfa_words[0]  # .iloc[:, 0]

                # merging bag-of-patterns of different window_sizes
                # to single bag-of-patterns with prefix indicating
                # the used window-length
                highest = np.int32(self.highest_bits[ind])
                for j in range(len(bag)):
                    for (key, value) in bag[j].items():
                        # append the prefices to the words to distinguish
                        # between window-sizes
                        word = MUSE.shift_left(key, highest, ind,
                                               self.highest_dim_bit,
                                               window_size)

                        bag_all_words[j][word] = value

        return bag_all_words
Beispiel #10
0
    def predict(self, X):
        """Predict regression target for X.
        The predicted regression target of an input sample is computed as the
        mean predicted regression targets of the trees in the forest.
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The input samples. Internally, its dtype will be converted to
            ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csr_matrix``.
        Returns
        -------
        y : array of shape = [n_samples] or [n_samples, n_outputs]
            The predicted values.
        """
        self.check_is_fitted()
        # Check data
        X = check_X(X, enforce_univariate=True)
        X = self._validate_X_predict(X)

        # Assign chunk of trees to jobs
        n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)

        # Parallel loop
        y_hat = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
            delayed(e.predict)(X, check_input=True) for e in self.estimators_)

        return np.sum(y_hat, axis=0) / len(self.estimators_)
Beispiel #11
0
    def transform(self, X, y=None):
        """
        Transform X.

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, n_columns]
            Nested dataframe with time-series in cells.

        Returns
        -------
        Xt : pandas DataFrame
        """
        self.check_is_fitted()
        X = check_X(X)

        n_instances, n_dims = X.shape

        arr = [X.iloc[i, :].values for i in range(n_instances)]

        max_length = _get_max_length(arr)

        if max_length > self.pad_length_:
            raise ValueError("Error: max_length of series \
                    is greater than the one found when fit or set.")

        pad = [
            pd.Series([self._create_pad(series) for series in out])
            for out in arr
        ]

        return pd.DataFrame(pad)
Beispiel #12
0
    def predict(self, X):
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True)

        rng = check_random_state(self.random_state)

        classes = []
        test_bags = self.transformer.transform(X)
        test_bags = test_bags.iloc[:, 0]

        for i, test_bag in enumerate(test_bags):
            best_dist = sys.float_info.max
            nn = None

            for n, bag in enumerate(self.transformed_data):
                dist = boss_distance(test_bag, bag, best_dist)

                if dist < best_dist or (dist == best_dist and rng.random()
                                        < 0.5):
                    best_dist = dist
                    nn = self.class_vals[n]

            classes.append(nn)

        return np.array(classes)
Beispiel #13
0
    def predict_proba(self, X):
        """
        Find probability estimates for each class for all cases in X.
        Parameters
        ----------
        X : The training input samples. array-like or sparse matrix of shape
        = [n_test_instances, series_length]
            If a Pandas data frame is passed (sktime format) a check is
            performed that it only has one column.
            If not, an exception is thrown, since this classifier does not
            yet have
            multivariate capability.

        Returns
        -------
        output : nd.array of shape = (n_instances, n_classes)
            Predicted probabilities
        """
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True, coerce_to_numpy=True)
        X = X.squeeze(1)

        _, series_length = X.shape
        if series_length != self.series_length:
            raise TypeError(
                " ERROR number of attributes in the train does not match "
                "that in the test data")
        y_probas = Parallel(n_jobs=self.n_jobs)(
            delayed(_predict_proba_for_estimator)(X, self.estimators_[i],
                                                  self.intervals_[i])
            for i in range(self.n_estimators))

        output = np.sum(y_probas,
                        axis=0) / (np.ones(self.n_classes) * self.n_estimators)
        return output
Beispiel #14
0
    def fit(self, X, y=None):
        """
        Fit transformer, generating random interval indices.

        Parameters
        ----------
        X : pandas DataFrame of shape [n_samples, n_features]
            Input data
        y : pandas Series, shape (n_samples, ...), optional
            Targets for supervised learning.

        Returns
        -------
        self : an instance of self.
        """
        X = check_X(X, enforce_univariate=True)

        self.input_shape_ = X.shape

        # Retrieve time-series indexes from each column.
        self._time_index = get_time_index(X)

        if isinstance(self.intervals, np.ndarray):
            self.intervals_ = self.intervals

        elif is_int(self.intervals):
            self.intervals_ = np.array_split(self._time_index, self.intervals)

        else:
            raise ValueError(
                f"Intervals must be either an integer, an array with "
                f"start and end points, but found: {self.intervals}")
        self._is_fitted = True
        return self
Beispiel #15
0
    def transform(self, X, y=None):
        """

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, n_dims]
            Nested dataframe with multivariate time-series in cells.

        Returns
        -------
        dims: Pandas data frame with first dimension in column zero,
              second in column one etc.
        """
        # Check the data
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=False)

        # Get information about the dataframe
        num_atts = len(X.iloc[0, 0])
        col_names = X.columns

        # Check the parameters are appropriate
        self._check_parameters(num_atts)

        # On each dimension, perform PAA
        dataFrames = []
        for x in col_names:
            dataFrames.append(self._perform_paa_along_dim(pd.DataFrame(X[x])))

        # Combine the dimensions together
        result = pd.concat(dataFrames, axis=1, sort=False)
        result.columns = col_names

        return result
Beispiel #16
0
    def predict_proba(self, X):
        """Predict class probabilities for X.
        The predicted class probabilities of an input sample are computed as
        the mean predicted class probabilities of the trees in the forest. The
        class probability of a single tree is the fraction of samples of the
        same
        class in a leaf.
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The input samples. Internally, its dtype will be converted to
            ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csr_matrix``.
        Returns
        -------
        p : array of shape = [n_samples, n_classes], or a list of n_outputs
            such arrays if n_outputs > 1.
            The class probabilities of the input samples. The order of the
            classes corresponds to that in the attribute `classes_`.
        """
        # Check data
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True)

        X = self._validate_X_predict(X)

        # Assign chunk of trees to jobs
        n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)

        all_proba = Parallel(n_jobs=n_jobs,
                             verbose=self.verbose)(delayed(e.predict_proba)(X)
                                                   for e in self.estimators_)

        return np.sum(all_proba, axis=0) / len(self.estimators_)
Beispiel #17
0
    def transform(self, X, y=None):
        """Concatenate multivariate time series/panel data into long
        univariate time series/panel
        data by simply concatenating times series in time.

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_samples, n_features]
            Nested dataframe with time-series in cells.

        Returns
        -------
        Xt : pandas DataFrame
          Transformed pandas DataFrame with same number of rows and single
          column
        """
        self.check_is_fitted()
        X = check_X(X)

        # We concatenate by tabularizing all columns and then detabularizing
        # them into a single column
        if isinstance(X, pd.DataFrame):
            Xt = from_nested_to_2d_array(X)
        else:
            Xt = from_3d_numpy_to_2d_array(X)
        return from_2d_array_to_nested(Xt)
Beispiel #18
0
    def predict(self, X):
        self.check_is_fitted()

        if isinstance(X, pd.Series) or isinstance(X, pd.DataFrame):
            X = check_X(X, enforce_univariate=True)
            X = tabularize(X, return_array=True)

        rng = check_random_state(self.random_state)

        classes = []
        test_bags = self.transformer.transform(X)
        test_bags = test_bags[0]  # .iloc[:, 0]

        for test_bag in test_bags:
            best_sim = -1
            nn = None

            for n, bag in enumerate(self.transformed_data):
                sim = histogram_intersection(test_bag, bag)

                if sim > best_sim or (sim == best_sim and rng.random() < 0.5):
                    best_sim = sim
                    nn = self.class_vals[n]

            classes.append(nn)

        return np.array(classes)
Beispiel #19
0
    def transform(self, X, y=None):
        """
        Transform X, transforms univariate time-series using sklearn's PCA
        class

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_samples, 1]
            Nested dataframe with univariate time-series in cells.

        Returns
        -------
        Xt : pandas DataFrame
          Transformed pandas DataFrame with the same number of rows and the
          (potentially reduced) PCA transformed
          column. Time indices of the original column are replaced with 0:(
          n_components - 1).
        """
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True)

        # Transform X using the fitted PCA
        Xtab = tabularize(X)
        Xpca = pd.DataFrame(data=self.pca.transform(Xtab),
                            index=Xtab.index,
                            columns=Xtab.columns[:self.pca.n_components_])

        # Back-transform into time series data format
        Xt = detabularise(Xpca, index=X.index)
        Xt.columns = X.columns
        return Xt
Beispiel #20
0
    def predict_proba(self, X):
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True)

        bag = self._transform_words(X)
        bag_dict = self.vectorizer.transform(bag)
        return self.clf.predict_proba(bag_dict)
    def predict_proba(self, X):
        """
        Find probability estimates for each class for all cases in X.
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_instances, n_columns]
            The training input samples.
            If a Pandas data frame is passed (sktime format)
            If a Pandas data frame is passed, a check is performed that it
            only has one column.
            If not, an exception is thrown, since this classifier does not
            yet have
            multivariate capability.

        Returns
        -------
        output : array of shape = [n_instances, n_classes] of probabilities
        """
        X = check_X(X, enforce_univariate=True)
        X = dataset_properties.negative_dataframe_indices(X)
        if self.n_jobs > 1 or self.n_jobs < 0:
            parallel = Parallel(self.n_jobs)
            distributions = parallel(
                delayed(self._predict_proba_tree)(X, tree)
                for tree in self.trees)
        else:
            distributions = [
                self._predict_proba_tree(X, tree) for tree in self.trees
            ]
        distributions = np.array(distributions)
        distributions = np.sum(distributions, axis=0)
        normalize(distributions, copy=False, norm='l1')
        return distributions
Beispiel #22
0
    def transform(self, X, y=None):
        """

        Parameters
        ----------
        X : pd.DataFrame
            Univariate time series to transform.
        y : pd.DataFrame, optional (default=False)
            Exogenous variables

        Returns
        -------
        y_hat : pd.DataFrame
            Extracted parameters; columns are parameter values
        """
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True)
        param_names = self._check_param_names(self.param_names)
        n_instances = X.shape[0]

        def _fit_extract(forecaster, x, param_names):
            forecaster.fit(x)
            params = forecaster.get_fitted_params()
            return np.hstack([params[name] for name in param_names])

        # iterate over rows
        extracted_params = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_extract)(clone(self.forecaster), X.iloc[i, 0],
                                  param_names) for i in range(n_instances))

        return pd.DataFrame(extracted_params,
                            index=X.index,
                            columns=param_names)
Beispiel #23
0
    def predict(self, X):
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True)

        rng = check_random_state(self.random_state)

        num_insts = X.shape[0]
        classes = np.zeros(num_insts, dtype=np.int_)

        test_bags = self.transformer.transform(X)
        test_bags = [series.to_dict() for series in test_bags.iloc[:, 0]]

        for i, test_bag in enumerate(test_bags):
            best_dist = sys.float_info.max
            nn = -1

            for n, bag in enumerate(self.transformed_data):
                dist = boss_distance(test_bag, bag, best_dist)

                if dist < best_dist or (dist == best_dist
                                        and rng.random() < 0.5):
                    best_dist = dist
                    nn = self.class_vals[n]

            classes[i] = nn

        return classes
Beispiel #24
0
    def transform(self, X, y=None):
        """Takes series in each cell, train linear interpolation and samples n.

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_samples, n_features]
            Nested dataframe with time-series in cells.

        Returns
        -------
        pandas DataFrame : Transformed pandas DataFrame with same number
                            of rows and columns
        """
        self.check_is_fitted()
        check_X(X)
        return X.apply(self._resize_col)
Beispiel #25
0
    def _transform_words(self, X):
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True, coerce_to_numpy=True)

        bag_all_words = [dict() for _ in range(len(X))]
        for i, window_size in enumerate(self.window_sizes):

            # SFA transform
            sfa_words = self.SFA_transformers[i].transform(X)
            bag = sfa_words[0]  # .iloc[:, 0]

            # merging bag-of-patterns of different window_sizes
            # to single bag-of-patterns with prefix indicating
            # the used window-length
            for j in range(len(bag)):
                for (key, value) in bag[j].items():
                    # append the prefices to the words to distinguish
                    # between window-sizes
                    if isinstance(key, tuple):
                        word = (((key[0] << self.highest_bit) | key[1]) <<
                                3) | window_size
                    else:
                        # word = ((key << self.highest_bit) << 3) | window_size
                        word = WEASEL.shift_left(key, self.highest_bit,
                                                 window_size)

                    bag_all_words[j][word] = value

        return bag_all_words
Beispiel #26
0
    def fit(self, X, y=None):
        """
        Fit transformer.

        Parameters
        ----------
        X : pandas DataFrame of shape [n_samples, n_features]
            Input data
        y : pandas Series, shape (n_samples, ...), optional
            Targets for supervised learning.

        Returns
        -------
        self : an instance of self.
        """
        X = check_X(X, coerce_to_pandas=True)

        if self.lower is None:
            n_instances, _ = X.shape
            arr = [X.iloc[i, :].values for i in range(n_instances)]
            self.lower_ = self.get_min_length(arr)
        else:
            self.lower_ = self.lower

        self._is_fitted = True
        return self
Beispiel #27
0
    def transform(self, X, y=None):
        """
        Transform X, transforms univariate time-series using sklearn's PCA
        class

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_samples, 1]
            Nested dataframe with univariate time-series in cells.

        Returns
        -------
        Xt : pandas DataFrame
          Transformed pandas DataFrame with the same number of rows and the
          (potentially reduced) PCA transformed
          column. Time indices of the original column are replaced with 0:(
          n_components - 1).
        """
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True, coerce_to_numpy=True)
        X = X.squeeze(1)

        # Transform X using the fitted PCA
        Xpca = pd.DataFrame(data=self.pca.transform(X))

        # Back-transform into time series data format
        Xt = from_2d_array_to_nested(Xpca)
        return Xt
    def predict_proba(self, X):
        """
        Find probability estimates for each class for all cases in X.
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_instances, n_columns]
            The training input samples.
            If a Pandas data frame is passed (sktime format)
            If a Pandas data frame is passed, a check is performed that it
            only has one column.
            If not, an exception is thrown, since this classifier does not
            yet have
            multivariate capability.
        Returns
        -------
        output : array of shape = [n_instances, n_classes] of probabilities
        """
        X = check_X(X, enforce_univariate=True)

        X = dataset_properties.negative_dataframe_indices(X)
        distances = self.distance_to_exemplars(X)
        ones = np.ones(distances.shape)
        distances = np.add(distances, ones)
        distributions = np.divide(ones, distances)
        normalize(distributions, copy=False, norm='l1')
        return distributions
Beispiel #29
0
    def fit(self, X, y=None):
        """Fit.

        Parameters
        ----------
        X : pd.DataFrame
            nested pandas DataFrame of shape [n_samples, n_columns]
        y : pd.Series or np.array
            Target variable

        Returns
        -------
        self : an instance of self
        """
        check_X(X)
        self.default_fc_parameters_ = self._get_extraction_params()
        self._is_fitted = True
        return self
 def find_closest_exemplar_indices(self, X):
     """
     find the closest exemplar index for each instance in a dataframe
     :param X: the dataframe containing instances
     :return: 1d numpy array of indices, one for each instance,
     reflecting the index of the closest exemplar
     """
     check_X(
         X)  # todo make checks optional and propogate from forest downwards
     n_instances = X.shape[0]
     distances = self.distance_to_exemplars(X)
     indices = np.empty(X.shape[0], dtype=int)
     for index in range(n_instances):
         exemplar_distances = distances[index]
         closest_exemplar_index = comparison.arg_min(
             exemplar_distances, self.random_state)
         indices[index] = closest_exemplar_index
     return indices