Esempio n. 1
0
    def _fit(self, X, y=None):
        """
        Fit transformer, generating random interval indices.

        Parameters
        ----------
        X : pandas DataFrame of shape [n_samples, n_features]
            Input data
        y : pandas Series, shape (n_samples, ...), optional
            Targets for supervised learning.

        Returns
        -------
        self : RandomIntervalSegmenter
            This estimator
        """
        # We use composition rather than inheritance here, because this transformer
        # has a different transform type (returns tabular) compared to the
        # RandomIntervalSegmenter (returns panel).
        self._interval_segmenter = RandomIntervalSegmenter(
            self.n_intervals, self.min_length, self.max_length, self.random_state
        )
        self._interval_segmenter.fit(X, y)
        self.intervals_ = self._interval_segmenter.intervals_
        self.input_shape_ = self._interval_segmenter.input_shape_
        self._time_index = self._interval_segmenter._time_index
        return self
def test_different_implementations():
    random_state = 1233
    X_train, y_train = make_classification_problem()

    # Compare with chained transformations.
    tran1 = RandomIntervalSegmenter(n_intervals=1, random_state=random_state)
    tran2 = SeriesToPrimitivesRowTransformer(FunctionTransformer(
        func=np.mean, validate=False),
                                             check_transformer=False)
    A = tran2.fit_transform(tran1.fit_transform(X_train))

    tran = RandomIntervalFeatureExtractor(n_intervals=1,
                                          features=[np.mean],
                                          random_state=random_state)
    B = tran.fit_transform(X_train)

    np.testing.assert_array_almost_equal(A, B)
Esempio n. 3
0
def tsf_benchmarking():
    for i in range(0, len(benchmark_datasets)):
        dataset = benchmark_datasets[i]
        print(str(i) + " problem = " + dataset)
        tsf = ib.TimeSeriesForest(n_estimators=100)
        exp.run_experiment(
            overwrite=False,
            problem_path=data_dir,
            results_path=results_dir,
            cls_name="PythonTSF",
            classifier=tsf,
            dataset=dataset,
            train_file=False,
        )
        steps = [
            ("segment", RandomIntervalSegmenter(n_intervals="sqrt")),
            (
                "transform",
                FeatureUnion(
                    [
                        (
                            "mean",
                            make_row_transformer(
                                FunctionTransformer(func=np.mean, validate=False)
                            ),
                        ),
                        (
                            "std",
                            make_row_transformer(
                                FunctionTransformer(func=np.std, validate=False)
                            ),
                        ),
                        (
                            "slope",
                            make_row_transformer(
                                FunctionTransformer(func=_slope, validate=False)
                            ),
                        ),
                    ]
                ),
            ),
            ("clf", DecisionTreeClassifier()),
        ]
        base_estimator = Pipeline(steps)
        tsf = TimeSeriesForestClassifier(estimator=base_estimator, n_estimators=100)
        exp.run_experiment(
            overwrite=False,
            problem_path=data_dir,
            results_path=results_dir,
            cls_name="PythonTSFComposite",
            classifier=tsf,
            dataset=dataset,
            train_file=False,
        )
Esempio n. 4
0
def test_output_format_dim(n_timepoints, n_instances, n_intervals):
    X = _make_nested_from_array(
        np.ones(n_timepoints), n_instances=n_instances, n_columns=1
    )

    trans = RandomIntervalSegmenter(n_intervals=n_intervals)
    Xt = trans.fit_transform(X)

    # Check number of rows and output type.
    assert isinstance(Xt, pd.DataFrame)
    assert Xt.shape[0] == X.shape[0]

    # Check number of generated intervals/columns.
    if n_intervals != "random":
        if np.issubdtype(type(n_intervals), np.floating):
            assert Xt.shape[1] == np.maximum(1, int(n_timepoints * n_intervals))
        elif np.issubdtype(type(n_intervals), np.integer):
            assert Xt.shape[1] == n_intervals
        elif n_intervals == "sqrt":
            assert Xt.shape[1] == np.maximum(1, int(np.sqrt(n_timepoints)))
        elif n_intervals == "log":
            assert Xt.shape[1] == np.maximum(1, int(np.log(n_timepoints)))
Esempio n. 5
0
def rise_benchmarking():
    for i in range(0, len(benchmark_datasets)):
        dataset = benchmark_datasets[i]
        print(str(i) + " problem = " + dataset)
        rise = fb.RandomIntervalSpectralForest(n_estimators=100)
        exp.run_experiment(
            overwrite=True,
            problem_path=data_dir,
            results_path=results_dir,
            cls_name="PythonRISE",
            classifier=rise,
            dataset=dataset,
            train_file=False,
        )
        steps = [
            ("segment", RandomIntervalSegmenter(n_intervals=1, min_length=5)),
            (
                "transform",
                FeatureUnion(
                    [
                        (
                            "acf",
                            make_row_transformer(
                                FunctionTransformer(func=acf_coefs, validate=False)
                            ),
                        ),
                        (
                            "ps",
                            make_row_transformer(
                                FunctionTransformer(func=powerspectrum, validate=False)
                            ),
                        ),
                    ]
                ),
            ),
            ("tabularise", Tabularizer()),
            ("clf", DecisionTreeClassifier()),
        ]
        base_estimator = Pipeline(steps)
        rise = TimeSeriesForestClassifier(estimator=base_estimator, n_estimators=100)
        exp.run_experiment(
            overwrite=True,
            problem_path=data_dir,
            results_path=results_dir,
            cls_name="PythonRISEComposite",
            classifier=rise,
            dataset=dataset,
            train_file=False,
        )
def test_different_pipelines():
    """Compare with transformer pipeline using TSFeatureUnion."""
    random_state = 1233
    X_train, y_train = make_classification_problem()
    steps = [
        (
            "segment",
            RandomIntervalSegmenter(n_intervals=1, random_state=random_state),
        ),
        (
            "transform",
            FeatureUnion([
                (
                    "mean",
                    SeriesToPrimitivesRowTransformer(
                        FunctionTransformer(func=np.mean, validate=False),
                        check_transformer=False,
                    ),
                ),
                (
                    "std",
                    SeriesToPrimitivesRowTransformer(
                        FunctionTransformer(func=np.std, validate=False),
                        check_transformer=False,
                    ),
                ),
                (
                    "slope",
                    SeriesToPrimitivesRowTransformer(
                        FunctionTransformer(func=_slope, validate=False),
                        check_transformer=False,
                    ),
                ),
            ]),
        ),
    ]
    pipe = Pipeline(steps)
    a = pipe.fit_transform(X_train)
    tran = RandomIntervalFeatureExtractor(
        n_intervals=1,
        features=[np.mean, np.std, _slope],
        random_state=random_state,
    )
    b = tran.fit_transform(X_train)
    np.testing.assert_array_equal(a, b)
    np.testing.assert_array_equal(pipe.steps[0][1].intervals_, tran.intervals_)
Esempio n. 7
0
def test_FeatureUnion_pipeline():
    # pipeline with segmentation plus multiple feature extraction

    steps = [
        ("segment", RandomIntervalSegmenter(n_intervals=1)),
        (
            "transform",
            FeatureUnion([("mean", mean_transformer),
                          ("std", std_transformer)]),
        ),
        ("clf", DecisionTreeClassifier()),
    ]
    clf = Pipeline(steps)

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    assert y_pred.shape[0] == y_test.shape[0]
    np.testing.assert_array_equal(np.unique(y_pred), np.unique(y_test))
Esempio n. 8
0
def test_equivalent_model_specifications(n_intervals, n_estimators):
    """Test composable TSF vs an equivalent model."""
    random_state = 1234
    X_train, y_train = load_unit_test(split="train")
    X_test, y_test = load_unit_test(split="test")

    # Due to tie-breaking/floating point rounding in the final decision tree
    # classifier, the results depend on the
    # exact column order of the input data

    # Compare pipeline predictions outside of ensemble.
    steps = [
        (
            "segment",
            RandomIntervalSegmenter(n_intervals=n_intervals,
                                    random_state=random_state),
        ),
        (
            "transform",
            FeatureUnion([("mean", mean_transformer),
                          ("std", std_transformer)]),
        ),
        ("clf", DecisionTreeClassifier(random_state=random_state)),
    ]
    clf1 = Pipeline(steps)
    clf1.fit(X_train, y_train)
    a = clf1.predict(X_test)

    steps = [
        (
            "transform",
            RandomIntervalFeatureExtractor(
                n_intervals=n_intervals,
                features=[np.mean, np.std],
                random_state=random_state,
            ),
        ),
        ("clf", DecisionTreeClassifier(random_state=random_state)),
    ]
    clf2 = Pipeline(steps)
    clf2.fit(X_train, y_train)
    b = clf2.predict(X_test)
    np.array_equal(a, b)
Esempio n. 9
0
def set_classifier(cls, resampleId):
    """
    Basic way of determining the classifier to build. To differentiate settings just and another elif. So, for example, if
    you wanted tuned TSF, you just pass TuneTSF and set up the tuning mechanism in the elif.
    This may well get superceded, it is just how e have always done it
    :param cls: String indicating which classifier you want
    :return: A classifier.

    """
    if cls.lower() == "pf":
        return pf.ProximityForest(random_state=resampleId)
    elif cls.lower() == "pt":
        return pf.ProximityTree(random_state=resampleId)
    elif cls.lower() == "ps":
        return pf.ProximityStump(random_state=resampleId)
    elif cls.lower() == "rise":
        return fb.RandomIntervalSpectralForest(random_state=resampleId)
    elif cls.lower() == "tsf":
        return ib.TimeSeriesForest(random_state=resampleId)
    elif cls.lower() == "boss":
        return db.BOSSEnsemble()
    elif cls.lower() == "st":
        return st.ShapeletTransformClassifier(time_contract_in_mins=1500)
    elif cls.lower() == "dtw":
        return nn.KNeighborsTimeSeriesClassifier(metric="dtw")
    elif cls.lower() == "ee" or cls.lower() == "elasticensemble":
        return dist.ElasticEnsemble()
    elif cls.lower() == "shapedtw_raw":
        return ShapeDTW(subsequence_length=30,
                        shape_descriptor_function="raw",
                        metric_params=None)
    elif cls.lower() == "shapedtw_dwt":
        return ShapeDTW(
            subsequence_length=30,
            shape_descriptor_function="dwt",
            metric_params={"num_levels_dwt": 3},
        )
    elif cls.lower() == "shapedtw_paa":
        return ShapeDTW(
            subsequence_length=30,
            shape_descriptor_function="paa",
            metric_params={"num_intervals_paa": 5},
        )
    elif cls.lower() == "shapedtw_slope":
        return ShapeDTW(
            subsequence_length=30,
            shape_descriptor_function="slope",
            metric_params={"num_intervals_slope": 5},
        )
    elif cls.lower() == "shapedtw_hog1d":
        return ShapeDTW(
            subsequence_length=30,
            shape_descriptor_function="hog1d",
            metric_params={
                "num_bins_hog1d": 8,
                "num_intervals_hog1d": 2,
                "scaling_factor_hog1d": 0.1,
            },
        )
    elif cls.lower() == "tsfcomposite":
        # It defaults to TSF
        return ensemble.TimeSeriesForestClassifier()
    elif cls.lower() == "risecomposite":
        steps = [
            ("segment", RandomIntervalSegmenter(n_intervals=1, min_length=5)),
            (
                "transform",
                FeatureUnion([
                    (
                        "acf",
                        make_row_transformer(
                            FunctionTransformer(func=acf_coefs,
                                                validate=False)),
                    ),
                    (
                        "ps",
                        make_row_transformer(
                            FunctionTransformer(func=powerspectrum,
                                                validate=False)),
                    ),
                ]),
            ),
            ("tabularise", Tabularizer()),
            ("clf", DecisionTreeClassifier()),
        ]
        base_estimator = Pipeline(steps)
        return ensemble.TimeSeriesForestClassifier(estimator=base_estimator,
                                                   n_estimators=100)
    else:
        raise Exception("UNKNOWN CLASSIFIER")
Esempio n. 10
0
def test_bad_input_args(bad_interval):
    X = _make_nested_from_array(np.ones(10), n_instances=10, n_columns=2)
    with pytest.raises(ValueError):
        RandomIntervalSegmenter(n_intervals=bad_interval).fit(X)
Esempio n. 11
0
class RandomIntervalFeatureExtractor(BaseTransformer):
    """Random interval feature extractor transform.

    Transformer that segments time-series into random intervals
    and subsequently extracts series-to-primitives features from each interval.

    n_intervals: str{'sqrt', 'log', 'random'}, int or float, optional (
    default='sqrt')
        Number of random intervals to generate, where m is length of time
        series:
        - If "log", log of m is used.
        - If "sqrt", sqrt of m is used.
        - If "random", random number of intervals is generated.
        - If int, n_intervals intervals are generated.
        - If float, int(n_intervals * m) is used with n_intervals giving the
        fraction of intervals of the
        time series length.

        For all arguments relative to the length of the time series,
        the generated number of intervals is
        always at least 1.

    features: list of functions, optional (default=None)
        Applies each function to random intervals to extract features.
        If None, the mean is extracted.

    random_state: : int, RandomState instance, optional (default=None)
        - If int, random_state is the seed used by the random number generator;
        - If RandomState instance, random_state is the random number generator;
        - If None, the random number generator is the RandomState instance used
        by `np.random`.
    """

    _tags = {
        "fit_is_empty": False,
        "univariate-only": True,
        "scitype:transform-input": "Series",
        # what is the scitype of X: Series, or Panel
        "scitype:transform-output": "Primitives",
        # what is the scitype of y: None (not needed), Primitives, Series, Panel
        "scitype:instancewise": True,  # is this an instance-wise transform?
        "X_inner_mtype": "nested_univ",  # which mtypes do _fit/_predict support for X?
        "y_inner_mtype": "pd_Series_Table",  # and for y?
    }

    def __init__(
        self,
        n_intervals="sqrt",
        min_length=None,
        max_length=None,
        features=None,
        random_state=None,
    ):
        self.n_intervals = n_intervals
        self.min_length = min_length
        self.max_length = max_length
        self.random_state = random_state
        self.features = features
        super(RandomIntervalFeatureExtractor, self).__init__()

    def _fit(self, X, y=None):
        """
        Fit transformer, generating random interval indices.

        Parameters
        ----------
        X : pandas DataFrame of shape [n_samples, n_features]
            Input data
        y : pandas Series, shape (n_samples, ...), optional
            Targets for supervised learning.

        Returns
        -------
        self : RandomIntervalSegmenter
            This estimator
        """
        # We use composition rather than inheritance here, because this transformer
        # has a different transform type (returns tabular) compared to the
        # RandomIntervalSegmenter (returns panel).
        self._interval_segmenter = RandomIntervalSegmenter(
            self.n_intervals, self.min_length, self.max_length, self.random_state
        )
        self._interval_segmenter.fit(X, y)
        self.intervals_ = self._interval_segmenter.intervals_
        self.input_shape_ = self._interval_segmenter.input_shape_
        self._time_index = self._interval_segmenter._time_index
        return self

    def _transform(self, X, y=None):
        """Transform X.

        Transform X, segments time-series in each column into random
        intervals using interval indices generated
        during `fit` and extracts features from each interval.

        Parameters
        ----------
        X : nested pandas.DataFrame of shape [n_instances, n_features]
            Nested dataframe with time-series in cells.

        Returns
        -------
        Xt : pandas.DataFrame
          Transformed pandas DataFrame with n_instances rows and one
            column for each generated interval.
        """
        # Check input of feature calculators, i.e list of functions to be
        # applied to time-series
        features = _check_features(self.features)
        X = convert_to(X, "numpy3D")

        # Check that the input is of the same shape as the one passed
        # during fit.
        if X.shape[1] != self.input_shape_[1]:
            raise ValueError(
                "Number of columns of input is different from what was seen in `fit`"
            )
        # Input validation
        # if not all([np.array_equal(fit_idx, trans_idx) for trans_idx,
        # fit_idx in zip(check_equal_index(X),
        #     raise ValueError('Indexes of input time-series are different
        #     from what was seen in `fit`')

        n_instances, _, _ = X.shape
        n_features = len(features)

        intervals = self.intervals_
        n_intervals = len(intervals)

        # Compute features on intervals.
        Xt = np.zeros((n_instances, n_features * n_intervals))  # Allocate output array
        # for transformed data
        columns = []

        i = 0
        for func in features:
            # TODO generalise to series-to-series functions and function kwargs
            for start, end in intervals:
                interval = X[:, :, start:end]

                # Try to use optimised computations over axis if possible,
                # otherwise iterate over rows.
                try:
                    Xt[:, i] = func(interval, axis=-1).squeeze()
                except TypeError as e:
                    if (
                        str(e) == f"{func.__name__}() got an unexpected "
                        f"keyword argument 'axis'"
                    ):
                        Xt[:, i] = np.apply_along_axis(
                            func, axis=2, arr=interval
                        ).squeeze()
                    else:
                        raise
                i += 1
                columns.append(f"{start}_{end}_{func.__name__}")

        Xt = pd.DataFrame(Xt)
        Xt.columns = columns
        return Xt
Esempio n. 12
0
def set_classifier(cls, resampleId):
    """
    Basic way of determining the classifier to build. To differentiate settings just and another elif. So, for example, if
    you wanted tuned TSF, you just pass TuneTSF and set up the tuning mechanism in the elif.
    This may well get superceded, it is just how e have always done it
    :param cls: String indicating which classifier you want
    :return: A classifier.

    """
    if cls.lower() == "pf":
        return pf.ProximityForest(random_state=resampleId)
    elif cls.lower() == "pt":
        return pf.ProximityTree(random_state=resampleId)
    elif cls.lower() == "ps":
        return pf.ProximityStump(random_state=resampleId)
    elif cls.lower() == "rise":
        return fb.RandomIntervalSpectralForest(random_state=resampleId)
    elif cls.lower() == "tsf":
        return ib.TimeSeriesForest(random_state=resampleId)
    elif cls.lower() == "cif":
        return CanonicalIntervalForest(random_state=resampleId)
    elif cls.lower() == "boss":
        return BOSSEnsemble(random_state=resampleId)
    elif cls.lower() == "cboss":
        return ContractableBOSS(random_state=resampleId)
    elif cls.lower() == "tde":
        return TemporalDictionaryEnsemble(random_state=resampleId)
    elif cls.lower() == "st":
        return st.ShapeletTransformClassifier(time_contract_in_mins=1500)
    elif cls.lower() == "dtwcv":
        return nn.KNeighborsTimeSeriesClassifier(metric="dtwcv")
    elif cls.lower() == "ee" or cls.lower() == "elasticensemble":
        return dist.ElasticEnsemble()
    elif cls.lower() == "tsfcomposite":
        # It defaults to TSF
        return ensemble.TimeSeriesForestClassifier()
    elif cls.lower() == "risecomposite":
        steps = [
            ("segment", RandomIntervalSegmenter(n_intervals=1, min_length=5)),
            (
                "transform",
                FeatureUnion([
                    (
                        "acf",
                        make_row_transformer(
                            FunctionTransformer(func=acf_coefs,
                                                validate=False)),
                    ),
                    (
                        "ps",
                        make_row_transformer(
                            FunctionTransformer(func=powerspectrum,
                                                validate=False)),
                    ),
                ]),
            ),
            ("tabularise", Tabularizer()),
            ("clf", DecisionTreeClassifier()),
        ]
        base_estimator = Pipeline(steps)
        return ensemble.TimeSeriesForestClassifier(estimator=base_estimator,
                                                   n_estimators=100)
    elif cls.lower() == "rocket":
        rocket_pipeline = make_pipeline(
            Rocket(random_state=resampleId),
            RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True),
        )
        return rocket_pipeline
    else:
        raise Exception("UNKNOWN CLASSIFIER")