def test_fit_transform_output(self, estimator_instance, scenario): """Test that transform output is of expected scitype.""" X = scenario.args["transform"]["X"] Xt = scenario.run(estimator_instance, method_sequence=["fit", "transform"]) X_scitype = scenario.get_tag("X_scitype") trafo_input = estimator_instance.get_tag("scitype:transform-input") trafo_output = estimator_instance.get_tag("scitype:transform-output") # get metadata for X and ensure that X_scitype tag was correct valid_X_scitype, _, X_metadata = check_is_scitype(X, scitype=X_scitype, return_metadata=True) msg = ( f"error with scenario {type(scenario).__name__}, X_scitype tag " f'was "{X_scitype}", but check_is_scitype does not confirm this') assert valid_X_scitype, msg Xt_expected_scitype = self._expected_trafo_output_scitype( X_scitype, trafo_input, trafo_output) # todo 0.11.0 or 0.12.0: # remove this once #2219 is merged, which adds Hierarchical support # until then, skip tests if expected scitype is Hierarchical if Xt_expected_scitype == "Hierarchical": return None valid_scitype, _, Xt_metadata = check_is_scitype( Xt, scitype=Xt_expected_scitype, return_metadata=True) msg = ( f"{type(estimator_instance).__name__}.transform should return an object of " f"scitype {Xt_expected_scitype} when given an input of scitype {X_scitype}," f" but found the following return: {Xt}") assert valid_scitype, msg # we now know that Xt has its expected scitype # assign this variable for better readability Xt_scitype = Xt_expected_scitype # skip the "number of instances" test below for Aggregator, Reconciler # reason: this adds "pseudo-instances" for the __total and increases the count # todo: we probably want to mirror this into a "hierarchical" tag later on if type(estimator_instance).__name__ in ["Aggregator", "Reconciler"]: return None # if we vectorize, number of instances before/after transform should be same if trafo_input == "Series" and trafo_output == "Series": if X_scitype == "Series" and Xt_scitype == "Series": if estimator_instance.get_tag( "transform-returns-same-time-index"): assert X.shape[0] == Xt.shape[0] if X_scitype == "Panel" and Xt_scitype == "Panel": assert X_metadata["n_instances"] == Xt_metadata["n_instances"] if X_scitype == "Hierarchical" and Xt_scitype == "Hierarchical": assert X_metadata["n_instances"] == Xt_metadata["n_instances"] if trafo_input == "Panel" and trafo_output == "Panel": if X_scitype == "Hierarchical" and Xt_scitype == "Hierarchical": assert X_metadata["n_panels"] == Xt_metadata["n_panels"]
def test_classifier_output(self, estimator_instance, scenario): """Test classifier outputs the correct data types and values. Test predict produces a np.array or pd.Series with only values seen in the train data, and that predict_proba probability estimates add up to one. """ n_classes = scenario.get_tag("n_classes") X_new = scenario.args["predict"]["X"] y_train = scenario.args["fit"]["y"] # we use check_is_scitype to get the number instances in X_new # this is more robust against different scitypes in X_new _, _, X_new_metadata = check_is_scitype(X_new, "Panel", return_metadata=True) X_new_instances = X_new_metadata["n_instances"] # run fit and predict y_pred = scenario.run(estimator_instance, method_sequence=["fit", "predict"]) # check predict assert isinstance(y_pred, np.ndarray) assert y_pred.shape == (X_new_instances, ) assert np.all(np.isin(np.unique(y_pred), np.unique(y_train))) # check predict proba (all classifiers have predict_proba by default) y_proba = scenario.run(estimator_instance, method_sequence=["predict_proba"]) assert isinstance(y_proba, np.ndarray) assert y_proba.shape == (X_new_instances, n_classes) np.testing.assert_allclose(y_proba.sum(axis=1), 1)
def _pairwise_table_x_check(self, X, var_name="X"): """Check and coerce input data. Method used to check the input and convert Table input to internally used format, as defined in X_inner_mtype tag Parameters ---------- X: pd.DataFrame, pd.Series, numpy 1D or 2D, list of dicts sktime data container compliant with the Table scitype The value to be checked and coerced var_name: str, variable name to print in error messages Returns ------- X: Panel data container of a supported format in X_inner_mtype usually a 2D np.ndarray or a pd.DataFrame, unless overridden """ X_valid = check_is_scitype(X, "Table", return_metadata=False, var_name=var_name) if not X_valid: msg = ( "X and X2 must be in an sktime compatible format, of scitype Table, " "for instance a pandas.DataFrame or a 2D numpy.ndarray. " "See the data format tutorial examples/AA_datatypes_and_datasets.ipynb" ) raise TypeError(msg) X_inner_mtype = self.get_tag("X_inner_mtype") X_coerced = convert_to(X, to_type=X_inner_mtype, as_scitype="Table") return X_coerced
def test_series_in_series_out_supported_fit_in_transform(): """Test that fit/transform runs and returns the correct output type. Setting: transformer has tags "scitype:transform-input" = "Series" "scitype:transform-output" = "Series" "fit_is_empty" = True "X_inner_mtype" supports "Series" X input to fit/transform has Series scitype X output from fit/transform should be Series """ # one example for a transformer which supports Series internally cls = ExponentTransformer est = cls.create_test_instance() # ensure cls is a good example, if this fails, choose another example # (if this changes, it may be due to implementing more scitypes) # (then this is not a failure of cls, but we need to choose another example) assert "Series" in inner_X_scitypes(est) assert est.get_class_tag("fit_is_empty") assert est.get_class_tag("scitype:transform-input") == "Series" assert est.get_class_tag("scitype:transform-output") == "Series" # scenario in which series are passed to fit/transform scenario = TransformerFitTransformSeriesUnivariate() Xt = scenario.run(est, method_sequence=["fit", "transform"]) valid, _, _ = check_is_scitype(Xt, scitype="Series", return_metadata=True) assert valid, "fit.transform does not return a Series when given a Series"
def test_panel_in_primitives_out_supported_with_y_in_fit_but_not_transform(): """Test that fit/transform runs and returns the correct output type. Setting: transformer has tags "scitype:transform-input" = "Series" "scitype:transform-output" = "Primitives" "fit_is_empty" = False "requires_y" = True "X_inner_mtype" supports "Panel" X input to fit/transform has Panel scitype X output from fit/transform should be Table """ # one example for a transformer which supports Panel internally cls = TSFreshRelevantFeatureExtractor est = cls.create_test_instance() # ensure cls is a good example, if this fails, choose another example # (if this changes, it may be due to implementing more scitypes) # (then this is not a failure of cls, but we need to choose another example) assert "Panel" in inner_X_scitypes(est) assert not est.get_tag("fit_is_empty") assert est.get_tag("requires_y") assert est.get_tag("scitype:transform-input") == "Series" assert est.get_tag("scitype:transform-output") == "Primitives" # scenario in which series are passed to fit/transform scenario = TransformerFitTransformPanelUnivariateWithClassYOnlyFit() Xt = scenario.run(est, method_sequence=["fit", "transform"]) valid, _, _ = check_is_scitype(Xt, scitype="Table", return_metadata=True) assert valid, "fit.transform does not return a Table when given a Table" # todo: possibly, add mtype check, use metadata return # length of Xt should be seven = number of samples in the scenario assert len(Xt) == 7
def _check_classifier_input( X, y=None, enforce_min_instances=1, ): """Check whether input X and y are valid formats with minimum data. Raises a ValueError if the input is not valid. Parameters ---------- X : check whether conformant with any sktime Panel mtype specification y : check whether a pd.Series or np.array enforce_min_instances : int, optional (default=1) check there are a minimum number of instances. Returns ------- metadata : dict with metadata for X returned by datatypes.check_is_scitype Raises ------ ValueError If y or X is invalid input data type, or there is not enough data """ # Check X is valid input type and recover the data characteristics X_valid, _, X_metadata = check_is_scitype(X, scitype="Panel", return_metadata=True) if not X_valid: raise TypeError( f"X is not of a supported input data type." f"X must be in a supported mtype format for Panel, found {type(X)}" f"Use datatypes.check_is_mtype to check conformance with specifications." ) n_cases = X_metadata["n_instances"] if n_cases < enforce_min_instances: raise ValueError( f"Minimum number of cases required is {enforce_min_instances} but X " f"has : {n_cases}") # Check y if passed if y is not None: # Check y valid input if not isinstance(y, (pd.Series, np.ndarray)): raise ValueError( f"y must be a np.array or a pd.Series, but found type: {type(y)}" ) # Check matching number of labels n_labels = y.shape[0] if n_cases != n_labels: raise ValueError( f"Mismatch in number of cases. Number in X = {n_cases} nos in y = " f"{n_labels}") if isinstance(y, np.ndarray): if y.ndim > 1: raise ValueError(f"y must be 1-dimensional but is in fact " f"{y.ndim} dimensional") return X_metadata
def _check_clusterer_input( self, X: TimeSeriesInstances, enforce_min_instances: int = 1 ) -> TimeSeriesInstances: """Validate the input and prepare for _fit. Parameters ---------- X : np.ndarray (2d or 3d array of shape (n_instances, series_length) or shape (n_instances,n_dimensions,series_length)) or nested pd.DataFrame ( n_instances,n_dimensions). Training time series instances to cluster. Returns ------- X : np.ndarray (3d of shape (n_instances,n_dimensions,series_length)) or pd.Dataframe (n_instances,n_dimensions). Converted X ready for _fit. Raises ------ ValueError If y or X is invalid input data type, or there is not enough data. """ X = self._initial_conversion(X) X_valid, _, X_metadata = check_is_scitype( X, scitype="Panel", return_metadata=True ) if not X_valid: raise TypeError( f"X is not of a supported input data type." f"X must be of type np.ndarray or pd.DataFrame, found {type(X)}" f"Use datatypes.check_is_mtype to check conformance with " f"specifications." ) n_cases = X_metadata["n_instances"] if n_cases < enforce_min_instances: raise ValueError( f"Minimum number of cases required is {enforce_min_instances} but X " f"has : {n_cases}" ) missing = X_metadata["has_nans"] multivariate = not X_metadata["is_univariate"] unequal = not X_metadata["is_equal_length"] self._check_capabilities(missing, multivariate, unequal) return convert_to( X, to_type=self.get_tag("X_inner_mtype"), as_scitype="Panel", )
def _pairwise_panel_x_check(self, X, var_name="X"): """Check and coerce input data. Method used to check the input and convert Series/Panel input to internally used format, as defined in X_inner_mtype tag Parameters ---------- X: List of dfs, Numpy of dfs, 3d numpy sktime data container compliant with the Series or Panel scitype The value to be checked var_name: str, variable name to print in error messages Returns ------- X: Panel data container of a supported format in X_inner_mtype usually df-list, list of pd.DataFrame, unless overridden """ check_res = check_is_scitype(X, ["Series", "Panel"], return_metadata=True, var_name=var_name) X_valid = check_res[0] metadata = check_res[2] X_scitype = metadata["scitype"] if not X_valid: msg = ( "X and X2 must be in an sktime compatible format, " "of scitype Series or Panel, " "for instance a pandas.DataFrame with sktime compatible time indices, " "or with MultiIndex and lowest level a sktime compatible time index. " "See the data format tutorial examples/AA_datatypes_and_datasets.ipynb" ) raise TypeError(msg) # if the input is a single series, convert it to a Panel if X_scitype == "Series": X = convert_Series_to_Panel(X) # can't be anything else if check_is_scitype is working properly elif X_scitype != "Panel": raise RuntimeError( "Unexpected error in check_is_scitype, check validity") X_inner_mtype = self.get_tag("X_inner_mtype") X_coerced = convert_to(X, to_type=X_inner_mtype, as_scitype="Panel") return X_coerced
def _pairwise_panel_x_check(self, X, var_name="X"): """Check and coerce input data. Method used to check the input and convert Series/Panel input to internally used format, as defined in X_inner_mtype tag Parameters ---------- X: List of dfs, Numpy of dfs, 3d numpy The value to be checked var_name: str, variable name to print in error messages Returns ------- X: Panel data container of a supported format in X_inner_mtype usually df-list, list of pd.DataFrame, unless overridden """ check_res = check_is_scitype(X, ["Series", "Panel"], return_metadata=True, var_name=var_name) X_valid = check_res[0] metadata = check_res[2] X_scitype = metadata["scitype"] if not X_valid: raise TypeError("X/X2 must be of Series or Panel scitype") # if the input is a single series, convert it to a Panel if X_scitype == "Series": X = convert_Series_to_Panel(X) # can't be anything else if check_is_scitype is working properly elif X_scitype != "Panel": raise RuntimeError( "Unexpected error in check_is_scitype, check validity") X_inner_mtype = self.get_tag("X_inner_mtype") X_coerced = convert_to(X, to_type=X_inner_mtype, as_scitype="Panel") return X_coerced
def test_hierarchical_in_hierarchical_out_not_supported_but_series_fit_in_transform( ): """Test that fit/transform runs and returns the correct output type. Setting: transformer has tags "scitype:transform-input" = "Series" "scitype:transform-output" = "Series" "fit_is_empty" = True "X_inner_mtype" supports "Series" but not "Panel" and not "Hierarchical X input to fit/transform has Hierarchical scitype X output from fit/transform should be Hierarchical """ # one example for a transformer which supports Series internally cls = ExponentTransformer est = cls.create_test_instance() # ensure cls is a good example, if this fails, choose another example # (if this changes, it may be due to implementing more scitypes) # (then this is not a failure of cls, but we need to choose another example) assert "Series" in inner_X_scitypes(est) assert "Panel" not in inner_X_scitypes(est) assert "Hierarchical" not in inner_X_scitypes(est) assert est.get_tag("fit_is_empty") assert est.get_tag("scitype:transform-input") == "Series" assert est.get_tag("scitype:transform-output") == "Series" # scenario in which series are passed to fit/transform scenario = TransformerFitTransformHierarchicalUnivariate() Xt = scenario.run(est, method_sequence=["fit", "transform"]) valid, _, _ = check_is_scitype(Xt, scitype="Hierarchical", return_metadata=True) assert valid, "fit.transform does not return a Table when given a Table" # todo: possibly, add mtype check, use metadata return # length of Xt should be number of hierarchy levels times number of time points assert len(Xt) == 2 * 4 * 12
def _check_ys(self, y_true, y_pred, multioutput): if multioutput is None: multioutput = self.multioutput valid, msg, metadata = check_is_scitype(y_pred, scitype="Proba", return_metadata=True, var_name="y_pred") if not valid: raise TypeError(msg) y_pred_mtype = metadata["mtype"] inner_y_pred_mtype = self.get_tag("scitype:y_pred") y_pred_inner = convert( y_pred, from_type=y_pred_mtype, to_type=inner_y_pred_mtype, as_scitype="Proba", ) y_true, y_pred, multioutput = self._check_consistent_input( y_true, y_pred, multioutput) return y_true, y_pred_inner, multioutput
def get_window(obj, window_length=None, lag=0): """Slice obj to the time index window with given length and lag. Returns time series or time series panel with time indices strictly greater than cutoff - lag - window_length, and equal or less than cutoff - lag. Cutoff if of obj, as determined by get_cutoff. Parameters ---------- obj : sktime compatible time series data container or None if not None, must be of one of the following mtypes: pd.Series, pd.DataFrame, np.ndarray, of Series scitype pd.multiindex, numpy3D, nested_univ, df-list, of Panel scitype pd_multiindex_hier, of Hierarchical scitype window_length : int or timedelta, optional, default=-inf must be int if obj is int indexed, timedelta if datetime indexed length of the window to slice to. Default = window of infinite size lag : int or timedelta, optional, default = 0 must be int if obj is int indexed, timedelta if datetime indexed lag of the latest time in the window, with respect to cutoff of obj Returns ------- obj sub-set to time indices in the semi-open interval (cutoff - window_length - lag, cutoff - lag) None if obj was None """ from sktime.datatypes import check_is_scitype, convert_to if window_length is None or obj is None: return obj valid, _, metadata = check_is_scitype( obj, scitype=["Series", "Panel", "Hierarchical"], return_metadata=True) if not valid: raise ValueError( "obj must be of Series, Panel, or Hierarchical scitype") obj_in_mtype = metadata["mtype"] obj = convert_to(obj, GET_LATEST_WINDOW_SUPPORTED_MTYPES) # numpy3D (Panel) or np.npdarray (Series) if isinstance(obj, np.ndarray): obj_len = len(obj) window_start = max(-window_length - lag, -obj_len) window_end = max(-lag, -obj_len) if window_end == 0: return obj[window_start:] else: return obj[window_start:window_end] # pd.DataFrame(Series), pd-multiindex (Panel) and pd_multiindex_hier (Hierarchical) if isinstance(obj, pd.DataFrame): cutoff = get_cutoff(obj) win_start_excl = cutoff - window_length - lag win_end_incl = cutoff - lag if not isinstance(obj.index, pd.MultiIndex): time_indices = obj.index else: time_indices = obj.index.get_level_values(-1) win_select = (time_indices > win_start_excl) & (time_indices <= win_end_incl) obj_subset = obj.iloc[win_select] return convert_to(obj_subset, obj_in_mtype) raise ValueError( "bug in get_latest_window, unreachable condition, ifs should be exhaustive" )