def wrapper(self, data, labels=None, **kwargs): # Check if pandas so we can convert back is_pandas = True if isinstance(data, pd.DataFrame) else False pd_idx = data.index if is_pandas else None # Fit checks if check_fitted: self.check_is_fitted() # First convert to pandas so everything is the same format if labels is None: data = check_X(data, coerce_to_pandas=True) else: data, labels = check_X_y(data, labels, coerce_to_pandas=True) # Now convert it to a numpy array # Note sktime uses [N, C, L] whereas signature code uses shape # [N, L, C] (C being channels) so we must transpose. data = np.transpose(from_nested_to_3d_numpy(data), [0, 2, 1]) # Apply the function to the transposed array if labels is None: output = func(self, data, **kwargs) else: output = func(self, data, labels, **kwargs) # Convert back if all( [is_pandas, isinstance(output, np.ndarray), not force_numpy]): output = pd.DataFrame(index=pd_idx, data=output) return output
def _transform_words(self, X): self.check_is_fitted() X = check_X(X, enforce_univariate=False, coerce_to_pandas=True) if self.use_first_order_differences: X = self.add_first_order_differences(X) bag_all_words = [dict() for _ in range(len(X))] # On each dimension, perform SFA for ind, column in enumerate(self.col_names): X_dim = X[[column]] X_dim = from_nested_to_3d_numpy(X_dim) for i, window_size in enumerate(self.window_sizes[ind]): # SFA transform sfa_words = self.SFA_transformers[ind][i].transform(X_dim) bag = sfa_words[0] # merging bag-of-patterns of different window_sizes # to single bag-of-patterns with prefix indicating # the used window-length highest = np.int32(self.highest_bits[ind]) for j in range(len(bag)): for (key, value) in bag[j].items(): # append the prefices to the words to distinguish # between window-sizes word = MUSE.shift_left( key, highest, ind, self.highest_dim_bit, window_size ) bag_all_words[j][word] = value return bag_all_words
def test_from_nested_to_3d_numpy(n_instances, n_columns, n_timepoints): nested, _ = make_classification_problem(n_instances, n_columns, n_timepoints) array = from_nested_to_3d_numpy(nested) # check types and shapes assert isinstance(array, np.ndarray) assert array.shape == (n_instances, n_columns, n_timepoints) # check values of random series np.testing.assert_array_equal(nested.iloc[1, 0], array[1, 0, :])
def _multivariate_nested_df_to_array(X): return from_nested_to_3d_numpy(X)
def check_X( X, enforce_univariate=False, enforce_min_instances=1, enforce_min_columns=1, coerce_to_numpy=False, coerce_to_pandas=False, ): """Validate input data. Parameters ---------- X : pd.DataFrame or np.array Input data enforce_univariate : bool, optional (default=False) Enforce that X is univariate. enforce_min_instances : int, optional (default=1) Enforce minimum number of instances. enforce_min_columns : int, optional (default=1) Enforce minimum number of columns (or time-series variables). coerce_to_numpy : bool, optional (default=False) If True, X will be coerced to a 3-dimensional numpy array. coerce_to_pandas : bool, optional (default=False) If True, X will be coerced to a nested pandas DataFrame. Returns ------- X : pd.DataFrame or np.array Checked and possibly converted input data Raises ------ ValueError If X is invalid input data """ # check input type if coerce_to_pandas and coerce_to_numpy: raise ValueError( "`coerce_to_pandas` and `coerce_to_numpy` cannot both be set to True" ) if not isinstance(X, VALID_X_TYPES): raise ValueError(f"X must be a pd.DataFrame or a np.array, " f"but found: {type(X)}") # check np.array # check first if we have the right number of dimensions, otherwise we # may not be able to get the shape of the second dimension below if isinstance(X, np.ndarray): if not X.ndim == 3: raise ValueError( f"If passed as a np.array, X must be a 3-dimensional " f"array, but found shape: {X.shape}") if coerce_to_pandas: X = from_3d_numpy_to_nested(X) # enforce minimum number of columns n_columns = X.shape[1] if n_columns < enforce_min_columns: raise ValueError( f"X must contain at least: {enforce_min_columns} columns, " f"but found only: {n_columns}.") # enforce univariate data if enforce_univariate and n_columns > 1: raise ValueError( f"X must be univariate with X.shape[1] == 1, but found: " f"X.shape[1] == {n_columns}.") # enforce minimum number of instances if enforce_min_instances > 0: _enforce_min_instances(X, min_instances=enforce_min_instances) # check pd.DataFrame if isinstance(X, pd.DataFrame): if not is_nested_dataframe(X): raise ValueError( "If passed as a pd.DataFrame, X must be a nested " "pd.DataFrame, with pd.Series or np.arrays inside cells.") # convert pd.DataFrame if coerce_to_numpy: X = from_nested_to_3d_numpy(X) return X
def fit(self, X, y): """Build a WEASEL+MUSE classifiers from the training set (X, y), Parameters ---------- X : nested pandas DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, coerce_to_pandas=True) y = np.asarray(y) # add first order differences in each dimension to TS if self.use_first_order_differences: X = self.add_first_order_differences(X) # Window length parameter space dependent on series length self.col_names = X.columns rng = check_random_state(self.random_state) self.n_dims = len(self.col_names) self.highest_dim_bit = (math.ceil(math.log2(self.n_dims))) + 1 self.highest_bits = np.zeros(self.n_dims) self.SFA_transformers = [[] for _ in range(self.n_dims)] # the words of all dimensions and all time series all_words = [dict() for _ in range(X.shape[0])] # On each dimension, perform SFA for ind, column in enumerate(self.col_names): X_dim = X[[column]] X_dim = from_nested_to_3d_numpy(X_dim) series_length = X_dim.shape[-1] # TODO compute minimum over all ts ? # increment window size in steps of 'win_inc' win_inc = self.compute_window_inc(series_length) self.max_window = int(min(series_length, self.max_window)) self.window_sizes.append( list(range(self.min_window, self.max_window, win_inc)) ) self.highest_bits[ind] = math.ceil(math.log2(self.max_window)) + 1 for window_size in self.window_sizes[ind]: transformer = SFA( word_length=rng.choice(self.word_lengths), alphabet_size=self.alphabet_size, window_size=window_size, norm=rng.choice(self.norm_options), anova=self.anova, binning_method=rng.choice(self.binning_strategies), bigrams=self.bigrams, remove_repeat_words=False, lower_bounding=False, save_words=False, ) sfa_words = transformer.fit_transform(X_dim, y) self.SFA_transformers[ind].append(transformer) bag = sfa_words[0] # chi-squared test to keep only relevant features relevant_features = {} apply_chi_squared = self.p_threshold < 1 if apply_chi_squared: vectorizer = DictVectorizer(sparse=True, dtype=np.int32, sort=False) bag_vec = vectorizer.fit_transform(bag) chi2_statistics, p = chi2(bag_vec, y) relevant_features_idx = np.where(p <= self.p_threshold)[0] relevant_features = set( np.array(vectorizer.feature_names_)[relevant_features_idx] ) # merging bag-of-patterns of different window_sizes # to single bag-of-patterns with prefix indicating # the used window-length highest = np.int32(self.highest_bits[ind]) for j in range(len(bag)): for (key, value) in bag[j].items(): # chi-squared test if (not apply_chi_squared) or (key in relevant_features): # append the prefices to the words to # distinguish between window-sizes word = MUSE.shift_left( key, highest, ind, self.highest_dim_bit, window_size ) all_words[j][word] = value self.clf = make_pipeline( DictVectorizer(sparse=True, sort=False), # StandardScaler(with_mean=True, copy=False), LogisticRegression( max_iter=5000, solver="liblinear", dual=True, # class_weight="balanced", penalty="l2", random_state=self.random_state, ), ) self.clf.fit(all_words, y) self._is_fitted = True return self
def _multivariate_nested_df_to_array(X): X = from_nested_to_3d_numpy(X) # go from [n][d][m] to [n][m][d] return X.transpose(0, 2, 1)