def distance(instance_a, instance_b, **params): # find distance instance_a = from_nested_to_2d_array( instance_a, return_numpy=True) # todo use specific # dimension rather than whole # thing? instance_b = from_nested_to_2d_array( instance_b, return_numpy=True) # todo use specific # dimension rather than whole thing? instance_a = np.transpose(instance_a) instance_b = np.transpose(instance_b) return distance_measure(instance_a, instance_b, **params)
def test_from_nested_to_2d_array(n_instances, n_columns, n_timepoints): """Test from_nested_to_2d_array for correctness.""" nested, _ = make_classification_problem(n_instances, n_columns, n_timepoints) array = from_nested_to_2d_array(nested) assert array.shape == (n_instances, n_columns * n_timepoints) assert array.index.equals(nested.index)
def transform(self, X, y=None): """Concatenate multivariate time series/panel data into long univariate time series/panel data by simply concatenating times series in time. Parameters ---------- X : nested pandas DataFrame of shape [n_samples, n_features] Nested dataframe with time-series in cells. Returns ------- Xt : pandas DataFrame Transformed pandas DataFrame with same number of rows and single column """ self.check_is_fitted() X = check_X(X) # We concatenate by tabularizing all columns and then detabularizing # them into a single column if isinstance(X, pd.DataFrame): Xt = from_nested_to_2d_array(X) else: Xt = from_3d_numpy_to_2d_array(X) return from_2d_array_to_nested(Xt)
def test_output_format_dim(len_series, n_instances, n_components): np.random.seed(42) X = from_2d_array_to_nested( pd.DataFrame(data=np.random.randn(n_instances, len_series))) trans = PCATransformer(n_components=n_components) Xt = trans.fit_transform(X) # Check number of rows and output type. assert isinstance(Xt, pd.DataFrame) assert Xt.shape[0] == X.shape[0] # Check number of principal components in the output. assert from_nested_to_2d_array(Xt).shape[1] == min( n_components, from_nested_to_2d_array(X).shape[1])
def _combine_data_frames(self, dataFrames, weighting_factor, col_names): """Combine two dataframes together into a single dataframe. Used when the shape_descriptor_function is set to "compound". """ first_desc = dataFrames[0] second_desc = dataFrames[1] first_desc_array = [] second_desc_array = [] # Convert the dataframes into arrays for x in first_desc.columns: first_desc_array.append( from_nested_to_2d_array(first_desc[x], return_numpy=True)) for x in second_desc.columns: second_desc_array.append( from_nested_to_2d_array(second_desc[x], return_numpy=True)) # Concatenate the arrays together res = [] for x in range(len(first_desc_array)): dim1 = [] for y in range(len(first_desc_array[x])): dim2 = [] dim2.extend(first_desc_array[x][y]) dim2.extend(second_desc_array[x][y] * weighting_factor) dim1.append(dim2) res.append(dim1) res = np.asarray(res) # Convert to pandas dataframe df = pd.DataFrame() for col in col_names: colToAdd = [] for row in range(len(res[col])): inst = res[col][row] colToAdd.append(pd.Series(inst)) df[col] = colToAdd return df
def row_first(X): if isinstance(X, pd.Series): X = pd.DataFrame(X) Xt = pd.concat( [ pd.Series(from_nested_to_2d_array(col).iloc[:, 0]) for _, col in X.items() ], axis=1, ) return Xt
def test_padding_fill_value_transformer(): """Test full fill padding.""" # load data X_train, y_train = load_basic_motions(split="train", return_X_y=True) padding_transformer = PaddingTransformer(pad_length=120, fill_value=1) Xt = padding_transformer.fit_transform(X_train) # when we tabularize the data it has 6 dimensions # and we've padded them all to 120 long. data = from_nested_to_2d_array(Xt) assert len(data.columns) == 120 * 6
def test_padding_transformer(): """Test the dimensions after padding.""" # load data X_train, y_train = load_basic_motions(split="train", return_X_y=True) padding_transformer = PaddingTransformer() Xt = padding_transformer.fit_transform(X_train) # when we tabularize the data it has 6 dimensions # and we've padded them to there normal length of 100 data = from_nested_to_2d_array(Xt) assert len(data.columns) == 100 * 6
def test_truncation_transformer(): """Test truncation to the shortest series length.""" # load data X_train, y_train = load_basic_motions(split="train", return_X_y=True) truncated_transformer = TruncationTransformer(5) Xt = truncated_transformer.fit_transform(X_train) # when we tabularize the data it has 6 dimensions # and we've truncated them all to 5 long. data = from_nested_to_2d_array(Xt) assert len(data.columns) == 5 * 6
def test_truncation_paramterised_transformer(): """Test truncation to the a user defined length.""" # load data X_train, y_train = load_basic_motions(split="train", return_X_y=True) truncated_transformer = TruncationTransformer(2, 10) Xt = truncated_transformer.fit_transform(X_train) # when we tabularize the data it has 6 dimensions # and we've truncated them all to (10-2) long. data = from_nested_to_2d_array(Xt) assert len(data.columns) == 8 * 6
def test_dft_mft(use_fallback_dft, norm): # load training data X, y = load_gunpoint(split="train", return_X_y=True) X_tab = from_nested_to_2d_array(X, return_numpy=True) word_length = 6 alphabet_size = 4 # Single DFT transformation window_size = np.shape(X_tab)[1] p = SFA( word_length=6, alphabet_size=4, window_size=window_size, norm=norm, use_fallback_dft=use_fallback_dft, ).fit(X, y) if use_fallback_dft: dft = p._discrete_fourier_transform(X_tab[0], word_length, norm, 1, True) else: dft = p._fast_fourier_transform(X_tab[0]) mft = p._mft(X_tab[0]) assert (mft - dft < 0.0001).all() # Windowed DFT transformation window_size = 140 p = SFA( word_length=word_length, alphabet_size=alphabet_size, window_size=window_size, norm=norm, use_fallback_dft=use_fallback_dft, ).fit(X, y) mft = p._mft(X_tab[0]) for i in range(len(X_tab[0]) - window_size + 1): if use_fallback_dft: dft = p._discrete_fourier_transform( X_tab[0, i : window_size + i], word_length, norm, 1, True ) else: dft = p._fast_fourier_transform(X_tab[0, i : window_size + i]) assert (mft[i] - dft < 0.001).all() assert len(mft) == len(X_tab[0]) - window_size + 1 assert len(mft[0]) == word_length
def test_tsfresh_extractor(default_fc_parameters): X, y = make_classification_problem() X_train, X_test, y_train, y_test = train_test_split(X, y) transformer = TSFreshFeatureExtractor( default_fc_parameters=default_fc_parameters, disable_progressbar=True ) Xt = transformer.fit_transform(X_train, y_train) actual = Xt.filter(like="__mean", axis=1).values.ravel() expected = from_nested_to_2d_array(X_train).mean(axis=1).values assert expected[0] == X_train.iloc[0, 0].mean() np.testing.assert_allclose(actual, expected)
def test_pca_results(n_components): np.random.seed(42) # sklearn X = pd.DataFrame(data=np.random.randn(10, 5)) pca = PCA(n_components=n_components) Xt1 = pca.fit_transform(X) # sktime Xs = from_2d_array_to_nested(X) pca_transform = PCATransformer(n_components=n_components) Xt2 = pca_transform.fit_transform(Xs) assert np.allclose(np.asarray(Xt1), np.asarray(from_nested_to_2d_array(Xt2)))
def transform(self, X, y=None): """ Parameters ---------- X : a pandas dataframe of shape = [n_samples, num_dims] The training input samples. Returns ------- df: a pandas data frame of shape = [num_intervals, num_dims] """ # Check the data self.check_is_fitted() X = check_X(X, coerce_to_pandas=True) # Get information about the dataframe n_timepoints = len(X.iloc[0, 0]) num_instances = X.shape[0] col_names = X.columns self._check_parameters(n_timepoints) df = pd.DataFrame() for x in col_names: # Convert one of the columns in the dataframe to numpy array arr = from_nested_to_2d_array(pd.DataFrame(X[x]), return_numpy=True) # Calculate gradients transformedData = [] for y in range(num_instances): res = self._get_gradients_of_lines(arr[y]) transformedData.append(res) # Convert to Numpy array transformedData = np.asarray(transformedData) # Add it to the dataframe colToAdd = [] for i in range(len(transformedData)): inst = transformedData[i] colToAdd.append(pd.Series(inst)) df[x] = colToAdd return df
def plot_cluster_algorithm(model: BaseClusterer, predict_series: NumpyOrDF, k: int): """ Method that is used to plot a clustering algorithms output Parameters ---------- model: BaseClusterer Clustering model to plot predict_series: Numpy or Dataframe The series to predict the values for k: int Number of centers """ _check_soft_dependencies("matplotlib") import matplotlib.pyplot as plt import matplotlib.patches as mpatches if isinstance(predict_series, pd.DataFrame): predict_series = from_nested_to_2d_array(predict_series, return_numpy=True) plt.figure(figsize=(5, 10)) plt.rcParams["figure.dpi"] = 100 indexes = model.predict(predict_series) centers = model.get_centers() series_values = TimeSeriesLloydsPartitioning.get_cluster_values( indexes, predict_series, k) fig, axes = plt.subplots(nrows=k, ncols=1) for i in range(k): _plot(series_values[i], centers[i], axes[i]) blue_patch = mpatches.Patch(color="blue", label="Series that belong to the cluster") red_patch = mpatches.Patch(color="red", label="Cluster centers") plt.legend( handles=[red_patch, blue_patch], loc="upper center", bbox_to_anchor=(0.5, -0.40), fancybox=True, shadow=True, ncol=5, ) plt.tight_layout() plt.show()
def _perform_paa_along_dim(self, X): X = from_nested_to_2d_array(X, return_numpy=True) num_atts = X.shape[1] num_insts = X.shape[0] dims = pd.DataFrame() data = [] for i in range(num_insts): series = X[i, :] frames = [] current_frame = 0 current_frame_size = 0 frame_length = num_atts / self.num_intervals frame_sum = 0 for n in range(num_atts): remaining = frame_length - current_frame_size if remaining > 1: frame_sum += series[n] current_frame_size += 1 else: frame_sum += remaining * series[n] current_frame_size += remaining if current_frame_size == frame_length: frames.append(frame_sum / frame_length) current_frame += 1 frame_sum = (1 - remaining) * series[n] current_frame_size = 1 - remaining # if the last frame was lost due to double imprecision if current_frame == self.num_intervals - 1: frames.append(frame_sum / frame_length) data.append(pd.Series(frames)) dims[0] = data return dims
def transform(self, X, y=None): """ Parameters ---------- X : a pandas dataframe of shape = [n_samples, num_dims] The training input samples. Returns ------- dims: a pandas data frame of shape = [n_samples, num_dims] """ # Check the data self.check_is_fitted() X = check_X(X, enforce_univariate=False, coerce_to_pandas=True) self._check_parameters() # Get information about the dataframe col_names = X.columns df = pd.DataFrame() for x in col_names: # Convert one of the columns in the dataframe to numpy array arr = from_nested_to_2d_array(pd.DataFrame(X[x]), return_numpy=True) transformedData = self._extract_wavelet_coefficients(arr) # Convert to a numpy array transformedData = np.asarray(transformedData) # Add it to the dataframe colToAdd = [] for i in range(len(transformedData)): inst = transformedData[i] colToAdd.append(pd.Series(inst)) df[x] = colToAdd return df
def transform(self, X, y=None): """Transform nested pandas dataframe into tabular dataframe. Parameters ---------- X : pandas DataFrame Nested dataframe with pandas series or numpy arrays in cells. y : array-like, optional (default=None) Returns ------- Xt : pandas DataFrame Transformed dataframe with only primitives in cells. """ self.check_is_fitted() X = check_X(X) if isinstance(X, pd.DataFrame): return from_nested_to_2d_array(X) else: return from_3d_numpy_to_2d_array(X)
def _transform_single_feature(self, X, feature): """transforms data into a specified catch22 feature Parameters ---------- X : pandas DataFrame, input time series feature : int, catch22 feature id or String, catch22 feature name. Returns ------- Numpy array containing a catch22 feature for each input series """ if isinstance(feature, (int, np.integer)) or isinstance(feature, (float, np.float)): if feature > 21 or feature < 0: raise ValueError("Invalid catch22 feature ID") elif isinstance(feature, str): if feature in feature_names: feature = feature_names.index(feature) else: raise ValueError("Invalid catch22 feature name") else: raise ValueError("catch22 feature name or ID required") if isinstance(X, pd.DataFrame): X = from_nested_to_2d_array(X, return_numpy=True) n_instances = X.shape[0] X = np.reshape(X, (n_instances, -1)) c22_list = Parallel(n_jobs=self.n_jobs)( delayed(self._transform_case_single)( X[i], feature, ) for i in range(n_instances)) return np.asarray(c22_list)
def predict(self, X: NumpyOrDF, y=None) -> NumpyArray: """ Return cluster center index for data samples. Parameters ---------- X: 2D np.array with shape (n_instances, n_timepoints) or pd.DataFrame in nested format panel of time series to cluster y: ignored, exists for API consistency reasons Returns ------- Numpy_Array: 1D np.array of length n_instances Index of the cluster each sample belongs to """ self.check_is_fitted() if isinstance(X, pd.DataFrame): X = from_nested_to_2d_array(X, return_numpy=True) self._check_params(X) return self._predict(X)
def fit(self, X: NumpyOrDF, y=None): """ Fit the clustering algorithm on the dataset X Parameters ---------- X: 2D np.array with shape (n_instances, n_timepoints) or pd.DataFrame in nested format panel of univariate time series to train the clustering model on y: ignored, exists for API consistency reasons Returns ------- reference to self """ if isinstance(X, pd.DataFrame): X = from_nested_to_2d_array(X, return_numpy=True) self._check_params(X) self._fit(X) self._is_fitted = True return self
def transform_single_feature(self, X, feature, case_id=None): """Transform data into a specified catch22 feature. Parameters ---------- X : pandas DataFrame, input time series. feature : int, catch22 feature id or String, catch22 feature name. case_id : int, identifier for the current set of cases. If the case_id is not None and the same as the previously used case_id, calculations from previous features will be reused. Returns ------- Numpy array containing a catch22 feature for each input series. """ if isinstance(feature, (int, np.integer)) or isinstance(feature, (float, np.float)): if feature > 21 or feature < 0: raise ValueError("Invalid catch22 feature ID") elif isinstance(feature, str): if feature in feature_names: feature = feature_names.index(feature) else: raise ValueError("Invalid catch22 feature name") else: raise ValueError("catch22 feature name or ID required") if isinstance(X, pd.DataFrame): X = from_nested_to_2d_array(X, return_numpy=True) n_instances = X.shape[0] X = np.reshape(X, (n_instances, -1)) series_length = X.shape[1] if case_id is not None: if case_id != self._case_id: self._case_id = case_id self._st_n_instances = n_instances self._st_series_length = series_length self._outlier_series = [None] * n_instances self._smin = [None] * n_instances self._smax = [None] * n_instances self._smean = [None] * n_instances self._fft = [None] * n_instances self._ac = [None] * n_instances self._acfz = [None] * n_instances else: if (n_instances != self._st_n_instances or series_length != self._st_series_length): raise ValueError( "Catch22: case_is the same, but n_instances and " "series_length do not match last seen for single " "feature transform.") c22_list = Parallel(n_jobs=self.n_jobs)( delayed(self._transform_case_single)( X[i], feature, case_id, i, ) for i in range(n_instances)) if self.replace_nans: c22_list = np.nan_to_num(c22_list, False, 0, 0, 0) return np.asarray(c22_list)