def test_matches_original_paper_macro_F1(self): test_cases = [ # n_features, label order, macro f1 to-beat (5, [0, 1, 0], 0.9), (5, [0, 1, 2, 1, 0], 0.9), (5, [0, 1, 2, 3, 0, 1, 2, 3], 0.9), (5, [0, 1, 1, 0, 2, 2, 2, 0], 0.9), ] for n, labels, expected in test_cases: rdata = RandomData(0, n, window_size=5) # Original paper code performs at 100p/cluster! samples_per_segment = 120 k = len(set(labels)) # Num clusters t = samples_per_segment * k * len(labels) # total ts length breaks = [(i) * t // len(labels) for i, _ in enumerate(labels, 1)] X, y_tru = rdata.generate_points(labels, breaks) ticc = TICC(n_clusters=k, window_size=5, n_jobs=4, random_state=0) y = ticc.fit_predict(X) # We use best_f1 because label:segment assignments are arbitrary result = best_f1(y_tru, y, average='macro') assert result > expected
# %% Generate data n_features = 5 label_seq = [0, 1, 2, 0, 2, 1] samples_per_segment = 250 window_size = 8 # Derived from above params k = len(set(label_seq)) # Num clusters t = samples_per_segment * len(label_seq) # total ts length breaks = [i * t // len(label_seq) for i in range(1, len(label_seq) + 1)] palette = {n: c['color'] for n, c in zip(range(n_features), colors)} randomdata = RandomData(seed=1234, n_features=n_features, window_size=window_size) X, y_true = randomdata.generate_points(label_seq, breaks) # Plot Synthetic Data plot_synthetic_data(X, breaks) # %% Fit TICC and GMM to data scaler = StandardScaler() X_scaled = scaler.fit_transform(X) ticc = TICC(n_clusters=k, window_size=window_size, random_state=1234, beta=200) gmm = GaussianMixture(n_components=k, random_state=1234) X_stacked = ticc.stack_data(X_scaled) y_ticc = ticc.fit_predict(X) y_gmm = gmm.fit_predict(X_stacked) # Macro F1 Scores