def test_pass_if_consistent_on_similar_random_data(self): test_cases = [ (5, [0, 1, 0], 5), (5, [0, 1, 2, 1, 0], 5), ] for n, labels, repeats in test_cases: rdata = RandomData(seed=0, n_features=n, window_size=5) k = len(set(labels)) t = 200 * k * len(labels) breaks = [(i) * t // len(labels) for i, _ in enumerate(labels, 1)] rdata.generate_cluster_params(k) # Reuse same cluster parameters for each dataset data = [ rdata.generate_points(labels, breaks, True)[0] for i in range(repeats) ] ticc = TICC(n_clusters=k, window_size=5, beta=300, n_jobs=4, random_state=0, cluster_reassignment=0.3, verbose=True) y_preds = [ticc.fit_predict(X) for X in data] for y1, y2 in combinations(y_preds, 2): result = np.sum(np.not_equal(y1, y2)) / t assert result < 0.02
def test_example(self): X = np.loadtxt( '/Users/Gethin/Vestemi/code/TICC/ticclib/tests/test_data/example_data.txt', delimiter=",") ticc = TICC(n_clusters=8, window_size=1, lambda_parameter=11e-2, beta=600, max_iter=100, n_jobs=4, random_state=102, verbose=True) # X_stacked = ticc.stack_data(X) cluster_assignment = ticc.fit_predict(X) clusters = ticc.clusters_ # np.savetxt("UnitTest_Data/Results.txt", cluster_assignment, fmt='%d', delimiter=",") assign = np.loadtxt( "/Users/Gethin/Vestemi/code/TICC/ticclib/tests/test_data/Results.txt" ) val = abs(assign - cluster_assignment) self.assertEqual(sum(val), 0) # Test prediction works with batch of data outside of `fit` method. Perhaps there is a better way # to test this in parallel so these are more like unit tests rather than integration tests? batch_labels = ticc.predict(X[0:999, ]) # np.savetxt("UnitTest_Data/batchLabels.txt", batch_labels, fmt="%d", delimiter=',') batch_val = abs(batch_labels - cluster_assignment[:999]) self.assertEqual(sum(batch_val), 0) # Test streaming by passing in 5 row blocks at a time (current timestamp and previous 4) # I am causing data leakage by training on the whole set and then using the trained model while streaming, # but this is for testing the code, so it is ok # TODO: figure out why larger blocks don't improve predictions more. Reference: # https://github.com/davidhallac/TICC/issues/18#issuecomment-384514116 def test_streaming(block_size): test_stream = np.zeros(1000) test_stream[0:block_size] = cluster_assignment[0:block_size] for i in range(block_size, 1000): point = X[i - block_size:i, ] test_stream[i] = ticc.predict(point)[block_size - 1] percent_correct_streaming = 100 * sum( cluster_assignment[:1000] == test_stream) / 1000.0 self.assertGreater(percent_correct_streaming, 0.9) test_streaming(5) for i in range(8): # np.savetxt(f"UnitTest_Data/cluster_{i}.txt", clusterMRFs[i], fmt='%.4e', delimiter=',') MRF = np.loadtxt(f"test_data/cluster_{i}.txt", delimiter=',') np.testing.assert_array_almost_equal(MRF, clusters[i].MRF_, decimal=3)
def test_fit_predict(self): rdata = RandomData(0, 5, 5) labels = [0, 1, 2, 3, 4, 5] breaks = [(i) * 1200 // len(labels) for i, _ in enumerate(labels, 1)] X, _ = rdata.generate_points(labels, breaks) ticc = TICC(n_clusters=6, window_size=5, beta=0, n_jobs=4, verbose=True, random_state=0) A = ticc.fit(X).predict(X) B = ticc.fit_predict(X) np.testing.assert_array_equal(A, B)
def test_matches_original_paper_macro_F1(self): test_cases = [ # n_features, label order, macro f1 to-beat (5, [0, 1, 0], 0.9), (5, [0, 1, 2, 1, 0], 0.9), (5, [0, 1, 2, 3, 0, 1, 2, 3], 0.9), (5, [0, 1, 1, 0, 2, 2, 2, 0], 0.9), ] for n, labels, expected in test_cases: rdata = RandomData(0, n, window_size=5) # Original paper code performs at 100p/cluster! samples_per_segment = 120 k = len(set(labels)) # Num clusters t = samples_per_segment * k * len(labels) # total ts length breaks = [(i) * t // len(labels) for i, _ in enumerate(labels, 1)] X, y_tru = rdata.generate_points(labels, breaks) ticc = TICC(n_clusters=k, window_size=5, n_jobs=4, random_state=0) y = ticc.fit_predict(X) # We use best_f1 because label:segment assignments are arbitrary result = best_f1(y_tru, y, average='macro') assert result > expected
def test_label_consistency_w_different_seeds(self): test_cases = [ # seed1, seed2, n_features, label order, expected (0, 1, 5, [0, 1, 2]), (3, 2, 5, [0, 1, 2, 1, 0]), (0, 1, 5, [0, 1, 2, 3, 0, 1, 2, 3]), (0, 9, 5, [0, 1, 1, 0, 2, 2, 2, 0]), ] for s1, s2, n, labels in test_cases: rdata = RandomData(0, n, window_size=5) k = len(set(labels)) # Num clusters t = 200 * k * len(labels) # total ts length breaks = [(i) * t // len(labels) for i, _ in enumerate(labels, 1)] X, _ = rdata.generate_points(labels, breaks) ticc1 = TICC(n_clusters=k, window_size=5, n_jobs=4, random_state=s1) ticc2 = TICC(n_clusters=k, window_size=5, n_jobs=4, random_state=s2) y1 = ticc1.fit_predict(X) y2 = ticc2.fit_predict(X) np.testing.assert_array_equal(y1, y2)
def test_multiExample(self): X = np.loadtxt("test_data/example_data.txt", delimiter=",") ticc = TICC(n_clusters=5, window_size=5, lambda_parameter=11e-2, beta=600, max_iter=100, n_jobs=4, random_state=102, verbose=True) # X_stacked = ticc.stack_data(X) cluster_assignment = ticc.fit_predict(X) clusters = ticc.clusters_ # np.savetxt("UnitTest_Data/multiResults.txt", cluster_assignment, fmt='%d', delimiter=',') assign = np.loadtxt("test_data/multiResults.txt") val = abs(assign - cluster_assignment) self.assertEqual(sum(val), 0) for i in range(5): # np.savetxt(f"UnitTest_Data/multiCluster_{i}.txt", clusterMRFs[i], fmt='%.4e', delimiter=",") MRF = np.loadtxt(f"test_data/multiCluster_{i}.txt", delimiter=',') np.testing.assert_array_almost_equal(MRF, clusters[i].MRF_, decimal=3)
def test_score_increases_with_f1(self): rdata = RandomData(0, 5, 5) samples_per_segment = 300 labels = [0, 1, 2, 3] # , 1, 0, 1] k = len(set(labels)) # Num clusters t = samples_per_segment * k * len(labels) # total ts length breaks = [(i) * t // len(labels) for i, _ in enumerate(labels, 1)] X, y_true = rdata.generate_points(labels, breaks) params = ParameterGrid({ 'beta': [0, 500, 1000], 'window_size': [1, 5, 10], }) models = (TICC(**p, random_state=0) for p in params) models = Parallel(n_jobs=-1)(delayed(lambda x: x.fit(X))(m) for m in models) scores = [model.score(X) for model in models] f1_scores = [ best_f1(y_true, y) for y in map(lambda x: x.predict(X), models) ] assert scores.index(max(scores)) == f1_scores.index(max(f1_scores))
def test_empty_cluster_handling(self): # We check if an error is thrown during handling of empty clusters X = np.load('test_data/example_empty_clusters.npy') ticc = TICC(n_clusters=4, window_size=5, n_jobs=4, random_state=0) # X_stacked = ticc.stack_data(X) ticc.fit(X)
k = len(set(label_seq)) # Num clusters t = samples_per_segment * len(label_seq) # total ts length breaks = [i * t // len(label_seq) for i in range(1, len(label_seq) + 1)] palette = {n: c['color'] for n, c in zip(range(n_features), colors)} randomdata = RandomData(seed=1234, n_features=n_features, window_size=window_size) X, y_true = randomdata.generate_points(label_seq, breaks) # Plot Synthetic Data plot_synthetic_data(X, breaks) # %% Fit TICC and GMM to data scaler = StandardScaler() X_scaled = scaler.fit_transform(X) ticc = TICC(n_clusters=k, window_size=window_size, random_state=1234, beta=200) gmm = GaussianMixture(n_components=k, random_state=1234) X_stacked = ticc.stack_data(X_scaled) y_ticc = ticc.fit_predict(X) y_gmm = gmm.fit_predict(X_stacked) # Macro F1 Scores f1_ticc = f1_score(y_true, y_ticc, average='macro') f1_gmm = f1_score(y_true, y_gmm, average='macro') print(f"TICC F1 score = {f1_ticc}\n GMM F1 score = {f1_gmm}") # %% Plot Cluster Assignments fig, axes = plt.subplots(3, sharex=True, figsize=(14, 8)) axes[0].plot(y_true, color=palette[0], label='Ground Truth') axes[1].plot(y_ticc, color=palette[1], label='TICC')
def test_ticc_sklearn_compatibility(): # NOTE: window size = 1 ensures input unchanged by stacking # Max iter and beta of 0 to speed up test completion ticc = TICC(n_clusters=3, window_size=1, max_iter=15, beta=0, n_jobs=-1) return check_estimator(ticc)