def test_biased_vs_unbiased(self): fname = "example_data.txt" unbiased_ticc = TICC(window_size=1, number_of_clusters=8, lambda_parameter=11e-2, beta=600, maxIters=100, threshold=2e-5, write_out_file=False, prefix_string="output_folder/", num_proc=1) (unbiased_cluster_assignment, unbiased_cluster_MRFs) = unbiased_ticc.fit(input_file=fname) biased_ticc = TICC(window_size=1, number_of_clusters=8, lambda_parameter=11e-2, beta=600, maxIters=100, threshold=2e-5, write_out_file=False, prefix_string="output_folder/", num_proc=1, biased=True) (biased_cluster_assignment, biased_cluster_MRFs) = biased_ticc.fit(input_file=fname) np.testing.assert_array_equal( np.array(biased_cluster_assignment), np.array(unbiased_cluster_assignment), "Biased assignment is not equel to unbiased assignment!")
def test_multiExample(self): fname = "example_data.txt" ticc = TICC(window_size=5, number_of_clusters=5, lambda_parameter=11e-2, beta=600, maxIters=100, threshold=2e-5, write_out_file=False, prefix_string="output_folder/", num_proc=1) (cluster_assignment, cluster_MRFs) = ticc.fit(input_file=fname) assign = np.loadtxt("UnitTest_Data/multiResults.txt") val = abs(assign - cluster_assignment) self.assertEqual(sum(val), 0) for i in range(5): mrf = np.loadtxt("UnitTest_Data/multiCluster_" + str(i) + ".txt", delimiter=',') try: np.testing.assert_array_almost_equal(mrf, cluster_MRFs[i], decimal=3) except AssertionError: #Test failed self.assertTrue(1 == 0)
def test_example(self): fname = "example_data.txt" ticc = TICC(window_size=1, number_of_clusters=8, lambda_parameter=11e-2, beta=600, maxIters=100, threshold=2e-5, write_out_file=False, prefix_string="output_folder/", num_proc=1) (cluster_assignment, cluster_MRFs) = ticc.fit(input_file=fname) assign = np.loadtxt("UnitTest_Data/Results.txt") val = abs(assign - cluster_assignment) self.assertEqual(sum(val), 0) # Test prediction works with batch of data outside of `fit` method. Perhaps there is a better way # to test this in parallel so these are more like unit tests rather than integration tests? test_batch = ticc.predict_clusters( ticc.trained_model['complete_D_train'][0:1000, ]) batch_val = abs(test_batch - cluster_assignment[0:1000]) self.assertEqual(sum(batch_val), 0) # Test streaming by passing in 5 row blocks at a time (current timestamp and previous 4) # I am causing data leakage by training on the whole set and then using the trained model while streaming, # but this is for testing the code, so it is ok # TODO: figure out why larger blocks don't improve predictions more. Reference: # https://github.com/davidhallac/TICC/issues/18#issuecomment-384514116 def test_streaming(block_size): test_stream = np.zeros(1000) test_stream[0:block_size] = cluster_assignment[0:block_size] for i in range(block_size, 1000): point = ticc.trained_model['complete_D_train'][i - block_size:i, ] test_stream[i] = ticc.predict_clusters(point)[block_size - 1] percent_correct_streaming = 100 * sum( cluster_assignment[0:1000] == test_stream) / 1000.0 self.assertGreater(percent_correct_streaming, 0.9) test_streaming(5) for i in range(8): mrf = np.loadtxt("UnitTest_Data/cluster_" + str(i) + ".txt", delimiter=',') try: np.testing.assert_array_almost_equal(mrf, cluster_MRFs[i], decimal=3) except AssertionError: #Test failed self.assertTrue(1 == 0)
def test_failed_unbiased(self): with self.assertRaises(Exception) as context: # TICC will fail in Iteration 2, because cluster 9 has only one observation. fname = "example_data.txt" ticc = TICC(window_size=1, number_of_clusters=50, lambda_parameter=11e-2, beta=600, maxIters=100, threshold=2e-5, write_out_file=False, prefix_string="output_folder/", num_proc=1) (cluster_assignment, cluster_MRFs) = ticc.fit(input_file=fname) self.assertTrue('This is broken {}'.format(context.exception))
def test_multiExample(self): fname = "example_data.txt" ticc = TICC(window_size = 5,number_of_clusters = 5, lambda_parameter = 11e-2, beta = 600, maxIters = 100, threshold = 2e-5, write_out_file = False, prefix_string = "output_folder/", num_proc=1) (cluster_assignment, cluster_MRFs) = ticc.fit(input_file=fname) assign = np.loadtxt("UnitTest_Data/multiResults.txt") val = abs(assign - cluster_assignment) self.assertEqual(sum(val), 0) for i in range(5): mrf = np.loadtxt("UnitTest_Data/multiCluster_"+str(i)+".txt",delimiter=',') try: np.testing.assert_array_almost_equal(mrf, cluster_MRFs[i], decimal=3) except AssertionError: #Test failed self.assertTrue(1==0)
def run_ticc(data): ticc = TICC(window_size=1, number_of_clusters=2, lambda_parameter=11e-2, beta=600, maxIters=50, threshold=2e-15, write_out_file=True, prefix_string="ration_folder/", num_proc=1) (cluster_assignment, cluster_MRFs) = ticc.fit(input_file=data, rf=-1, rl=-1, rational=True) cuts = [] for i in range(1, len(cluster_assignment)): if cluster_assignment[i] != cluster_assignment[i - 1]: cuts.append(i) return cuts
def run_ticc(data, save_file): num_cluster = 4 #chickendance #num_cluster= 3 #sudden_cardiac, synthetic n = data.shape[1] ticc = TICC(window_size=5, number_of_clusters=num_cluster, lambda_parameter=11e-2, beta=600, maxIters=100, threshold=2e-15, write_out_file=True, prefix_string="ration_folder/", num_proc=1) (cluster_assignment, cluster_MRFs) = ticc.fit(input_file=data, rf=-1, rl=-1, rational=True) cluster_score = get_importance_score(cluster_MRFs, n, num_cluster, save_file) return np.array(cluster_score), cluster_assignment
def test_example(self): fname = "example_data.txt" ticc = TICC(window_size = 1,number_of_clusters = 8, lambda_parameter = 11e-2, beta = 600, maxIters = 100, threshold = 2e-5, write_out_file = False, prefix_string = "output_folder/", num_proc=1) (cluster_assignment, cluster_MRFs) = ticc.fit(input_file=fname) assign = np.loadtxt("UnitTest_Data/Results.txt") val = abs(assign - cluster_assignment) self.assertEqual(sum(val), 0) # Test prediction works with batch of data outside of `fit` method. Perhaps there is a better way # to test this in parallel so these are more like unit tests rather than integration tests? test_batch = ticc.predict_clusters(ticc.trained_model['complete_D_train'][0:1000, ]) batch_val = abs(test_batch - cluster_assignment[0:1000]) self.assertEqual(sum(batch_val), 0) # Test streaming by passing in 5 row blocks at a time (current timestamp and previous 4) # I am causing data leakage by training on the whole set and then using the trained model while streaming, # but this is for testing the code, so it is ok # TODO: figure out why larger blocks don't improve predictions more. Reference: # https://github.com/davidhallac/TICC/issues/18#issuecomment-384514116 def test_streaming(block_size): test_stream = np.zeros(1000) test_stream[0:block_size] = cluster_assignment[0:block_size] for i in range(block_size, 1000): point = ticc.trained_model['complete_D_train'][i - block_size:i, ] test_stream[i] = ticc.predict_clusters(point)[block_size - 1] percent_correct_streaming = 100 * sum(cluster_assignment[0:1000] == test_stream) / 1000.0 self.assertGreater(percent_correct_streaming, 0.9) test_streaming(5) for i in range(8): mrf = np.loadtxt("UnitTest_Data/cluster_"+str(i)+".txt",delimiter=',') try: np.testing.assert_array_almost_equal(mrf, cluster_MRFs[i], decimal=3) except AssertionError: #Test failed self.assertTrue(1==0)
# maxiters 1k # window size 1 for betavals in range(0, 6000, 50): try: ticc = TICC(window_size=1, number_of_clusters=numclust, lambda_parameter=lambvals, beta=betavals, maxIters=10, threshold=2e-5, write_out_file=False, prefix_string="output_folder/", num_proc=1) (cluster_assignment, cluster_MRFs, bic) = ticc.fit(input_file=fname) print("what?") tup = (numclust, lambvals, betavals, bic) biclist.append(tup) except: tup = "Fail" print(tup) # print("what") # for numclust in range(3, 10, 1): # print(fname) # print("hm") # # for lambvals in np.linspace(5e-2, 9e-2, 4):
a[1, 0].scatter(x, y[2] + y0) a[1, 1].scatter(x, y[3] + y0) a[2, 0].scatter(x, y[4] + y0) #pyplot.bar(x,yz) np.savetxt("Syn_TimeSeries2.csv", np.transpose([y[0, ], y[1, ], y[2, ], y[3, ], y[4, ]]), delimiter=',') #np.savetxt('test.csv', x, delimiter=',') ''' Time Series using Pandas,''' if __name__ == '__main__': fname = "Syn_TimeSeries2.csv" ticc = TICC(window_size=1, number_of_clusters=2, lambda_parameter=11e-2, beta=600, maxIters=1000, threshold=2e-5, write_out_file=False, prefix_string="output_folder/", num_proc=1) (cluster_assignment, cluster_MRFs) = ticc.fit(input_file=fname) print(cluster_assignment) #np.savetxt('Results2.txt', cluster_assignment, fmt='%d', delimiter=',') np.savetxt('Results_SynData.csv', cluster_assignment, fmt='%d', delimiter=',') #print(np.size(cluster_assignment))
from TICC_solver import TICC import numpy as np import sys fname = "example_data.txt" ticc = TICC(window_size=1, number_of_clusters=8, lambda_parameter=11e-2, beta=600, maxIters=100, threshold=2e-5, write_out_file=False, prefix_string="output_folder/", num_proc=1) (cluster_assignment, cluster_MRFs) = ticc.fit(input_file=fname) print(cluster_assignment) np.savetxt('Results.txt', cluster_assignment, fmt='%d', delimiter=',')
def genera_cluster(cluster=0, window=0, p_lambda=0, beta=0, percorsoD="/", percorsoS="/", percorsoG="", percorsoDN="", seed=102): percorsoS = percorsoS + "/TICC" if not os.path.exists(percorsoS): os.makedirs(percorsoS) # Read the dataset base_name = percorsoD file_name = base_name df, latitude, longitude, experiment = get_dataset(file_name) df = df.astype(float) ticc = TICC(window_size=int(window), number_of_clusters=int(cluster), lambda_parameter=float(p_lambda), beta=int(beta), seed=seed) print "[XM]> ========= Generating TICC clustering model ===========" cluster_assignment, cluster_MRFs, bic, aic, ll = ticc.fit( input_file=percorsoD) cluster_assignment = [int(item) for item in cluster_assignment] surplus = [] if int(window) > 1: for i in range(int(window) - 1): surplus.append(cluster_assignment[0]) cluster_assignment = surplus + cluster_assignment if percorsoG != "": y = open(percorsoG, 'r').readlines() y = [float(item) for item in y] evaluation_names = [ "Acc.", "CE", "F1", "Entropy", "Purity", "NbClust." ] df_evaluation = pd.DataFrame() df_evaluation["clusters_found"] = list(cluster_assignment) df_evaluation["clusters_hidden"] = list(y) clusters_found = df_evaluation["clusters_found"] clusters_hidden = df_evaluation["clusters_hidden"] evaluation_temp = generate_evaluation(clusters_found, clusters_hidden) print "[XM]> Generating results files" fileClusters = open(percorsoS + "/cl.txt", "w") for item in clusters_found: fileClusters.write("%s\n" % item) fileClusters.close() print "[XM]> Clustering generated" coordinate = cluster_MRFs.values() #mrf = [] """ for mat in coordinate: diag = np.diagonal(mat) mrf.append(diag) mrf = pd.DataFrame(mrf).to_string() """ for cl, mat in enumerate(coordinate): centri = open(percorsoS + "/model_parameters" + str(cl) + ".txt", "w") mat = pd.DataFrame(mat).to_string() centri.write("{}".format(mat)) centri.close() print "[XM]> Model parameters generated" general_info(righe=df.shape[0], colonne=df.shape[1], clust=cluster, window=window, p_lambda=p_lambda, beta=beta, bic=bic, aic=aic, ll=ll, percorsoD=percorsoD, percorsoS=percorsoS, accuracy_local=evaluation_temp[0], ce_local=evaluation_temp[1], f1_local=evaluation_temp[2], entropy_local=evaluation_temp[3], purity_local=evaluation_temp[4], nb_clusters_found=evaluation_temp[5]) if percorsoDN != "": fileDN = open(percorsoDN, 'r') fileDND = open(percorsoS + "/dataStandardization.csv", 'w') for line in fileDN: fileDND.write(line) fileDN.close() fileDND.close() else: #evaluation_names = ["Acc.", "CE", "F1", "Entropy", "NbClust."] df_evaluation = pd.DataFrame() df_evaluation["clusters_found"] = list(cluster_assignment) #df_evaluation["clusters_hidden"] = list(y) clusters_found = df_evaluation["clusters_found"] #clusters_hidden = df_evaluation["clusters_hidden"] #evaluation_temp =subc.generate_evaluation(clusters_found, clusters_hidden) #txt = subc.get_evaluation_string(evaluation_temp, evaluation_names)+" print "[XM]> Generating results files" fileClusters = open(percorsoS + "/cl.txt", "w") for item in clusters_found: fileClusters.write("%s\n" % item) fileClusters.close() print "[XM]> Clustering generated" coordinate = cluster_MRFs.values() #mrf = [] """ for mat in coordinate: diag = np.diagonal(mat) mrf.append(diag) mrf = pd.DataFrame(mrf).to_string() centri = open(percorsoS+"/model_parameters.txt","w") centri.write("{}".format(mrf)) centri.close() """ for cl, mat in enumerate(coordinate): centri = open(percorsoS + "/model_parameters" + str(cl) + ".txt", "w") mat = pd.DataFrame(mat).to_string() centri.write("{}".format(mat)) centri.close() print "[XM]> Model parameters generated" general_info(righe=df.shape[0], colonne=df.shape[1], clust=cluster, window=window, p_lambda=p_lambda, beta=beta, bic=bic, aic=aic, ll=ll, percorsoD=percorsoD, percorsoS=percorsoS, accuracy_local="NA", ce_local="NA", f1_local="NA", entropy_local="NA", purity_local="NA", nb_clusters_found=np.unique(clusters_found).size) if percorsoDN != "": fileDN = open(percorsoDN, 'r') fileDND = open(percorsoS + "/dataStandardization.csv", 'w') for line in fileDN: fileDND.write(line) fileDN.close() fileDND.close()