def clustering_pro(args): print(args.output_folder) # if not os.path.isfile(args.output_folder + '/statistic.txt'): # lib.init_dir(args.output_folder) # else: # print "It looks good!" # sys.exit(0) lib.init_dir(args.output_folder) # collect X (X, generated_traces, additional_val_traces, method_list) = parse_sampled_traces(args.generated_traces_folder, 'd') print("Training data:", len(generated_traces)) # read validation trace validation_traces = parse_validation_traces(args.validation_traces_folder, 'seed_') validation_traces |= additional_val_traces print("Validating:", len(validation_traces), "data") number_of_clusters = args.num_cluster # print args if len(X) <= number_of_clusters: print("WARNING: number of clusters must be < number of instances!", args.num_cluster, len(X)) sys.exit(0) print("Length of X", len(X)) # do clustering estimator = do_clustering(args, X) compute_statistics(X, method_list, args, estimator, generated_traces, validation_traces)
def when_ending_method_available(ending_methods, fsm, output_folder, make_dfa=False): extended_fsm_dir = output_folder + '/extended_endings_fsm' lib.init_dir(extended_fsm_dir) extended_fsm = extending_ending_states(fsm, ending_methods) open(extended_fsm_dir + '/fsm.txt', 'w').write(extended_fsm.to_string()) drawing_dot(extended_fsm, extended_fsm_dir + '/fsm') extended_dfa = extended_fsm.nfa2dfa() open(extended_fsm_dir + '/dfa.txt', 'w').write(extended_dfa.to_string()) drawing_dot(extended_dfa, extended_fsm_dir + '/dfa') if make_dfa: extended_mindfa = graph_lib.minimize_dfa(extended_dfa) open(extended_fsm_dir + '/mindfa.txt', 'w').write(extended_mindfa.to_string()) drawing_dot(extended_mindfa, extended_fsm_dir + '/mindfa')
def create_fsm_for_unit_traces(elementID2cluster, training_traces, output_folder): lib.init_dir(output_folder) unit_id = 0 for one_trace in training_traces: unit_id += 1 unit_dir = output_folder + '/fsm_d' + str(unit_id) lib.init_dir(unit_dir) fsm, log_fsm = create_fsm(elementID2cluster, [one_trace]) dfa = fsm.nfa2dfa() mindfa = graph_lib.minimize_dfa(dfa) open(unit_dir + '/fsm.txt', 'w').write(fsm.to_string()) open(unit_dir + '/dfa.txt', 'w').write(dfa.to_string()) open(unit_dir + '/mindfa.txt', 'w').write(mindfa.to_string()) drawing_dot(fsm, unit_dir + '/fsm') drawing_dot(dfa, unit_dir + '/dfa') drawing_dot(mindfa, unit_dir + '/mindfa')
def compute_statistics(X, method_list, args, estimator, generated_traces, validation_traces, output_folder=None, X_id_mapping=None, create_fsm_per_unit_trace=False, ending_methods=None, minimize_dfa=True, ktails=False, check_accepted_traces=True): if output_folder is None: output_folder = args.output_folder lib.init_dir(output_folder) if estimator is not None: ### elementID2cluster, centroids, X_labels = read_clusters( estimator, X, X_id_mapping=X_id_mapping) elif ktails: # ktails elementID2cluster, centroids, X_labels = read_ktails_clusters( X, X_id_mapping=X_id_mapping) else: print("ERROR: no estimators!") sys.exit(0) # write cluster info write_cluster(elementID2cluster, X, output_folder + '/resultant_cluster.gz', X_id_mapping=X_id_mapping) # write centroids write_centroids_to_file(centroids, output_folder + '/centroids.txt') # write distance to centroid of each element in each cluster write_cluster_contents_distance( elementID2cluster, X, centroids, output_folder + '/cluster_element_distances.txt') if create_fsm_per_unit_trace: create_fsm_for_unit_traces(elementID2cluster, generated_traces, output_folder + '/unit_fsms') # create FSM fsm, log_fsm = create_fsm(elementID2cluster, generated_traces) # write info of data contained inside each cluster write_trace_cluster_info(elementID2cluster, generated_traces, output_folder + '/trace_cluster_info.txt') # write fsm log to file write_log_to_file(log_fsm, output_folder + '/debug_fsm.txt') # DFA dfa = fsm.nfa2dfa() if check_accepted_traces: dfa_num_accepted_traces = count_accepted_traces( dfa, validation_traces, output_file=output_folder + '/dfa_uncovered_traces.txt') else: dfa_num_accepted_traces = -1 print("Finished validating DFA:", dfa_num_accepted_traces, "validation traces accepted by DFA") if minimize_dfa: # MinDFA mindfa = graph_lib.minimize_dfa(dfa) else: mindfa = None open(output_folder + '/fsm.txt', 'w').write(fsm.to_string()) drawing_dot(fsm, output_folder + '/fsm') open(output_folder + '/dfa.txt', 'w').write(dfa.to_string()) if minimize_dfa: open(output_folder + '/mindfa.txt', 'w').write(mindfa.to_string()) drawing_dot(dfa, output_folder + '/dfa') if minimize_dfa: drawing_dot(mindfa, output_folder + '/mindfa') print("after drawing dot") print(output_folder) try: fsm.serialize(output_folder + "/serialized_fsa.json") except Exception as e: print("Serialization problem:") print(e) # Number of accepted data; size of DFA, MinDFA, FSM; # fsm_num_accepted_traces = count_accepted_traces(fsm, validation_traces, debug=True) # print "Finished validating FSM:", fsm_num_accepted_traces, "data" ### # mindfa_num_accepted_traces = count_accepted_traces(mindfa, validation_traces) # print "Finished validating MinDFA:", mindfa_num_accepted_traces, "data" ##### compute silhouete #### # try: # import signal # signal.signal(signal.SIGALRM, lib.handler) # signal.alarm(waiting_time()) # silhouette_avg = silhouette_score(np.array(X), estimator.labels_, # sample_size=min( # args.silhouette_sample_size if args.silhouette_sample_size is not None else len( # X), len(X)), # random_state=args.seed) # print "silhouette_avg:", silhouette_avg # # signal.alarm(0) # except TimeoutError: # print "silhouette computation runs too long!" # silhouette_avg = -1 # except ValueError as e: # print e # silhouette_avg = -1 # finally: # signal.alarm(0) # write statistics with open(output_folder + '/statistic.txt', 'w') as writer: writer.write('FSM_size:' + '\t' + str(len(fsm.states)) + '\n') if dfa is not None: writer.write('DFA_size:' + '\t' + str(len(dfa.states)) + '\n') if mindfa is not None: writer.write('MinDFA_size:' + '\t' + str(len(mindfa.states)) + '\n') # writer.write('FSM_validation:' + '\t' + str(fsm_num_accepted_traces) + '\n') if dfa_num_accepted_traces is not None: writer.write('DFA_validation:' + '\t' + str(dfa_num_accepted_traces) + '\n') # writer.write('MinDFA_validation:' + '\t' + str(mindfa_num_accepted_traces) + '\n') # writer.write('silhouette_avg:' + '\t' + str(silhouette_avg) + '\n') if hasattr(estimator, 'n_clusters'): writer.write('num_cluster:\t' + str(estimator.n_clusters) + '\n') else: n_clusters_ = len(set(X_labels)) - (1 if -1 in X_labels else 0) writer.write('num_cluster:\t' + str(n_clusters_) + '\n') writer.write('total_validation_traces:\t' + str(len(validation_traces)) + '\n') if dfa_num_accepted_traces is not None: possible_recall = float(dfa_num_accepted_traces) / float( len(validation_traces)) writer.write('recall:\t' + str(possible_recall) + '\n') print("after writing stats") ######################## if ending_methods is not None: when_ending_method_available(ending_methods, fsm, output_folder, make_dfa=minimize_dfa)