def _eval_previous(self): print('Parsing Previous ... ', end='', flush=True) fn_recent = Name_functions.parameter_evaluation_evaluation_metric_file('Previous') if Filefunctions.exists(fn_recent): print('Already done') return with open(fn_recent, 'w+') as wf: wf.write('S;Day;NumEntries;accuracy;f1\n') for S in self.Multi['S']: predictor = Classifiers.PreviousClassifier(S) fn = Name_functions.DS_file(S) _, labels, times, ids = Di(fn).get_data(fn_subset_ids=self.test_ids_fn, return_split_values=True, return_identifiers=True) data = pd.DataFrame(index=ids) data['time'] = times data['y_true'] = [l[0] for l in labels] data['Day'] = np.floor(data['time']) # Calculate the accuracy score for each day for day in data['Day'].unique(): subset = data[data['Day'] == day] acc_score, f1_score = self.get_scores(predictor=predictor, true_labels=subset['y_true'], times=subset['time'], ids=subset.index ) if not (acc_score is None or f1_score is None): wf.write('{};{};{};{};{}\n'.format(S, day, len(subset), acc_score, f1_score)) print('Done')
def parse_s(s): fn_data = Name_functions.DS_file(s) fn_subset = Name_functions.DS_reduced_ids_DSJ(s) x, y = Di(fn_data).split_data(int(s), fn_subset_ids=fn_subset) print('\tM^{}_j ... '.format(s), end='', flush=True) good_splits = 0 for i in sorted(x): fn_model = Name_functions.model_SJ(s, i) c, cc = np.unique(y[i], return_counts=True) if min(cc) < cv * 2: index = np.where(cc == np.min(cc)) continue if len(c) <= 1: continue if os.path.exists(fn_model): good_splits += 1 continue else: generate_model(x[i], y[i], s, i) good_splits += 1 continue print('Done ({}/{} D^{}_j met requirements)'.format( good_splits, len(x), s)) return good_splits, len(x), 100 * good_splits / len(x)
def parse_naive(s): fn_target = Name_functions.DS_reduced_ids_naive(s) print('\tD^S_naive ... ', end='', flush=True) # Check existence if Filefunctions.exists(fn_target): print('Already done') return fn_input = Name_functions.DS_file(s) x, y, timestamps, ids = DataImporter(fn_input).get_data( return_identifiers=True, return_split_values=True) first_year_indices = [ i for i in range(len(timestamps)) if timestamps[i] < Parameters.train_time_naive_stop ] x = x[first_year_indices] y = y[first_year_indices] ids = ids[first_year_indices] x, y, medoid_indices = KMedoids.reduce_to_medoids( x, y, return_indices=True, factor=Parameters.LargeSmallFactor) ids_keep = [ids[i] for i in medoid_indices] with open(fn_target, 'w+') as wf: for CaseID in ids_keep: wf.write('{}\n'.format(CaseID)) print('Done')
def __init__(self, s, score_function): super().__init__() assert (isinstance(score_function, BEPTScoring) or score_function is None) self.score_function = score_function self.predictions = Name_functions.import_probabilities_split(S=s) self.times = Name_functions.SJ_period_mid_times(S=s)
def run(): assert (len(topics) == len(activities)) assert (number_activities in Parameters.S_values) concept_drifter = Parameters.concept_drifter ConceptDriftedEventLogCreator(concept_drifter).run( fn_cases=Name_functions.cases_info(), fn_event_log=Name_functions.event_log())
def parse_ms(s): fn_data = Name_functions.DS_file(s) x, y, time, case_id = Di(fn_data).get_data(return_identifiers=True, return_split_values=True) print('\tM^{}_j ... '.format(s), end='', flush=True) # S predictions for i in sorted([int(i) for i in Name_functions.S_J_values(s)], reverse=True): if Filefunctions.exists(Name_functions.DSJ_probabilities(s, i)): continue model_i = Model_Functions.loadModel(Name_functions.model_SJ(s, i)) model_labels = model_i.classes_.tolist() model_end_time = Name_functions.SJ_period_end_time(s, i) with open(Name_functions.DSJ_probabilities(s, i), 'w+') as wf: for dx, t, idn in zip(x, time, case_id): if t < model_end_time: # Only test if the model existed before the data point continue model_predictions = model_i.predict_proba(dx.reshape(1, -1))[0] actual_predictions = [ (0 if (i not in model_labels) else model_predictions[model_labels.index(i)]) for i in all_labels ] wf.write('{};{};{}\n'.format( idn, t, ';'.join(['{:4f}'.format(x) for x in actual_predictions]))) print('Done') # Naive predictions print('\tM^{}_naive ... '.format(s), end='', flush=True) if Filefunctions.exists(Name_functions.DS_probabilities_naive(s)): print('Already done') return model_naive = Model_Functions.loadModel(Name_functions.model_S_naive(s)) model_naive_labels = model_naive.classes_.tolist() model_naive_end_time = Parameters.train_time_naive_stop with open(Name_functions.DS_probabilities_naive(s), 'w+') as wf: for dx, t, idn in zip(x, time, case_id): if t < model_naive_end_time: # Only test if the model existed before the data point continue model_predictions = model_naive.predict_proba(dx.reshape(1, -1))[0] actual_predictions = [ (0 if (i not in model_naive_labels) else model_predictions[model_naive_labels.index(i)]) for i in all_labels ] wf.write('{};{};{}\n'.format( idn, t, ';'.join(['{:4f}'.format(x) for x in actual_predictions]))) print('Done')
def run(): with open(Name_functions.best_graec(), 'r') as rf: (S, B, T, P) = rf.readline()[:-1].split(';')[0:4] CalculateDailyScores(single={'S': [int(S)], 'Tau': [float(T)], 'P': [float(P)], 'Beta': [float(B)]}, multi={'Beta': Parameters.GRAEC_beta, 'Tau': Parameters.GRAEC_tau, 'S': Parameters.S_values}, test_ids_fn=Name_functions.DS_reduced_ids_DSJ(S)).run()
def parse_previous(s): # do PREVIOUS SCORE df_previous = DataFrameOperations.import_df(fn=Name_functions.S_recent_test_predictions(s)) predicted_label = df_previous['Predicted_label'] true_label = df_previous['True_label'] acc_score = Metrics.accuracy(true_label=true_label, predicted_label=predicted_label) with open(Name_functions.S_recent_score(s, 'accuracy'), 'w+') as wf: wf.write('{}\n'.format(acc_score)) f1_score = Metrics.f1(true_label=true_label, predicted_label=predicted_label) with open(Name_functions.S_recent_score(s, 'f1'), 'w+') as wf: wf.write('{}\n'.format(f1_score))
def parse_naive(s): # do NAIVE SCORE df_naive = DataFrameOperations.import_df(fn=Name_functions.S_naive_test_predictions(s)) predicted_label = df_naive['Predicted_label'] true_label = df_naive['True_label'] acc_score = Metrics.accuracy(true_label=true_label, predicted_label=predicted_label) with open(Name_functions.S_naive_score(s, 'accuracy'), 'w+') as wf: wf.write('{}\n'.format(acc_score)) f1_score = Metrics.f1(true_label=true_label, predicted_label=predicted_label) with open(Name_functions.S_naive_score(s, 'f1'), 'w+') as wf: wf.write('{}\n'.format(f1_score))
def run(): fn = Name_functions.full_GRAEC_table() # Clear Test Scores file with open(fn, 'w+') as wf: wf.write('Beta;Tau;P;S;Score_type;Score_Value\n') for S in Parameters.S_values: Filefunctions.make_directory(Name_functions.S_score_folder(S)) print('S = {}'.format(S)) parse_graec(S) parse_previous(S) parse_naive(S)
def _eval_param(self, evaluated_parameter): print('Parsing parameter {} ... '.format(evaluated_parameter), end='', flush=True) fn = Name_functions.parameter_evaluation_evaluation_metric_file(evaluated_parameter) if Filefunctions.exists(fn): print('Already done') return with open(fn, 'w+') as wf: wf.write('S;Beta;Tau;P;Day;NumEntries;accuracy;f1\n') for S in self.values(evaluated_parameter, 'S'): predictor = Classifiers.BPTSClassifier(s=S, score_function=None) fn = Name_functions.DS_file(S) _, labels, times, ids = Di(fn).get_data(fn_subset_ids=self.test_ids_fn, return_split_values=True, return_identifiers=True) data = pd.DataFrame(index=ids) data['time'] = times data['y_true'] = [l[0] for l in labels] data['Day'] = np.floor(data['time']) for beta in self.values(evaluated_parameter, 'Beta'): for p in self.values(evaluated_parameter, 'P'): for tau in self.values(evaluated_parameter, 'Tau'): scoring_function = PeriodScoring(s=S, beta=beta, tau=tau, p=p) predictor.set_scoring_function(scoring_function) for day in data['Day'].unique(): subset = data[data['Day'] == day] acc_score, f1_score = self.get_scores(predictor=predictor, ids=subset.index, times=subset['time'], true_labels=subset['y_true'], ) if not (acc_score is None or f1_score is None): wf.write('{};{};{};{};{};{};{};{}\n'.format(S, beta, tau, p, day, len(subset), acc_score, f1_score)) print('Done')
def parse_naive(s): print('\tM^{}_naive ... '.format(s), end='', flush=True) fn_model = Name_functions.model_S_naive(s) if os.path.exists(fn_model): print("Already done") return 1.0, 1.0, 100 fn_data = Name_functions.DS_file(s) fn_subset = Name_functions.DS_reduced_ids_DSJ(s) x, y, t = Di(fn_data).get_data(fn_subset, True, False) y = y.ravel() # Only take data that is in the first year x = [ x[i] for i in range(len(t)) if t[i] < Parameters.train_time_naive_stop ] y = [ y[i] for i in range(len(t)) if t[i] < Parameters.train_time_naive_stop ] x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0, test_size=0.2) # Get the best model best_model = None best_score = -1 for c in used_models: score, model = train_classifier(c, x_train, x_test, y_train, y_test) if score > best_score: best_score = score best_model = model # save the model Model_Functions.saveModel(best_model, fn_model) print("Done") return 1.0, 1.0, 100
def parse_ms(s): print('D^{} ... '.format(s), end='', flush=True) if Filefunctions.exists(Name_functions.DS_train_ids(s)): if Filefunctions.exists(Name_functions.DS_test_ids(s)): print('Already done') return np.random.seed(0) X, y, times, ids = DI(Name_functions.DS_file(s)).get_data( Name_functions.DS_reduced_ids_DSJ(s), True, True) if Parameters.take_test_split_chronological: test_case_ids = [] train_case_ids = [] times_post_warm_up = [ t for t in times if t > Parameters.test_time_start ] times_post_warm_up.sort() train_start_index = int( (1 - Parameters.assessment_test_split) * len(times_post_warm_up)) train_time_end = times_post_warm_up[train_start_index] for case_start_time, case_id in zip(times, ids): if case_start_time <= Parameters.test_time_start: continue if case_start_time < train_time_end: train_case_ids.append(case_id) else: test_case_ids.append(case_id) else: indices = [ i for i in range(len(ids)) if times[i] > Parameters.test_time_start ] test_indices = [] train_indices = [] c, cc = np.unique(y[indices], return_counts=True) for label, label_count in zip(c, cc): num_test = int(label_count * Parameters.assessment_test_split) indices_c = [i for i in indices if y[i] == label] indices_c_test = np.random.choice(indices_c, num_test, replace=False) test_indices.extend(indices_c_test.tolist()) train_indices.extend( [i for i in indices_c if i not in indices_c_test]) test_case_ids = ids[test_indices] train_case_ids = ids[train_indices] with open(Name_functions.DS_train_ids(s), 'w+') as wf: for case_id in train_case_ids: wf.write('{}\n'.format(case_id)) with open(Name_functions.DS_test_ids(s), 'w+') as wf: for case_id in test_case_ids: wf.write('{}\n'.format(case_id)) print('Done')
def parse_ms(s): fn_target = Name_functions.DS_reduced_ids_DSJ(s) # Check existence print('\tD^S_j ... ', end='', flush=True) if Filefunctions.exists(fn_target): print('Already done') return fn_input = Name_functions.DS_file(s) x, y, ids = DataImporter(fn_input).split_data(int(s), return_identifiers=True) ids_keep = [] for i in sorted(x): xi, yi, indices = KMedoids.reduce_to_medoids(x[i], y[i], return_indices=True) ids_keep.extend([ids[i][j] for j in indices]) with open(fn_target, 'w+') as wf: for caseID in ids_keep: wf.write('{}\n'.format(caseID)) print('Done')
def generate_model(x, y, s, i): # Generate Train/Test x_train, x_test, y_train, y_test = train_test_split(x, y.ravel(), random_state=0, test_size=0.2) # Get the best model best_model = None best_score = -1 for c in used_models: score, model = train_classifier(c, x_train, x_test, y_train, y_test) if score > best_score: best_score = score best_model = model # save the model Model_Functions.saveModel(best_model, Name_functions.model_SJ(s, i))
def parse_graec(s): # do GRAEC SCORE enumeration_encoder = Human_Functions.load_dict_from_csv(Name_functions.S_GRAEC_enumeration_dictionary(s)) best_acc_train = -1 best_acc_test = -1 best_acc_enum = -1 best_f1_train = -1 best_f1_test = -1 best_f1_enum = -1 with open(Name_functions.full_GRAEC_table(), 'a+') as wf: for e in enumeration_encoder: df_train = DataFrameOperations.import_df(fn=Name_functions.S_GRAEC_train_predictions(s, e)) acc_train = Metrics.accuracy(df_train['True_label'], df_train['Predicted_label']) f1_train = Metrics.f1(df_train['True_label'], df_train['Predicted_label']) df_test = DataFrameOperations.import_df(fn=Name_functions.S_GRAEC_test_predictions(s, e)) acc_test = Metrics.accuracy(df_test['True_label'], df_test['Predicted_label']) f1_test = Metrics.f1(df_test['True_label'], df_test['Predicted_label']) wf.write(enumeration_encoder[e] + ';{};ACC;{}\n'.format(s, acc_test)) wf.write(enumeration_encoder[e] + ';{};F1;{}\n'.format(s, f1_test)) if acc_train > best_acc_train: best_acc_train = acc_train best_acc_test = acc_test best_acc_enum = e if f1_train > best_f1_train: best_f1_train = f1_train best_f1_test = f1_test best_f1_enum = e with open(Name_functions.S_GRAEC_score(s, 'accuracy'), 'w+') as wf: wf.write(enumeration_encoder[best_acc_enum] + '\n') wf.write('{}\n'.format(best_acc_test)) with open(Name_functions.S_GRAEC_score(s, 'f1'), 'w+') as wf: wf.write(enumeration_encoder[best_f1_enum] + '\n') wf.write('{}\n'.format(best_f1_test))
def run(): for s in Parameters.S_values: create_labelled_dataset(event_log=Name_functions.event_log(), s=s, feature_filename=Name_functions.cases_info(), output_file=Name_functions.DS_file(s))
def undo(): Filefunctions.delete(Name_functions.parameter_evaluation_data_folder()) STEP9_Over_Time_Results.undo()
def __init__(self, s): super().__init__() self.predictions = Name_functions.import_probabilities_naive(S=s)
def __init__(self, s, score_function): super().__init__() assert (score_function is None or isinstance(score_function, PeriodScoring)) self.score_function = score_function self.predictions = Name_functions.import_probabilities_split(S=s)
def __init__(self, s): super().__init__() self.predictions = Name_functions.import_probabilities_split(S=s) self.times = Name_functions.SJ_period_end_times(S=s)
def parse_ms(s): print('\tGRAEC ... ', end='', flush=True) if Filefunctions.exists(Name_functions.S_GRAEC_enumeration_dictionary(s)): print('Already done') return enumeration_encoder = dict() fn_data = Name_functions.DS_file(s) fn_train_ids = Name_functions.DS_train_ids(s) fn_test_ids = Name_functions.DS_test_ids(s) x_train, labels_train, times_train, ids_train = DI(fn_data).get_data( fn_subset_ids=fn_train_ids, return_split_values=True, return_identifiers=True) x_test, labels_test, times_test, ids_test = DI(fn_data).get_data( fn_subset_ids=fn_test_ids, return_split_values=True, return_identifiers=True) enumeration = 0 predictor = Classifiers.BPTSClassifier(s=s, score_function=None) for B in Parameters.GRAEC_beta: for T in Parameters.GRAEC_tau: for P in Parameters.GRAEC_p if not T == 0 else [ 0 ]: # P has no use for T == 0 enumeration_encoder[enumeration] = '{};{};{}'.format(B, T, P) predictor.set_scoring_function( score_function=PeriodScoring(beta=B, p=P, tau=T, s=s)) with open( Name_functions.S_GRAEC_train_predictions( s, enumeration), 'w+') as wf: wf.write('SOID;time;True_label;Predicted_label\n') for case_id, t, true_label in zip(ids_train, times_train, labels_train): predicted_label = predictor.predict(case_id=case_id, time=t) wf.write('{};{};{};{}\n'.format( case_id, t, true_label[0], predicted_label)) with open( Name_functions.S_GRAEC_test_predictions( s, enumeration), 'w+') as wf: wf.write('Case_id;time;True_label;Predicted_label\n') for case_id, t, true_label in zip(ids_test, times_test, labels_test): predicted_label = predictor.predict(case_id=case_id, time=t) wf.write('{};{};{};{}\n'.format( case_id, t, true_label[0], predicted_label)) enumeration += 1 Human_Functions.save_dict_to_csv( enumeration_encoder, Name_functions.S_GRAEC_enumeration_dictionary(s)) fn_data = Name_functions.DS_file(s) fn_ids = Name_functions.DS_test_ids(s) x, labels, times, ids = DI(fn_data).get_data(fn_subset_ids=fn_ids, return_split_values=True, return_identifiers=True) print('Done') print('\tNaive and Previous ... ', end='', flush=True) naive_predictor = Classifiers.NaiveClassifier(s) previous_predictor = Classifiers.PreviousClassifier(s) with open(Name_functions.S_naive_test_predictions(s), 'w+') as wf_naive: with open(Name_functions.S_recent_test_predictions(s), 'w+') as wf_previous: wf_naive.write('{};{};{};{}\n'.format('Case_id', 'time', 'True_label', 'Predicted_label')) wf_previous.write('{};{};{};{}\n'.format('Case_id', 'time', 'True_label', 'Predicted_label')) for case_id, t, true_label in zip(ids, times, labels): predicted_label_naive = naive_predictor.predict( case_id=case_id, time=t) if predicted_label_naive is not None: wf_naive.write('{};{};{};{}\n'.format( case_id, t, true_label[0], predicted_label_naive)) predicted_label_previous = previous_predictor.predict( case_id=case_id, time=t) if predicted_label_previous is not None: wf_previous.write('{};{};{};{}\n'.format( case_id, t, true_label[0], predicted_label_previous)) print('Done')
def undo(): for S in Parameters.S_values: Filefunctions.delete(Name_functions.S_score_folder(S)) STEP7_Global_Results.undo()
def run(): for metric in ['accuracy', 'f1']: best_graec_score = -1 best_graec_parameters = None fig, ax = plt.subplots() y = [] x_labels = [] colours = [] with open(Name_functions.metric_table(metric), 'w+') as wf: wf.write('\\begin{table}[]\n') wf.write('\\centering\n') wf.write('\\begin{tabular}{|c|c|c|c|}\n') wf.write('\\hline\n') wf.write('$S$ & $\\beta$ & $\\tau$ & {}\\\\\n'.format( Parameters.Tex_dict[metric])) wf.write('\\hline\n') for S in Parameters.S_values: names = ['N{}', 'R{}', 'GR{}'] fn_scores = [ Name_functions.S_naive_score(S, metric), Name_functions.S_recent_score(S, metric), Name_functions.S_GRAEC_score(S, metric) ] colour_values = ['r', 'orange', 'b'] for i in range(3): with open(fn_scores[i], 'r') as rf: if i == 2: # Our solution (B, T, P) = rf.readline()[:-1].split(';')[0:4] score = float(rf.readline()[:-1]) wf.write('{}&{}&{}&{:.3f}\\\\\n'.format( S, B, T, score)) if score > best_graec_score: best_graec_score = score best_graec_parameters = '{};{};{};{}\n'.format( S, B, T, P) else: score = float(rf.readline()[:-1]) x_labels.append(names[i].format(S)) y.append(score) colours.append(colour_values[i]) wf.write('\\hline\n') wf.write('\\end{tabular}\n') wf.write( '\\caption{{Optimal values for $S$, $\\beta$, and $\\tau$, and their {} scores}}\n' .format(Parameters.Tex_dict[metric])) wf.write('\\label{}\n') wf.write('\\end{table}\n') ax.bar(range(len(x_labels)), y, color=colours) ax.set_xticks(range(len(x_labels))) ax.set_xticklabels(x_labels) ax.set_xlabel('Method') ax.set_ylim(0, max(y) + 0.05) ax.set_ylabel('{} score'.format(metric)) ax.set_title( '{} scores for different concept drift solutions'.format(metric)) for (xp, yp) in zip(range(len(x_labels)), y): ax.text(xp - 0.36, yp, '{:.2f}'.format(yp)) # Save graph to disc fn = Name_functions.metric_figure(metric) fig.set_size_inches(20 / 2.56, 10 / 2.56) plt.savefig(fn, bbox_inches='tight') plt.close() # Save best graec parameters to disc fn = Name_functions.best_graec() with open(fn, 'w+') as wf: wf.write(best_graec_parameters)
def undo(): for S in Parameters.S_values: Filefunctions.delete(Name_functions.S_GRAEC_enumeration_folder(S)) Filefunctions.delete(Name_functions.S_naive_test_predictions(S)) Filefunctions.delete(Name_functions.S_recent_test_predictions(S)) STEP6_Global_Scoring.undo()
def undo(): Filefunctions.delete(Name_functions.results_folder()) STEP8_Over_Time_Scoring.undo()