parent_dir = current_dir[:current_dir.rfind(os.path.sep)] sys.path.insert(0, parent_dir) from formula_verificator import verify_formula_as_compliant from shared_variables import path_to_model_file, eventlog from support_scripts.prepare_data import encode from support_scripts.prepare_data import getSymbol from support_scripts.prepare_data import prepare_testing_data import time start_time = time.time() only_compliant = True lines, lines_t, lines_t2, lines_t3, maxlen, chars, char_indices,divisor, divisor2, divisor3, predict_size,target_indices_char = prepare_testing_data(eventlog, only_compliant) lines, lines_t, lines_t2, lines_t3 = selectFormulaVerifiedTraces(lines, lines_t, lines_t2, lines_t3) #this is the beam stack size, means how many "best" alternatives will be stored # lines = lines[0:300] # lines_t= lines_t[0:300] # lines_t2=lines_t2[0:300] # lines_t3=lines_t3[0:300] one_ahead_gt = [] one_ahead_pred = [] # load model, set this to the model generated by train.py model = load_model(path_to_model_file) # make predictions
#find cycles and modify the probability functionality goes here stop_symbol_probability_amplifier_current = 1 start_of_the_cycle_symbol = " " one_ahead_gt = [] one_ahead_pred = [] two_ahead_gt = [] two_ahead_pred = [] three_ahead_gt = [] three_ahead_pred = [] lines_s, lines_t_s, lines_t2_s, lines_t3_s = selectFormulaVerifiedTraces( lines, lines_t, lines_t2, lines_t3) with open('../output_files/results/suffix_and_remaining_time3_%s' % eventlog, 'wb') as csvfile: spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) spamwriter.writerow([ "Prefix length", "Groud truth", "Predicted", "Levenshtein", "Damerau", "Jaccard", "Ground truth times", "Predicted times", "RMSE", "MAE", "Median AE" ]) for prefix_size in range(prefix_size_pred_from, prefix_size_pred_to): print("prefix size: " + str(prefix_size))
def run_experiments(log_identificator, formula_type): eventlog, path_to_model_file, beam_size, \ prefix_size_pred_from, prefix_size_pred_to, formula = activateSettings(log_identificator, formula_type) current_path = os.path.abspath(getsourcefile(lambda: 0)) current_dir = os.path.dirname(current_path) parent_dir = current_dir[:current_dir.rfind(os.path.sep)] sys.path.insert(0, parent_dir) start_time = time.time() lines, lines_t, lines_t2, lines_t3, maxlen, chars, char_indices, divisor, divisor2, \ divisor3, predict_size, target_indices_char, target_char_indices = prepare_testing_data(eventlog) # find cycles and modify the probability functionality goes here stop_symbol_probability_amplifier_current = 1 # modify to be able to get second best prediction def getSymbol(predictions, ith_best=0): predictions[ 0] = predictions[0] * stop_symbol_probability_amplifier_current i = np.argsort(predictions)[len(predictions) - ith_best - 1] return target_indices_char[i] one_ahead_gt = [] one_ahead_pred = [] # load model, set this to the model generated by train.py model = load_model(path_to_model_file) stop_symbol_probability_amplifier_current = 1 # make predictions with open( 'output_files/results/' + formula_type + '/suffix_and_remaining_time2_%s' % eventlog, 'wb') as csvfile: spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) spamwriter.writerow([ "Prefix length", "Groud truth", "Predicted", "Levenshtein", "Damerau", "Jaccard", "Ground truth times", "Predicted times", "RMSE", "MAE", "Median AE" ]) for prefix_size in range(prefix_size_pred_from, prefix_size_pred_to): # here we checkout the prefixes with formulas verified only on the suffix phase lines_s, lines_t_s, lines_t2_s, lines_t3_s = selectFormulaVerifiedTraces( lines, lines_t, lines_t2, lines_t3, formula, prefix_size) print("prefix size: " + str(prefix_size)) print("formulas verifited: " + str(len(lines_s)) + " out of : " + str(len(lines))) for line, times, times2, times3 in zip(lines_s, lines_t_s, lines_t2_s, lines_t3_s): prediction_end_reached = False times.append(0) cropped_line = ''.join(line[:prefix_size]) cropped_times = times[:prefix_size] cropped_times3 = times3[:prefix_size] if len(times2) < prefix_size: continue # make no prediction for this case, since this case has ended already # initialize root of the tree for beam search total_predicted_time_initialization = 0 search_tree_root = MultileafTree( beam_size, encode(cropped_line, cropped_times, cropped_times3, maxlen, chars, char_indices, divisor, divisor2), cropped_line, total_predicted_time_initialization) prediction_end_reached = False ground_truth = ''.join(line[prefix_size:prefix_size + predict_size]) ground_truth_t = times2[prefix_size - 1] case_end_time = times2[len(times2) - 1] ground_truth_t = case_end_time - ground_truth_t predicted = '' for i in range(predict_size): # here we will take data from the node in the tree used to prun enc = search_tree_root.data # encode(cropped_line, cropped_times, cropped_times3) y = model.predict(enc, verbose=0) # make predictions # split predictions into seperate activity and time predictions y_char = y[0][0] y_t = y[1][0][0] stop_symbol_probability_amplifier_current, \ start_of_the_cycle_symbol = amplify(search_tree_root.cropped_line) # cropped_line += prediction if y_t < 0: y_t = 0 # TOO not normalizing here seems like a bug cropped_times.append(y_t) ma = False for i in range(beam_size): prediction = getSymbolAmpl( y_char, target_indices_char, target_char_indices, start_of_the_cycle_symbol, stop_symbol_probability_amplifier_current, i) # end of case was just predicted, therefore, stop predicting further into the future if prediction == '!': if verify_formula_as_compliant( search_tree_root.cropped_line, formula, prefix_size): one_ahead_pred.append( search_tree_root.total_predicted_time) one_ahead_gt.append(ground_truth_t) print('! predicted, end case') ma = True break # else: # prediction_end_reached = True; if ma: break # if the end of prediction was not reached we continue as always, and then function :choose_next_ # top_descendant: will earch for future prediction # in not reached, function :choose_next_top_descendant: will backtrack y_t = y_t * divisor3 if not prediction_end_reached: cropped_times3.append(cropped_times3[-1] + timedelta(seconds=y_t)) for i in range(beam_size): temp_prediction = getSymbolAmpl( y_char, target_indices_char, target_char_indices, start_of_the_cycle_symbol, stop_symbol_probability_amplifier_current, i) if temp_prediction == '!': continue temp_cropped_line = search_tree_root.cropped_line + temp_prediction # this means that we found the end in one of the alternatives. temp_total_predicted_time = search_tree_root.total_predicted_time + y_t temp_state_data = encode(temp_cropped_line, cropped_times, cropped_times3, maxlen, chars, char_indices, divisor, divisor2) search_tree_root.descendants[i] = MultileafTree( beam_size, temp_state_data, temp_cropped_line, temp_total_predicted_time, search_tree_root) search_tree_root = search_tree_root.choose_next_top_descendant( ) if prediction_end_reached: prediction_end_reached = False if search_tree_root is None: print( "Cannot find any trace that is compliant with formula given current beam size" ) break output = [] if search_tree_root is None: predicted = u"" total_predicted_time = 0 else: predicted = (search_tree_root.cropped_line[prefix_size:]) total_predicted_time = search_tree_root.total_predicted_time if len(ground_truth) > 0: output.append(prefix_size) output.append(unicode(ground_truth).encode("utf-8")) output.append(unicode(predicted).encode("utf-8")) output.append( 1 - distance.nlevenshtein(predicted, ground_truth)) dls = 1 - (damerau_levenshtein_distance( unicode(predicted), unicode(ground_truth)) / max(len(predicted), len(ground_truth))) if dls < 0: dls = 0 # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where the # default character encoding of the operating system caused it to be negative, this should never # be the case output.append(dls) output.append(1 - distance.jaccard(predicted, ground_truth)) output.append(ground_truth_t) output.append(total_predicted_time) output.append('') output.append( metrics.mean_absolute_error([ground_truth_t], [total_predicted_time])) output.append( metrics.median_absolute_error([ground_truth_t], [total_predicted_time])) spamwriter.writerow(output) print("TIME TO FINISH --- %s seconds ---" % (time.time() - start_time))
def runExperiments(logIdentificator, formulaType): eventlog, path_to_model_file, beam_size, \ prefix_size_pred_from, prefix_size_pred_to, formula = activateSettings(logIdentificator, formulaType) start_time = time.time() csvfile = open('../data/%s' % eventlog, 'r') spamreader = csv.reader(csvfile, delimiter=',', quotechar='|') next(spamreader, None) # skip the headers lastcase = '' line = '' firstLine = True lines = [] timeseqs = [] # relative time since previous event timeseqs2 = [] # relative time since case start timeseqs3 = [] # absolute time of previous event times = [] times2 = [] times3 = [] numlines = 0 casestarttime = None lasteventtime = None for row in spamreader: t = time.strptime(row[2], "%Y-%m-%d %H:%M:%S") if row[0]!=lastcase: casestarttime = t lasteventtime = t lastcase = row[0] if not firstLine: lines.append(line) timeseqs.append(times) timeseqs2.append(times2) timeseqs3.append(times3) line = '' times = [] times2 = [] times3 = [] numlines+=1 line+= getUnicode_fromInt(row[1]) timesincelastevent = datetime.fromtimestamp(time.mktime(t))-datetime.fromtimestamp(time.mktime(lasteventtime)) timesincecasestart = datetime.fromtimestamp(time.mktime(t))-datetime.fromtimestamp(time.mktime(casestarttime)) midnight = datetime.fromtimestamp(time.mktime(t)).replace(hour=0, minute=0, second=0, microsecond=0) timesincemidnight = datetime.fromtimestamp(time.mktime(t))-midnight timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds times.append(timediff) times2.append(timediff2) times3.append(datetime.fromtimestamp(time.mktime(t))) lasteventtime = t firstLine = False # add last case lines.append(line) timeseqs.append(times) timeseqs2.append(times2) timeseqs3.append(times3) numlines+=1 divisor = np.mean([item for sublist in timeseqs for item in sublist]) print('divisor: {}'.format(divisor)) divisor2 = np.mean([item for sublist in timeseqs2 for item in sublist]) print('divisor2: {}'.format(divisor2)) divisor3 = np.mean(map(lambda x: np.mean(map(lambda y: x[len(x)-1]-y, x)), timeseqs2)) print('divisor3: {}'.format(divisor3)) elems_per_fold = int(round(numlines/3)) fold1and2lines = lines[:2*elems_per_fold] step = 1 sentences = [] softness = 0 next_chars = [] fold1and2lines = map(lambda x: x+'!',fold1and2lines) maxlen = max(map(lambda x: len(x),fold1and2lines)) chars = map(lambda x : set(x),fold1and2lines) chars = list(set().union(*chars)) chars.sort() target_chars = copy.copy(chars) chars.remove('!') print('total chars: {}, target chars: {}'.format(len(chars), len(target_chars))) char_indices = dict((c, i) for i, c in enumerate(chars)) indices_char = dict((i, c) for i, c in enumerate(chars)) target_char_indices = dict((c, i) for i, c in enumerate(target_chars)) target_indices_char = dict((i, c) for i, c in enumerate(target_chars)) print(indices_char) #we only need the third fold, because first two were used for training fold3 = lines[2*elems_per_fold:] fold3_t = timeseqs[2*elems_per_fold:] fold3_t2 = timeseqs2[2*elems_per_fold:] fold3_t3 = timeseqs3[2*elems_per_fold:] lines = fold3 lines_t = fold3_t lines_t2 = fold3_t2 lines_t3 = fold3_t3 # set parameters predict_size = maxlen # load model, set this to the model generated by train.py model = load_model(path_to_model_file) # define helper functions #this one encodes the current sentence into the onehot encoding def encode(sentence, times, times3, maxlen=maxlen): num_features = len(chars)+5 X = np.zeros((1, maxlen, num_features), dtype=np.float32) leftpad = maxlen-len(sentence) times2 = np.cumsum(times) for t, char in enumerate(sentence): midnight = times3[t].replace(hour=0, minute=0, second=0, microsecond=0) timesincemidnight = times3[t]-midnight multiset_abstraction = Counter(sentence[:t+1]) for c in chars: if c==char: X[0, t+leftpad, char_indices[c]] = 1 X[0, t+leftpad, len(chars)] = t+1 X[0, t+leftpad, len(chars)+1] = times[t]/divisor X[0, t+leftpad, len(chars)+2] = times2[t]/divisor2 X[0, t+leftpad, len(chars)+3] = timesincemidnight.seconds/86400 X[0, t+leftpad, len(chars)+4] = times3[t].weekday()/7 return X #modify to be able to get second best prediction def getSymbol(predictions, ith_best = 0): i = np.argsort(predictions)[len(predictions) - ith_best - 1] return target_indices_char[i] one_ahead_gt = [] one_ahead_pred = [] two_ahead_gt = [] two_ahead_pred = [] three_ahead_gt = [] three_ahead_pred = [] with open('output_files/results/'+formulaType+'/suffix_and_remaining_time0_%s' % eventlog, 'wb') as csvfile: spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) spamwriter.writerow(["Prefix length", "Groud truth", "Predicted", "Levenshtein", "Damerau", "Jaccard", "Ground truth times", "Predicted times", "RMSE", "MAE", "Median AE"]) for prefix_size in range(prefix_size_pred_from, prefix_size_pred_to): lines_s, lines_t_s, lines_t2_s, lines_t3_s = selectFormulaVerifiedTraces(lines, lines_t, lines_t2, lines_t3, formula, prefix_size) print(prefix_size) print("formulas verifited: " + str(len(lines_s)) + " out of : " + str(len(lines))) for line, times, times2, times3 in izip(lines_s, lines_t_s, lines_t2_s, lines_t3_s): times.append(0) cropped_line = ''.join(line[:prefix_size]) cropped_times = times[:prefix_size] cropped_times3 = times3[:prefix_size] if len(times2)<prefix_size: continue # make no prediction for this case, since this case has ended already ground_truth = ''.join(line[prefix_size:prefix_size+predict_size]) ground_truth_t = times2[prefix_size-1] case_end_time = times2[len(times2)-1] ground_truth_t = case_end_time-ground_truth_t predicted = '' total_predicted_time = 0 for i in range(predict_size): enc = encode(cropped_line, cropped_times, cropped_times3) y = model.predict(enc, verbose=0) # make predictions # split predictions into seperate activity and time predictions y_char = y[0][0] y_t = y[1][0][0] prediction = getSymbol(y_char) # undo one-hot encoding cropped_line += prediction if y_t<0: y_t=0 cropped_times.append(y_t) if prediction == '!': # end of case was just predicted, therefore, stop predicting further into the future one_ahead_pred.append(total_predicted_time) one_ahead_gt.append(ground_truth_t) print('! predicted, end case') break y_t = y_t * divisor3 cropped_times3.append(cropped_times3[-1] + timedelta(seconds=y_t)) total_predicted_time = total_predicted_time + y_t predicted += prediction output = [] if len(ground_truth)>0: output.append(prefix_size) output.append(unicode(ground_truth).encode("utf-8")) output.append(unicode(predicted).encode("utf-8")) output.append(1 - distance.nlevenshtein(predicted, ground_truth)) dls = 1 - (damerau_levenshtein_distance(unicode(predicted), unicode(ground_truth)) / max(len(predicted),len(ground_truth))) if dls<0: dls=0 # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where the default character encoding of the operating system caused it to be negative, this should never be the case output.append(dls) output.append(1 - distance.jaccard(predicted, ground_truth)) output.append(ground_truth_t) output.append(total_predicted_time) output.append('') output.append(metrics.mean_absolute_error([ground_truth_t], [total_predicted_time])) output.append(metrics.median_absolute_error([ground_truth_t], [total_predicted_time])) spamwriter.writerow(output) print("TIME TO FINISH --- %s seconds ---" % (time.time() - start_time))