def DL_Distance(str1, str2): print(str1, str2) print("distance 1: ", distance.nlevenshtein(str1, str2)) print("distance 2: ", damerau_levenshtein_distance(str1, str2)) dls = (damerau_levenshtein_distance(str1, str2) / max(len(str1), len(str2))) print("distance 3: ", dls) print("distance 4: ", distance.jaccard(str1, str2))
def dameraulevenshtein(seq1, seq2): """Calculate the Damerau-Levenshtein distance between sequences. This distance is the number of operations (consisting of insertions, deletions or substitutions of a single character, or transposition of two adjacent characters) required to change one sequence into the other. Arguments may be str or unicode. >>> dameraulevenshtein('ba', 'abc') 2 >>> dameraulevenshtein('fee', 'deed') 2 >>> dameraulevenshtein(u'abcd', u'bacde') 2 >>> dameraulevenshtein(u'number e', u'number \u03c0') 1 """ if isinstance(seq1, str): seq1 = unicode(seq1, 'utf-8') if isinstance(seq2, str): seq2 = unicode(seq2, 'utf-8') # Fall back onto Python implementation for code points unsupported by the C # implementation. # https://github.com/jamesturk/jellyfish/issues/55#issuecomment-312509263 try: return jellyfish.damerau_levenshtein_distance(seq1, seq2) except ValueError: return py_jellyfish.damerau_levenshtein_distance(seq1, seq2)
def mapperSimilarity(self, _, line): SIMILARITY_THRESHOLD = 0.8 words = line.split(';') distance = damerau_levenshtein_distance(words[0], words[1]) sim = self.normalizeDistanceIndex(len(words[0]), len(words[1]), distance) if (sim > SIMILARITY_THRESHOLD): yield (words[0], [words[1], sim])
def match(data1, data2, fields1, fields2): threshold = 0.4 matches = [] for data1key, data1values in data1.items(): for data2key, data2values in data2.items(): match = True for field1, field2 in zip(fields1, fields2): maximum = float(max(len(data1values[field1]), len(data2values[field2]))) if jellyfish.damerau_levenshtein_distance(data1values[field1], data2values[field2]) / maximum > threshold: match = False if match: matches.append((data1key, data2key)) return matches
def match(data1, data2, fields1, fields2): threshold = 0.6 matches = [] for data1key, data1values in data1.items(): for data2key, data2values in data2.items(): match = False for field1, field2 in zip(fields1, fields2): maximum = float( max(len(data1values[field1]), len(data2values[field2]))) distance = jellyfish.damerau_levenshtein_distance( data1values[field1], data2values[field2]) degree = 1 - distance / maximum if degree > threshold: match = True if match: matches.append((data1key, data2key, degree)) return matches
def mapperSimilarity(self, key, data): if key == "dl": SIMILARITY_THRESHOLD = 0.8 distance = damerau_levenshtein_distance(data[0], data[1]) sim = self.normalizeDistanceIndex(len(data[0]), len(data[1]), distance) if (sim > SIMILARITY_THRESHOLD): yield ("ap", [data[0], data[1], sim]) yield ("cos", [data[0], data[1], sim]) yield ("jw", [data[0], data[1], sim]) elif key == "rat": SIMILARITY_THRESHOLD = 0.8 sim = SequenceMatcher(a=data[0], b=data[1]).ratio() if (sim > SIMILARITY_THRESHOLD): yield ("ap", [data[0], data[1], sim]) yield ("cos", [data[0], data[1], sim]) yield ("jw", [data[0], data[1], sim])
def levenshtein(s1, s2): if (not s1): s1 = "" if (not s2): s2 = "" if len(s1) < len(s2): return levenshtein(s2, s1) #choosen if len(s2) == 0: return len(s1) try: return jellyfish.damerau_levenshtein_distance(str(s1), str(s2)) except: # workaround for unicode : fallback from c to python version return py_jellyfish.damerau_levenshtein_distance(str(s1), str(s2)) # cached version try: return levCache[tuple(s1, s2)] except: pass levCache[tuple([s1, s2])] = jellyfish.levenshtein_distance(str(s1), str(s2)) return levCache[tuple([s1, s2])] #original # len(s1) >= len(s2) previous_row = list(range(len(s2) + 1)) for i, c1 in enumerate(s1): current_row = [i + 1] for j, c2 in enumerate(s2): insertions = previous_row[ j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer deletions = current_row[j] + 1 # than s2 substitutions = previous_row[j] + (c1 != c2) current_row.append(min(insertions, deletions, substitutions)) previous_row = current_row return previous_row[-1]
def runExperiments(logIdentificator, formulaType): eventlog, path_to_model_file, beam_size, \ prefix_size_pred_from, prefix_size_pred_to, formula = activateSettings(logIdentificator, formulaType) start_time = time.time() lines, lines_t, lines_t2, lines_t3, maxlen, chars, char_indices,divisor, divisor2, \ divisor3, predict_size,target_indices_char,target_char_indices\ = prepare_testing_data(eventlog) # # lines = lines[0:300] # lines_t= lines_t[0:300] # lines_t2=lines_t2[0:300] # lines_t3=lines_t3[0:300] #this is the beam stack size, means how many "best" alternatives will be stored one_ahead_gt = [] one_ahead_pred = [] #find cycles and modify the probability functionality goes here stop_symbol_probability_amplifier_current = 1 # load model, set this to the model generated by train.py model = load_model(path_to_model_file) class NodePrediction(): def __init__(self, data, cropped_line, total_predicted_time, probability_of=0): self.data = data self.cropped_line = cropped_line self.total_predicted_time = total_predicted_time self.probability_of = probability_of # make predictions with open( 'output_files/results/' + formulaType + '/suffix_and_remaining_time3_%s' % eventlog, 'wb') as csvfile: spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) spamwriter.writerow([ "Prefix length", "Groud truth", "Predicted", "Levenshtein", "Damerau", "Jaccard", "Ground truth times", "Predicted times", "RMSE", "MAE", "Median AE" ]) for prefix_size in range(prefix_size_pred_from, prefix_size_pred_to): print(prefix_size) # lines = lines[13:] # lines_t = lines_t[13:] # lines_t2 = lines_t2[13:] # lines_t3 = lines_t3[13:] lines_s, lines_t_s, lines_t2_s, lines_t3_s = selectFormulaVerifiedTraces( lines, lines_t, lines_t2, lines_t3, formula, prefix_size) print("prefix size: " + str(prefix_size)) print("formulas verifited: " + str(len(lines_s)) + " out of : " + str(len(lines))) counterr = 0 for line, times, times2, times3 in izip(lines_s, lines_t_s, lines_t2_s, lines_t3_s): times.append(0) cropped_line = ''.join(line[:prefix_size]) cropped_times = times[:prefix_size] cropped_times3 = times3[:prefix_size] if len(times2) < prefix_size: continue # make no prediction for this case, since this case has ended already # initialize root of the tree for beam search total_predicted_time_initialization = 0 search_node_root = NodePrediction( encode(cropped_line, cropped_times, cropped_times3, maxlen, chars, char_indices, divisor, divisor2), cropped_line, total_predicted_time_initialization) ground_truth = ''.join(line[prefix_size:prefix_size + predict_size]) ground_truth_t = times2[prefix_size - 1] case_end_time = times2[len(times2) - 1] ground_truth_t = case_end_time - ground_truth_t predicted = '' queue_next_steps = PriorityQueue() queue_next_steps.put( (-search_node_root.probability_of, search_node_root)) queue_next_steps_future = PriorityQueue() start_of_the_cycle_symbol = " " found_sattisfying_constraint = False current_beam_size = beam_size for i in range(predict_size): for k in range(current_beam_size): if queue_next_steps.empty(): break _, current_prediction_premis = queue_next_steps.get() if not found_sattisfying_constraint: if verify_formula_as_compliant( current_prediction_premis.cropped_line, formula, prefix_size): #the formula verified and we can just finish the predictions #beam size is 1 because predict only sequence of events current_beam_size = 1 #overwrite new queue queue_next_steps_future = PriorityQueue() found_sattisfying_constraint = True enc = current_prediction_premis.data temp_cropped_line = current_prediction_premis.cropped_line y = model.predict(enc, verbose=0) # make predictions # split predictions into seperate activity and time predictions y_char = y[0][0] y_t = y[1][0][0] if y_t < 0: y_t = 0 cropped_times.append(y_t) if not i == 0: stop_symbol_probability_amplifier_current, start_of_the_cycle_symbol = amplify( temp_cropped_line) #in not reached, function :choose_next_top_descendant: will backtrack y_t = y_t * divisor3 cropped_times3.append(cropped_times3[-1] + timedelta(seconds=y_t)) for j in range(current_beam_size): temp_prediction = getSymbolAmpl( y_char, target_indices_char, target_char_indices, start_of_the_cycle_symbol, stop_symbol_probability_amplifier_current, j) if temp_prediction == '!': # end of case was just predicted, therefore, stop predicting further into the future if verify_formula_as_compliant( temp_cropped_line, formula, prefix_size): one_ahead_pred.append( current_prediction_premis. total_predicted_time) one_ahead_gt.append(ground_truth_t) stop_symbol_probability_amplifier_current = 1 print('! predicted, end case') queue_next_steps = PriorityQueue() break else: continue temp_cropped_line = current_prediction_premis.cropped_line + temp_prediction temp_total_predicted_time = current_prediction_premis.total_predicted_time + y_t temp_state_data = encode(temp_cropped_line, cropped_times, cropped_times3, maxlen, chars, char_indices, divisor, divisor2) probability_this = np.sort(y_char)[len(y_char) - 1 - j] temp = NodePrediction( temp_state_data, temp_cropped_line, temp_total_predicted_time, current_prediction_premis.probability_of + np.log(probability_this)) queue_next_steps_future.put( (-temp.probability_of, temp)) # print str(counterr) + ' ' + str(i) + ' ' + str(k) \ # + ' ' + str(j) + ' ' + temp_cropped_line[prefix_size:]\ # + " " + str(temp.probability_of) queue_next_steps = queue_next_steps_future queue_next_steps_future = PriorityQueue() counterr += 1 if current_prediction_premis == None: print "Cannot find any trace that is compliant with formula given current beam size" break output = [] if current_prediction_premis == None: predicted = u"" total_predicted_time = 0 else: predicted = ( current_prediction_premis.cropped_line[prefix_size:]) total_predicted_time = current_prediction_premis.total_predicted_time if len(ground_truth) > 0: output.append(prefix_size) output.append(unicode(ground_truth).encode("utf-8")) output.append(unicode(predicted).encode("utf-8")) output.append( 1 - distance.nlevenshtein(predicted, ground_truth)) dls = 1 - (damerau_levenshtein_distance( unicode(predicted), unicode(ground_truth)) / max(len(predicted), len(ground_truth))) if dls < 0: dls = 0 # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where the default character encoding of the operating system caused it to be negative, this should never be the case output.append(dls) output.append(1 - distance.jaccard(predicted, ground_truth)) output.append(ground_truth_t) output.append(total_predicted_time) output.append('') output.append( metrics.mean_absolute_error([ground_truth_t], [total_predicted_time])) output.append( metrics.median_absolute_error([ground_truth_t], [total_predicted_time])) spamwriter.writerow(output) print("TIME TO FINISH --- %s seconds ---" % (time.time() - start_time))
def test_damerau_levenshtein_distance(jf, s1, s2, value): value = int(value) assert jf.damerau_levenshtein_distance(s1, s2) == value
def test_damerau_levenshtein_distance_type(jf): jf.damerau_levenshtein_distance(u'abc', u'abc') with pytest.raises(TypeError) as exc: jf.damerau_levenshtein_distance(b'abc', b'abc') assert 'expected' in str(exc.value)
def test(args, preprocess_manager): result_dir = args.result_dir task = args.task # get test set if preprocess_manager.num_features_additional > 0: lines, caseids, lines_t, lines_t2, lines_t3, lines_add, sequence_max_length, num_features_all, num_features_activities = preprocess_manager.create_test_set( ) else: lines, caseids, lines_t, lines_t2, lines_t3, sequence_max_length, num_features_all, num_features_activities = preprocess_manager.create_test_set( ) # load model model_suffix_prediction = load_model( '%smodel_suffix_prediction_%s.h5' % (args.checkpoint_dir, preprocess_manager.iteration_cross_validation)) # set options for result output data_set_name = args.data_set.split('.csv')[0] generic_result_dir = result_dir + data_set_name + "__" + task fold_result_dir = generic_result_dir + "_%d%s" % ( preprocess_manager.iteration_cross_validation, ".csv") result_dir = fold_result_dir # start prediction with open(result_dir, 'w') as csvfile: spamwriter = csv.writer(csvfile, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL) spamwriter.writerow([ "CaseID", "Prefix length", "Ground truth", "Predicted", "Levenshtein", "Damerau", "Jaccard", "Ground truth times", "Predicted times", "MAE", "In time", "Dev. in time", "Num corrections" ]) for line, caseid, times, times2, times3, line_add in zip( lines, caseids, lines_t, lines_t2, lines_t3, lines_add): # for each prefix of a case with a size > 1 for prefix_size in range(2, sequence_max_length): num_corrections = 0 util.llprint("\nPrefix size: %d\n" % prefix_size) # preparation for next best event determination # get prefix; one output for each prefix of a case current = dict() predict = dict() ground_truth = dict() # current = ground truth prefix + predicted suffix current = { "line": ''.join(line[:prefix_size]), "times": times[:prefix_size], "times2": times2[:prefix_size], "times3": times3[:prefix_size], "line_add": line_add[:prefix_size], } # termination if '!' in current["line"]: break ground_truth = { "total_event": ''.join(line[:]), "prefix_event": ''.join(line[:prefix_size]), "suffix_event": ''.join(line[prefix_size:]), "total_time": times2[len(times2) - 1], "prefix_time": times2[prefix_size - 1], "suffix_time": times2[len(times2) - 1] - times2[prefix_size - 1] } predict = { "size": sequence_max_length - 1, "predicted": '', "suffix_time": 0 } # result for each prefix of a case if args.next_best_action: # check prefix conformance if preprocess_manager.checkCandidate( args, preprocess_manager.transformNewInstance( ground_truth["prefix_event"])): predict, in_time, deviation_in_time, num_corrections = predictSuffixAndTimeForPrefixNextBestEvent( args, model_suffix_prediction, preprocess_manager, current, predict, ground_truth, num_corrections) else: break else: predict, in_time, deviation_in_time = predictSuffixAndTimeForPrefix( args, model_suffix_prediction, preprocess_manager, current, predict, ground_truth) # termination if predict["predicted"] == "": continue output = [] if len(ground_truth["suffix_event"]) > 0: output.append(caseid) output.append(prefix_size) output.append( str(ground_truth["suffix_event"]).encode("utf-8")) output.append(str(predict["predicted"]).encode("utf-8")) output.append(1 - distance.nlevenshtein( predict["predicted"], ground_truth["suffix_event"])) dls = 1 - (damerau_levenshtein_distance( str(predict["predicted"]), str(ground_truth["suffix_event"])) / max(len(predict["predicted"]), len(ground_truth["suffix_event"]))) if dls < 0: dls = 0 output.append(dls) output.append(1 - distance.jaccard( predict["predicted"], ground_truth["suffix_event"])) output.append(ground_truth["suffix_time"]) output.append(predict["suffix_time"]) output.append( metrics.mean_absolute_error( [ground_truth["suffix_time"]], [predict["suffix_time"]])) output.append(in_time) output.append(deviation_in_time) if num_corrections > 0: output.append(num_corrections) else: output.append(0) spamwriter.writerow(output)
def run_experiments(log_identificator, formula_type, rnn_type): eventlog, \ path_to_model_file_cf, \ path_to_model_file_cfr, \ path_to_declare_model_file, \ beam_size, \ prefix_size_pred_from, \ prefix_size_pred_to, \ formula = activate_settings(log_identificator, formula_type) if rnn_type == "CF": path_to_model_file = path_to_model_file_cf elif rnn_type == "CFR": path_to_model_file = path_to_model_file_cfr start_time = time.time() # prepare the data N.B. maxlen == predict_size lines, \ lines_id, \ lines_group, \ lines_t, \ lines_t2, \ lines_t3, \ lines_t4, \ maxlen, \ chars, \ chars_group, \ char_indices, \ char_indices_group, \ divisor, \ divisor2, \ divisor3, \ predict_size, \ target_indices_char, \ target_indices_char_group,\ target_char_indices, \ target_char_indices_group = prepare_testing_data(eventlog) # load model, set this to the model generated by train.py model = load_model(path_to_model_file) # define helper functions # this one encodes the current sentence into the onehot encoding # noinspection PyUnusedLocal def encode(sentence, sentence_group, times_enc, times3_enc, maxlen_enc=maxlen): num_features = len(chars) + len(chars_group) + 5 x = np.zeros((1, maxlen_enc, num_features), dtype=np.float32) leftpad = maxlen_enc - len(sentence) times2_enc = np.cumsum(times_enc) for v, char in enumerate(sentence): midnight = times3_enc[v].replace(hour=0, minute=0, second=0, microsecond=0) timesincemidnight = times3_enc[v] - midnight multiset_abstraction = Counter(sentence[:v + 1]) for c in chars: if c == char: x[0, v + leftpad, char_indices[c]] = 1 for g in chars_group: if g == sentence_group[v]: x[0, v + leftpad, len(char_indices) + char_indices_group[g]] = 1 x[0, v + leftpad, len(chars) + len(chars_group)] = v + 1 x[0, v + leftpad, len(chars) + len(chars_group) + 1] = times_enc[v] / divisor x[0, v + leftpad, len(chars) + len(chars_group) + 2] = times2_enc[v] / divisor2 x[0, v + leftpad, len(chars) + len(chars_group) + 3] = timesincemidnight.seconds / 86400 x[0, v + leftpad, len(chars) + len(chars_group) + 4] = times3_enc[v].weekday() / 7 return x # modify to be able to get second best prediction def get_symbol(predictions, vth_best=0): v = np.argsort(predictions)[len(predictions) - vth_best - 1] return target_indices_char[v] def get_symbol_group(predictions, vth_best=0): v = np.argsort(predictions)[len(predictions) - vth_best - 1] return target_indices_char_group[v] one_ahead_gt = [] one_ahead_pred = [] with open( 'output_files/final_experiments/results/baseline/%s_%s.csv' % (eventlog[:-4], rnn_type), 'wb') as csvfile: spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) spamwriter.writerow([ "Prefix length", "Ground truth", "Predicted", "Levenshtein", "Damerau", "Jaccard", "Ground truth times", "Predicted times", "RMSE", "MAE", "Median AE", "Ground Truth Group", "Predicted Group", "Levenshtein Group" ]) for prefix_size in range(prefix_size_pred_from, prefix_size_pred_to): lines_s,\ lines_id_s,\ lines_group_s, \ lines_t_s, \ lines_t2_s, \ lines_t3_s,\ lines_t4_s = select_declare_verified_traces(path_to_declare_model_file, lines, lines_id, lines_group, lines_t, lines_t2, lines_t3, lines_t4, prefix_size) print(prefix_size) print("formulas verified: " + str(len(lines_s)) + " out of : " + str(len(lines))) for line, line_id, line_group, times, times2, times3, times4 in zip( lines_s, lines_id_s, lines_group_s, lines_t_s, lines_t2_s, lines_t3_s, lines_t4_s): times.append(0) cropped_line = ''.join(line[:prefix_size]) cropped_line_group = ''.join(line_group[:prefix_size]) cropped_times = times[:prefix_size] cropped_times3 = times3[:prefix_size] cropped_times4 = times4[:prefix_size] if len(times2) < prefix_size: continue # make no prediction for this case, since this case has ended already ground_truth = ''.join(line[prefix_size:prefix_size + predict_size]) ground_truth_group = ''.join( line_group[prefix_size:prefix_size + predict_size]) ground_truth_t = times2[prefix_size - 1] case_end_time = times2[len(times2) - 1] ground_truth_t = case_end_time - ground_truth_t predicted = '' predicted_group = '' total_predicted_time = 0 for i in range(predict_size): enc = encode(cropped_line, cropped_line_group, cropped_times, cropped_times3) y = model.predict(enc, verbose=0) # make predictions # split predictions into seperate activity and time predictions y_char = y[0][0] y_group = y[1][0] y_t = y[2][0][0] prediction = get_symbol(y_char) # undo one-hot encoding prediction_group = get_symbol_group( y_group) # undo one-hot encoding cropped_line += prediction cropped_line_group += prediction_group # adds a fake timestamp to the list t = time.strptime(cropped_times4[-1], "%Y-%m-%d %H:%M:%S") new_timestamp = datetime.fromtimestamp( time.mktime(t)) + timedelta(0, 2000) cropped_times4.append( new_timestamp.strftime("%Y-%m-%d %H:%M:%S")) if y_t < 0: y_t = 0 cropped_times.append(y_t) # end of case was just predicted, therefore, stop predicting further into the future if prediction == '!': one_ahead_pred.append(total_predicted_time) one_ahead_gt.append(ground_truth_t) print('! predicted, end case') break y_t = y_t * divisor3 cropped_times3.append(cropped_times3[-1] + timedelta(seconds=y_t)) total_predicted_time = total_predicted_time + y_t predicted += prediction predicted_group += prediction_group output = [] if len(ground_truth) > 0: output.append(prefix_size) output.append(unicode(ground_truth).encode("utf-8")) output.append(unicode(predicted).encode("utf-8")) output.append( 1 - distance.nlevenshtein(predicted, ground_truth)) dls = 1 - (damerau_levenshtein_distance( unicode(predicted), unicode(ground_truth)) / max(len(predicted), len(ground_truth))) # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where # the default character encoding of the operating system caused it to be negative, # this should never be the case if dls < 0: dls = 0 output.append(dls) output.append(1 - distance.jaccard(predicted, ground_truth)) output.append(ground_truth_t) output.append(total_predicted_time) output.append('') output.append( metrics.mean_absolute_error([ground_truth_t], [total_predicted_time])) output.append( metrics.median_absolute_error([ground_truth_t], [total_predicted_time])) output.append(unicode(ground_truth_group).encode("utf-8")) output.append(unicode(predicted_group).encode("utf-8")) output.append(1 - distance.nlevenshtein( predicted_group, ground_truth_group)) spamwriter.writerow(output) print("TIME TO FINISH --- %s seconds ---" % (time.time() - start_time))
total_predicted_time = total_predicted_time + y_t predicted += prediction output = [] if len(ground_truth) > 0: output.append(caseid) output.append(prefix_size) output.append(ground_truth) output.append(predicted) #print('predicted: ' , predicted) #print('ground_truth: ' , ground_truth) output.append(1 - distance.nlevenshtein(predicted, ground_truth)) #print('distaance nlevenshtein: ' , distance.nlevenshtein(predicted, ground_truth)) #dls = 1 - (damerau_levenshtein_distance(unicode(predicted), unicode(ground_truth)) / max(len(predicted),len(ground_truth))) dls = 1 - ( damerau_levenshtein_distance(predicted, ground_truth) / max(len(predicted), len(ground_truth))) #print('distaance damerau_levenshtein_distance: ' , damerau_levenshtein_distance(predicted, ground_truth)) #print( 'max ',max(len(predicted),len(ground_truth))) #print( 'jaccard ',distance.jaccard(predicted, ground_truth)) #print( 'cos ',cossim(predicted, ground_truth)) if dls < 0: dls = 0 # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where the default character encoding of the operating system caused it to be negative, this should never be the case output.append(dls) output.append(1 - distance.jaccard(predicted, ground_truth)) output.append(ground_truth_t) output.append(total_predicted_time) output.append('') output.append( metrics.mean_absolute_error([ground_truth_t],
def damerau_levenshtein_distance(a, b): try: return jellyfish.damerau_levenshtein_distance(a, b) except ValueError: # c implementation can't deal with unicode, fall back to (slower) python return py_jellyfish.damerau_levenshtein_distance(a, b)
def runExperiments(logIdentificator, formulaType): eventlog, path_to_model_file, beam_size, \ prefix_size_pred_from, prefix_size_pred_to, formula = activateSettings(logIdentificator, formulaType) start_time = time.time() csvfile = open('../data/%s' % eventlog, 'r') spamreader = csv.reader(csvfile, delimiter=',', quotechar='|') next(spamreader, None) # skip the headers lastcase = '' line = '' firstLine = True lines = [] timeseqs = [] # relative time since previous event timeseqs2 = [] # relative time since case start timeseqs3 = [] # absolute time of previous event times = [] times2 = [] times3 = [] numlines = 0 casestarttime = None lasteventtime = None for row in spamreader: t = time.strptime(row[2], "%Y-%m-%d %H:%M:%S") if row[0]!=lastcase: casestarttime = t lasteventtime = t lastcase = row[0] if not firstLine: lines.append(line) timeseqs.append(times) timeseqs2.append(times2) timeseqs3.append(times3) line = '' times = [] times2 = [] times3 = [] numlines+=1 line+= getUnicode_fromInt(row[1]) timesincelastevent = datetime.fromtimestamp(time.mktime(t))-datetime.fromtimestamp(time.mktime(lasteventtime)) timesincecasestart = datetime.fromtimestamp(time.mktime(t))-datetime.fromtimestamp(time.mktime(casestarttime)) midnight = datetime.fromtimestamp(time.mktime(t)).replace(hour=0, minute=0, second=0, microsecond=0) timesincemidnight = datetime.fromtimestamp(time.mktime(t))-midnight timediff = 86400 * timesincelastevent.days + timesincelastevent.seconds timediff2 = 86400 * timesincecasestart.days + timesincecasestart.seconds times.append(timediff) times2.append(timediff2) times3.append(datetime.fromtimestamp(time.mktime(t))) lasteventtime = t firstLine = False # add last case lines.append(line) timeseqs.append(times) timeseqs2.append(times2) timeseqs3.append(times3) numlines+=1 divisor = np.mean([item for sublist in timeseqs for item in sublist]) print('divisor: {}'.format(divisor)) divisor2 = np.mean([item for sublist in timeseqs2 for item in sublist]) print('divisor2: {}'.format(divisor2)) divisor3 = np.mean(map(lambda x: np.mean(map(lambda y: x[len(x)-1]-y, x)), timeseqs2)) print('divisor3: {}'.format(divisor3)) elems_per_fold = int(round(numlines/3)) fold1and2lines = lines[:2*elems_per_fold] step = 1 sentences = [] softness = 0 next_chars = [] fold1and2lines = map(lambda x: x+'!',fold1and2lines) maxlen = max(map(lambda x: len(x),fold1and2lines)) chars = map(lambda x : set(x),fold1and2lines) chars = list(set().union(*chars)) chars.sort() target_chars = copy.copy(chars) chars.remove('!') print('total chars: {}, target chars: {}'.format(len(chars), len(target_chars))) char_indices = dict((c, i) for i, c in enumerate(chars)) indices_char = dict((i, c) for i, c in enumerate(chars)) target_char_indices = dict((c, i) for i, c in enumerate(target_chars)) target_indices_char = dict((i, c) for i, c in enumerate(target_chars)) print(indices_char) #we only need the third fold, because first two were used for training fold3 = lines[2*elems_per_fold:] fold3_t = timeseqs[2*elems_per_fold:] fold3_t2 = timeseqs2[2*elems_per_fold:] fold3_t3 = timeseqs3[2*elems_per_fold:] lines = fold3 lines_t = fold3_t lines_t2 = fold3_t2 lines_t3 = fold3_t3 # set parameters predict_size = maxlen # load model, set this to the model generated by train.py model = load_model(path_to_model_file) # define helper functions #this one encodes the current sentence into the onehot encoding def encode(sentence, times, times3, maxlen=maxlen): num_features = len(chars)+5 X = np.zeros((1, maxlen, num_features), dtype=np.float32) leftpad = maxlen-len(sentence) times2 = np.cumsum(times) for t, char in enumerate(sentence): midnight = times3[t].replace(hour=0, minute=0, second=0, microsecond=0) timesincemidnight = times3[t]-midnight multiset_abstraction = Counter(sentence[:t+1]) for c in chars: if c==char: X[0, t+leftpad, char_indices[c]] = 1 X[0, t+leftpad, len(chars)] = t+1 X[0, t+leftpad, len(chars)+1] = times[t]/divisor X[0, t+leftpad, len(chars)+2] = times2[t]/divisor2 X[0, t+leftpad, len(chars)+3] = timesincemidnight.seconds/86400 X[0, t+leftpad, len(chars)+4] = times3[t].weekday()/7 return X #modify to be able to get second best prediction def getSymbol(predictions, ith_best = 0): i = np.argsort(predictions)[len(predictions) - ith_best - 1] return target_indices_char[i] one_ahead_gt = [] one_ahead_pred = [] two_ahead_gt = [] two_ahead_pred = [] three_ahead_gt = [] three_ahead_pred = [] with open('output_files/results/'+formulaType+'/suffix_and_remaining_time0_%s' % eventlog, 'wb') as csvfile: spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) spamwriter.writerow(["Prefix length", "Groud truth", "Predicted", "Levenshtein", "Damerau", "Jaccard", "Ground truth times", "Predicted times", "RMSE", "MAE", "Median AE"]) for prefix_size in range(prefix_size_pred_from, prefix_size_pred_to): lines_s, lines_t_s, lines_t2_s, lines_t3_s = selectFormulaVerifiedTraces(lines, lines_t, lines_t2, lines_t3, formula, prefix_size) print(prefix_size) print("formulas verifited: " + str(len(lines_s)) + " out of : " + str(len(lines))) for line, times, times2, times3 in izip(lines_s, lines_t_s, lines_t2_s, lines_t3_s): times.append(0) cropped_line = ''.join(line[:prefix_size]) cropped_times = times[:prefix_size] cropped_times3 = times3[:prefix_size] if len(times2)<prefix_size: continue # make no prediction for this case, since this case has ended already ground_truth = ''.join(line[prefix_size:prefix_size+predict_size]) ground_truth_t = times2[prefix_size-1] case_end_time = times2[len(times2)-1] ground_truth_t = case_end_time-ground_truth_t predicted = '' total_predicted_time = 0 for i in range(predict_size): enc = encode(cropped_line, cropped_times, cropped_times3) y = model.predict(enc, verbose=0) # make predictions # split predictions into seperate activity and time predictions y_char = y[0][0] y_t = y[1][0][0] prediction = getSymbol(y_char) # undo one-hot encoding cropped_line += prediction if y_t<0: y_t=0 cropped_times.append(y_t) if prediction == '!': # end of case was just predicted, therefore, stop predicting further into the future one_ahead_pred.append(total_predicted_time) one_ahead_gt.append(ground_truth_t) print('! predicted, end case') break y_t = y_t * divisor3 cropped_times3.append(cropped_times3[-1] + timedelta(seconds=y_t)) total_predicted_time = total_predicted_time + y_t predicted += prediction output = [] if len(ground_truth)>0: output.append(prefix_size) output.append(unicode(ground_truth).encode("utf-8")) output.append(unicode(predicted).encode("utf-8")) output.append(1 - distance.nlevenshtein(predicted, ground_truth)) dls = 1 - (damerau_levenshtein_distance(unicode(predicted), unicode(ground_truth)) / max(len(predicted),len(ground_truth))) if dls<0: dls=0 # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where the default character encoding of the operating system caused it to be negative, this should never be the case output.append(dls) output.append(1 - distance.jaccard(predicted, ground_truth)) output.append(ground_truth_t) output.append(total_predicted_time) output.append('') output.append(metrics.mean_absolute_error([ground_truth_t], [total_predicted_time])) output.append(metrics.median_absolute_error([ground_truth_t], [total_predicted_time])) spamwriter.writerow(output) print("TIME TO FINISH --- %s seconds ---" % (time.time() - start_time))
def test(args, preprocess_manager): # get test set lines, case_ids, lines_t, lines_t2, lines_t3, seq_max_length, num_features_all, num_features_activities = preprocess_manager.create_test_set( ) model_suffix_prediction = load_model('%smodel_suffix_prediction.h5' % args.checkpoint_dir) # load model data_set_name = args.data_set.split('.csv')[ 0] # set options for result output generic_result_dir = args.result_dir + data_set_name + "_" + args.task result_dir = generic_result_dir + ".csv" # start prediction with open(result_dir, 'w') as csvfile: spamwriter = csv.writer(csvfile, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL) spamwriter.writerow([ "CaseID", "Prefix length", "Ground truth", "Predicted", "Levenshtein", "Damerau", "Jaccard", "Ground truth times", "Predicted times", "MAE", "In time", "Dev. in time", "Num interventions" ]) for line, case_id, times, times2, times3 in zip( lines, case_ids, lines_t, lines_t2, lines_t3): for prefix_size in range(2, seq_max_length): # size > 1 print("\nPrefix size: %d" % prefix_size) # current = ground truth prefix + predicted suffix current = { "line": ''.join(line[:prefix_size]), "times": times[:prefix_size], "times2": times2[:prefix_size], "times3": times3[:prefix_size] } if '!' in current["line"]: # termination break ground_truth = { "total_event": ''.join(line[:]), "prefix_event": ''.join(line[:prefix_size]), "suffix_event": ''.join(line[prefix_size:]), "total_time": times2[len(times2) - 1], "prefix_time": times2[prefix_size - 1], "suffix_time": times2[len(times2) - 1] - times2[prefix_size - 1] } predict = { "size": seq_max_length - 1, "predicted": '', "suffix_time": 0 } if args.next_best_action: # Check prefix conformance if preprocess_manager.check_candidate( args, preprocess_manager.transform_new_instance( ground_truth["prefix_event"])): predict, in_time, deviation_in_time, num_interventions = predict_suffix_and_time_for_prefix_next_best_event( args, model_suffix_prediction, preprocess_manager, current, predict, ground_truth) else: break else: # Check prefix conformance if preprocess_manager.check_candidate( args, preprocess_manager.transform_new_instance( ground_truth["prefix_event"])): predict, in_time, deviation_in_time = predict_suffix_and_time_for_prefix( model_suffix_prediction, preprocess_manager, current, predict, ground_truth) else: break if predict["predicted"] == "": # termination continue output = [] if len(ground_truth["suffix_event"]) > 0: output.append(case_id) output.append(prefix_size) output.append( str(ground_truth["suffix_event"]).encode("utf-8")) output.append(str(predict["predicted"]).encode("utf-8")) output.append(1 - distance.nlevenshtein( predict["predicted"], ground_truth["suffix_event"])) dls = 1 - (damerau_levenshtein_distance( str(predict["predicted"]), str(ground_truth["suffix_event"])) / max(len(predict["predicted"]), len(ground_truth["suffix_event"]))) if dls < 0: dls = 0 output.append(dls) output.append(1 - distance.jaccard( predict["predicted"], ground_truth["suffix_event"])) output.append(ground_truth["suffix_time"]) output.append(predict["suffix_time"]) output.append( metrics.mean_absolute_error( [ground_truth["suffix_time"]], [predict["suffix_time"]])) output.append(in_time) output.append(deviation_in_time) if args.next_best_action: output.append(num_interventions) else: output.append(0) spamwriter.writerow(output)
def test_damerau_levenshtein_distance_type(jf): jf.damerau_levenshtein_distance(u"abc", u"abc") with pytest.raises(TypeError) as exc: jf.damerau_levenshtein_distance(b"abc", b"abc") assert "expected" in str(exc.value)
def fuzzy_value_scoring(values_list1, values_list2): """ string pairwise matcher NB only best matches are taken this is not all by all gets fuzzy pair match based on jarowinkler returns dict with mean, stc and 0.9 qualtile for jarowinkler, damerau levenshtein and hamming distances If the number of values is too long (>1000) the most frequently used values are taken as best representatives. This is to make computation doable. """ if len(values_list1) > 0 and len(values_list2) > 0: if len(values_list1) > 1000 or len(values_list2) > 1000: if len(values_list1) > 1000: x = value_info.get(facet1) value_df = pd.DataFrame(columns=['frequency']).from_dict( x, orient='index').reset_index().rename(columns={ "index": "value", 0: "frequency" }).sort_values(['frequency'], ascending=False).head(n=1000) values_list1 = value_df['value'].tolist() if len(values_list2) > 1000: x = value_info.get(facet2) value_df = pd.DataFrame(columns=['frequency']).from_dict( x, orient='index').reset_index().rename(columns={ "index": "value", 0: "frequency" }).sort_values(['frequency'], ascending=False).head(n=1000) values_list2 = value_df['value'].tolist() if len(values_list1) > len(values_list2): short_list = values_list2 long_list = values_list1 else: short_list = values_list1 long_list = values_list2 # calculate the best fuzzy matches best_match_list = [] for value1 in short_list: jaro_distance_list = [] for value2 in long_list: try: damerau_levenshtein_distance = jellyfish.damerau_levenshtein_distance( value1, value2) except ValueError: damerau_levenshtein_distance = py_jellyfish.damerau_levenshtein_distance( value1, value2) jaro_winkler = jellyfish.jaro_winkler(value1, value2) hamming_distance = jellyfish.hamming_distance(value1, value2) jaro_tuple = (value1, value2, jaro_winkler, damerau_levenshtein_distance, hamming_distance) jaro_distance_list.append(jaro_tuple) best_match = max(jaro_distance_list, key=lambda x: x[2]) best_match_list.append(best_match) df = pd.DataFrame(best_match_list, columns=[ 'facet1', 'facet2', 'jaro_distance', 'damerau_levenshtein_distance', 'hamming_distance' ]) jaro_distance_quant = df['jaro_distance'].quantile(0.9) jaro_distance_mean = df['jaro_distance'].mean() jaro_distance_std = df['jaro_distance'].std() damerau_levenshtein_distance_quant = df[ 'damerau_levenshtein_distance'].quantile(0.9) damerau_levenshtein_distance_mean = df[ 'damerau_levenshtein_distance'].mean() damerau_levenshtein_distance_std = df[ 'damerau_levenshtein_distance'].std() hamming_distance_quant = df['hamming_distance'].quantile(0.9) hamming_distance_mean = df['hamming_distance'].mean() hamming_distance_std = df['hamming_distance'].std() results = { 'jaro_distance_quant': jaro_distance_quant, 'jaro_distance_mean': jaro_distance_mean, 'jaro_distance_std': jaro_distance_std, 'damerau_levenshtein_distance_quant': damerau_levenshtein_distance_quant, 'damerau_levenshtein_distance_mean': damerau_levenshtein_distance_mean, 'damerau_levenshtein_distance_std': damerau_levenshtein_distance_std, 'hamming_distance_quant': hamming_distance_quant, 'hamming_distance_mean': hamming_distance_mean, 'hamming_distance_std': hamming_distance_std } # so a good match will be a high mean, low std. The quantile is prob better than mean. return results else: # 'N.A.' returned if one or both of the facets dont have any values. results = {'jaro_distance_quant':'N.A.', \ 'jaro_distance_mean':'N.A.', \ 'jaro_distance_std':'N.A.', \ 'damerau_levenshtein_distance_quant':'N.A.', \ 'damerau_levenshtein_distance_mean':'N.A.', \ 'damerau_levenshtein_distance_std':'N.A.', \ 'hamming_distance_quant':'N.A.', \ 'hamming_distance_mean':'N.A.', \ 'hamming_distance_std':'N.A.'} return results
def evaluate(train_log, test_log, model_folder, model_file): caseid_col = 0 role_col = 2 task_col = 1 csvfile = open(train_log, 'r') spamreader = csv.reader(csvfile, delimiter=',', quotechar='|') next(spamreader, None) # skip the headers ascii_offset = 161 lastcase = '' line = '' firstLine = True lines = [] caseids = [] numlines = 0 for row in spamreader: if row[caseid_col]!=lastcase: caseids.append(row[caseid_col]) lastcase = row[caseid_col] if not firstLine: lines.append(line) line = '' times = [] numlines+=1 line+=chr(int(row[task_col])+ascii_offset) firstLine = False # add last case lines.append(line) numlines+=1 csvfile = open(test_log, 'r') spamreader = csv.reader(csvfile, delimiter=',', quotechar='|') next(spamreader, None) # skip the headers ascii_offset = 161 for row in spamreader: #the rows are "CaseID,ActivityID,CompleteTimestamp" if row[caseid_col]!=lastcase: #'lastcase' is to save the last executed case for the loop lastcase = row[caseid_col] if not firstLine: lines.append(line) line = '' numlines+=1 line+=chr(int(row[task_col])+ascii_offset) firstLine = False # add last case lines.append(line) numlines+=1 step = 1 sentences = [] softness = 0 next_chars = [] lines = list(map(lambda x: x+'!',lines)) maxlen = max(map(lambda x: len(x),lines)) chars = map(lambda x : set(x),lines) chars = list(set().union(*chars)) chars.sort() target_chars = copy.copy(chars) chars.remove('!') print('total chars: {}, target chars: {}'.format(len(chars), len(target_chars))) char_indices = dict((c, i) for i, c in enumerate(chars)) indices_char = dict((i, c) for i, c in enumerate(chars)) target_char_indices = dict((c, i) for i, c in enumerate(target_chars)) target_indices_char = dict((i, c) for i, c in enumerate(target_chars)) print(indices_char) lastcase = '' line = '' firstLine = True lines = [] caseids = [] numlines = 0 csvfile = open(test_log, 'r') spamreader = csv.reader(csvfile, delimiter=',', quotechar='|') next(spamreader, None) # skip the headers for row in spamreader: if row[caseid_col]!=lastcase: caseids.append(row[caseid_col]) lastcase = row[caseid_col] if not firstLine: lines.append(line) line = '' times = [] numlines+=1 line+=chr(int(row[task_col])+ascii_offset) firstLine = False # add last case lines.append(line) numlines+=1 # set parameters predict_size = 1 # load model, set this to the model generated by train.py model = load_model(os.path.join(model_folder, model_file)) # define helper functions def encode(sentence, maxlen=maxlen): num_features = len(chars)+1 X = np.zeros((1, maxlen, num_features), dtype=np.float32) leftpad = maxlen-len(sentence) for t, char in enumerate(sentence): multiset_abstraction = Counter(sentence[:t+1]) for c in chars: if c==char: X[0, t+leftpad, char_indices[c]] = 1 X[0, t+leftpad, len(chars)] = t+1 return X def getSymbol(predictions): maxPrediction = 0 symbol = '' i = 0 for prediction in predictions: if(prediction>=maxPrediction): maxPrediction = prediction symbol = target_indices_char[i] i += 1 return symbol one_ahead_gt = [] one_ahead_pred = [] two_ahead_gt = [] two_ahead_pred = [] three_ahead_gt = [] three_ahead_pred = [] # make predictions with open(os.path.join(model_folder, "predictions.csv"), 'w') as csvfile: spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) spamwriter.writerow(["CaseID", "Prefix length", "Groud truth", "Predicted", "Levenshtein", "Damerau", "Jaccard", "Ground truth times", "Predicted times", "RMSE", "MAE"]) for prefix_size in range(1,maxlen): print(prefix_size) for line, caseid in zip(lines, caseids): cropped_line = ''.join(line[:prefix_size]) if '!' in cropped_line: continue # make no prediction for this case, since this case has ended already ground_truth = ''.join(line[prefix_size:prefix_size+predict_size]) predicted = '' for i in range(predict_size): if len(ground_truth)<=i: continue enc = encode(cropped_line) y = model.predict(enc, verbose=0) y_char = y[0] prediction = getSymbol(y_char) cropped_line += prediction if prediction == '!': # end of case was just predicted, therefore, stop predicting further into the future break predicted += prediction output = [] if len(ground_truth)>0: output.append(caseid) output.append(prefix_size) output.append(str(ground_truth)) output.append(str(predicted)) output.append(1 - distance.nlevenshtein(predicted, ground_truth)) dls = 1 - (damerau_levenshtein_distance(str(predicted), str(ground_truth)) / max(len(predicted),len(ground_truth))) if dls<0: dls=0 # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where the default character encoding of the operating system caused it to be negative, this should never be the case output.append(dls) output.append(1 - distance.jaccard(predicted, ground_truth)) output.append(' ') output.append(' ') output.append('') output.append('') output.append('') spamwriter.writerow(output)
def run_experiments(log_identificator, formula_type): eventlog, path_to_model_file, beam_size, \ prefix_size_pred_from, prefix_size_pred_to, formula = activateSettings(log_identificator, formula_type) current_path = os.path.abspath(getsourcefile(lambda: 0)) current_dir = os.path.dirname(current_path) parent_dir = current_dir[:current_dir.rfind(os.path.sep)] sys.path.insert(0, parent_dir) start_time = time.time() lines, lines_t, lines_t2, lines_t3, maxlen, chars, char_indices, divisor, divisor2, \ divisor3, predict_size, target_indices_char, target_char_indices = prepare_testing_data(eventlog) # find cycles and modify the probability functionality goes here stop_symbol_probability_amplifier_current = 1 # modify to be able to get second best prediction def getSymbol(predictions, ith_best=0): predictions[ 0] = predictions[0] * stop_symbol_probability_amplifier_current i = np.argsort(predictions)[len(predictions) - ith_best - 1] return target_indices_char[i] one_ahead_gt = [] one_ahead_pred = [] # load model, set this to the model generated by train.py model = load_model(path_to_model_file) stop_symbol_probability_amplifier_current = 1 # make predictions with open( 'output_files/results/' + formula_type + '/suffix_and_remaining_time2_%s' % eventlog, 'wb') as csvfile: spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) spamwriter.writerow([ "Prefix length", "Groud truth", "Predicted", "Levenshtein", "Damerau", "Jaccard", "Ground truth times", "Predicted times", "RMSE", "MAE", "Median AE" ]) for prefix_size in range(prefix_size_pred_from, prefix_size_pred_to): # here we checkout the prefixes with formulas verified only on the suffix phase lines_s, lines_t_s, lines_t2_s, lines_t3_s = selectFormulaVerifiedTraces( lines, lines_t, lines_t2, lines_t3, formula, prefix_size) print("prefix size: " + str(prefix_size)) print("formulas verifited: " + str(len(lines_s)) + " out of : " + str(len(lines))) for line, times, times2, times3 in zip(lines_s, lines_t_s, lines_t2_s, lines_t3_s): prediction_end_reached = False times.append(0) cropped_line = ''.join(line[:prefix_size]) cropped_times = times[:prefix_size] cropped_times3 = times3[:prefix_size] if len(times2) < prefix_size: continue # make no prediction for this case, since this case has ended already # initialize root of the tree for beam search total_predicted_time_initialization = 0 search_tree_root = MultileafTree( beam_size, encode(cropped_line, cropped_times, cropped_times3, maxlen, chars, char_indices, divisor, divisor2), cropped_line, total_predicted_time_initialization) prediction_end_reached = False ground_truth = ''.join(line[prefix_size:prefix_size + predict_size]) ground_truth_t = times2[prefix_size - 1] case_end_time = times2[len(times2) - 1] ground_truth_t = case_end_time - ground_truth_t predicted = '' for i in range(predict_size): # here we will take data from the node in the tree used to prun enc = search_tree_root.data # encode(cropped_line, cropped_times, cropped_times3) y = model.predict(enc, verbose=0) # make predictions # split predictions into seperate activity and time predictions y_char = y[0][0] y_t = y[1][0][0] stop_symbol_probability_amplifier_current, \ start_of_the_cycle_symbol = amplify(search_tree_root.cropped_line) # cropped_line += prediction if y_t < 0: y_t = 0 # TOO not normalizing here seems like a bug cropped_times.append(y_t) ma = False for i in range(beam_size): prediction = getSymbolAmpl( y_char, target_indices_char, target_char_indices, start_of_the_cycle_symbol, stop_symbol_probability_amplifier_current, i) # end of case was just predicted, therefore, stop predicting further into the future if prediction == '!': if verify_formula_as_compliant( search_tree_root.cropped_line, formula, prefix_size): one_ahead_pred.append( search_tree_root.total_predicted_time) one_ahead_gt.append(ground_truth_t) print('! predicted, end case') ma = True break # else: # prediction_end_reached = True; if ma: break # if the end of prediction was not reached we continue as always, and then function :choose_next_ # top_descendant: will earch for future prediction # in not reached, function :choose_next_top_descendant: will backtrack y_t = y_t * divisor3 if not prediction_end_reached: cropped_times3.append(cropped_times3[-1] + timedelta(seconds=y_t)) for i in range(beam_size): temp_prediction = getSymbolAmpl( y_char, target_indices_char, target_char_indices, start_of_the_cycle_symbol, stop_symbol_probability_amplifier_current, i) if temp_prediction == '!': continue temp_cropped_line = search_tree_root.cropped_line + temp_prediction # this means that we found the end in one of the alternatives. temp_total_predicted_time = search_tree_root.total_predicted_time + y_t temp_state_data = encode(temp_cropped_line, cropped_times, cropped_times3, maxlen, chars, char_indices, divisor, divisor2) search_tree_root.descendants[i] = MultileafTree( beam_size, temp_state_data, temp_cropped_line, temp_total_predicted_time, search_tree_root) search_tree_root = search_tree_root.choose_next_top_descendant( ) if prediction_end_reached: prediction_end_reached = False if search_tree_root is None: print( "Cannot find any trace that is compliant with formula given current beam size" ) break output = [] if search_tree_root is None: predicted = u"" total_predicted_time = 0 else: predicted = (search_tree_root.cropped_line[prefix_size:]) total_predicted_time = search_tree_root.total_predicted_time if len(ground_truth) > 0: output.append(prefix_size) output.append(unicode(ground_truth).encode("utf-8")) output.append(unicode(predicted).encode("utf-8")) output.append( 1 - distance.nlevenshtein(predicted, ground_truth)) dls = 1 - (damerau_levenshtein_distance( unicode(predicted), unicode(ground_truth)) / max(len(predicted), len(ground_truth))) if dls < 0: dls = 0 # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where the # default character encoding of the operating system caused it to be negative, this should never # be the case output.append(dls) output.append(1 - distance.jaccard(predicted, ground_truth)) output.append(ground_truth_t) output.append(total_predicted_time) output.append('') output.append( metrics.mean_absolute_error([ground_truth_t], [total_predicted_time])) output.append( metrics.median_absolute_error([ground_truth_t], [total_predicted_time])) spamwriter.writerow(output) print("TIME TO FINISH --- %s seconds ---" % (time.time() - start_time))
def fuzzy_value_scoring(values_list1, values_list2): """ string pairwise matcher NB only best matches are taken this is not all by all gets fuzzy pair match based on jarowinkler returns dict with mean, stc and 0.9 qualtile for jarowinkler, damerau levenshtein and hamming distances """ if len(values_list1) > 0 and len(values_list2) > 0: if len(values_list1) > len(values_list2): short_list = values_list2 long_list = values_list1 else: short_list = values_list1 long_list = values_list2 # calculate the best fuzzy matches best_match_list = [] for value1 in short_list: jaro_distance_list = [] for value2 in long_list: try: damerau_levenshtein_distance = jellyfish.damerau_levenshtein_distance( value1, value2) except ValueError: damerau_levenshtein_distance = py_jellyfish.damerau_levenshtein_distance( value1, value2) jaro_winkler = jellyfish.jaro_winkler(value1, value2) hamming_distance = jellyfish.hamming_distance(value1, value2) jaro_tuple = (value1, value2, jaro_winkler, damerau_levenshtein_distance, hamming_distance) jaro_distance_list.append(jaro_tuple) best_match = max(jaro_distance_list, key=lambda x: x[2]) best_match_list.append(best_match) df = pd.DataFrame(best_match_list, columns=[ 'facet1', 'facet2', 'jaro_distance', 'damerau_levenshtein_distance', 'hamming_distance' ]) jaro_distance_quant = df['jaro_distance'].quantile(0.9) jaro_distance_mean = df['jaro_distance'].mean() jaro_distance_std = df['jaro_distance'].std() damerau_levenshtein_distance_quant = df[ 'damerau_levenshtein_distance'].quantile(0.9) damerau_levenshtein_distance_mean = df[ 'damerau_levenshtein_distance'].mean() damerau_levenshtein_distance_std = df[ 'damerau_levenshtein_distance'].std() hamming_distance_quant = df['hamming_distance'].quantile(0.9) hamming_distance_mean = df['hamming_distance'].mean() hamming_distance_std = df['hamming_distance'].std() results = {'jaro_distance_quant':jaro_distance_quant, \ 'jaro_distance_mean':jaro_distance_mean, \ 'jaro_distance_std':jaro_distance_std, \ 'damerau_levenshtein_distance_quant':damerau_levenshtein_distance_quant, \ 'damerau_levenshtein_distance_mean':damerau_levenshtein_distance_mean, \ 'damerau_levenshtein_distance_std':damerau_levenshtein_distance_std, \ 'hamming_distance_quant':hamming_distance_quant, \ 'hamming_distance_mean':hamming_distance_mean, \ 'hamming_distance_std':hamming_distance_std} # so a good match will be a high mean, low std. The quantile is prob better than mean. return results else: # 'N.A.' returned if one or both of the facets dont have any values. results = {'jaro_distance_quant':'N.A.', \ 'jaro_distance_mean':'N.A.', \ 'jaro_distance_std':'N.A.', \ 'damerau_levenshtein_distance_quant':'N.A.', \ 'damerau_levenshtein_distance_mean':'N.A.', \ 'damerau_levenshtein_distance_std':'N.A.', \ 'hamming_distance_quant':'N.A.', \ 'hamming_distance_mean':'N.A.', \ 'hamming_distance_std':'N.A.'} return results
def test(self, ): """ Generates a file with predictions for next activities and time ***Helper Variables*** predict_size : int number of predictions model : tf.keras.models trained models complete path path1: str complete path of the model filename: str local path and name of the model path: str local path of model model_type: str name of the model file_name: str name of the output file spamwriter: object csv writer object prefix_size:int size of eventlog prefix_size self.lines: list these are all the activity seq self.char_indices : dict ascii coded characters of the unique activities to integer indices self.indices_char: dict integer indices to ascii coded characters of the unique activities self.target_char_indices: dict ascii coded characters of the target unique activities to integer indices (target includes one excess activity '!' case end) self.target_indices_char: dict integer indices to ascii coded characters of the target unique activities self.lines: list ActivityIDs self.lines_t: list differences between two events self.lines_t2: list differences between the current and first of test_set self.lines_t3 : list Midnight time self.lines_t4 : list Day of the week self.one_ahead_gt : list helper variable to predict one ahead self.one_ahead_pred : list helper variable to predict one ahead self.two_ahead_gt : list helper variable to predict two ahead self.two_ahead_pred : list helper variable to predict two ahead self.three_ahead_gt :list helper variable to predict three ahead self.three_ahead_pred :list helper variable to predict three ahead cropped_line: list running activities while predictions cropped_times: list running time differences while predictions cropped_times3: list running time difference from case starting line: char activity items time: float time difference current and previous event times3: float time difference current and fisrt event ground_truth: char Ground truth activity ground_truth_t: float Groud truth time difference predicted: char predicted activity as a char predicted: list predicted time storing list y : dict all predctions y_char : float numerical prediction for activities y_t: float direct time prediction output: list complete list of output """ # set parameters predict_size = 1 # load model, set this to the model generated by train.py model = tf.keras.models.load_model( self.model_name, compile=False, custom_objects={"TLSTM_layer": TLSTM_layer}) #name of the output path1, filename = os.path.split(self.model_name) path, model_type = os.path.split(path1) #name file_name = model_type + self.eventlog # make predictions with open('Results/1hotnext_activity_and_time_%s' % file_name, 'w', encoding="utf-8") as csvfile: spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) spamwriter.writerow([ "CaseID", "Prefix length", "Groud truth", "Predicted", "Confidence", "Levenshtein", "Damerau", "Jaccard", "Ground truth times", "Predicted times", "RMSE", "MAE" ]) for prefix_size in range(2, self.maxlen): print(prefix_size) for line, caseid, times, times3 in zip(self.lines, self.caseids, self.lines_t, self.lines_t3): times.append(0) cropped_line = ''.join(line[:prefix_size]) cropped_times = times[:prefix_size] cropped_times3 = times3[:prefix_size] if '!' in cropped_line: continue # make no prediction for this case, since this case has ended already ground_truth = ''.join(line[prefix_size:prefix_size + predict_size]) ground_truth_t = times[prefix_size:prefix_size + predict_size] predicted = '' predicted_t = [] for i in range(predict_size): if len(ground_truth) <= i: continue enc = self.encode(cropped_line, cropped_times, cropped_times3, self.num_features) y = model.predict(enc, verbose=0) y_char = y[0][0] y_t = y[1][0][0] prediction = self.getSymbol(y_char) confidence = np.round(np.max(y_char) * 100) cropped_line += prediction if y_t < 0: y_t = 0 cropped_times.append(y_t) y_t = y_t * self.divisor cropped_times3.append(cropped_times3[-1] + timedelta(seconds=y_t)) predicted_t.append(y_t) if i == 0: if len(ground_truth_t) > 0: self.one_ahead_pred.append(y_t) self.one_ahead_gt.append(ground_truth_t[0]) if i == 1: if len(ground_truth_t) > 1: self.two_ahead_pred.append(y_t) self.two_ahead_gt.append(ground_truth_t[1]) if i == 2: if len(ground_truth_t) > 2: self.three_ahead_pred.append(y_t) self.three_ahead_gt.append(ground_truth_t[2]) if prediction == '!': # end of case was just predicted, therefore, stop predicting further into the future print('! predicted, end case') break predicted += prediction output = [] if len(ground_truth) > 0: output.append(caseid) output.append(prefix_size) output.append(str(ground_truth)) output.append(str(predicted)) output.append(confidence) output.append( 1 - distance.nlevenshtein(predicted, ground_truth)) dls = 1 - (damerau_levenshtein_distance( str(predicted), str(ground_truth)) / max(len(predicted), len(ground_truth))) if dls < 0: dls = 0 # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where the default character encoding of the operating system caused it to be negative, this should never be the case output.append(dls) output.append( 1 - distance.jaccard(predicted, ground_truth)) output.append('; '.join( str(x) for x in ground_truth_t)) output.append('; '.join(str(x) for x in predicted_t)) if len(predicted_t) > len( ground_truth_t ): # if predicted more events than length of case, only use needed number of events for time evaluation predicted_t = predicted_t[:len(ground_truth_t)] if len(ground_truth_t) > len( predicted_t ): # if predicted less events than length of case, put 0 as placeholder prediction predicted_t.extend( range(len(ground_truth_t) - len(predicted_t))) if len(ground_truth_t) > 0 and len(predicted_t) > 0: output.append('') output.append( metrics.mean_absolute_error( [ground_truth_t[0]], [predicted_t[0]])) output.append( metrics.median_absolute_error( [ground_truth_t[0]], [predicted_t[0]])) else: output.append('') output.append('') output.append('') spamwriter.writerow(output)
def calc_lev_dist(a, b): return py_jellyfish.damerau_levenshtein_distance(a, b)
def load_data(row): lev_dist = Levenshtein.distance(str(row[0]).lower(), str(row[1]).lower()) jar_dist = jaro_distance(str(row[0]).lower(), str(row[1]).lower()) dam_dist = damerau_levenshtein_distance( str(row[0]).lower(), str(row[1]).lower()) q1 = parser(str(row[0])) q2 = parser(str(row[1])) set_ent1 = set([ele.label_.lower() for ele in q1.ents]) set_ent2 = set([ele.label_.lower() for ele in q2.ents]) num_ent, val_ent, rate_ent = feat(set_ent1, set_ent2) set_ent1 = set([' '.join(t.orth_ for t in ele) for ele in q1.ents]) set_ent2 = set([' '.join(t.orth_ for t in ele) for ele in q2.ents]) num_ent2, val_ent2, rate_ent2 = feat(set_ent1, set_ent2) list_last1 = [ele.lower_ for ele in q1 if ele.pos_ != 'PUNCT'] list_last2 = [ele.lower_ for ele in q2 if ele.pos_ != 'PUNCT'] num_for = 0 val_for = 0. for i in range(min(len(list_last1), len(list_last2))): if list_last1[i] == list_last2[i] or match_rating_comparison( list_last1[i], list_last2[i]): num_for += 1 val_for += weights.get(list_last1[i], 0) else: break list_last1.reverse() list_last2.reverse() num_clean2_rev = 0 val_clean2_rev = 0. for i in range(min(len(list_last1), len(list_last2))): if list_last1[i] == list_last2[i] or match_rating_comparison( list_last1[i], list_last2[i]): num_clean2_rev += 1 val_clean2_rev += weights.get(list_last1[i], 0) else: break set_sub1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'nsubj']) set_sub2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'nsubj']) num_sub, val_sub, rate_sub = feat(set_sub1, set_sub2) set_root1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'ROOT']) set_root2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'ROOT']) num_root, val_root, rate_root = feat(set_root1, set_root2) set_advmod1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'advmod']) set_advmod2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'advmod']) num_advmod, val_advmod, rate_advmod = feat(set_advmod1, set_advmod2) set_advcl1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'advcl']) set_advcl2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'advcl']) num_advcl, val_advcl, rate_advcl = feat(set_advcl1, set_advcl2) set_aux1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'aux']) set_aux2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'aux']) num_aux, val_aux, rate_aux = feat(set_aux1, set_aux2) set_dobj1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'dobj']) set_dobj2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'dobj']) num_dobj, val_dobj, rate_dobj = feat(set_dobj1, set_dobj2) # set_poss1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'poss']) # set_poss2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'poss']) # num_poss, val_poss, rate_poss = feat(set_poss1, set_poss2) set_noun1 = set([ele.lower_ for ele in q1 if ele.pos_ == 'NOUN']) set_noun2 = set([ele.lower_ for ele in q2 if ele.pos_ == 'NOUN']) num_noun, val_noun, rate_noun = feat(set_noun1, set_noun2) set_verb1 = set([ele.lower_ for ele in q1 if ele.pos_ == 'VERB']) set_verb2 = set([ele.lower_ for ele in q2 if ele.pos_ == 'VERB']) num_verb, val_verb, rate_verb = feat(set_verb1, set_verb2) set_adv1 = set([ele.lower_ for ele in q1 if ele.pos_ == 'ADV']) set_adv2 = set([ele.lower_ for ele in q2 if ele.pos_ == 'ADV']) num_adv, val_adv, rate_adv = feat(set_adv1, set_adv2) # set_adj1 = set([ele.lower_ for ele in q1 if ele.pos_ == 'ADJ']) # set_adj2 = set([ele.lower_ for ele in q2 if ele.pos_ == 'ADJ']) # num_adj, val_adj, rate_adj = feat(set_adj1, set_adj2) set_svo1 = set([(ele[0].lower(), ele[1].lower(), ele[2].lower()) for ele in findSVOs(q1)]) set_svo2 = set([(ele[0].lower(), ele[1].lower(), ele[2].lower()) for ele in findSVOs(q2)]) set_svo1 = set([(wnl.lemmatize(ele[0]), wnl.lemmatize(ele[1]), wnl.lemmatize(ele[2])) for ele in set_svo1]) set_svo2 = set([(wnl.lemmatize(ele[0]), wnl.lemmatize(ele[1]), wnl.lemmatize(ele[2])) for ele in set_svo2]) num_svo, val_svo, rate_svo = feat(set_svo1, set_svo2) set_s1 = set(ele[0] for ele in set_svo1) set_v1 = set(ele[1] for ele in set_svo1) set_o1 = set(ele[2] for ele in set_svo1) set_s2 = set(ele[0] for ele in set_svo2) set_v2 = set(ele[1] for ele in set_svo2) set_o2 = set(ele[2] for ele in set_svo2) num_s, val_s, rate_s = feat(set_s1, set_s2) num_v, val_v, rate_v = feat(set_v1, set_v2) num_o, val_o, rate_o = feat(set_o1, set_o2) list_ret = [ num_ent, num_ent2, num_clean2_rev, num_for, lev_dist, jar_dist, dam_dist, num_sub, num_root, num_advmod, num_advcl, num_aux, # num_poss, num_noun, num_verb, num_adv, # num_adj, num_svo, num_s, num_v, num_o ] list_ret += [ val_ent, val_ent2, val_clean2_rev, val_for, val_sub, val_root, val_advmod, val_advcl, val_aux, val_dobj, # val_poss, val_noun, val_verb, val_adv, # val_adj, val_svo, val_s, val_v, val_o ] list_ret += [ rate_ent, rate_ent2, rate_sub, rate_root, rate_advmod, rate_advcl, rate_aux, rate_dobj, # rate_poss, rate_noun, rate_verb, rate_adv, # rate_adj, rate_svo, rate_s, rate_v, rate_o ] return list_ret
if i==2: if len(ground_truth_t)>2: three_ahead_pred.append(y_t) three_ahead_gt.append(ground_truth_t[2]) if prediction == '!': # end of case was just predicted, therefore, stop predicting further into the future print('! predicted, end case') break predicted += prediction output = [] if len(ground_truth)>0: output.append(caseid) output.append(prefix_size) output.append(unicode(ground_truth).encode("utf-8")) output.append(unicode(predicted).encode("utf-8")) output.append(1 - distance.nlevenshtein(predicted, ground_truth)) dls = 1 - (damerau_levenshtein_distance(unicode(predicted), unicode(ground_truth)) / max(len(predicted),len(ground_truth))) if dls<0: dls=0 # we encountered problems with Damerau-Levenshtein Similarity on some linux machines where the default character encoding of the operating system caused it to be negative, this should never be the case output.append(dls) output.append(1 - distance.jaccard(predicted, ground_truth)) output.append('; '.join(str(x) for x in ground_truth_t)) output.append('; '.join(str(x) for x in predicted_t)) if len(predicted_t)>len(ground_truth_t): # if predicted more events than length of case, only use needed number of events for time evaluation predicted_t = predicted_t[:len(ground_truth_t)] if len(ground_truth_t)>len(predicted_t): # if predicted less events than length of case, put 0 as placeholder prediction predicted_t.extend(range(len(ground_truth_t)-len(predicted_t))) if len(ground_truth_t)>0 and len(predicted_t)>0: output.append('') output.append(metrics.mean_absolute_error([ground_truth_t[0]], [predicted_t[0]])) #output.append(metrics.median_absolute_error([ground_truth_t[0]], [predicted_t[0]])) else:
def run_experiments(log_identificator, formula_type, rnn_type): eventlog, \ path_to_model_file_cf, \ path_to_model_file_cfr, \ path_to_declare_model_file, \ beam_size, \ prefix_size_pred_from, \ prefix_size_pred_to, \ formula = activate_settings(log_identificator, formula_type) if rnn_type == "CF": path_to_model_file = path_to_model_file_cf elif rnn_type == "CFR": path_to_model_file = path_to_model_file_cfr start_time = time.time() # prepare the data lines, \ lines_id, \ lines_group, \ lines_t, \ lines_t2, \ lines_t3, \ lines_t4, \ maxlen, \ chars, \ chars_group, \ char_indices, \ char_indices_group, \ divisor, \ divisor2, \ divisor3, \ predict_size, \ target_indices_char, \ target_indices_char_group,\ target_char_indices, \ target_char_indices_group = prepare_testing_data(eventlog) # find cycles and modify the probability functionality goes here stop_symbol_probability_amplifier_current = 1 # load model, set this to the model generated by train.py model = load_model(path_to_model_file) # Get the predicted group symbol def get_symbol_group(predictions, vth_best=0): v = np.argsort(predictions)[len(predictions) - vth_best - 1] return target_indices_char_group[v] class NodePrediction: def __init__(self, data, trace_id, crop_line, crop_line_group, crop_times, tot_predicted_time, probability_of=0): self.data = data self.trace_id = trace_id self.cropped_line = crop_line self.cropped_line_group = crop_line_group self.cropped_times = crop_times self.total_predicted_time = tot_predicted_time self.probability_of = probability_of # make predictions with open( 'output_files/final_experiments/results/LTL/%s_%s.csv' % (eventlog[:-4], rnn_type), 'wb') as csvfile: spamwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) # headers for the new file spamwriter.writerow([ "Prefix length", "Ground truth", "Predicted", "Levenshtein", "Damerau", "Jaccard", "Ground truth times", "Predicted times", "RMSE", "MAE", "Median AE", "Ground Truth Group", "Predicted Group", "Levenshtein Group" ]) # make predictions for different prefix sizes as specified in 'shared variables' for prefix_size in range(prefix_size_pred_from, prefix_size_pred_to): print(prefix_size) lines_s, \ lines_id_s, \ lines_group_s, \ lines_t_s, \ lines_t2_s, \ lines_t3_s, \ lines_t4_s = select_declare_verified_traces(path_to_declare_model_file, lines, lines_id, lines_group, lines_t, lines_t2, lines_t3, lines_t4, prefix_size) print("prefix size: " + str(prefix_size)) print("formulas verified: " + str(len(lines_s)) + " out of : " + str(len(lines))) counterr = 0 for line, line_id, line_group, times, times2, times3, times4 in zip( lines_s, lines_id_s, lines_group_s, lines_t_s, lines_t2_s, lines_t3_s, lines_t4_s): times.append(0) cropped_line_id = line_id cropped_line = ''.join(line[:prefix_size]) cropped_line_group = ''.join(line_group[:prefix_size]) cropped_times = times[:prefix_size] cropped_times3 = times3[:prefix_size] cropped_times4 = times4[:prefix_size] if len(times2) < prefix_size: continue # make no prediction for this case, since this case has ended already # initialize root of the tree for beam search total_predicted_time_initialization = 0 search_node_root = NodePrediction( encode(cropped_line, cropped_line_group, cropped_times, cropped_times3, maxlen, chars, chars_group, char_indices, char_indices_group, divisor, divisor2), cropped_line_id, cropped_line, cropped_line_group, cropped_times4, total_predicted_time_initialization) ground_truth = ''.join(line[prefix_size:prefix_size + predict_size]) ground_truth_group = ''.join( line_group[prefix_size:prefix_size + predict_size]) ground_truth_t = times2[prefix_size - 1] case_end_time = times2[len(times2) - 1] ground_truth_t = case_end_time - ground_truth_t queue_next_steps = PriorityQueue() queue_next_steps.put( (-search_node_root.probability_of, search_node_root)) queue_next_steps_future = PriorityQueue() start_of_the_cycle_symbol = " " found_sattisfying_constraint = False current_beam_size = beam_size current_prediction_premis = None for i in range(predict_size): for k in range(current_beam_size): if queue_next_steps.empty(): break _, current_prediction_premis = queue_next_steps.get() if not found_sattisfying_constraint: if verify_formula_as_compliant( current_prediction_premis.cropped_line, formula, prefix_size): # the formula verified and we can just finish the predictions # beam size is 1 because predict only sequence of events current_beam_size = 1 current_prediction_premis.probability_of = 0.0 # overwrite new queue queue_next_steps_future = PriorityQueue() found_sattisfying_constraint = True enc = current_prediction_premis.data temp_cropped_line = current_prediction_premis.cropped_line y = model.predict(enc, verbose=0) # make predictions # split predictions into seperate activity and time predictions y_char = y[0][0] y_group = y[1][0] y_t = y[2][0][0] if y_t < 0: y_t = 0 cropped_times.append(y_t) if not i == 0: stop_symbol_probability_amplifier_current, start_of_the_cycle_symbol = \ amplify(temp_cropped_line) # in not reached, function :choose_next_top_descendant: will backtrack y_t = y_t * divisor3 cropped_times3.append(cropped_times3[-1] + timedelta(seconds=y_t)) for j in range(current_beam_size): temp_prediction = get_symbol_ampl( y_char, target_indices_char, target_char_indices, start_of_the_cycle_symbol, stop_symbol_probability_amplifier_current, j) temp_prediction_group = get_symbol_group(y_group) # end of case was just predicted, therefore, stop predicting further into the future if temp_prediction == '!': if verify_formula_as_compliant( temp_cropped_line, formula, prefix_size): stop_symbol_probability_amplifier_current = 1 print('! predicted, end case') queue_next_steps = PriorityQueue() break else: continue temp_cropped_line = current_prediction_premis.cropped_line + temp_prediction temp_cropped_line_group = \ current_prediction_premis.cropped_line_group + temp_prediction_group # adds a fake timestamp to the list t = time.strptime(cropped_times4[-1], "%Y-%m-%d %H:%M:%S") new_timestamp = datetime.fromtimestamp( time.mktime(t)) + timedelta(0, 2000) cropped_times4.append( new_timestamp.strftime("%Y-%m-%d %H:%M:%S")) temp_total_predicted_time = current_prediction_premis.total_predicted_time + y_t temp_state_data = encode( temp_cropped_line, temp_cropped_line_group, cropped_times, cropped_times3, maxlen, chars, chars_group, char_indices, char_indices_group, divisor, divisor2) probability_this = np.sort(y_char)[len(y_char) - 1 - j] temp = NodePrediction( temp_state_data, cropped_line_id, temp_cropped_line, temp_cropped_line_group, cropped_times4, temp_total_predicted_time, current_prediction_premis.probability_of + np.log(probability_this)) queue_next_steps_future.put( (-temp.probability_of, temp)) print ('INFORMATION: ' + str(counterr) + ' ' + str(i) + ' ' + str(k) + ' ' + str(j) + ' ' + \ temp_cropped_line[prefix_size:] + " " + str(temp.probability_of)) queue_next_steps = queue_next_steps_future queue_next_steps_future = PriorityQueue() counterr += 1 if current_prediction_premis is None: print( "Cannot find any trace that is compliant with formula given current beam size" ) break output = [] if current_prediction_premis is None: predicted = u"" predicted_group = u"" total_predicted_time = 0 else: predicted = ( current_prediction_premis.cropped_line[prefix_size:]) predicted_group = (current_prediction_premis. cropped_line_group[prefix_size:]) total_predicted_time = current_prediction_premis.total_predicted_time if len(ground_truth) > 0: output.append(prefix_size) output.append(unicode(ground_truth).encode("utf-8")) output.append(unicode(predicted).encode("utf-8")) output.append( 1 - distance.nlevenshtein(predicted, ground_truth)) dls = 1 - (damerau_levenshtein_distance( unicode(predicted), unicode(ground_truth)) / max(len(predicted), len(ground_truth))) # we encountered problems with Damerau-Levenshtein Similarity on some # linux machines where the default character encoding of the operating system # caused it to be negative, this should never be the case if dls < 0: dls = 0 output.append(dls) output.append(1 - distance.jaccard(predicted, ground_truth)) output.append(ground_truth_t) output.append(total_predicted_time) output.append('') output.append( metrics.mean_absolute_error([ground_truth_t], [total_predicted_time])) output.append( metrics.median_absolute_error([ground_truth_t], [total_predicted_time])) output.append(unicode(ground_truth_group).encode("utf-8")) output.append(unicode(predicted_group).encode("utf-8")) output.append(1 - distance.nlevenshtein( predicted_group, ground_truth_group)) spamwriter.writerow(output) print("TIME TO FINISH --- %s seconds ---" % (time.time() - start_time))
def test(args, preprocess_manager): batch_size = args.batch_size_test result_dir = args.result_dir task = args.task if preprocess_manager.num_features_additional > 0: lines, caseids, lines_add, sequence_max_length, num_features_all, num_features_activities = preprocess_manager.create_test_set( ) else: lines, caseids, sequence_max_length, num_features_all, num_features_activities = preprocess_manager.create_test_set( ) model = keras.models.load_model( '%smodel_%s.h5' % (args.checkpoint_dir, preprocess_manager.iteration_cross_validation)) predict_size = 1 data_set_name = args.data_set.split('.csv')[0] generic_result_dir = result_dir + data_set_name + "__" + task fold_result_dir = generic_result_dir + "_%d%s" % ( preprocess_manager.iteration_cross_validation, ".csv") result_dir = fold_result_dir with open(result_dir, 'w') as csvfile: spamwriter = csv.writer(csvfile, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL) spamwriter.writerow([ "CaseID", "Prefix length", "Groud truth", "Predicted", "Levenshtein", "Damerau", "Jaccard" ]) for prefix_size in range(2, sequence_max_length): util.llprint("\nPrefix size: %d\n" % prefix_size) # if additional attributes exists if preprocess_manager.num_features_additional > 0: for line, caseid, line_add in zip(lines, caseids, lines_add): cropped_line = ''.join(line[:prefix_size]) cropped_line_add = line_add[:prefix_size] if '!' in cropped_line: continue ground_truth = ''.join(line[prefix_size:prefix_size + predict_size]) predicted = '' for i in range(predict_size): if len(ground_truth) <= i: continue input_vec, num_features_all, num_features_activities = preprocess_manager.encode_test_set_add( args, cropped_line, cropped_line_add, batch_size) y = model.predict(input_vec, verbose=0) y_char = y[0][:] prediction = preprocess_manager.getSymbol(y_char) cropped_line += prediction predicted += prediction if prediction == '!': print('! predicted, end case') break output = [] if len(ground_truth) > 0: output.append(caseid) output.append(prefix_size) output.append(str(ground_truth).encode("utf-8")) output.append(str(predicted).encode("utf-8")) output.append( 1 - distance.nlevenshtein(predicted, ground_truth)) dls = 1 - (damerau_levenshtein_distance( str(predicted), str(ground_truth)) / max(len(predicted), len(ground_truth))) if dls < 0: dls = 0 output.append(dls) output.append( 1 - distance.jaccard(predicted, ground_truth)) spamwriter.writerow(output) # if no additional attributes exists else: for line, caseid in zip(lines, caseids): cropped_line = ''.join(line[:prefix_size]) if '!' in cropped_line: continue ground_truth = ''.join(line[prefix_size:prefix_size + predict_size]) predicted = '' for i in range(predict_size): if len(ground_truth) <= i: continue input_vec = preprocess_manager.encode_test_set( cropped_line, batch_size) y = model.predict(input_vec, verbose=0) y_char = y[0][:] prediction = preprocess_manager.getSymbol(y_char) cropped_line += prediction predicted += prediction if prediction == '!': print('! predicted, end case') break output = [] if len(ground_truth) > 0: output.append(caseid) output.append(prefix_size) output.append(str(ground_truth).encode("utf-8")) output.append(str(predicted).encode("utf-8")) output.append( 1 - distance.nlevenshtein(predicted, ground_truth)) dls = 1 - (damerau_levenshtein_distance( str(predicted), str(ground_truth)) / max(len(predicted), len(ground_truth))) if dls < 0: dls = 0 output.append(dls) output.append( 1 - distance.jaccard(predicted, ground_truth)) spamwriter.writerow(output)